Simplify data ingestion service

2025-09-10 15:21:53 +01:00
parent fa694443e7
commit 13556347b0
18 changed files with 826 additions and 1560 deletions
--- a/microservices/data-ingestion-service/src/data_validator.py
+++ b/microservices/data-ingestion-service/src/data_validator.py
@@ -0,0 +1,710 @@
+"""
+Data validation and enrichment for time series data.
+Provides quality assessment, metadata enrichment, and data transformation capabilities.
+"""
+
+import asyncio
+import json
+import logging
+import statistics
+from datetime import datetime, timedelta
+from typing import List, Dict, Any, Optional, Tuple
+import hashlib
+import re
+from collections import defaultdict
+import math
+
+logger = logging.getLogger(__name__)
+
+class DataValidator:
+    """Validates, enriches, and transforms time series data"""
+    
+    def __init__(self, db, redis_client):
+        self.db = db
+        self.redis = redis_client
+        self.validation_rules = {}
+        self.enrichment_cache = {}
+        self.quality_thresholds = {
+            "completeness": 0.8,
+            "accuracy": 0.9,
+            "consistency": 0.85,
+            "timeliness": 0.9
+        }
+    
+    async def initialize(self):
+        """Initialize validator with default rules and configurations"""
+        try:
+            await self._load_validation_rules()
+            await self._load_enrichment_metadata()
+            logger.info("Data validator initialized successfully")
+        except Exception as e:
+            logger.error(f"Error initializing data validator: {e}")
+            raise
+    
+    async def validate_and_enrich_data(self, data: List[Dict[str, Any]], 
+                                     source_name: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+        """Validate and enrich time series data, returning processed data and quality report"""
+        try:
+            logger.info(f"Validating and enriching {len(data)} records from {source_name}")
+            
+            # Initialize validation report
+            quality_report = {
+                "source": source_name,
+                "total_records": len(data),
+                "processed_records": 0,
+                "rejected_records": 0,
+                "quality_scores": {},
+                "issues_found": [],
+                "processing_time": datetime.utcnow().isoformat()
+            }
+            
+            enriched_data = []
+            
+            # Process each record
+            for i, record in enumerate(data):
+                try:
+                    # Validate record
+                    validation_result = await self._validate_record(record, source_name)
+                    
+                    if validation_result["is_valid"]:
+                        # Enrich the record
+                        enriched_record = await self._enrich_record(record, source_name, validation_result)
+                        enriched_data.append(enriched_record)
+                        quality_report["processed_records"] += 1
+                    else:
+                        quality_report["rejected_records"] += 1
+                        quality_report["issues_found"].extend(validation_result["issues"])
+                        logger.warning(f"Record {i} rejected: {validation_result['issues']}")
+                
+                except Exception as e:
+                    logger.error(f"Error processing record {i}: {e}")
+                    quality_report["rejected_records"] += 1
+                    quality_report["issues_found"].append(f"Processing error: {str(e)}")
+            
+            # Calculate overall quality scores
+            quality_report["quality_scores"] = await self._calculate_quality_scores(enriched_data, quality_report)
+            
+            # Store quality report
+            await self._store_quality_report(quality_report, source_name)
+            
+            logger.info(f"Validation complete: {quality_report['processed_records']}/{quality_report['total_records']} records processed")
+            
+            return enriched_data, quality_report
+        
+        except Exception as e:
+            logger.error(f"Error in data validation and enrichment: {e}")
+            raise
+    
+    async def _validate_record(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
+        """Validate a single record against quality rules"""
+        validation_result = {
+            "is_valid": True,
+            "issues": [],
+            "quality_metrics": {}
+        }
+        
+        try:
+            # Check required fields
+            required_fields = ["sensor_id", "timestamp", "value"]
+            for field in required_fields:
+                if field not in record or record[field] is None:
+                    validation_result["is_valid"] = False
+                    validation_result["issues"].append(f"Missing required field: {field}")
+            
+            if not validation_result["is_valid"]:
+                return validation_result
+            
+            # Validate timestamp
+            timestamp_validation = await self._validate_timestamp(record["timestamp"])
+            validation_result["quality_metrics"]["timestamp_quality"] = timestamp_validation["score"]
+            if not timestamp_validation["is_valid"]:
+                validation_result["issues"].extend(timestamp_validation["issues"])
+            
+            # Validate numeric value
+            value_validation = await self._validate_numeric_value(record["value"], record.get("unit"))
+            validation_result["quality_metrics"]["value_quality"] = value_validation["score"]
+            if not value_validation["is_valid"]:
+                validation_result["issues"].extend(value_validation["issues"])
+            
+            # Validate sensor ID format
+            sensor_validation = await self._validate_sensor_id(record["sensor_id"])
+            validation_result["quality_metrics"]["sensor_id_quality"] = sensor_validation["score"]
+            if not sensor_validation["is_valid"]:
+                validation_result["issues"].extend(sensor_validation["issues"])
+            
+            # Check for duplicates
+            duplicate_check = await self._check_for_duplicates(record, source_name)
+            validation_result["quality_metrics"]["uniqueness"] = duplicate_check["score"]
+            if not duplicate_check["is_unique"]:
+                validation_result["issues"].extend(duplicate_check["issues"])
+            
+            # Calculate overall validity
+            if validation_result["issues"]:
+                # Allow minor issues but flag major ones
+                major_issues = [issue for issue in validation_result["issues"] 
+                              if "Missing required field" in issue or "Invalid" in issue]
+                validation_result["is_valid"] = len(major_issues) == 0
+        
+        except Exception as e:
+            logger.error(f"Error validating record: {e}")
+            validation_result["is_valid"] = False
+            validation_result["issues"].append(f"Validation error: {str(e)}")
+        
+        return validation_result
+    
+    async def _enrich_record(self, record: Dict[str, Any], source_name: str, 
+                           validation_result: Dict[str, Any]) -> Dict[str, Any]:
+        """Enrich a record with additional metadata and derived fields"""
+        try:
+            enriched = record.copy()
+            
+            # Add validation metadata
+            enriched["data_quality"] = {
+                "quality_score": statistics.mean(validation_result["quality_metrics"].values()) if validation_result["quality_metrics"] else 0.0,
+                "quality_metrics": validation_result["quality_metrics"],
+                "validation_timestamp": datetime.utcnow().isoformat()
+            }
+            
+            # Add source information
+            enriched["source_info"] = {
+                "source_name": source_name,
+                "ingestion_time": datetime.utcnow().isoformat(),
+                "record_id": hashlib.md5(f"{source_name}_{record.get('sensor_id', 'unknown')}_{record.get('timestamp', 0)}".encode()).hexdigest()
+            }
+            
+            # Normalize timestamp format
+            enriched["timestamp"] = await self._normalize_timestamp(record["timestamp"])
+            enriched["timestamp_iso"] = datetime.fromtimestamp(enriched["timestamp"]).isoformat()
+            
+            # Infer and enrich sensor type
+            sensor_type_info = await self._infer_sensor_type(record)
+            enriched["sensor_type"] = sensor_type_info["type"]
+            enriched["sensor_category"] = sensor_type_info["category"]
+            
+            # Add unit standardization
+            unit_info = await self._standardize_unit(record.get("unit"))
+            enriched["unit"] = unit_info["standard_unit"]
+            enriched["unit_info"] = unit_info
+            
+            # Calculate derived metrics
+            derived_metrics = await self._calculate_derived_metrics(enriched, source_name)
+            enriched["derived_metrics"] = derived_metrics
+            
+            # Add location and context information
+            context_info = await self._enrich_with_context(enriched, source_name)
+            enriched["metadata"] = {**enriched.get("metadata", {}), **context_info}
+            
+            # Add temporal features
+            temporal_features = await self._extract_temporal_features(enriched["timestamp"])
+            enriched["temporal"] = temporal_features
+            
+            # Energy-specific enrichments
+            if sensor_type_info["category"] == "energy":
+                energy_enrichment = await self._enrich_energy_data(enriched)
+                enriched.update(energy_enrichment)
+            
+            return enriched
+        
+        except Exception as e:
+            logger.error(f"Error enriching record: {e}")
+            return record
+    
+    async def _validate_timestamp(self, timestamp) -> Dict[str, Any]:
+        """Validate timestamp format and reasonableness"""
+        result = {"is_valid": True, "issues": [], "score": 1.0}
+        
+        try:
+            # Convert to numeric timestamp
+            if isinstance(timestamp, str):
+                try:
+                    # Try parsing ISO format
+                    dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+                    ts = dt.timestamp()
+                except:
+                    # Try parsing as unix timestamp string
+                    ts = float(timestamp)
+            else:
+                ts = float(timestamp)
+            
+            # Check if timestamp is reasonable (not too far in past/future)
+            current_time = datetime.utcnow().timestamp()
+            max_age = 365 * 24 * 3600  # 1 year
+            max_future = 24 * 3600    # 1 day
+            
+            if ts < current_time - max_age:
+                result["issues"].append("Timestamp too old (more than 1 year)")
+                result["score"] -= 0.3
+            elif ts > current_time + max_future:
+                result["issues"].append("Timestamp too far in future")
+                result["score"] -= 0.3
+            
+            # Check for reasonable precision (not too precise for energy data)
+            if ts != int(ts) and len(str(ts).split('.')[1]) > 3:
+                result["score"] -= 0.1  # Minor issue
+        
+        except (ValueError, TypeError) as e:
+            result["is_valid"] = False
+            result["issues"].append(f"Invalid timestamp format: {e}")
+            result["score"] = 0.0
+        
+        return result
+    
+    async def _validate_numeric_value(self, value, unit: Optional[str] = None) -> Dict[str, Any]:
+        """Validate numeric value reasonableness"""
+        result = {"is_valid": True, "issues": [], "score": 1.0}
+        
+        try:
+            numeric_value = float(value)
+            
+            # Check for negative values (usually invalid for energy data)
+            if numeric_value < 0:
+                result["issues"].append("Negative energy value")
+                result["score"] -= 0.4
+            
+            # Check for unreasonably large values
+            unit_str = (unit or "").lower()
+            if "wh" in unit_str:
+                # Energy values
+                if numeric_value > 100000:  # >100kWh seems excessive for single reading
+                    result["issues"].append("Unusually high energy value")
+                    result["score"] -= 0.2
+            elif "w" in unit_str:
+                # Power values
+                if numeric_value > 50000:   # >50kW seems excessive
+                    result["issues"].append("Unusually high power value")
+                    result["score"] -= 0.2
+            
+            # Check for zero values (might indicate sensor issues)
+            if numeric_value == 0:
+                result["score"] -= 0.1
+            
+            # Check for NaN or infinity
+            if math.isnan(numeric_value) or math.isinf(numeric_value):
+                result["is_valid"] = False
+                result["issues"].append("Invalid numeric value (NaN or Infinity)")
+                result["score"] = 0.0
+        
+        except (ValueError, TypeError) as e:
+            result["is_valid"] = False
+            result["issues"].append(f"Non-numeric value: {e}")
+            result["score"] = 0.0
+        
+        return result
+    
+    async def _validate_sensor_id(self, sensor_id: str) -> Dict[str, Any]:
+        """Validate sensor ID format and consistency"""
+        result = {"is_valid": True, "issues": [], "score": 1.0}
+        
+        try:
+            if not isinstance(sensor_id, str) or len(sensor_id) == 0:
+                result["is_valid"] = False
+                result["issues"].append("Empty or invalid sensor ID")
+                result["score"] = 0.0
+                return result
+            
+            # Check length
+            if len(sensor_id) < 3:
+                result["issues"].append("Very short sensor ID")
+                result["score"] -= 0.2
+            elif len(sensor_id) > 50:
+                result["issues"].append("Very long sensor ID")
+                result["score"] -= 0.1
+            
+            # Check for reasonable characters
+            if not re.match(r'^[a-zA-Z0-9_\-\.]+$', sensor_id):
+                result["issues"].append("Sensor ID contains unusual characters")
+                result["score"] -= 0.1
+        
+        except Exception as e:
+            result["issues"].append(f"Sensor ID validation error: {e}")
+            result["score"] -= 0.1
+        
+        return result
+    
+    async def _check_for_duplicates(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
+        """Check for duplicate records"""
+        result = {"is_unique": True, "issues": [], "score": 1.0}
+        
+        try:
+            # Create record signature
+            signature = hashlib.md5(
+                f"{source_name}_{record.get('sensor_id')}_{record.get('timestamp')}_{record.get('value')}".encode()
+            ).hexdigest()
+            
+            # Check cache for recent duplicates
+            cache_key = f"record_signature:{signature}"
+            exists = await self.redis.exists(cache_key)
+            
+            if exists:
+                result["is_unique"] = False
+                result["issues"].append("Duplicate record detected")
+                result["score"] = 0.0
+            else:
+                # Store signature with short expiration (1 hour)
+                await self.redis.setex(cache_key, 3600, "1")
+        
+        except Exception as e:
+            logger.debug(f"Error checking duplicates: {e}")
+            # Don't fail validation for cache errors
+        
+        return result
+    
+    async def _normalize_timestamp(self, timestamp) -> int:
+        """Normalize timestamp to unix timestamp"""
+        try:
+            if isinstance(timestamp, str):
+                try:
+                    # Try ISO format first
+                    dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+                    return int(dt.timestamp())
+                except:
+                    # Try as unix timestamp string
+                    return int(float(timestamp))
+            else:
+                return int(float(timestamp))
+        except:
+            # Fallback to current time
+            return int(datetime.utcnow().timestamp())
+    
+    async def _infer_sensor_type(self, record: Dict[str, Any]) -> Dict[str, str]:
+        """Infer sensor type from record data"""
+        sensor_id = record.get("sensor_id", "").lower()
+        unit = (record.get("unit", "") or "").lower()
+        value = record.get("value", 0)
+        metadata = record.get("metadata", {})
+        
+        # Energy sensors
+        if "wh" in unit or "energy" in sensor_id or "consumption" in sensor_id:
+            return {"type": "energy", "category": "energy"}
+        elif "w" in unit and "wh" not in unit:
+            return {"type": "power", "category": "energy"}
+        
+        # Environmental sensors
+        elif "temp" in sensor_id or "°c" in unit or "celsius" in unit:
+            return {"type": "temperature", "category": "environmental"}
+        elif "humid" in sensor_id or "%" in unit:
+            return {"type": "humidity", "category": "environmental"}
+        elif "co2" in sensor_id or "ppm" in unit:
+            return {"type": "co2", "category": "environmental"}
+        
+        # Motion/occupancy sensors
+        elif "motion" in sensor_id or "occupancy" in sensor_id or ("motion" in str(metadata).lower()):
+            return {"type": "motion", "category": "occupancy"}
+        
+        # Generation sensors
+        elif "generation" in sensor_id or "solar" in sensor_id or "generation" in str(metadata).lower():
+            return {"type": "generation", "category": "energy"}
+        
+        # Default to energy if unclear
+        else:
+            return {"type": "energy", "category": "energy"}
+    
+    async def _standardize_unit(self, unit: Optional[str]) -> Dict[str, Any]:
+        """Standardize unit format"""
+        if not unit:
+            return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
+        
+        unit_lower = unit.lower().strip()
+        
+        # Energy units
+        if unit_lower in ["kwh", "kw-h", "kw_h"]:
+            return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
+        elif unit_lower in ["wh", "w-h", "w_h"]:
+            return {"standard_unit": "kWh", "conversion_factor": 0.001, "unit_type": "energy"}
+        elif unit_lower in ["mwh", "mw-h", "mw_h"]:
+            return {"standard_unit": "kWh", "conversion_factor": 1000.0, "unit_type": "energy"}
+        
+        # Power units
+        elif unit_lower in ["kw", "kilowatt", "kilowatts"]:
+            return {"standard_unit": "kW", "conversion_factor": 1.0, "unit_type": "power"}
+        elif unit_lower in ["w", "watt", "watts"]:
+            return {"standard_unit": "kW", "conversion_factor": 0.001, "unit_type": "power"}
+        elif unit_lower in ["mw", "megawatt", "megawatts"]:
+            return {"standard_unit": "kW", "conversion_factor": 1000.0, "unit_type": "power"}
+        
+        # Temperature units
+        elif unit_lower in ["°c", "celsius", "c"]:
+            return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature"}
+        elif unit_lower in ["°f", "fahrenheit", "f"]:
+            return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature", "requires_conversion": True}
+        
+        # Default
+        else:
+            return {"standard_unit": unit, "conversion_factor": 1.0, "unit_type": "unknown"}
+    
+    async def _calculate_derived_metrics(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
+        """Calculate derived metrics from the record"""
+        derived = {}
+        
+        try:
+            value = float(record.get("value", 0))
+            unit_info = record.get("unit_info", {})
+            
+            # Apply unit conversion if needed
+            if unit_info.get("conversion_factor", 1.0) != 1.0:
+                derived["original_value"] = value
+                derived["converted_value"] = value * unit_info["conversion_factor"]
+            
+            # Energy-specific calculations
+            if unit_info.get("unit_type") == "energy":
+                # Estimate cost (simplified)
+                cost_per_kwh = 0.12  # Example rate
+                derived["estimated_cost"] = value * cost_per_kwh
+                
+                # Estimate CO2 emissions (simplified)
+                co2_per_kwh = 0.4  # kg CO2 per kWh (example grid factor)
+                derived["estimated_co2_kg"] = value * co2_per_kwh
+            
+            # Add value range classification
+            derived["value_range"] = await self._classify_value_range(value, unit_info.get("unit_type"))
+            
+        except Exception as e:
+            logger.debug(f"Error calculating derived metrics: {e}")
+        
+        return derived
+    
+    async def _classify_value_range(self, value: float, unit_type: str) -> str:
+        """Classify value into ranges for better understanding"""
+        if unit_type == "energy":
+            if value < 1:
+                return "very_low"
+            elif value < 10:
+                return "low"
+            elif value < 50:
+                return "medium"
+            elif value < 200:
+                return "high"
+            else:
+                return "very_high"
+        elif unit_type == "power":
+            if value < 0.5:
+                return "very_low"
+            elif value < 5:
+                return "low"
+            elif value < 20:
+                return "medium"
+            elif value < 100:
+                return "high"
+            else:
+                return "very_high"
+        else:
+            return "unknown"
+    
+    async def _enrich_with_context(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
+        """Enrich record with contextual information"""
+        context = {}
+        
+        try:
+            # Add geographical context if available
+            context["data_source"] = "real_community"
+            context["source_type"] = "ftp_ingestion"
+            
+            # Add data freshness
+            ingestion_time = datetime.utcnow()
+            data_time = datetime.fromtimestamp(record["timestamp"])
+            context["data_age_minutes"] = (ingestion_time - data_time).total_seconds() / 60
+            
+            # Classify data freshness
+            if context["data_age_minutes"] < 15:
+                context["freshness"] = "real_time"
+            elif context["data_age_minutes"] < 60:
+                context["freshness"] = "near_real_time"
+            elif context["data_age_minutes"] < 1440:  # 24 hours
+                context["freshness"] = "recent"
+            else:
+                context["freshness"] = "historical"
+            
+        except Exception as e:
+            logger.debug(f"Error adding context: {e}")
+        
+        return context
+    
+    async def _extract_temporal_features(self, timestamp: int) -> Dict[str, Any]:
+        """Extract temporal features from timestamp"""
+        dt = datetime.fromtimestamp(timestamp)
+        
+        return {
+            "hour": dt.hour,
+            "day_of_week": dt.weekday(),
+            "day_of_month": dt.day,
+            "month": dt.month,
+            "quarter": (dt.month - 1) // 3 + 1,
+            "is_weekend": dt.weekday() >= 5,
+            "is_business_hours": 8 <= dt.hour <= 17,
+            "season": self._get_season(dt.month)
+        }
+    
+    def _get_season(self, month: int) -> str:
+        """Get season from month"""
+        if month in [12, 1, 2]:
+            return "winter"
+        elif month in [3, 4, 5]:
+            return "spring"
+        elif month in [6, 7, 8]:
+            return "summer"
+        else:
+            return "autumn"
+    
+    async def _enrich_energy_data(self, record: Dict[str, Any]) -> Dict[str, Any]:
+        """Add energy-specific enrichments"""
+        enrichment = {}
+        
+        try:
+            value = record.get("derived_metrics", {}).get("converted_value", record.get("value", 0))
+            temporal = record.get("temporal", {})
+            
+            # Energy usage patterns
+            if temporal.get("is_business_hours"):
+                enrichment["usage_pattern"] = "business_hours"
+            elif temporal.get("is_weekend"):
+                enrichment["usage_pattern"] = "weekend"
+            else:
+                enrichment["usage_pattern"] = "off_hours"
+            
+            # Demand classification
+            if value > 100:
+                enrichment["demand_level"] = "high"
+            elif value > 50:
+                enrichment["demand_level"] = "medium"
+            elif value > 10:
+                enrichment["demand_level"] = "low"
+            else:
+                enrichment["demand_level"] = "minimal"
+            
+            # Peak/off-peak classification
+            hour = temporal.get("hour", 0)
+            if 17 <= hour <= 21:  # Evening peak
+                enrichment["tariff_period"] = "peak"
+            elif 22 <= hour <= 6:   # Night off-peak
+                enrichment["tariff_period"] = "off_peak"
+            else:
+                enrichment["tariff_period"] = "standard"
+            
+        except Exception as e:
+            logger.debug(f"Error enriching energy data: {e}")
+        
+        return enrichment
+    
+    async def _calculate_quality_scores(self, data: List[Dict[str, Any]], quality_report: Dict[str, Any]) -> Dict[str, float]:
+        """Calculate overall quality scores"""
+        if not data:
+            return {"overall": 0.0, "completeness": 0.0, "accuracy": 0.0, "consistency": 0.0, "timeliness": 0.0}
+        
+        # Completeness score
+        total_expected_fields = len(data) * 4  # sensor_id, timestamp, value, unit
+        total_present_fields = sum(1 for record in data 
+                                 for field in ["sensor_id", "timestamp", "value", "unit"] 
+                                 if record.get(field) is not None)
+        completeness = total_present_fields / total_expected_fields if total_expected_fields > 0 else 0.0
+        
+        # Accuracy score (based on validation scores)
+        accuracy_scores = [record.get("data_quality", {}).get("quality_score", 0) for record in data]
+        accuracy = statistics.mean(accuracy_scores) if accuracy_scores else 0.0
+        
+        # Consistency score (coefficient of variation for quality scores)
+        if len(accuracy_scores) > 1:
+            std_dev = statistics.stdev(accuracy_scores)
+            mean_score = statistics.mean(accuracy_scores)
+            consistency = 1.0 - (std_dev / mean_score) if mean_score > 0 else 0.0
+        else:
+            consistency = 1.0
+        
+        # Timeliness score (based on data age)
+        current_time = datetime.utcnow().timestamp()
+        ages = [(current_time - record.get("timestamp", current_time)) / 3600 for record in data]  # age in hours
+        avg_age = statistics.mean(ages) if ages else 0
+        timeliness = max(0.0, 1.0 - (avg_age / 24))  # Decrease score as data gets older than 24 hours
+        
+        # Overall score
+        overall = statistics.mean([completeness, accuracy, consistency, timeliness])
+        
+        return {
+            "overall": round(overall, 3),
+            "completeness": round(completeness, 3),
+            "accuracy": round(accuracy, 3),
+            "consistency": round(consistency, 3),
+            "timeliness": round(timeliness, 3)
+        }
+    
+    async def _store_quality_report(self, quality_report: Dict[str, Any], source_name: str):
+        """Store quality report in database"""
+        try:
+            quality_report["_id"] = f"{source_name}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
+            await self.db.quality_reports.insert_one(quality_report)
+            
+            # Also cache in Redis for quick access
+            cache_key = f"quality_report:{source_name}:latest"
+            await self.redis.setex(cache_key, 3600, json.dumps(quality_report, default=str))
+            
+        except Exception as e:
+            logger.error(f"Error storing quality report: {e}")
+    
+    async def _load_validation_rules(self):
+        """Load validation rules configuration"""
+        # Default validation rules
+        self.validation_rules = {
+            "energy": {
+                "min_value": 0,
+                "max_value": 100000,
+                "required_precision": 0.01
+            },
+            "power": {
+                "min_value": 0,
+                "max_value": 50000,
+                "required_precision": 0.1
+            },
+            "temperature": {
+                "min_value": -50,
+                "max_value": 100,
+                "required_precision": 0.1
+            }
+        }
+        
+        logger.info("Loaded default validation rules")
+    
+    async def _load_enrichment_metadata(self):
+        """Load enrichment metadata"""
+        # Load any cached enrichment data
+        try:
+            cache_keys = []
+            async for key in self.redis.scan_iter(match="enrichment:*"):
+                cache_keys.append(key)
+            
+            logger.info(f"Loaded {len(cache_keys)} enrichment cache entries")
+            
+        except Exception as e:
+            logger.debug(f"Error loading enrichment metadata: {e}")
+    
+    async def get_quality_summary(self, source_name: Optional[str] = None) -> Dict[str, Any]:
+        """Get quality summary for sources"""
+        try:
+            match_filter = {"source": source_name} if source_name else {}
+            
+            # Get recent quality reports
+            cursor = self.db.quality_reports.find(match_filter).sort("processing_time", -1).limit(50)
+            
+            reports = []
+            async for report in cursor:
+                report["_id"] = str(report["_id"])
+                reports.append(report)
+            
+            if not reports:
+                return {"message": "No quality reports found"}
+            
+            # Calculate summary statistics
+            avg_quality = statistics.mean([r["quality_scores"]["overall"] for r in reports])
+            total_processed = sum([r["processed_records"] for r in reports])
+            total_rejected = sum([r["rejected_records"] for r in reports])
+            
+            return {
+                "total_reports": len(reports),
+                "average_quality": round(avg_quality, 3),
+                "total_processed_records": total_processed,
+                "total_rejected_records": total_rejected,
+                "success_rate": round(total_processed / (total_processed + total_rejected) * 100, 2) if (total_processed + total_rejected) > 0 else 0,
+                "latest_report": reports[0] if reports else None
+            }
+            
+        except Exception as e:
+            logger.error(f"Error getting quality summary: {e}")
+            return {"error": str(e)}