""" Data validation and enrichment for time series data. Provides quality assessment, metadata enrichment, and data transformation capabilities. """ import asyncio import json import logging import statistics from datetime import datetime, timedelta from typing import List, Dict, Any, Optional, Tuple import hashlib import re from collections import defaultdict import math logger = logging.getLogger(__name__) class DataValidator: """Validates, enriches, and transforms time series data""" def __init__(self, db, redis_client): self.db = db self.redis = redis_client self.validation_rules = {} self.enrichment_cache = {} self.quality_thresholds = { "completeness": 0.8, "accuracy": 0.9, "consistency": 0.85, "timeliness": 0.9 } async def initialize(self): """Initialize validator with default rules and configurations""" try: await self._load_validation_rules() await self._load_enrichment_metadata() logger.info("Data validator initialized successfully") except Exception as e: logger.error(f"Error initializing data validator: {e}") raise async def validate_and_enrich_data(self, data: List[Dict[str, Any]], source_name: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: """Validate and enrich time series data, returning processed data and quality report""" try: logger.info(f"Validating and enriching {len(data)} records from {source_name}") # Initialize validation report quality_report = { "source": source_name, "total_records": len(data), "processed_records": 0, "rejected_records": 0, "quality_scores": {}, "issues_found": [], "processing_time": datetime.utcnow().isoformat() } enriched_data = [] # Process each record for i, record in enumerate(data): try: # Validate record validation_result = await self._validate_record(record, source_name) if validation_result["is_valid"]: # Enrich the record enriched_record = await self._enrich_record(record, source_name, validation_result) enriched_data.append(enriched_record) quality_report["processed_records"] += 1 else: quality_report["rejected_records"] += 1 quality_report["issues_found"].extend(validation_result["issues"]) logger.warning(f"Record {i} rejected: {validation_result['issues']}") except Exception as e: logger.error(f"Error processing record {i}: {e}") quality_report["rejected_records"] += 1 quality_report["issues_found"].append(f"Processing error: {str(e)}") # Calculate overall quality scores quality_report["quality_scores"] = await self._calculate_quality_scores(enriched_data, quality_report) # Store quality report await self._store_quality_report(quality_report, source_name) logger.info(f"Validation complete: {quality_report['processed_records']}/{quality_report['total_records']} records processed") return enriched_data, quality_report except Exception as e: logger.error(f"Error in data validation and enrichment: {e}") raise async def _validate_record(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]: """Validate a single record against quality rules""" validation_result = { "is_valid": True, "issues": [], "quality_metrics": {} } try: # Check required fields required_fields = ["sensor_id", "timestamp", "value"] for field in required_fields: if field not in record or record[field] is None: validation_result["is_valid"] = False validation_result["issues"].append(f"Missing required field: {field}") if not validation_result["is_valid"]: return validation_result # Validate timestamp timestamp_validation = await self._validate_timestamp(record["timestamp"]) validation_result["quality_metrics"]["timestamp_quality"] = timestamp_validation["score"] if not timestamp_validation["is_valid"]: validation_result["issues"].extend(timestamp_validation["issues"]) # Validate numeric value value_validation = await self._validate_numeric_value(record["value"], record.get("unit")) validation_result["quality_metrics"]["value_quality"] = value_validation["score"] if not value_validation["is_valid"]: validation_result["issues"].extend(value_validation["issues"]) # Validate sensor ID format sensor_validation = await self._validate_sensor_id(record["sensor_id"]) validation_result["quality_metrics"]["sensor_id_quality"] = sensor_validation["score"] if not sensor_validation["is_valid"]: validation_result["issues"].extend(sensor_validation["issues"]) # Check for duplicates duplicate_check = await self._check_for_duplicates(record, source_name) validation_result["quality_metrics"]["uniqueness"] = duplicate_check["score"] if not duplicate_check["is_unique"]: validation_result["issues"].extend(duplicate_check["issues"]) # Calculate overall validity if validation_result["issues"]: # Allow minor issues but flag major ones major_issues = [issue for issue in validation_result["issues"] if "Missing required field" in issue or "Invalid" in issue] validation_result["is_valid"] = len(major_issues) == 0 except Exception as e: logger.error(f"Error validating record: {e}") validation_result["is_valid"] = False validation_result["issues"].append(f"Validation error: {str(e)}") return validation_result async def _enrich_record(self, record: Dict[str, Any], source_name: str, validation_result: Dict[str, Any]) -> Dict[str, Any]: """Enrich a record with additional metadata and derived fields""" try: enriched = record.copy() # Add validation metadata enriched["data_quality"] = { "quality_score": statistics.mean(validation_result["quality_metrics"].values()) if validation_result["quality_metrics"] else 0.0, "quality_metrics": validation_result["quality_metrics"], "validation_timestamp": datetime.utcnow().isoformat() } # Add source information enriched["source_info"] = { "source_name": source_name, "ingestion_time": datetime.utcnow().isoformat(), "record_id": hashlib.md5(f"{source_name}_{record.get('sensor_id', 'unknown')}_{record.get('timestamp', 0)}".encode()).hexdigest() } # Normalize timestamp format enriched["timestamp"] = await self._normalize_timestamp(record["timestamp"]) enriched["timestamp_iso"] = datetime.fromtimestamp(enriched["timestamp"]).isoformat() # Infer and enrich sensor type sensor_type_info = await self._infer_sensor_type(record) enriched["sensor_type"] = sensor_type_info["type"] enriched["sensor_category"] = sensor_type_info["category"] # Add unit standardization unit_info = await self._standardize_unit(record.get("unit")) enriched["unit"] = unit_info["standard_unit"] enriched["unit_info"] = unit_info # Calculate derived metrics derived_metrics = await self._calculate_derived_metrics(enriched, source_name) enriched["derived_metrics"] = derived_metrics # Add location and context information context_info = await self._enrich_with_context(enriched, source_name) enriched["metadata"] = {**enriched.get("metadata", {}), **context_info} # Add temporal features temporal_features = await self._extract_temporal_features(enriched["timestamp"]) enriched["temporal"] = temporal_features # Energy-specific enrichments if sensor_type_info["category"] == "energy": energy_enrichment = await self._enrich_energy_data(enriched) enriched.update(energy_enrichment) return enriched except Exception as e: logger.error(f"Error enriching record: {e}") return record async def _validate_timestamp(self, timestamp) -> Dict[str, Any]: """Validate timestamp format and reasonableness""" result = {"is_valid": True, "issues": [], "score": 1.0} try: # Convert to numeric timestamp if isinstance(timestamp, str): try: # Try parsing ISO format dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) ts = dt.timestamp() except: # Try parsing as unix timestamp string ts = float(timestamp) else: ts = float(timestamp) # Check if timestamp is reasonable (not too far in past/future) current_time = datetime.utcnow().timestamp() max_age = 365 * 24 * 3600 # 1 year max_future = 24 * 3600 # 1 day if ts < current_time - max_age: result["issues"].append("Timestamp too old (more than 1 year)") result["score"] -= 0.3 elif ts > current_time + max_future: result["issues"].append("Timestamp too far in future") result["score"] -= 0.3 # Check for reasonable precision (not too precise for energy data) if ts != int(ts) and len(str(ts).split('.')[1]) > 3: result["score"] -= 0.1 # Minor issue except (ValueError, TypeError) as e: result["is_valid"] = False result["issues"].append(f"Invalid timestamp format: {e}") result["score"] = 0.0 return result async def _validate_numeric_value(self, value, unit: Optional[str] = None) -> Dict[str, Any]: """Validate numeric value reasonableness""" result = {"is_valid": True, "issues": [], "score": 1.0} try: numeric_value = float(value) # Check for negative values (usually invalid for energy data) if numeric_value < 0: result["issues"].append("Negative energy value") result["score"] -= 0.4 # Check for unreasonably large values unit_str = (unit or "").lower() if "wh" in unit_str: # Energy values if numeric_value > 100000: # >100kWh seems excessive for single reading result["issues"].append("Unusually high energy value") result["score"] -= 0.2 elif "w" in unit_str: # Power values if numeric_value > 50000: # >50kW seems excessive result["issues"].append("Unusually high power value") result["score"] -= 0.2 # Check for zero values (might indicate sensor issues) if numeric_value == 0: result["score"] -= 0.1 # Check for NaN or infinity if math.isnan(numeric_value) or math.isinf(numeric_value): result["is_valid"] = False result["issues"].append("Invalid numeric value (NaN or Infinity)") result["score"] = 0.0 except (ValueError, TypeError) as e: result["is_valid"] = False result["issues"].append(f"Non-numeric value: {e}") result["score"] = 0.0 return result async def _validate_sensor_id(self, sensor_id: str) -> Dict[str, Any]: """Validate sensor ID format and consistency""" result = {"is_valid": True, "issues": [], "score": 1.0} try: if not isinstance(sensor_id, str) or len(sensor_id) == 0: result["is_valid"] = False result["issues"].append("Empty or invalid sensor ID") result["score"] = 0.0 return result # Check length if len(sensor_id) < 3: result["issues"].append("Very short sensor ID") result["score"] -= 0.2 elif len(sensor_id) > 50: result["issues"].append("Very long sensor ID") result["score"] -= 0.1 # Check for reasonable characters if not re.match(r'^[a-zA-Z0-9_\-\.]+$', sensor_id): result["issues"].append("Sensor ID contains unusual characters") result["score"] -= 0.1 except Exception as e: result["issues"].append(f"Sensor ID validation error: {e}") result["score"] -= 0.1 return result async def _check_for_duplicates(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]: """Check for duplicate records""" result = {"is_unique": True, "issues": [], "score": 1.0} try: # Create record signature signature = hashlib.md5( f"{source_name}_{record.get('sensor_id')}_{record.get('timestamp')}_{record.get('value')}".encode() ).hexdigest() # Check cache for recent duplicates cache_key = f"record_signature:{signature}" exists = await self.redis.exists(cache_key) if exists: result["is_unique"] = False result["issues"].append("Duplicate record detected") result["score"] = 0.0 else: # Store signature with short expiration (1 hour) await self.redis.setex(cache_key, 3600, "1") except Exception as e: logger.debug(f"Error checking duplicates: {e}") # Don't fail validation for cache errors return result async def _normalize_timestamp(self, timestamp) -> int: """Normalize timestamp to unix timestamp""" try: if isinstance(timestamp, str): try: # Try ISO format first dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) return int(dt.timestamp()) except: # Try as unix timestamp string return int(float(timestamp)) else: return int(float(timestamp)) except: # Fallback to current time return int(datetime.utcnow().timestamp()) async def _infer_sensor_type(self, record: Dict[str, Any]) -> Dict[str, str]: """Infer sensor type from record data""" sensor_id = record.get("sensor_id", "").lower() unit = (record.get("unit", "") or "").lower() value = record.get("value", 0) metadata = record.get("metadata", {}) # Energy sensors if "wh" in unit or "energy" in sensor_id or "consumption" in sensor_id: return {"type": "energy", "category": "energy"} elif "w" in unit and "wh" not in unit: return {"type": "power", "category": "energy"} # Environmental sensors elif "temp" in sensor_id or "°c" in unit or "celsius" in unit: return {"type": "temperature", "category": "environmental"} elif "humid" in sensor_id or "%" in unit: return {"type": "humidity", "category": "environmental"} elif "co2" in sensor_id or "ppm" in unit: return {"type": "co2", "category": "environmental"} # Motion/occupancy sensors elif "motion" in sensor_id or "occupancy" in sensor_id or ("motion" in str(metadata).lower()): return {"type": "motion", "category": "occupancy"} # Generation sensors elif "generation" in sensor_id or "solar" in sensor_id or "generation" in str(metadata).lower(): return {"type": "generation", "category": "energy"} # Default to energy if unclear else: return {"type": "energy", "category": "energy"} async def _standardize_unit(self, unit: Optional[str]) -> Dict[str, Any]: """Standardize unit format""" if not unit: return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"} unit_lower = unit.lower().strip() # Energy units if unit_lower in ["kwh", "kw-h", "kw_h"]: return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"} elif unit_lower in ["wh", "w-h", "w_h"]: return {"standard_unit": "kWh", "conversion_factor": 0.001, "unit_type": "energy"} elif unit_lower in ["mwh", "mw-h", "mw_h"]: return {"standard_unit": "kWh", "conversion_factor": 1000.0, "unit_type": "energy"} # Power units elif unit_lower in ["kw", "kilowatt", "kilowatts"]: return {"standard_unit": "kW", "conversion_factor": 1.0, "unit_type": "power"} elif unit_lower in ["w", "watt", "watts"]: return {"standard_unit": "kW", "conversion_factor": 0.001, "unit_type": "power"} elif unit_lower in ["mw", "megawatt", "megawatts"]: return {"standard_unit": "kW", "conversion_factor": 1000.0, "unit_type": "power"} # Temperature units elif unit_lower in ["°c", "celsius", "c"]: return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature"} elif unit_lower in ["°f", "fahrenheit", "f"]: return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature", "requires_conversion": True} # Default else: return {"standard_unit": unit, "conversion_factor": 1.0, "unit_type": "unknown"} async def _calculate_derived_metrics(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]: """Calculate derived metrics from the record""" derived = {} try: value = float(record.get("value", 0)) unit_info = record.get("unit_info", {}) # Apply unit conversion if needed if unit_info.get("conversion_factor", 1.0) != 1.0: derived["original_value"] = value derived["converted_value"] = value * unit_info["conversion_factor"] # Energy-specific calculations if unit_info.get("unit_type") == "energy": # Estimate cost (simplified) cost_per_kwh = 0.12 # Example rate derived["estimated_cost"] = value * cost_per_kwh # Estimate CO2 emissions (simplified) co2_per_kwh = 0.4 # kg CO2 per kWh (example grid factor) derived["estimated_co2_kg"] = value * co2_per_kwh # Add value range classification derived["value_range"] = await self._classify_value_range(value, unit_info.get("unit_type")) except Exception as e: logger.debug(f"Error calculating derived metrics: {e}") return derived async def _classify_value_range(self, value: float, unit_type: str) -> str: """Classify value into ranges for better understanding""" if unit_type == "energy": if value < 1: return "very_low" elif value < 10: return "low" elif value < 50: return "medium" elif value < 200: return "high" else: return "very_high" elif unit_type == "power": if value < 0.5: return "very_low" elif value < 5: return "low" elif value < 20: return "medium" elif value < 100: return "high" else: return "very_high" else: return "unknown" async def _enrich_with_context(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]: """Enrich record with contextual information""" context = {} try: # Add geographical context if available context["data_source"] = "real_community" context["source_type"] = "ftp_ingestion" # Add data freshness ingestion_time = datetime.utcnow() data_time = datetime.fromtimestamp(record["timestamp"]) context["data_age_minutes"] = (ingestion_time - data_time).total_seconds() / 60 # Classify data freshness if context["data_age_minutes"] < 15: context["freshness"] = "real_time" elif context["data_age_minutes"] < 60: context["freshness"] = "near_real_time" elif context["data_age_minutes"] < 1440: # 24 hours context["freshness"] = "recent" else: context["freshness"] = "historical" except Exception as e: logger.debug(f"Error adding context: {e}") return context async def _extract_temporal_features(self, timestamp: int) -> Dict[str, Any]: """Extract temporal features from timestamp""" dt = datetime.fromtimestamp(timestamp) return { "hour": dt.hour, "day_of_week": dt.weekday(), "day_of_month": dt.day, "month": dt.month, "quarter": (dt.month - 1) // 3 + 1, "is_weekend": dt.weekday() >= 5, "is_business_hours": 8 <= dt.hour <= 17, "season": self._get_season(dt.month) } def _get_season(self, month: int) -> str: """Get season from month""" if month in [12, 1, 2]: return "winter" elif month in [3, 4, 5]: return "spring" elif month in [6, 7, 8]: return "summer" else: return "autumn" async def _enrich_energy_data(self, record: Dict[str, Any]) -> Dict[str, Any]: """Add energy-specific enrichments""" enrichment = {} try: value = record.get("derived_metrics", {}).get("converted_value", record.get("value", 0)) temporal = record.get("temporal", {}) # Energy usage patterns if temporal.get("is_business_hours"): enrichment["usage_pattern"] = "business_hours" elif temporal.get("is_weekend"): enrichment["usage_pattern"] = "weekend" else: enrichment["usage_pattern"] = "off_hours" # Demand classification if value > 100: enrichment["demand_level"] = "high" elif value > 50: enrichment["demand_level"] = "medium" elif value > 10: enrichment["demand_level"] = "low" else: enrichment["demand_level"] = "minimal" # Peak/off-peak classification hour = temporal.get("hour", 0) if 17 <= hour <= 21: # Evening peak enrichment["tariff_period"] = "peak" elif 22 <= hour <= 6: # Night off-peak enrichment["tariff_period"] = "off_peak" else: enrichment["tariff_period"] = "standard" except Exception as e: logger.debug(f"Error enriching energy data: {e}") return enrichment async def _calculate_quality_scores(self, data: List[Dict[str, Any]], quality_report: Dict[str, Any]) -> Dict[str, float]: """Calculate overall quality scores""" if not data: return {"overall": 0.0, "completeness": 0.0, "accuracy": 0.0, "consistency": 0.0, "timeliness": 0.0} # Completeness score total_expected_fields = len(data) * 4 # sensor_id, timestamp, value, unit total_present_fields = sum(1 for record in data for field in ["sensor_id", "timestamp", "value", "unit"] if record.get(field) is not None) completeness = total_present_fields / total_expected_fields if total_expected_fields > 0 else 0.0 # Accuracy score (based on validation scores) accuracy_scores = [record.get("data_quality", {}).get("quality_score", 0) for record in data] accuracy = statistics.mean(accuracy_scores) if accuracy_scores else 0.0 # Consistency score (coefficient of variation for quality scores) if len(accuracy_scores) > 1: std_dev = statistics.stdev(accuracy_scores) mean_score = statistics.mean(accuracy_scores) consistency = 1.0 - (std_dev / mean_score) if mean_score > 0 else 0.0 else: consistency = 1.0 # Timeliness score (based on data age) current_time = datetime.utcnow().timestamp() ages = [(current_time - record.get("timestamp", current_time)) / 3600 for record in data] # age in hours avg_age = statistics.mean(ages) if ages else 0 timeliness = max(0.0, 1.0 - (avg_age / 24)) # Decrease score as data gets older than 24 hours # Overall score overall = statistics.mean([completeness, accuracy, consistency, timeliness]) return { "overall": round(overall, 3), "completeness": round(completeness, 3), "accuracy": round(accuracy, 3), "consistency": round(consistency, 3), "timeliness": round(timeliness, 3) } async def _store_quality_report(self, quality_report: Dict[str, Any], source_name: str): """Store quality report in database""" try: quality_report["_id"] = f"{source_name}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}" await self.db.quality_reports.insert_one(quality_report) # Also cache in Redis for quick access cache_key = f"quality_report:{source_name}:latest" await self.redis.setex(cache_key, 3600, json.dumps(quality_report, default=str)) except Exception as e: logger.error(f"Error storing quality report: {e}") async def _load_validation_rules(self): """Load validation rules configuration""" # Default validation rules self.validation_rules = { "energy": { "min_value": 0, "max_value": 100000, "required_precision": 0.01 }, "power": { "min_value": 0, "max_value": 50000, "required_precision": 0.1 }, "temperature": { "min_value": -50, "max_value": 100, "required_precision": 0.1 } } logger.info("Loaded default validation rules") async def _load_enrichment_metadata(self): """Load enrichment metadata""" # Load any cached enrichment data try: cache_keys = [] async for key in self.redis.scan_iter(match="enrichment:*"): cache_keys.append(key) logger.info(f"Loaded {len(cache_keys)} enrichment cache entries") except Exception as e: logger.debug(f"Error loading enrichment metadata: {e}") async def get_quality_summary(self, source_name: Optional[str] = None) -> Dict[str, Any]: """Get quality summary for sources""" try: match_filter = {"source": source_name} if source_name else {} # Get recent quality reports cursor = self.db.quality_reports.find(match_filter).sort("processing_time", -1).limit(50) reports = [] async for report in cursor: report["_id"] = str(report["_id"]) reports.append(report) if not reports: return {"message": "No quality reports found"} # Calculate summary statistics avg_quality = statistics.mean([r["quality_scores"]["overall"] for r in reports]) total_processed = sum([r["processed_records"] for r in reports]) total_rejected = sum([r["rejected_records"] for r in reports]) return { "total_reports": len(reports), "average_quality": round(avg_quality, 3), "total_processed_records": total_processed, "total_rejected_records": total_rejected, "success_rate": round(total_processed / (total_processed + total_rejected) * 100, 2) if (total_processed + total_rejected) > 0 else 0, "latest_report": reports[0] if reports else None } except Exception as e: logger.error(f"Error getting quality summary: {e}") return {"error": str(e)}