sac4cps-backend/microservices/data-ingestion-service/data_validator.py

"""
Data validation and enrichment for time series data.
Provides quality assessment, metadata enrichment, and data transformation capabilities.
"""

import asyncio
import json
import logging
import statistics
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Tuple
import hashlib
import re
from collections import defaultdict
import math

logger = logging.getLogger(__name__)

class DataValidator:
    """Validates, enriches, and transforms time series data"""

    def __init__(self, db, redis_client):
        self.db = db
        self.redis = redis_client
        self.validation_rules = {}
        self.enrichment_cache = {}
        self.quality_thresholds = {
            "completeness": 0.8,
            "accuracy": 0.9,
            "consistency": 0.85,
            "timeliness": 0.9
        }

    async def initialize(self):
        """Initialize validator with default rules and configurations"""
        try:
            await self._load_validation_rules()
            await self._load_enrichment_metadata()
            logger.info("Data validator initialized successfully")
        except Exception as e:
            logger.error(f"Error initializing data validator: {e}")
            raise

    async def validate_and_enrich_data(self, data: List[Dict[str, Any]],
                                     source_name: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
        """Validate and enrich time series data, returning processed data and quality report"""
        try:
            logger.info(f"Validating and enriching {len(data)} records from {source_name}")

            # Initialize validation report
            quality_report = {
                "source": source_name,
                "total_records": len(data),
                "processed_records": 0,
                "rejected_records": 0,
                "quality_scores": {},
                "issues_found": [],
                "processing_time": datetime.utcnow().isoformat()
            }

            enriched_data = []

            # Process each record
            for i, record in enumerate(data):
                try:
                    # Validate record
                    validation_result = await self._validate_record(record, source_name)

                    if validation_result["is_valid"]:
                        # Enrich the record
                        enriched_record = await self._enrich_record(record, source_name, validation_result)
                        enriched_data.append(enriched_record)
                        quality_report["processed_records"] += 1
                    else:
                        quality_report["rejected_records"] += 1
                        quality_report["issues_found"].extend(validation_result["issues"])
                        logger.warning(f"Record {i} rejected: {validation_result['issues']}")

                except Exception as e:
                    logger.error(f"Error processing record {i}: {e}")
                    quality_report["rejected_records"] += 1
                    quality_report["issues_found"].append(f"Processing error: {str(e)}")

            # Calculate overall quality scores
            quality_report["quality_scores"] = await self._calculate_quality_scores(enriched_data, quality_report)

            # Store quality report
            await self._store_quality_report(quality_report, source_name)

            logger.info(f"Validation complete: {quality_report['processed_records']}/{quality_report['total_records']} records processed")

            return enriched_data, quality_report

        except Exception as e:
            logger.error(f"Error in data validation and enrichment: {e}")
            raise

    async def _validate_record(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
        """Validate a single record against quality rules"""
        validation_result = {
            "is_valid": True,
            "issues": [],
            "quality_metrics": {}
        }

        try:
            # Check required fields
            required_fields = ["sensor_id", "timestamp", "value"]
            for field in required_fields:
                if field not in record or record[field] is None:
                    validation_result["is_valid"] = False
                    validation_result["issues"].append(f"Missing required field: {field}")

            if not validation_result["is_valid"]:
                return validation_result

            # Validate timestamp
            timestamp_validation = await self._validate_timestamp(record["timestamp"])
            validation_result["quality_metrics"]["timestamp_quality"] = timestamp_validation["score"]
            if not timestamp_validation["is_valid"]:
                validation_result["issues"].extend(timestamp_validation["issues"])

            # Validate numeric value
            value_validation = await self._validate_numeric_value(record["value"], record.get("unit"))
            validation_result["quality_metrics"]["value_quality"] = value_validation["score"]
            if not value_validation["is_valid"]:
                validation_result["issues"].extend(value_validation["issues"])

            # Validate sensor ID format
            sensor_validation = await self._validate_sensor_id(record["sensor_id"])
            validation_result["quality_metrics"]["sensor_id_quality"] = sensor_validation["score"]
            if not sensor_validation["is_valid"]:
                validation_result["issues"].extend(sensor_validation["issues"])

            # Check for duplicates
            duplicate_check = await self._check_for_duplicates(record, source_name)
            validation_result["quality_metrics"]["uniqueness"] = duplicate_check["score"]
            if not duplicate_check["is_unique"]:
                validation_result["issues"].extend(duplicate_check["issues"])

            # Calculate overall validity
            if validation_result["issues"]:
                # Allow minor issues but flag major ones
                major_issues = [issue for issue in validation_result["issues"]
                              if "Missing required field" in issue or "Invalid" in issue]
                validation_result["is_valid"] = len(major_issues) == 0

        except Exception as e:
            logger.error(f"Error validating record: {e}")
            validation_result["is_valid"] = False
            validation_result["issues"].append(f"Validation error: {str(e)}")

        return validation_result

    async def _enrich_record(self, record: Dict[str, Any], source_name: str,
                           validation_result: Dict[str, Any]) -> Dict[str, Any]:
        """Enrich a record with additional metadata and derived fields"""
        try:
            enriched = record.copy()

            # Add validation metadata
            enriched["data_quality"] = {
                "quality_score": statistics.mean(validation_result["quality_metrics"].values()) if validation_result["quality_metrics"] else 0.0,
                "quality_metrics": validation_result["quality_metrics"],
                "validation_timestamp": datetime.utcnow().isoformat()
            }

            # Add source information
            enriched["source_info"] = {
                "source_name": source_name,
                "ingestion_time": datetime.utcnow().isoformat(),
                "record_id": hashlib.md5(f"{source_name}_{record.get('sensor_id', 'unknown')}_{record.get('timestamp', 0)}".encode()).hexdigest()
            }

            # Normalize timestamp format
            enriched["timestamp"] = await self._normalize_timestamp(record["timestamp"])
            enriched["timestamp_iso"] = datetime.fromtimestamp(enriched["timestamp"]).isoformat()

            # Infer and enrich sensor type
            sensor_type_info = await self._infer_sensor_type(record)
            enriched["sensor_type"] = sensor_type_info["type"]
            enriched["sensor_category"] = sensor_type_info["category"]

            # Add unit standardization
            unit_info = await self._standardize_unit(record.get("unit"))
            enriched["unit"] = unit_info["standard_unit"]
            enriched["unit_info"] = unit_info

            # Calculate derived metrics
            derived_metrics = await self._calculate_derived_metrics(enriched, source_name)
            enriched["derived_metrics"] = derived_metrics

            # Add location and context information
            context_info = await self._enrich_with_context(enriched, source_name)
            enriched["metadata"] = {**enriched.get("metadata", {}), **context_info}

            # Add temporal features
            temporal_features = await self._extract_temporal_features(enriched["timestamp"])
            enriched["temporal"] = temporal_features

            # Energy-specific enrichments
            if sensor_type_info["category"] == "energy":
                energy_enrichment = await self._enrich_energy_data(enriched)
                enriched.update(energy_enrichment)

            return enriched

        except Exception as e:
            logger.error(f"Error enriching record: {e}")
            return record

    async def _validate_timestamp(self, timestamp) -> Dict[str, Any]:
        """Validate timestamp format and reasonableness"""
        result = {"is_valid": True, "issues": [], "score": 1.0}

        try:
            # Convert to numeric timestamp
            if isinstance(timestamp, str):
                try:
                    # Try parsing ISO format
                    dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
                    ts = dt.timestamp()
                except:
                    # Try parsing as unix timestamp string
                    ts = float(timestamp)
            else:
                ts = float(timestamp)

            # Check if timestamp is reasonable (not too far in past/future)
            current_time = datetime.utcnow().timestamp()
            max_age = 365 * 24 * 3600  # 1 year
            max_future = 24 * 3600    # 1 day

            if ts < current_time - max_age:
                result["issues"].append("Timestamp too old (more than 1 year)")
                result["score"] -= 0.3
            elif ts > current_time + max_future:
                result["issues"].append("Timestamp too far in future")
                result["score"] -= 0.3

            # Check for reasonable precision (not too precise for energy data)
            if ts != int(ts) and len(str(ts).split('.')[1]) > 3:
                result["score"] -= 0.1  # Minor issue

        except (ValueError, TypeError) as e:
            result["is_valid"] = False
            result["issues"].append(f"Invalid timestamp format: {e}")
            result["score"] = 0.0

        return result

    async def _validate_numeric_value(self, value, unit: Optional[str] = None) -> Dict[str, Any]:
        """Validate numeric value reasonableness"""
        result = {"is_valid": True, "issues": [], "score": 1.0}

        try:
            numeric_value = float(value)

            # Check for negative values (usually invalid for energy data)
            if numeric_value < 0:
                result["issues"].append("Negative energy value")
                result["score"] -= 0.4

            # Check for unreasonably large values
            unit_str = (unit or "").lower()
            if "wh" in unit_str:
                # Energy values
                if numeric_value > 100000:  # >100kWh seems excessive for single reading
                    result["issues"].append("Unusually high energy value")
                    result["score"] -= 0.2
            elif "w" in unit_str:
                # Power values
                if numeric_value > 50000:   # >50kW seems excessive
                    result["issues"].append("Unusually high power value")
                    result["score"] -= 0.2

            # Check for zero values (might indicate sensor issues)
            if numeric_value == 0:
                result["score"] -= 0.1

            # Check for NaN or infinity
            if math.isnan(numeric_value) or math.isinf(numeric_value):
                result["is_valid"] = False
                result["issues"].append("Invalid numeric value (NaN or Infinity)")
                result["score"] = 0.0

        except (ValueError, TypeError) as e:
            result["is_valid"] = False
            result["issues"].append(f"Non-numeric value: {e}")
            result["score"] = 0.0

        return result

    async def _validate_sensor_id(self, sensor_id: str) -> Dict[str, Any]:
        """Validate sensor ID format and consistency"""
        result = {"is_valid": True, "issues": [], "score": 1.0}

        try:
            if not isinstance(sensor_id, str) or len(sensor_id) == 0:
                result["is_valid"] = False
                result["issues"].append("Empty or invalid sensor ID")
                result["score"] = 0.0
                return result

            # Check length
            if len(sensor_id) < 3:
                result["issues"].append("Very short sensor ID")
                result["score"] -= 0.2
            elif len(sensor_id) > 50:
                result["issues"].append("Very long sensor ID")
                result["score"] -= 0.1

            # Check for reasonable characters
            if not re.match(r'^[a-zA-Z0-9_\-\.]+$', sensor_id):
                result["issues"].append("Sensor ID contains unusual characters")
                result["score"] -= 0.1

        except Exception as e:
            result["issues"].append(f"Sensor ID validation error: {e}")
            result["score"] -= 0.1

        return result

    async def _check_for_duplicates(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
        """Check for duplicate records"""
        result = {"is_unique": True, "issues": [], "score": 1.0}

        try:
            # Create record signature
            signature = hashlib.md5(
                f"{source_name}_{record.get('sensor_id')}_{record.get('timestamp')}_{record.get('value')}".encode()
            ).hexdigest()

            # Check cache for recent duplicates
            cache_key = f"record_signature:{signature}"
            exists = await self.redis.exists(cache_key)

            if exists:
                result["is_unique"] = False
                result["issues"].append("Duplicate record detected")
                result["score"] = 0.0
            else:
                # Store signature with short expiration (1 hour)
                await self.redis.setex(cache_key, 3600, "1")

        except Exception as e:
            logger.debug(f"Error checking duplicates: {e}")
            # Don't fail validation for cache errors

        return result

    async def _normalize_timestamp(self, timestamp) -> int:
        """Normalize timestamp to unix timestamp"""
        try:
            if isinstance(timestamp, str):
                try:
                    # Try ISO format first
                    dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
                    return int(dt.timestamp())
                except:
                    # Try as unix timestamp string
                    return int(float(timestamp))
            else:
                return int(float(timestamp))
        except:
            # Fallback to current time
            return int(datetime.utcnow().timestamp())

    async def _infer_sensor_type(self, record: Dict[str, Any]) -> Dict[str, str]:
        """Infer sensor type from record data"""
        sensor_id = record.get("sensor_id", "").lower()
        unit = (record.get("unit", "") or "").lower()
        value = record.get("value", 0)
        metadata = record.get("metadata", {})

        # Energy sensors
        if "wh" in unit or "energy" in sensor_id or "consumption" in sensor_id:
            return {"type": "energy", "category": "energy"}
        elif "w" in unit and "wh" not in unit:
            return {"type": "power", "category": "energy"}

        # Environmental sensors
        elif "temp" in sensor_id or "°c" in unit or "celsius" in unit:
            return {"type": "temperature", "category": "environmental"}
        elif "humid" in sensor_id or "%" in unit:
            return {"type": "humidity", "category": "environmental"}
        elif "co2" in sensor_id or "ppm" in unit:
            return {"type": "co2", "category": "environmental"}

        # Motion/occupancy sensors
        elif "motion" in sensor_id or "occupancy" in sensor_id or ("motion" in str(metadata).lower()):
            return {"type": "motion", "category": "occupancy"}

        # Generation sensors
        elif "generation" in sensor_id or "solar" in sensor_id or "generation" in str(metadata).lower():
            return {"type": "generation", "category": "energy"}

        # Default to energy if unclear
        else:
            return {"type": "energy", "category": "energy"}

    async def _standardize_unit(self, unit: Optional[str]) -> Dict[str, Any]:
        """Standardize unit format"""
        if not unit:
            return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}

        unit_lower = unit.lower().strip()

        # Energy units
        if unit_lower in ["kwh", "kw-h", "kw_h"]:
            return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
        elif unit_lower in ["wh", "w-h", "w_h"]:
            return {"standard_unit": "kWh", "conversion_factor": 0.001, "unit_type": "energy"}
        elif unit_lower in ["mwh", "mw-h", "mw_h"]:
            return {"standard_unit": "kWh", "conversion_factor": 1000.0, "unit_type": "energy"}

        # Power units
        elif unit_lower in ["kw", "kilowatt", "kilowatts"]:
            return {"standard_unit": "kW", "conversion_factor": 1.0, "unit_type": "power"}
        elif unit_lower in ["w", "watt", "watts"]:
            return {"standard_unit": "kW", "conversion_factor": 0.001, "unit_type": "power"}
        elif unit_lower in ["mw", "megawatt", "megawatts"]:
            return {"standard_unit": "kW", "conversion_factor": 1000.0, "unit_type": "power"}

        # Temperature units
        elif unit_lower in ["°c", "celsius", "c"]:
            return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature"}
        elif unit_lower in ["°f", "fahrenheit", "f"]:
            return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature", "requires_conversion": True}

        # Default
        else:
            return {"standard_unit": unit, "conversion_factor": 1.0, "unit_type": "unknown"}

    async def _calculate_derived_metrics(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
        """Calculate derived metrics from the record"""
        derived = {}

        try:
            value = float(record.get("value", 0))
            unit_info = record.get("unit_info", {})

            # Apply unit conversion if needed
            if unit_info.get("conversion_factor", 1.0) != 1.0:
                derived["original_value"] = value
                derived["converted_value"] = value * unit_info["conversion_factor"]

            # Energy-specific calculations
            if unit_info.get("unit_type") == "energy":
                # Estimate cost (simplified)
                cost_per_kwh = 0.12  # Example rate
                derived["estimated_cost"] = value * cost_per_kwh

                # Estimate CO2 emissions (simplified)
                co2_per_kwh = 0.4  # kg CO2 per kWh (example grid factor)
                derived["estimated_co2_kg"] = value * co2_per_kwh

            # Add value range classification
            derived["value_range"] = await self._classify_value_range(value, unit_info.get("unit_type"))

        except Exception as e:
            logger.debug(f"Error calculating derived metrics: {e}")

        return derived

    async def _classify_value_range(self, value: float, unit_type: str) -> str:
        """Classify value into ranges for better understanding"""
        if unit_type == "energy":
            if value < 1:
                return "very_low"
            elif value < 10:
                return "low"
            elif value < 50:
                return "medium"
            elif value < 200:
                return "high"
            else:
                return "very_high"
        elif unit_type == "power":
            if value < 0.5:
                return "very_low"
            elif value < 5:
                return "low"
            elif value < 20:
                return "medium"
            elif value < 100:
                return "high"
            else:
                return "very_high"
        else:
            return "unknown"

    async def _enrich_with_context(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
        """Enrich record with contextual information"""
        context = {}

        try:
            # Add geographical context if available
            context["data_source"] = "real_community"
            context["source_type"] = "ftp_ingestion"

            # Add data freshness
            ingestion_time = datetime.utcnow()
            data_time = datetime.fromtimestamp(record["timestamp"])
            context["data_age_minutes"] = (ingestion_time - data_time).total_seconds() / 60

            # Classify data freshness
            if context["data_age_minutes"] < 15:
                context["freshness"] = "real_time"
            elif context["data_age_minutes"] < 60:
                context["freshness"] = "near_real_time"
            elif context["data_age_minutes"] < 1440:  # 24 hours
                context["freshness"] = "recent"
            else:
                context["freshness"] = "historical"

        except Exception as e:
            logger.debug(f"Error adding context: {e}")

        return context

    async def _extract_temporal_features(self, timestamp: int) -> Dict[str, Any]:
        """Extract temporal features from timestamp"""
        dt = datetime.fromtimestamp(timestamp)

        return {
            "hour": dt.hour,
            "day_of_week": dt.weekday(),
            "day_of_month": dt.day,
            "month": dt.month,
            "quarter": (dt.month - 1) // 3 + 1,
            "is_weekend": dt.weekday() >= 5,
            "is_business_hours": 8 <= dt.hour <= 17,
            "season": self._get_season(dt.month)
        }

    def _get_season(self, month: int) -> str:
        """Get season from month"""
        if month in [12, 1, 2]:
            return "winter"
        elif month in [3, 4, 5]:
            return "spring"
        elif month in [6, 7, 8]:
            return "summer"
        else:
            return "autumn"

    async def _enrich_energy_data(self, record: Dict[str, Any]) -> Dict[str, Any]:
        """Add energy-specific enrichments"""
        enrichment = {}

        try:
            value = record.get("derived_metrics", {}).get("converted_value", record.get("value", 0))
            temporal = record.get("temporal", {})

            # Energy usage patterns
            if temporal.get("is_business_hours"):
                enrichment["usage_pattern"] = "business_hours"
            elif temporal.get("is_weekend"):
                enrichment["usage_pattern"] = "weekend"
            else:
                enrichment["usage_pattern"] = "off_hours"

            # Demand classification
            if value > 100:
                enrichment["demand_level"] = "high"
            elif value > 50:
                enrichment["demand_level"] = "medium"
            elif value > 10:
                enrichment["demand_level"] = "low"
            else:
                enrichment["demand_level"] = "minimal"

            # Peak/off-peak classification
            hour = temporal.get("hour", 0)
            if 17 <= hour <= 21:  # Evening peak
                enrichment["tariff_period"] = "peak"
            elif 22 <= hour <= 6:   # Night off-peak
                enrichment["tariff_period"] = "off_peak"
            else:
                enrichment["tariff_period"] = "standard"

        except Exception as e:
            logger.debug(f"Error enriching energy data: {e}")

        return enrichment

    async def _calculate_quality_scores(self, data: List[Dict[str, Any]], quality_report: Dict[str, Any]) -> Dict[str, float]:
        """Calculate overall quality scores"""
        if not data:
            return {"overall": 0.0, "completeness": 0.0, "accuracy": 0.0, "consistency": 0.0, "timeliness": 0.0}

        # Completeness score
        total_expected_fields = len(data) * 4  # sensor_id, timestamp, value, unit
        total_present_fields = sum(1 for record in data
                                 for field in ["sensor_id", "timestamp", "value", "unit"]
                                 if record.get(field) is not None)
        completeness = total_present_fields / total_expected_fields if total_expected_fields > 0 else 0.0

        # Accuracy score (based on validation scores)
        accuracy_scores = [record.get("data_quality", {}).get("quality_score", 0) for record in data]
        accuracy = statistics.mean(accuracy_scores) if accuracy_scores else 0.0

        # Consistency score (coefficient of variation for quality scores)
        if len(accuracy_scores) > 1:
            std_dev = statistics.stdev(accuracy_scores)
            mean_score = statistics.mean(accuracy_scores)
            consistency = 1.0 - (std_dev / mean_score) if mean_score > 0 else 0.0
        else:
            consistency = 1.0

        # Timeliness score (based on data age)
        current_time = datetime.utcnow().timestamp()
        ages = [(current_time - record.get("timestamp", current_time)) / 3600 for record in data]  # age in hours
        avg_age = statistics.mean(ages) if ages else 0
        timeliness = max(0.0, 1.0 - (avg_age / 24))  # Decrease score as data gets older than 24 hours

        # Overall score
        overall = statistics.mean([completeness, accuracy, consistency, timeliness])

        return {
            "overall": round(overall, 3),
            "completeness": round(completeness, 3),
            "accuracy": round(accuracy, 3),
            "consistency": round(consistency, 3),
            "timeliness": round(timeliness, 3)
        }

    async def _store_quality_report(self, quality_report: Dict[str, Any], source_name: str):
        """Store quality report in database"""
        try:
            quality_report["_id"] = f"{source_name}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
            await self.db.quality_reports.insert_one(quality_report)

            # Also cache in Redis for quick access
            cache_key = f"quality_report:{source_name}:latest"
            await self.redis.setex(cache_key, 3600, json.dumps(quality_report, default=str))

        except Exception as e:
            logger.error(f"Error storing quality report: {e}")

    async def _load_validation_rules(self):
        """Load validation rules configuration"""
        # Default validation rules
        self.validation_rules = {
            "energy": {
                "min_value": 0,
                "max_value": 100000,
                "required_precision": 0.01
            },
            "power": {
                "min_value": 0,
                "max_value": 50000,
                "required_precision": 0.1
            },
            "temperature": {
                "min_value": -50,
                "max_value": 100,
                "required_precision": 0.1
            }
        }

        logger.info("Loaded default validation rules")

    async def _load_enrichment_metadata(self):
        """Load enrichment metadata"""
        # Load any cached enrichment data
        try:
            cache_keys = []
            async for key in self.redis.scan_iter(match="enrichment:*"):
                cache_keys.append(key)

            logger.info(f"Loaded {len(cache_keys)} enrichment cache entries")

        except Exception as e:
            logger.debug(f"Error loading enrichment metadata: {e}")

    async def get_quality_summary(self, source_name: Optional[str] = None) -> Dict[str, Any]:
        """Get quality summary for sources"""
        try:
            match_filter = {"source": source_name} if source_name else {}

            # Get recent quality reports
            cursor = self.db.quality_reports.find(match_filter).sort("processing_time", -1).limit(50)

            reports = []
            async for report in cursor:
                report["_id"] = str(report["_id"])
                reports.append(report)

            if not reports:
                return {"message": "No quality reports found"}

            # Calculate summary statistics
            avg_quality = statistics.mean([r["quality_scores"]["overall"] for r in reports])
            total_processed = sum([r["processed_records"] for r in reports])
            total_rejected = sum([r["rejected_records"] for r in reports])

            return {
                "total_reports": len(reports),
                "average_quality": round(avg_quality, 3),
                "total_processed_records": total_processed,
                "total_rejected_records": total_rejected,
                "success_rate": round(total_processed / (total_processed + total_rejected) * 100, 2) if (total_processed + total_rejected) > 0 else 0,
                "latest_report": reports[0] if reports else None
            }

        except Exception as e:
            logger.error(f"Error getting quality summary: {e}")
            return {"error": str(e)}