Simplify data ingestion service
This commit is contained in:
710
microservices/data-ingestion-service/src/data_validator.py
Normal file
710
microservices/data-ingestion-service/src/data_validator.py
Normal file
@@ -0,0 +1,710 @@
|
||||
"""
|
||||
Data validation and enrichment for time series data.
|
||||
Provides quality assessment, metadata enrichment, and data transformation capabilities.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import statistics
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
import hashlib
|
||||
import re
|
||||
from collections import defaultdict
|
||||
import math
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DataValidator:
|
||||
"""Validates, enriches, and transforms time series data"""
|
||||
|
||||
def __init__(self, db, redis_client):
|
||||
self.db = db
|
||||
self.redis = redis_client
|
||||
self.validation_rules = {}
|
||||
self.enrichment_cache = {}
|
||||
self.quality_thresholds = {
|
||||
"completeness": 0.8,
|
||||
"accuracy": 0.9,
|
||||
"consistency": 0.85,
|
||||
"timeliness": 0.9
|
||||
}
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize validator with default rules and configurations"""
|
||||
try:
|
||||
await self._load_validation_rules()
|
||||
await self._load_enrichment_metadata()
|
||||
logger.info("Data validator initialized successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Error initializing data validator: {e}")
|
||||
raise
|
||||
|
||||
async def validate_and_enrich_data(self, data: List[Dict[str, Any]],
|
||||
source_name: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
||||
"""Validate and enrich time series data, returning processed data and quality report"""
|
||||
try:
|
||||
logger.info(f"Validating and enriching {len(data)} records from {source_name}")
|
||||
|
||||
# Initialize validation report
|
||||
quality_report = {
|
||||
"source": source_name,
|
||||
"total_records": len(data),
|
||||
"processed_records": 0,
|
||||
"rejected_records": 0,
|
||||
"quality_scores": {},
|
||||
"issues_found": [],
|
||||
"processing_time": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
enriched_data = []
|
||||
|
||||
# Process each record
|
||||
for i, record in enumerate(data):
|
||||
try:
|
||||
# Validate record
|
||||
validation_result = await self._validate_record(record, source_name)
|
||||
|
||||
if validation_result["is_valid"]:
|
||||
# Enrich the record
|
||||
enriched_record = await self._enrich_record(record, source_name, validation_result)
|
||||
enriched_data.append(enriched_record)
|
||||
quality_report["processed_records"] += 1
|
||||
else:
|
||||
quality_report["rejected_records"] += 1
|
||||
quality_report["issues_found"].extend(validation_result["issues"])
|
||||
logger.warning(f"Record {i} rejected: {validation_result['issues']}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing record {i}: {e}")
|
||||
quality_report["rejected_records"] += 1
|
||||
quality_report["issues_found"].append(f"Processing error: {str(e)}")
|
||||
|
||||
# Calculate overall quality scores
|
||||
quality_report["quality_scores"] = await self._calculate_quality_scores(enriched_data, quality_report)
|
||||
|
||||
# Store quality report
|
||||
await self._store_quality_report(quality_report, source_name)
|
||||
|
||||
logger.info(f"Validation complete: {quality_report['processed_records']}/{quality_report['total_records']} records processed")
|
||||
|
||||
return enriched_data, quality_report
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in data validation and enrichment: {e}")
|
||||
raise
|
||||
|
||||
async def _validate_record(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
|
||||
"""Validate a single record against quality rules"""
|
||||
validation_result = {
|
||||
"is_valid": True,
|
||||
"issues": [],
|
||||
"quality_metrics": {}
|
||||
}
|
||||
|
||||
try:
|
||||
# Check required fields
|
||||
required_fields = ["sensor_id", "timestamp", "value"]
|
||||
for field in required_fields:
|
||||
if field not in record or record[field] is None:
|
||||
validation_result["is_valid"] = False
|
||||
validation_result["issues"].append(f"Missing required field: {field}")
|
||||
|
||||
if not validation_result["is_valid"]:
|
||||
return validation_result
|
||||
|
||||
# Validate timestamp
|
||||
timestamp_validation = await self._validate_timestamp(record["timestamp"])
|
||||
validation_result["quality_metrics"]["timestamp_quality"] = timestamp_validation["score"]
|
||||
if not timestamp_validation["is_valid"]:
|
||||
validation_result["issues"].extend(timestamp_validation["issues"])
|
||||
|
||||
# Validate numeric value
|
||||
value_validation = await self._validate_numeric_value(record["value"], record.get("unit"))
|
||||
validation_result["quality_metrics"]["value_quality"] = value_validation["score"]
|
||||
if not value_validation["is_valid"]:
|
||||
validation_result["issues"].extend(value_validation["issues"])
|
||||
|
||||
# Validate sensor ID format
|
||||
sensor_validation = await self._validate_sensor_id(record["sensor_id"])
|
||||
validation_result["quality_metrics"]["sensor_id_quality"] = sensor_validation["score"]
|
||||
if not sensor_validation["is_valid"]:
|
||||
validation_result["issues"].extend(sensor_validation["issues"])
|
||||
|
||||
# Check for duplicates
|
||||
duplicate_check = await self._check_for_duplicates(record, source_name)
|
||||
validation_result["quality_metrics"]["uniqueness"] = duplicate_check["score"]
|
||||
if not duplicate_check["is_unique"]:
|
||||
validation_result["issues"].extend(duplicate_check["issues"])
|
||||
|
||||
# Calculate overall validity
|
||||
if validation_result["issues"]:
|
||||
# Allow minor issues but flag major ones
|
||||
major_issues = [issue for issue in validation_result["issues"]
|
||||
if "Missing required field" in issue or "Invalid" in issue]
|
||||
validation_result["is_valid"] = len(major_issues) == 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error validating record: {e}")
|
||||
validation_result["is_valid"] = False
|
||||
validation_result["issues"].append(f"Validation error: {str(e)}")
|
||||
|
||||
return validation_result
|
||||
|
||||
async def _enrich_record(self, record: Dict[str, Any], source_name: str,
|
||||
validation_result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Enrich a record with additional metadata and derived fields"""
|
||||
try:
|
||||
enriched = record.copy()
|
||||
|
||||
# Add validation metadata
|
||||
enriched["data_quality"] = {
|
||||
"quality_score": statistics.mean(validation_result["quality_metrics"].values()) if validation_result["quality_metrics"] else 0.0,
|
||||
"quality_metrics": validation_result["quality_metrics"],
|
||||
"validation_timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
# Add source information
|
||||
enriched["source_info"] = {
|
||||
"source_name": source_name,
|
||||
"ingestion_time": datetime.utcnow().isoformat(),
|
||||
"record_id": hashlib.md5(f"{source_name}_{record.get('sensor_id', 'unknown')}_{record.get('timestamp', 0)}".encode()).hexdigest()
|
||||
}
|
||||
|
||||
# Normalize timestamp format
|
||||
enriched["timestamp"] = await self._normalize_timestamp(record["timestamp"])
|
||||
enriched["timestamp_iso"] = datetime.fromtimestamp(enriched["timestamp"]).isoformat()
|
||||
|
||||
# Infer and enrich sensor type
|
||||
sensor_type_info = await self._infer_sensor_type(record)
|
||||
enriched["sensor_type"] = sensor_type_info["type"]
|
||||
enriched["sensor_category"] = sensor_type_info["category"]
|
||||
|
||||
# Add unit standardization
|
||||
unit_info = await self._standardize_unit(record.get("unit"))
|
||||
enriched["unit"] = unit_info["standard_unit"]
|
||||
enriched["unit_info"] = unit_info
|
||||
|
||||
# Calculate derived metrics
|
||||
derived_metrics = await self._calculate_derived_metrics(enriched, source_name)
|
||||
enriched["derived_metrics"] = derived_metrics
|
||||
|
||||
# Add location and context information
|
||||
context_info = await self._enrich_with_context(enriched, source_name)
|
||||
enriched["metadata"] = {**enriched.get("metadata", {}), **context_info}
|
||||
|
||||
# Add temporal features
|
||||
temporal_features = await self._extract_temporal_features(enriched["timestamp"])
|
||||
enriched["temporal"] = temporal_features
|
||||
|
||||
# Energy-specific enrichments
|
||||
if sensor_type_info["category"] == "energy":
|
||||
energy_enrichment = await self._enrich_energy_data(enriched)
|
||||
enriched.update(energy_enrichment)
|
||||
|
||||
return enriched
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error enriching record: {e}")
|
||||
return record
|
||||
|
||||
async def _validate_timestamp(self, timestamp) -> Dict[str, Any]:
|
||||
"""Validate timestamp format and reasonableness"""
|
||||
result = {"is_valid": True, "issues": [], "score": 1.0}
|
||||
|
||||
try:
|
||||
# Convert to numeric timestamp
|
||||
if isinstance(timestamp, str):
|
||||
try:
|
||||
# Try parsing ISO format
|
||||
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||
ts = dt.timestamp()
|
||||
except:
|
||||
# Try parsing as unix timestamp string
|
||||
ts = float(timestamp)
|
||||
else:
|
||||
ts = float(timestamp)
|
||||
|
||||
# Check if timestamp is reasonable (not too far in past/future)
|
||||
current_time = datetime.utcnow().timestamp()
|
||||
max_age = 365 * 24 * 3600 # 1 year
|
||||
max_future = 24 * 3600 # 1 day
|
||||
|
||||
if ts < current_time - max_age:
|
||||
result["issues"].append("Timestamp too old (more than 1 year)")
|
||||
result["score"] -= 0.3
|
||||
elif ts > current_time + max_future:
|
||||
result["issues"].append("Timestamp too far in future")
|
||||
result["score"] -= 0.3
|
||||
|
||||
# Check for reasonable precision (not too precise for energy data)
|
||||
if ts != int(ts) and len(str(ts).split('.')[1]) > 3:
|
||||
result["score"] -= 0.1 # Minor issue
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
result["is_valid"] = False
|
||||
result["issues"].append(f"Invalid timestamp format: {e}")
|
||||
result["score"] = 0.0
|
||||
|
||||
return result
|
||||
|
||||
async def _validate_numeric_value(self, value, unit: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Validate numeric value reasonableness"""
|
||||
result = {"is_valid": True, "issues": [], "score": 1.0}
|
||||
|
||||
try:
|
||||
numeric_value = float(value)
|
||||
|
||||
# Check for negative values (usually invalid for energy data)
|
||||
if numeric_value < 0:
|
||||
result["issues"].append("Negative energy value")
|
||||
result["score"] -= 0.4
|
||||
|
||||
# Check for unreasonably large values
|
||||
unit_str = (unit or "").lower()
|
||||
if "wh" in unit_str:
|
||||
# Energy values
|
||||
if numeric_value > 100000: # >100kWh seems excessive for single reading
|
||||
result["issues"].append("Unusually high energy value")
|
||||
result["score"] -= 0.2
|
||||
elif "w" in unit_str:
|
||||
# Power values
|
||||
if numeric_value > 50000: # >50kW seems excessive
|
||||
result["issues"].append("Unusually high power value")
|
||||
result["score"] -= 0.2
|
||||
|
||||
# Check for zero values (might indicate sensor issues)
|
||||
if numeric_value == 0:
|
||||
result["score"] -= 0.1
|
||||
|
||||
# Check for NaN or infinity
|
||||
if math.isnan(numeric_value) or math.isinf(numeric_value):
|
||||
result["is_valid"] = False
|
||||
result["issues"].append("Invalid numeric value (NaN or Infinity)")
|
||||
result["score"] = 0.0
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
result["is_valid"] = False
|
||||
result["issues"].append(f"Non-numeric value: {e}")
|
||||
result["score"] = 0.0
|
||||
|
||||
return result
|
||||
|
||||
async def _validate_sensor_id(self, sensor_id: str) -> Dict[str, Any]:
|
||||
"""Validate sensor ID format and consistency"""
|
||||
result = {"is_valid": True, "issues": [], "score": 1.0}
|
||||
|
||||
try:
|
||||
if not isinstance(sensor_id, str) or len(sensor_id) == 0:
|
||||
result["is_valid"] = False
|
||||
result["issues"].append("Empty or invalid sensor ID")
|
||||
result["score"] = 0.0
|
||||
return result
|
||||
|
||||
# Check length
|
||||
if len(sensor_id) < 3:
|
||||
result["issues"].append("Very short sensor ID")
|
||||
result["score"] -= 0.2
|
||||
elif len(sensor_id) > 50:
|
||||
result["issues"].append("Very long sensor ID")
|
||||
result["score"] -= 0.1
|
||||
|
||||
# Check for reasonable characters
|
||||
if not re.match(r'^[a-zA-Z0-9_\-\.]+$', sensor_id):
|
||||
result["issues"].append("Sensor ID contains unusual characters")
|
||||
result["score"] -= 0.1
|
||||
|
||||
except Exception as e:
|
||||
result["issues"].append(f"Sensor ID validation error: {e}")
|
||||
result["score"] -= 0.1
|
||||
|
||||
return result
|
||||
|
||||
async def _check_for_duplicates(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
|
||||
"""Check for duplicate records"""
|
||||
result = {"is_unique": True, "issues": [], "score": 1.0}
|
||||
|
||||
try:
|
||||
# Create record signature
|
||||
signature = hashlib.md5(
|
||||
f"{source_name}_{record.get('sensor_id')}_{record.get('timestamp')}_{record.get('value')}".encode()
|
||||
).hexdigest()
|
||||
|
||||
# Check cache for recent duplicates
|
||||
cache_key = f"record_signature:{signature}"
|
||||
exists = await self.redis.exists(cache_key)
|
||||
|
||||
if exists:
|
||||
result["is_unique"] = False
|
||||
result["issues"].append("Duplicate record detected")
|
||||
result["score"] = 0.0
|
||||
else:
|
||||
# Store signature with short expiration (1 hour)
|
||||
await self.redis.setex(cache_key, 3600, "1")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error checking duplicates: {e}")
|
||||
# Don't fail validation for cache errors
|
||||
|
||||
return result
|
||||
|
||||
async def _normalize_timestamp(self, timestamp) -> int:
|
||||
"""Normalize timestamp to unix timestamp"""
|
||||
try:
|
||||
if isinstance(timestamp, str):
|
||||
try:
|
||||
# Try ISO format first
|
||||
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||
return int(dt.timestamp())
|
||||
except:
|
||||
# Try as unix timestamp string
|
||||
return int(float(timestamp))
|
||||
else:
|
||||
return int(float(timestamp))
|
||||
except:
|
||||
# Fallback to current time
|
||||
return int(datetime.utcnow().timestamp())
|
||||
|
||||
async def _infer_sensor_type(self, record: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""Infer sensor type from record data"""
|
||||
sensor_id = record.get("sensor_id", "").lower()
|
||||
unit = (record.get("unit", "") or "").lower()
|
||||
value = record.get("value", 0)
|
||||
metadata = record.get("metadata", {})
|
||||
|
||||
# Energy sensors
|
||||
if "wh" in unit or "energy" in sensor_id or "consumption" in sensor_id:
|
||||
return {"type": "energy", "category": "energy"}
|
||||
elif "w" in unit and "wh" not in unit:
|
||||
return {"type": "power", "category": "energy"}
|
||||
|
||||
# Environmental sensors
|
||||
elif "temp" in sensor_id or "°c" in unit or "celsius" in unit:
|
||||
return {"type": "temperature", "category": "environmental"}
|
||||
elif "humid" in sensor_id or "%" in unit:
|
||||
return {"type": "humidity", "category": "environmental"}
|
||||
elif "co2" in sensor_id or "ppm" in unit:
|
||||
return {"type": "co2", "category": "environmental"}
|
||||
|
||||
# Motion/occupancy sensors
|
||||
elif "motion" in sensor_id or "occupancy" in sensor_id or ("motion" in str(metadata).lower()):
|
||||
return {"type": "motion", "category": "occupancy"}
|
||||
|
||||
# Generation sensors
|
||||
elif "generation" in sensor_id or "solar" in sensor_id or "generation" in str(metadata).lower():
|
||||
return {"type": "generation", "category": "energy"}
|
||||
|
||||
# Default to energy if unclear
|
||||
else:
|
||||
return {"type": "energy", "category": "energy"}
|
||||
|
||||
async def _standardize_unit(self, unit: Optional[str]) -> Dict[str, Any]:
|
||||
"""Standardize unit format"""
|
||||
if not unit:
|
||||
return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
|
||||
|
||||
unit_lower = unit.lower().strip()
|
||||
|
||||
# Energy units
|
||||
if unit_lower in ["kwh", "kw-h", "kw_h"]:
|
||||
return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
|
||||
elif unit_lower in ["wh", "w-h", "w_h"]:
|
||||
return {"standard_unit": "kWh", "conversion_factor": 0.001, "unit_type": "energy"}
|
||||
elif unit_lower in ["mwh", "mw-h", "mw_h"]:
|
||||
return {"standard_unit": "kWh", "conversion_factor": 1000.0, "unit_type": "energy"}
|
||||
|
||||
# Power units
|
||||
elif unit_lower in ["kw", "kilowatt", "kilowatts"]:
|
||||
return {"standard_unit": "kW", "conversion_factor": 1.0, "unit_type": "power"}
|
||||
elif unit_lower in ["w", "watt", "watts"]:
|
||||
return {"standard_unit": "kW", "conversion_factor": 0.001, "unit_type": "power"}
|
||||
elif unit_lower in ["mw", "megawatt", "megawatts"]:
|
||||
return {"standard_unit": "kW", "conversion_factor": 1000.0, "unit_type": "power"}
|
||||
|
||||
# Temperature units
|
||||
elif unit_lower in ["°c", "celsius", "c"]:
|
||||
return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature"}
|
||||
elif unit_lower in ["°f", "fahrenheit", "f"]:
|
||||
return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature", "requires_conversion": True}
|
||||
|
||||
# Default
|
||||
else:
|
||||
return {"standard_unit": unit, "conversion_factor": 1.0, "unit_type": "unknown"}
|
||||
|
||||
async def _calculate_derived_metrics(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
|
||||
"""Calculate derived metrics from the record"""
|
||||
derived = {}
|
||||
|
||||
try:
|
||||
value = float(record.get("value", 0))
|
||||
unit_info = record.get("unit_info", {})
|
||||
|
||||
# Apply unit conversion if needed
|
||||
if unit_info.get("conversion_factor", 1.0) != 1.0:
|
||||
derived["original_value"] = value
|
||||
derived["converted_value"] = value * unit_info["conversion_factor"]
|
||||
|
||||
# Energy-specific calculations
|
||||
if unit_info.get("unit_type") == "energy":
|
||||
# Estimate cost (simplified)
|
||||
cost_per_kwh = 0.12 # Example rate
|
||||
derived["estimated_cost"] = value * cost_per_kwh
|
||||
|
||||
# Estimate CO2 emissions (simplified)
|
||||
co2_per_kwh = 0.4 # kg CO2 per kWh (example grid factor)
|
||||
derived["estimated_co2_kg"] = value * co2_per_kwh
|
||||
|
||||
# Add value range classification
|
||||
derived["value_range"] = await self._classify_value_range(value, unit_info.get("unit_type"))
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error calculating derived metrics: {e}")
|
||||
|
||||
return derived
|
||||
|
||||
async def _classify_value_range(self, value: float, unit_type: str) -> str:
|
||||
"""Classify value into ranges for better understanding"""
|
||||
if unit_type == "energy":
|
||||
if value < 1:
|
||||
return "very_low"
|
||||
elif value < 10:
|
||||
return "low"
|
||||
elif value < 50:
|
||||
return "medium"
|
||||
elif value < 200:
|
||||
return "high"
|
||||
else:
|
||||
return "very_high"
|
||||
elif unit_type == "power":
|
||||
if value < 0.5:
|
||||
return "very_low"
|
||||
elif value < 5:
|
||||
return "low"
|
||||
elif value < 20:
|
||||
return "medium"
|
||||
elif value < 100:
|
||||
return "high"
|
||||
else:
|
||||
return "very_high"
|
||||
else:
|
||||
return "unknown"
|
||||
|
||||
async def _enrich_with_context(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
|
||||
"""Enrich record with contextual information"""
|
||||
context = {}
|
||||
|
||||
try:
|
||||
# Add geographical context if available
|
||||
context["data_source"] = "real_community"
|
||||
context["source_type"] = "ftp_ingestion"
|
||||
|
||||
# Add data freshness
|
||||
ingestion_time = datetime.utcnow()
|
||||
data_time = datetime.fromtimestamp(record["timestamp"])
|
||||
context["data_age_minutes"] = (ingestion_time - data_time).total_seconds() / 60
|
||||
|
||||
# Classify data freshness
|
||||
if context["data_age_minutes"] < 15:
|
||||
context["freshness"] = "real_time"
|
||||
elif context["data_age_minutes"] < 60:
|
||||
context["freshness"] = "near_real_time"
|
||||
elif context["data_age_minutes"] < 1440: # 24 hours
|
||||
context["freshness"] = "recent"
|
||||
else:
|
||||
context["freshness"] = "historical"
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error adding context: {e}")
|
||||
|
||||
return context
|
||||
|
||||
async def _extract_temporal_features(self, timestamp: int) -> Dict[str, Any]:
|
||||
"""Extract temporal features from timestamp"""
|
||||
dt = datetime.fromtimestamp(timestamp)
|
||||
|
||||
return {
|
||||
"hour": dt.hour,
|
||||
"day_of_week": dt.weekday(),
|
||||
"day_of_month": dt.day,
|
||||
"month": dt.month,
|
||||
"quarter": (dt.month - 1) // 3 + 1,
|
||||
"is_weekend": dt.weekday() >= 5,
|
||||
"is_business_hours": 8 <= dt.hour <= 17,
|
||||
"season": self._get_season(dt.month)
|
||||
}
|
||||
|
||||
def _get_season(self, month: int) -> str:
|
||||
"""Get season from month"""
|
||||
if month in [12, 1, 2]:
|
||||
return "winter"
|
||||
elif month in [3, 4, 5]:
|
||||
return "spring"
|
||||
elif month in [6, 7, 8]:
|
||||
return "summer"
|
||||
else:
|
||||
return "autumn"
|
||||
|
||||
async def _enrich_energy_data(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Add energy-specific enrichments"""
|
||||
enrichment = {}
|
||||
|
||||
try:
|
||||
value = record.get("derived_metrics", {}).get("converted_value", record.get("value", 0))
|
||||
temporal = record.get("temporal", {})
|
||||
|
||||
# Energy usage patterns
|
||||
if temporal.get("is_business_hours"):
|
||||
enrichment["usage_pattern"] = "business_hours"
|
||||
elif temporal.get("is_weekend"):
|
||||
enrichment["usage_pattern"] = "weekend"
|
||||
else:
|
||||
enrichment["usage_pattern"] = "off_hours"
|
||||
|
||||
# Demand classification
|
||||
if value > 100:
|
||||
enrichment["demand_level"] = "high"
|
||||
elif value > 50:
|
||||
enrichment["demand_level"] = "medium"
|
||||
elif value > 10:
|
||||
enrichment["demand_level"] = "low"
|
||||
else:
|
||||
enrichment["demand_level"] = "minimal"
|
||||
|
||||
# Peak/off-peak classification
|
||||
hour = temporal.get("hour", 0)
|
||||
if 17 <= hour <= 21: # Evening peak
|
||||
enrichment["tariff_period"] = "peak"
|
||||
elif 22 <= hour <= 6: # Night off-peak
|
||||
enrichment["tariff_period"] = "off_peak"
|
||||
else:
|
||||
enrichment["tariff_period"] = "standard"
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error enriching energy data: {e}")
|
||||
|
||||
return enrichment
|
||||
|
||||
async def _calculate_quality_scores(self, data: List[Dict[str, Any]], quality_report: Dict[str, Any]) -> Dict[str, float]:
|
||||
"""Calculate overall quality scores"""
|
||||
if not data:
|
||||
return {"overall": 0.0, "completeness": 0.0, "accuracy": 0.0, "consistency": 0.0, "timeliness": 0.0}
|
||||
|
||||
# Completeness score
|
||||
total_expected_fields = len(data) * 4 # sensor_id, timestamp, value, unit
|
||||
total_present_fields = sum(1 for record in data
|
||||
for field in ["sensor_id", "timestamp", "value", "unit"]
|
||||
if record.get(field) is not None)
|
||||
completeness = total_present_fields / total_expected_fields if total_expected_fields > 0 else 0.0
|
||||
|
||||
# Accuracy score (based on validation scores)
|
||||
accuracy_scores = [record.get("data_quality", {}).get("quality_score", 0) for record in data]
|
||||
accuracy = statistics.mean(accuracy_scores) if accuracy_scores else 0.0
|
||||
|
||||
# Consistency score (coefficient of variation for quality scores)
|
||||
if len(accuracy_scores) > 1:
|
||||
std_dev = statistics.stdev(accuracy_scores)
|
||||
mean_score = statistics.mean(accuracy_scores)
|
||||
consistency = 1.0 - (std_dev / mean_score) if mean_score > 0 else 0.0
|
||||
else:
|
||||
consistency = 1.0
|
||||
|
||||
# Timeliness score (based on data age)
|
||||
current_time = datetime.utcnow().timestamp()
|
||||
ages = [(current_time - record.get("timestamp", current_time)) / 3600 for record in data] # age in hours
|
||||
avg_age = statistics.mean(ages) if ages else 0
|
||||
timeliness = max(0.0, 1.0 - (avg_age / 24)) # Decrease score as data gets older than 24 hours
|
||||
|
||||
# Overall score
|
||||
overall = statistics.mean([completeness, accuracy, consistency, timeliness])
|
||||
|
||||
return {
|
||||
"overall": round(overall, 3),
|
||||
"completeness": round(completeness, 3),
|
||||
"accuracy": round(accuracy, 3),
|
||||
"consistency": round(consistency, 3),
|
||||
"timeliness": round(timeliness, 3)
|
||||
}
|
||||
|
||||
async def _store_quality_report(self, quality_report: Dict[str, Any], source_name: str):
|
||||
"""Store quality report in database"""
|
||||
try:
|
||||
quality_report["_id"] = f"{source_name}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
|
||||
await self.db.quality_reports.insert_one(quality_report)
|
||||
|
||||
# Also cache in Redis for quick access
|
||||
cache_key = f"quality_report:{source_name}:latest"
|
||||
await self.redis.setex(cache_key, 3600, json.dumps(quality_report, default=str))
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error storing quality report: {e}")
|
||||
|
||||
async def _load_validation_rules(self):
|
||||
"""Load validation rules configuration"""
|
||||
# Default validation rules
|
||||
self.validation_rules = {
|
||||
"energy": {
|
||||
"min_value": 0,
|
||||
"max_value": 100000,
|
||||
"required_precision": 0.01
|
||||
},
|
||||
"power": {
|
||||
"min_value": 0,
|
||||
"max_value": 50000,
|
||||
"required_precision": 0.1
|
||||
},
|
||||
"temperature": {
|
||||
"min_value": -50,
|
||||
"max_value": 100,
|
||||
"required_precision": 0.1
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Loaded default validation rules")
|
||||
|
||||
async def _load_enrichment_metadata(self):
|
||||
"""Load enrichment metadata"""
|
||||
# Load any cached enrichment data
|
||||
try:
|
||||
cache_keys = []
|
||||
async for key in self.redis.scan_iter(match="enrichment:*"):
|
||||
cache_keys.append(key)
|
||||
|
||||
logger.info(f"Loaded {len(cache_keys)} enrichment cache entries")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error loading enrichment metadata: {e}")
|
||||
|
||||
async def get_quality_summary(self, source_name: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get quality summary for sources"""
|
||||
try:
|
||||
match_filter = {"source": source_name} if source_name else {}
|
||||
|
||||
# Get recent quality reports
|
||||
cursor = self.db.quality_reports.find(match_filter).sort("processing_time", -1).limit(50)
|
||||
|
||||
reports = []
|
||||
async for report in cursor:
|
||||
report["_id"] = str(report["_id"])
|
||||
reports.append(report)
|
||||
|
||||
if not reports:
|
||||
return {"message": "No quality reports found"}
|
||||
|
||||
# Calculate summary statistics
|
||||
avg_quality = statistics.mean([r["quality_scores"]["overall"] for r in reports])
|
||||
total_processed = sum([r["processed_records"] for r in reports])
|
||||
total_rejected = sum([r["rejected_records"] for r in reports])
|
||||
|
||||
return {
|
||||
"total_reports": len(reports),
|
||||
"average_quality": round(avg_quality, 3),
|
||||
"total_processed_records": total_processed,
|
||||
"total_rejected_records": total_rejected,
|
||||
"success_rate": round(total_processed / (total_processed + total_rejected) * 100, 2) if (total_processed + total_rejected) > 0 else 0,
|
||||
"latest_report": reports[0] if reports else None
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting quality summary: {e}")
|
||||
return {"error": str(e)}
|
||||
Reference in New Issue
Block a user