- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
710 lines
30 KiB
Python
710 lines
30 KiB
Python
"""
|
|
Data validation and enrichment for time series data.
|
|
Provides quality assessment, metadata enrichment, and data transformation capabilities.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import statistics
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
import hashlib
|
|
import re
|
|
from collections import defaultdict
|
|
import math
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DataValidator:
|
|
"""Validates, enriches, and transforms time series data"""
|
|
|
|
def __init__(self, db, redis_client):
|
|
self.db = db
|
|
self.redis = redis_client
|
|
self.validation_rules = {}
|
|
self.enrichment_cache = {}
|
|
self.quality_thresholds = {
|
|
"completeness": 0.8,
|
|
"accuracy": 0.9,
|
|
"consistency": 0.85,
|
|
"timeliness": 0.9
|
|
}
|
|
|
|
async def initialize(self):
|
|
"""Initialize validator with default rules and configurations"""
|
|
try:
|
|
await self._load_validation_rules()
|
|
await self._load_enrichment_metadata()
|
|
logger.info("Data validator initialized successfully")
|
|
except Exception as e:
|
|
logger.error(f"Error initializing data validator: {e}")
|
|
raise
|
|
|
|
async def validate_and_enrich_data(self, data: List[Dict[str, Any]],
|
|
source_name: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
|
|
"""Validate and enrich time series data, returning processed data and quality report"""
|
|
try:
|
|
logger.info(f"Validating and enriching {len(data)} records from {source_name}")
|
|
|
|
# Initialize validation report
|
|
quality_report = {
|
|
"source": source_name,
|
|
"total_records": len(data),
|
|
"processed_records": 0,
|
|
"rejected_records": 0,
|
|
"quality_scores": {},
|
|
"issues_found": [],
|
|
"processing_time": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
enriched_data = []
|
|
|
|
# Process each record
|
|
for i, record in enumerate(data):
|
|
try:
|
|
# Validate record
|
|
validation_result = await self._validate_record(record, source_name)
|
|
|
|
if validation_result["is_valid"]:
|
|
# Enrich the record
|
|
enriched_record = await self._enrich_record(record, source_name, validation_result)
|
|
enriched_data.append(enriched_record)
|
|
quality_report["processed_records"] += 1
|
|
else:
|
|
quality_report["rejected_records"] += 1
|
|
quality_report["issues_found"].extend(validation_result["issues"])
|
|
logger.warning(f"Record {i} rejected: {validation_result['issues']}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing record {i}: {e}")
|
|
quality_report["rejected_records"] += 1
|
|
quality_report["issues_found"].append(f"Processing error: {str(e)}")
|
|
|
|
# Calculate overall quality scores
|
|
quality_report["quality_scores"] = await self._calculate_quality_scores(enriched_data, quality_report)
|
|
|
|
# Store quality report
|
|
await self._store_quality_report(quality_report, source_name)
|
|
|
|
logger.info(f"Validation complete: {quality_report['processed_records']}/{quality_report['total_records']} records processed")
|
|
|
|
return enriched_data, quality_report
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in data validation and enrichment: {e}")
|
|
raise
|
|
|
|
async def _validate_record(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
|
|
"""Validate a single record against quality rules"""
|
|
validation_result = {
|
|
"is_valid": True,
|
|
"issues": [],
|
|
"quality_metrics": {}
|
|
}
|
|
|
|
try:
|
|
# Check required fields
|
|
required_fields = ["sensor_id", "timestamp", "value"]
|
|
for field in required_fields:
|
|
if field not in record or record[field] is None:
|
|
validation_result["is_valid"] = False
|
|
validation_result["issues"].append(f"Missing required field: {field}")
|
|
|
|
if not validation_result["is_valid"]:
|
|
return validation_result
|
|
|
|
# Validate timestamp
|
|
timestamp_validation = await self._validate_timestamp(record["timestamp"])
|
|
validation_result["quality_metrics"]["timestamp_quality"] = timestamp_validation["score"]
|
|
if not timestamp_validation["is_valid"]:
|
|
validation_result["issues"].extend(timestamp_validation["issues"])
|
|
|
|
# Validate numeric value
|
|
value_validation = await self._validate_numeric_value(record["value"], record.get("unit"))
|
|
validation_result["quality_metrics"]["value_quality"] = value_validation["score"]
|
|
if not value_validation["is_valid"]:
|
|
validation_result["issues"].extend(value_validation["issues"])
|
|
|
|
# Validate sensor ID format
|
|
sensor_validation = await self._validate_sensor_id(record["sensor_id"])
|
|
validation_result["quality_metrics"]["sensor_id_quality"] = sensor_validation["score"]
|
|
if not sensor_validation["is_valid"]:
|
|
validation_result["issues"].extend(sensor_validation["issues"])
|
|
|
|
# Check for duplicates
|
|
duplicate_check = await self._check_for_duplicates(record, source_name)
|
|
validation_result["quality_metrics"]["uniqueness"] = duplicate_check["score"]
|
|
if not duplicate_check["is_unique"]:
|
|
validation_result["issues"].extend(duplicate_check["issues"])
|
|
|
|
# Calculate overall validity
|
|
if validation_result["issues"]:
|
|
# Allow minor issues but flag major ones
|
|
major_issues = [issue for issue in validation_result["issues"]
|
|
if "Missing required field" in issue or "Invalid" in issue]
|
|
validation_result["is_valid"] = len(major_issues) == 0
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error validating record: {e}")
|
|
validation_result["is_valid"] = False
|
|
validation_result["issues"].append(f"Validation error: {str(e)}")
|
|
|
|
return validation_result
|
|
|
|
async def _enrich_record(self, record: Dict[str, Any], source_name: str,
|
|
validation_result: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Enrich a record with additional metadata and derived fields"""
|
|
try:
|
|
enriched = record.copy()
|
|
|
|
# Add validation metadata
|
|
enriched["data_quality"] = {
|
|
"quality_score": statistics.mean(validation_result["quality_metrics"].values()) if validation_result["quality_metrics"] else 0.0,
|
|
"quality_metrics": validation_result["quality_metrics"],
|
|
"validation_timestamp": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
# Add source information
|
|
enriched["source_info"] = {
|
|
"source_name": source_name,
|
|
"ingestion_time": datetime.utcnow().isoformat(),
|
|
"record_id": hashlib.md5(f"{source_name}_{record.get('sensor_id', 'unknown')}_{record.get('timestamp', 0)}".encode()).hexdigest()
|
|
}
|
|
|
|
# Normalize timestamp format
|
|
enriched["timestamp"] = await self._normalize_timestamp(record["timestamp"])
|
|
enriched["timestamp_iso"] = datetime.fromtimestamp(enriched["timestamp"]).isoformat()
|
|
|
|
# Infer and enrich sensor type
|
|
sensor_type_info = await self._infer_sensor_type(record)
|
|
enriched["sensor_type"] = sensor_type_info["type"]
|
|
enriched["sensor_category"] = sensor_type_info["category"]
|
|
|
|
# Add unit standardization
|
|
unit_info = await self._standardize_unit(record.get("unit"))
|
|
enriched["unit"] = unit_info["standard_unit"]
|
|
enriched["unit_info"] = unit_info
|
|
|
|
# Calculate derived metrics
|
|
derived_metrics = await self._calculate_derived_metrics(enriched, source_name)
|
|
enriched["derived_metrics"] = derived_metrics
|
|
|
|
# Add location and context information
|
|
context_info = await self._enrich_with_context(enriched, source_name)
|
|
enriched["metadata"] = {**enriched.get("metadata", {}), **context_info}
|
|
|
|
# Add temporal features
|
|
temporal_features = await self._extract_temporal_features(enriched["timestamp"])
|
|
enriched["temporal"] = temporal_features
|
|
|
|
# Energy-specific enrichments
|
|
if sensor_type_info["category"] == "energy":
|
|
energy_enrichment = await self._enrich_energy_data(enriched)
|
|
enriched.update(energy_enrichment)
|
|
|
|
return enriched
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error enriching record: {e}")
|
|
return record
|
|
|
|
async def _validate_timestamp(self, timestamp) -> Dict[str, Any]:
|
|
"""Validate timestamp format and reasonableness"""
|
|
result = {"is_valid": True, "issues": [], "score": 1.0}
|
|
|
|
try:
|
|
# Convert to numeric timestamp
|
|
if isinstance(timestamp, str):
|
|
try:
|
|
# Try parsing ISO format
|
|
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
|
ts = dt.timestamp()
|
|
except:
|
|
# Try parsing as unix timestamp string
|
|
ts = float(timestamp)
|
|
else:
|
|
ts = float(timestamp)
|
|
|
|
# Check if timestamp is reasonable (not too far in past/future)
|
|
current_time = datetime.utcnow().timestamp()
|
|
max_age = 365 * 24 * 3600 # 1 year
|
|
max_future = 24 * 3600 # 1 day
|
|
|
|
if ts < current_time - max_age:
|
|
result["issues"].append("Timestamp too old (more than 1 year)")
|
|
result["score"] -= 0.3
|
|
elif ts > current_time + max_future:
|
|
result["issues"].append("Timestamp too far in future")
|
|
result["score"] -= 0.3
|
|
|
|
# Check for reasonable precision (not too precise for energy data)
|
|
if ts != int(ts) and len(str(ts).split('.')[1]) > 3:
|
|
result["score"] -= 0.1 # Minor issue
|
|
|
|
except (ValueError, TypeError) as e:
|
|
result["is_valid"] = False
|
|
result["issues"].append(f"Invalid timestamp format: {e}")
|
|
result["score"] = 0.0
|
|
|
|
return result
|
|
|
|
async def _validate_numeric_value(self, value, unit: Optional[str] = None) -> Dict[str, Any]:
|
|
"""Validate numeric value reasonableness"""
|
|
result = {"is_valid": True, "issues": [], "score": 1.0}
|
|
|
|
try:
|
|
numeric_value = float(value)
|
|
|
|
# Check for negative values (usually invalid for energy data)
|
|
if numeric_value < 0:
|
|
result["issues"].append("Negative energy value")
|
|
result["score"] -= 0.4
|
|
|
|
# Check for unreasonably large values
|
|
unit_str = (unit or "").lower()
|
|
if "wh" in unit_str:
|
|
# Energy values
|
|
if numeric_value > 100000: # >100kWh seems excessive for single reading
|
|
result["issues"].append("Unusually high energy value")
|
|
result["score"] -= 0.2
|
|
elif "w" in unit_str:
|
|
# Power values
|
|
if numeric_value > 50000: # >50kW seems excessive
|
|
result["issues"].append("Unusually high power value")
|
|
result["score"] -= 0.2
|
|
|
|
# Check for zero values (might indicate sensor issues)
|
|
if numeric_value == 0:
|
|
result["score"] -= 0.1
|
|
|
|
# Check for NaN or infinity
|
|
if math.isnan(numeric_value) or math.isinf(numeric_value):
|
|
result["is_valid"] = False
|
|
result["issues"].append("Invalid numeric value (NaN or Infinity)")
|
|
result["score"] = 0.0
|
|
|
|
except (ValueError, TypeError) as e:
|
|
result["is_valid"] = False
|
|
result["issues"].append(f"Non-numeric value: {e}")
|
|
result["score"] = 0.0
|
|
|
|
return result
|
|
|
|
async def _validate_sensor_id(self, sensor_id: str) -> Dict[str, Any]:
|
|
"""Validate sensor ID format and consistency"""
|
|
result = {"is_valid": True, "issues": [], "score": 1.0}
|
|
|
|
try:
|
|
if not isinstance(sensor_id, str) or len(sensor_id) == 0:
|
|
result["is_valid"] = False
|
|
result["issues"].append("Empty or invalid sensor ID")
|
|
result["score"] = 0.0
|
|
return result
|
|
|
|
# Check length
|
|
if len(sensor_id) < 3:
|
|
result["issues"].append("Very short sensor ID")
|
|
result["score"] -= 0.2
|
|
elif len(sensor_id) > 50:
|
|
result["issues"].append("Very long sensor ID")
|
|
result["score"] -= 0.1
|
|
|
|
# Check for reasonable characters
|
|
if not re.match(r'^[a-zA-Z0-9_\-\.]+$', sensor_id):
|
|
result["issues"].append("Sensor ID contains unusual characters")
|
|
result["score"] -= 0.1
|
|
|
|
except Exception as e:
|
|
result["issues"].append(f"Sensor ID validation error: {e}")
|
|
result["score"] -= 0.1
|
|
|
|
return result
|
|
|
|
async def _check_for_duplicates(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
|
|
"""Check for duplicate records"""
|
|
result = {"is_unique": True, "issues": [], "score": 1.0}
|
|
|
|
try:
|
|
# Create record signature
|
|
signature = hashlib.md5(
|
|
f"{source_name}_{record.get('sensor_id')}_{record.get('timestamp')}_{record.get('value')}".encode()
|
|
).hexdigest()
|
|
|
|
# Check cache for recent duplicates
|
|
cache_key = f"record_signature:{signature}"
|
|
exists = await self.redis.exists(cache_key)
|
|
|
|
if exists:
|
|
result["is_unique"] = False
|
|
result["issues"].append("Duplicate record detected")
|
|
result["score"] = 0.0
|
|
else:
|
|
# Store signature with short expiration (1 hour)
|
|
await self.redis.setex(cache_key, 3600, "1")
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error checking duplicates: {e}")
|
|
# Don't fail validation for cache errors
|
|
|
|
return result
|
|
|
|
async def _normalize_timestamp(self, timestamp) -> int:
|
|
"""Normalize timestamp to unix timestamp"""
|
|
try:
|
|
if isinstance(timestamp, str):
|
|
try:
|
|
# Try ISO format first
|
|
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
|
return int(dt.timestamp())
|
|
except:
|
|
# Try as unix timestamp string
|
|
return int(float(timestamp))
|
|
else:
|
|
return int(float(timestamp))
|
|
except:
|
|
# Fallback to current time
|
|
return int(datetime.utcnow().timestamp())
|
|
|
|
async def _infer_sensor_type(self, record: Dict[str, Any]) -> Dict[str, str]:
|
|
"""Infer sensor type from record data"""
|
|
sensor_id = record.get("sensor_id", "").lower()
|
|
unit = (record.get("unit", "") or "").lower()
|
|
value = record.get("value", 0)
|
|
metadata = record.get("metadata", {})
|
|
|
|
# Energy sensors
|
|
if "wh" in unit or "energy" in sensor_id or "consumption" in sensor_id:
|
|
return {"type": "energy", "category": "energy"}
|
|
elif "w" in unit and "wh" not in unit:
|
|
return {"type": "power", "category": "energy"}
|
|
|
|
# Environmental sensors
|
|
elif "temp" in sensor_id or "°c" in unit or "celsius" in unit:
|
|
return {"type": "temperature", "category": "environmental"}
|
|
elif "humid" in sensor_id or "%" in unit:
|
|
return {"type": "humidity", "category": "environmental"}
|
|
elif "co2" in sensor_id or "ppm" in unit:
|
|
return {"type": "co2", "category": "environmental"}
|
|
|
|
# Motion/occupancy sensors
|
|
elif "motion" in sensor_id or "occupancy" in sensor_id or ("motion" in str(metadata).lower()):
|
|
return {"type": "motion", "category": "occupancy"}
|
|
|
|
# Generation sensors
|
|
elif "generation" in sensor_id or "solar" in sensor_id or "generation" in str(metadata).lower():
|
|
return {"type": "generation", "category": "energy"}
|
|
|
|
# Default to energy if unclear
|
|
else:
|
|
return {"type": "energy", "category": "energy"}
|
|
|
|
async def _standardize_unit(self, unit: Optional[str]) -> Dict[str, Any]:
|
|
"""Standardize unit format"""
|
|
if not unit:
|
|
return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
|
|
|
|
unit_lower = unit.lower().strip()
|
|
|
|
# Energy units
|
|
if unit_lower in ["kwh", "kw-h", "kw_h"]:
|
|
return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
|
|
elif unit_lower in ["wh", "w-h", "w_h"]:
|
|
return {"standard_unit": "kWh", "conversion_factor": 0.001, "unit_type": "energy"}
|
|
elif unit_lower in ["mwh", "mw-h", "mw_h"]:
|
|
return {"standard_unit": "kWh", "conversion_factor": 1000.0, "unit_type": "energy"}
|
|
|
|
# Power units
|
|
elif unit_lower in ["kw", "kilowatt", "kilowatts"]:
|
|
return {"standard_unit": "kW", "conversion_factor": 1.0, "unit_type": "power"}
|
|
elif unit_lower in ["w", "watt", "watts"]:
|
|
return {"standard_unit": "kW", "conversion_factor": 0.001, "unit_type": "power"}
|
|
elif unit_lower in ["mw", "megawatt", "megawatts"]:
|
|
return {"standard_unit": "kW", "conversion_factor": 1000.0, "unit_type": "power"}
|
|
|
|
# Temperature units
|
|
elif unit_lower in ["°c", "celsius", "c"]:
|
|
return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature"}
|
|
elif unit_lower in ["°f", "fahrenheit", "f"]:
|
|
return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature", "requires_conversion": True}
|
|
|
|
# Default
|
|
else:
|
|
return {"standard_unit": unit, "conversion_factor": 1.0, "unit_type": "unknown"}
|
|
|
|
async def _calculate_derived_metrics(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
|
|
"""Calculate derived metrics from the record"""
|
|
derived = {}
|
|
|
|
try:
|
|
value = float(record.get("value", 0))
|
|
unit_info = record.get("unit_info", {})
|
|
|
|
# Apply unit conversion if needed
|
|
if unit_info.get("conversion_factor", 1.0) != 1.0:
|
|
derived["original_value"] = value
|
|
derived["converted_value"] = value * unit_info["conversion_factor"]
|
|
|
|
# Energy-specific calculations
|
|
if unit_info.get("unit_type") == "energy":
|
|
# Estimate cost (simplified)
|
|
cost_per_kwh = 0.12 # Example rate
|
|
derived["estimated_cost"] = value * cost_per_kwh
|
|
|
|
# Estimate CO2 emissions (simplified)
|
|
co2_per_kwh = 0.4 # kg CO2 per kWh (example grid factor)
|
|
derived["estimated_co2_kg"] = value * co2_per_kwh
|
|
|
|
# Add value range classification
|
|
derived["value_range"] = await self._classify_value_range(value, unit_info.get("unit_type"))
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error calculating derived metrics: {e}")
|
|
|
|
return derived
|
|
|
|
async def _classify_value_range(self, value: float, unit_type: str) -> str:
|
|
"""Classify value into ranges for better understanding"""
|
|
if unit_type == "energy":
|
|
if value < 1:
|
|
return "very_low"
|
|
elif value < 10:
|
|
return "low"
|
|
elif value < 50:
|
|
return "medium"
|
|
elif value < 200:
|
|
return "high"
|
|
else:
|
|
return "very_high"
|
|
elif unit_type == "power":
|
|
if value < 0.5:
|
|
return "very_low"
|
|
elif value < 5:
|
|
return "low"
|
|
elif value < 20:
|
|
return "medium"
|
|
elif value < 100:
|
|
return "high"
|
|
else:
|
|
return "very_high"
|
|
else:
|
|
return "unknown"
|
|
|
|
async def _enrich_with_context(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
|
|
"""Enrich record with contextual information"""
|
|
context = {}
|
|
|
|
try:
|
|
# Add geographical context if available
|
|
context["data_source"] = "real_community"
|
|
context["source_type"] = "ftp_ingestion"
|
|
|
|
# Add data freshness
|
|
ingestion_time = datetime.utcnow()
|
|
data_time = datetime.fromtimestamp(record["timestamp"])
|
|
context["data_age_minutes"] = (ingestion_time - data_time).total_seconds() / 60
|
|
|
|
# Classify data freshness
|
|
if context["data_age_minutes"] < 15:
|
|
context["freshness"] = "real_time"
|
|
elif context["data_age_minutes"] < 60:
|
|
context["freshness"] = "near_real_time"
|
|
elif context["data_age_minutes"] < 1440: # 24 hours
|
|
context["freshness"] = "recent"
|
|
else:
|
|
context["freshness"] = "historical"
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error adding context: {e}")
|
|
|
|
return context
|
|
|
|
async def _extract_temporal_features(self, timestamp: int) -> Dict[str, Any]:
|
|
"""Extract temporal features from timestamp"""
|
|
dt = datetime.fromtimestamp(timestamp)
|
|
|
|
return {
|
|
"hour": dt.hour,
|
|
"day_of_week": dt.weekday(),
|
|
"day_of_month": dt.day,
|
|
"month": dt.month,
|
|
"quarter": (dt.month - 1) // 3 + 1,
|
|
"is_weekend": dt.weekday() >= 5,
|
|
"is_business_hours": 8 <= dt.hour <= 17,
|
|
"season": self._get_season(dt.month)
|
|
}
|
|
|
|
def _get_season(self, month: int) -> str:
|
|
"""Get season from month"""
|
|
if month in [12, 1, 2]:
|
|
return "winter"
|
|
elif month in [3, 4, 5]:
|
|
return "spring"
|
|
elif month in [6, 7, 8]:
|
|
return "summer"
|
|
else:
|
|
return "autumn"
|
|
|
|
async def _enrich_energy_data(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Add energy-specific enrichments"""
|
|
enrichment = {}
|
|
|
|
try:
|
|
value = record.get("derived_metrics", {}).get("converted_value", record.get("value", 0))
|
|
temporal = record.get("temporal", {})
|
|
|
|
# Energy usage patterns
|
|
if temporal.get("is_business_hours"):
|
|
enrichment["usage_pattern"] = "business_hours"
|
|
elif temporal.get("is_weekend"):
|
|
enrichment["usage_pattern"] = "weekend"
|
|
else:
|
|
enrichment["usage_pattern"] = "off_hours"
|
|
|
|
# Demand classification
|
|
if value > 100:
|
|
enrichment["demand_level"] = "high"
|
|
elif value > 50:
|
|
enrichment["demand_level"] = "medium"
|
|
elif value > 10:
|
|
enrichment["demand_level"] = "low"
|
|
else:
|
|
enrichment["demand_level"] = "minimal"
|
|
|
|
# Peak/off-peak classification
|
|
hour = temporal.get("hour", 0)
|
|
if 17 <= hour <= 21: # Evening peak
|
|
enrichment["tariff_period"] = "peak"
|
|
elif 22 <= hour <= 6: # Night off-peak
|
|
enrichment["tariff_period"] = "off_peak"
|
|
else:
|
|
enrichment["tariff_period"] = "standard"
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error enriching energy data: {e}")
|
|
|
|
return enrichment
|
|
|
|
async def _calculate_quality_scores(self, data: List[Dict[str, Any]], quality_report: Dict[str, Any]) -> Dict[str, float]:
|
|
"""Calculate overall quality scores"""
|
|
if not data:
|
|
return {"overall": 0.0, "completeness": 0.0, "accuracy": 0.0, "consistency": 0.0, "timeliness": 0.0}
|
|
|
|
# Completeness score
|
|
total_expected_fields = len(data) * 4 # sensor_id, timestamp, value, unit
|
|
total_present_fields = sum(1 for record in data
|
|
for field in ["sensor_id", "timestamp", "value", "unit"]
|
|
if record.get(field) is not None)
|
|
completeness = total_present_fields / total_expected_fields if total_expected_fields > 0 else 0.0
|
|
|
|
# Accuracy score (based on validation scores)
|
|
accuracy_scores = [record.get("data_quality", {}).get("quality_score", 0) for record in data]
|
|
accuracy = statistics.mean(accuracy_scores) if accuracy_scores else 0.0
|
|
|
|
# Consistency score (coefficient of variation for quality scores)
|
|
if len(accuracy_scores) > 1:
|
|
std_dev = statistics.stdev(accuracy_scores)
|
|
mean_score = statistics.mean(accuracy_scores)
|
|
consistency = 1.0 - (std_dev / mean_score) if mean_score > 0 else 0.0
|
|
else:
|
|
consistency = 1.0
|
|
|
|
# Timeliness score (based on data age)
|
|
current_time = datetime.utcnow().timestamp()
|
|
ages = [(current_time - record.get("timestamp", current_time)) / 3600 for record in data] # age in hours
|
|
avg_age = statistics.mean(ages) if ages else 0
|
|
timeliness = max(0.0, 1.0 - (avg_age / 24)) # Decrease score as data gets older than 24 hours
|
|
|
|
# Overall score
|
|
overall = statistics.mean([completeness, accuracy, consistency, timeliness])
|
|
|
|
return {
|
|
"overall": round(overall, 3),
|
|
"completeness": round(completeness, 3),
|
|
"accuracy": round(accuracy, 3),
|
|
"consistency": round(consistency, 3),
|
|
"timeliness": round(timeliness, 3)
|
|
}
|
|
|
|
async def _store_quality_report(self, quality_report: Dict[str, Any], source_name: str):
|
|
"""Store quality report in database"""
|
|
try:
|
|
quality_report["_id"] = f"{source_name}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
|
|
await self.db.quality_reports.insert_one(quality_report)
|
|
|
|
# Also cache in Redis for quick access
|
|
cache_key = f"quality_report:{source_name}:latest"
|
|
await self.redis.setex(cache_key, 3600, json.dumps(quality_report, default=str))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error storing quality report: {e}")
|
|
|
|
async def _load_validation_rules(self):
|
|
"""Load validation rules configuration"""
|
|
# Default validation rules
|
|
self.validation_rules = {
|
|
"energy": {
|
|
"min_value": 0,
|
|
"max_value": 100000,
|
|
"required_precision": 0.01
|
|
},
|
|
"power": {
|
|
"min_value": 0,
|
|
"max_value": 50000,
|
|
"required_precision": 0.1
|
|
},
|
|
"temperature": {
|
|
"min_value": -50,
|
|
"max_value": 100,
|
|
"required_precision": 0.1
|
|
}
|
|
}
|
|
|
|
logger.info("Loaded default validation rules")
|
|
|
|
async def _load_enrichment_metadata(self):
|
|
"""Load enrichment metadata"""
|
|
# Load any cached enrichment data
|
|
try:
|
|
cache_keys = []
|
|
async for key in self.redis.scan_iter(match="enrichment:*"):
|
|
cache_keys.append(key)
|
|
|
|
logger.info(f"Loaded {len(cache_keys)} enrichment cache entries")
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error loading enrichment metadata: {e}")
|
|
|
|
async def get_quality_summary(self, source_name: Optional[str] = None) -> Dict[str, Any]:
|
|
"""Get quality summary for sources"""
|
|
try:
|
|
match_filter = {"source": source_name} if source_name else {}
|
|
|
|
# Get recent quality reports
|
|
cursor = self.db.quality_reports.find(match_filter).sort("processing_time", -1).limit(50)
|
|
|
|
reports = []
|
|
async for report in cursor:
|
|
report["_id"] = str(report["_id"])
|
|
reports.append(report)
|
|
|
|
if not reports:
|
|
return {"message": "No quality reports found"}
|
|
|
|
# Calculate summary statistics
|
|
avg_quality = statistics.mean([r["quality_scores"]["overall"] for r in reports])
|
|
total_processed = sum([r["processed_records"] for r in reports])
|
|
total_rejected = sum([r["rejected_records"] for r in reports])
|
|
|
|
return {
|
|
"total_reports": len(reports),
|
|
"average_quality": round(avg_quality, 3),
|
|
"total_processed_records": total_processed,
|
|
"total_rejected_records": total_rejected,
|
|
"success_rate": round(total_processed / (total_processed + total_rejected) * 100, 2) if (total_processed + total_rejected) > 0 else 0,
|
|
"latest_report": reports[0] if reports else None
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting quality summary: {e}")
|
|
return {"error": str(e)} |