Simplify data ingestion service

This commit is contained in:
rafaeldpsilva
2025-09-10 15:21:53 +01:00
parent fa694443e7
commit 13556347b0
18 changed files with 826 additions and 1560 deletions

View File

@@ -0,0 +1,710 @@
"""
Data validation and enrichment for time series data.
Provides quality assessment, metadata enrichment, and data transformation capabilities.
"""
import asyncio
import json
import logging
import statistics
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Tuple
import hashlib
import re
from collections import defaultdict
import math
logger = logging.getLogger(__name__)
class DataValidator:
"""Validates, enriches, and transforms time series data"""
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
self.validation_rules = {}
self.enrichment_cache = {}
self.quality_thresholds = {
"completeness": 0.8,
"accuracy": 0.9,
"consistency": 0.85,
"timeliness": 0.9
}
async def initialize(self):
"""Initialize validator with default rules and configurations"""
try:
await self._load_validation_rules()
await self._load_enrichment_metadata()
logger.info("Data validator initialized successfully")
except Exception as e:
logger.error(f"Error initializing data validator: {e}")
raise
async def validate_and_enrich_data(self, data: List[Dict[str, Any]],
source_name: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
"""Validate and enrich time series data, returning processed data and quality report"""
try:
logger.info(f"Validating and enriching {len(data)} records from {source_name}")
# Initialize validation report
quality_report = {
"source": source_name,
"total_records": len(data),
"processed_records": 0,
"rejected_records": 0,
"quality_scores": {},
"issues_found": [],
"processing_time": datetime.utcnow().isoformat()
}
enriched_data = []
# Process each record
for i, record in enumerate(data):
try:
# Validate record
validation_result = await self._validate_record(record, source_name)
if validation_result["is_valid"]:
# Enrich the record
enriched_record = await self._enrich_record(record, source_name, validation_result)
enriched_data.append(enriched_record)
quality_report["processed_records"] += 1
else:
quality_report["rejected_records"] += 1
quality_report["issues_found"].extend(validation_result["issues"])
logger.warning(f"Record {i} rejected: {validation_result['issues']}")
except Exception as e:
logger.error(f"Error processing record {i}: {e}")
quality_report["rejected_records"] += 1
quality_report["issues_found"].append(f"Processing error: {str(e)}")
# Calculate overall quality scores
quality_report["quality_scores"] = await self._calculate_quality_scores(enriched_data, quality_report)
# Store quality report
await self._store_quality_report(quality_report, source_name)
logger.info(f"Validation complete: {quality_report['processed_records']}/{quality_report['total_records']} records processed")
return enriched_data, quality_report
except Exception as e:
logger.error(f"Error in data validation and enrichment: {e}")
raise
async def _validate_record(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
"""Validate a single record against quality rules"""
validation_result = {
"is_valid": True,
"issues": [],
"quality_metrics": {}
}
try:
# Check required fields
required_fields = ["sensor_id", "timestamp", "value"]
for field in required_fields:
if field not in record or record[field] is None:
validation_result["is_valid"] = False
validation_result["issues"].append(f"Missing required field: {field}")
if not validation_result["is_valid"]:
return validation_result
# Validate timestamp
timestamp_validation = await self._validate_timestamp(record["timestamp"])
validation_result["quality_metrics"]["timestamp_quality"] = timestamp_validation["score"]
if not timestamp_validation["is_valid"]:
validation_result["issues"].extend(timestamp_validation["issues"])
# Validate numeric value
value_validation = await self._validate_numeric_value(record["value"], record.get("unit"))
validation_result["quality_metrics"]["value_quality"] = value_validation["score"]
if not value_validation["is_valid"]:
validation_result["issues"].extend(value_validation["issues"])
# Validate sensor ID format
sensor_validation = await self._validate_sensor_id(record["sensor_id"])
validation_result["quality_metrics"]["sensor_id_quality"] = sensor_validation["score"]
if not sensor_validation["is_valid"]:
validation_result["issues"].extend(sensor_validation["issues"])
# Check for duplicates
duplicate_check = await self._check_for_duplicates(record, source_name)
validation_result["quality_metrics"]["uniqueness"] = duplicate_check["score"]
if not duplicate_check["is_unique"]:
validation_result["issues"].extend(duplicate_check["issues"])
# Calculate overall validity
if validation_result["issues"]:
# Allow minor issues but flag major ones
major_issues = [issue for issue in validation_result["issues"]
if "Missing required field" in issue or "Invalid" in issue]
validation_result["is_valid"] = len(major_issues) == 0
except Exception as e:
logger.error(f"Error validating record: {e}")
validation_result["is_valid"] = False
validation_result["issues"].append(f"Validation error: {str(e)}")
return validation_result
async def _enrich_record(self, record: Dict[str, Any], source_name: str,
validation_result: Dict[str, Any]) -> Dict[str, Any]:
"""Enrich a record with additional metadata and derived fields"""
try:
enriched = record.copy()
# Add validation metadata
enriched["data_quality"] = {
"quality_score": statistics.mean(validation_result["quality_metrics"].values()) if validation_result["quality_metrics"] else 0.0,
"quality_metrics": validation_result["quality_metrics"],
"validation_timestamp": datetime.utcnow().isoformat()
}
# Add source information
enriched["source_info"] = {
"source_name": source_name,
"ingestion_time": datetime.utcnow().isoformat(),
"record_id": hashlib.md5(f"{source_name}_{record.get('sensor_id', 'unknown')}_{record.get('timestamp', 0)}".encode()).hexdigest()
}
# Normalize timestamp format
enriched["timestamp"] = await self._normalize_timestamp(record["timestamp"])
enriched["timestamp_iso"] = datetime.fromtimestamp(enriched["timestamp"]).isoformat()
# Infer and enrich sensor type
sensor_type_info = await self._infer_sensor_type(record)
enriched["sensor_type"] = sensor_type_info["type"]
enriched["sensor_category"] = sensor_type_info["category"]
# Add unit standardization
unit_info = await self._standardize_unit(record.get("unit"))
enriched["unit"] = unit_info["standard_unit"]
enriched["unit_info"] = unit_info
# Calculate derived metrics
derived_metrics = await self._calculate_derived_metrics(enriched, source_name)
enriched["derived_metrics"] = derived_metrics
# Add location and context information
context_info = await self._enrich_with_context(enriched, source_name)
enriched["metadata"] = {**enriched.get("metadata", {}), **context_info}
# Add temporal features
temporal_features = await self._extract_temporal_features(enriched["timestamp"])
enriched["temporal"] = temporal_features
# Energy-specific enrichments
if sensor_type_info["category"] == "energy":
energy_enrichment = await self._enrich_energy_data(enriched)
enriched.update(energy_enrichment)
return enriched
except Exception as e:
logger.error(f"Error enriching record: {e}")
return record
async def _validate_timestamp(self, timestamp) -> Dict[str, Any]:
"""Validate timestamp format and reasonableness"""
result = {"is_valid": True, "issues": [], "score": 1.0}
try:
# Convert to numeric timestamp
if isinstance(timestamp, str):
try:
# Try parsing ISO format
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
ts = dt.timestamp()
except:
# Try parsing as unix timestamp string
ts = float(timestamp)
else:
ts = float(timestamp)
# Check if timestamp is reasonable (not too far in past/future)
current_time = datetime.utcnow().timestamp()
max_age = 365 * 24 * 3600 # 1 year
max_future = 24 * 3600 # 1 day
if ts < current_time - max_age:
result["issues"].append("Timestamp too old (more than 1 year)")
result["score"] -= 0.3
elif ts > current_time + max_future:
result["issues"].append("Timestamp too far in future")
result["score"] -= 0.3
# Check for reasonable precision (not too precise for energy data)
if ts != int(ts) and len(str(ts).split('.')[1]) > 3:
result["score"] -= 0.1 # Minor issue
except (ValueError, TypeError) as e:
result["is_valid"] = False
result["issues"].append(f"Invalid timestamp format: {e}")
result["score"] = 0.0
return result
async def _validate_numeric_value(self, value, unit: Optional[str] = None) -> Dict[str, Any]:
"""Validate numeric value reasonableness"""
result = {"is_valid": True, "issues": [], "score": 1.0}
try:
numeric_value = float(value)
# Check for negative values (usually invalid for energy data)
if numeric_value < 0:
result["issues"].append("Negative energy value")
result["score"] -= 0.4
# Check for unreasonably large values
unit_str = (unit or "").lower()
if "wh" in unit_str:
# Energy values
if numeric_value > 100000: # >100kWh seems excessive for single reading
result["issues"].append("Unusually high energy value")
result["score"] -= 0.2
elif "w" in unit_str:
# Power values
if numeric_value > 50000: # >50kW seems excessive
result["issues"].append("Unusually high power value")
result["score"] -= 0.2
# Check for zero values (might indicate sensor issues)
if numeric_value == 0:
result["score"] -= 0.1
# Check for NaN or infinity
if math.isnan(numeric_value) or math.isinf(numeric_value):
result["is_valid"] = False
result["issues"].append("Invalid numeric value (NaN or Infinity)")
result["score"] = 0.0
except (ValueError, TypeError) as e:
result["is_valid"] = False
result["issues"].append(f"Non-numeric value: {e}")
result["score"] = 0.0
return result
async def _validate_sensor_id(self, sensor_id: str) -> Dict[str, Any]:
"""Validate sensor ID format and consistency"""
result = {"is_valid": True, "issues": [], "score": 1.0}
try:
if not isinstance(sensor_id, str) or len(sensor_id) == 0:
result["is_valid"] = False
result["issues"].append("Empty or invalid sensor ID")
result["score"] = 0.0
return result
# Check length
if len(sensor_id) < 3:
result["issues"].append("Very short sensor ID")
result["score"] -= 0.2
elif len(sensor_id) > 50:
result["issues"].append("Very long sensor ID")
result["score"] -= 0.1
# Check for reasonable characters
if not re.match(r'^[a-zA-Z0-9_\-\.]+$', sensor_id):
result["issues"].append("Sensor ID contains unusual characters")
result["score"] -= 0.1
except Exception as e:
result["issues"].append(f"Sensor ID validation error: {e}")
result["score"] -= 0.1
return result
async def _check_for_duplicates(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
"""Check for duplicate records"""
result = {"is_unique": True, "issues": [], "score": 1.0}
try:
# Create record signature
signature = hashlib.md5(
f"{source_name}_{record.get('sensor_id')}_{record.get('timestamp')}_{record.get('value')}".encode()
).hexdigest()
# Check cache for recent duplicates
cache_key = f"record_signature:{signature}"
exists = await self.redis.exists(cache_key)
if exists:
result["is_unique"] = False
result["issues"].append("Duplicate record detected")
result["score"] = 0.0
else:
# Store signature with short expiration (1 hour)
await self.redis.setex(cache_key, 3600, "1")
except Exception as e:
logger.debug(f"Error checking duplicates: {e}")
# Don't fail validation for cache errors
return result
async def _normalize_timestamp(self, timestamp) -> int:
"""Normalize timestamp to unix timestamp"""
try:
if isinstance(timestamp, str):
try:
# Try ISO format first
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
return int(dt.timestamp())
except:
# Try as unix timestamp string
return int(float(timestamp))
else:
return int(float(timestamp))
except:
# Fallback to current time
return int(datetime.utcnow().timestamp())
async def _infer_sensor_type(self, record: Dict[str, Any]) -> Dict[str, str]:
"""Infer sensor type from record data"""
sensor_id = record.get("sensor_id", "").lower()
unit = (record.get("unit", "") or "").lower()
value = record.get("value", 0)
metadata = record.get("metadata", {})
# Energy sensors
if "wh" in unit or "energy" in sensor_id or "consumption" in sensor_id:
return {"type": "energy", "category": "energy"}
elif "w" in unit and "wh" not in unit:
return {"type": "power", "category": "energy"}
# Environmental sensors
elif "temp" in sensor_id or "°c" in unit or "celsius" in unit:
return {"type": "temperature", "category": "environmental"}
elif "humid" in sensor_id or "%" in unit:
return {"type": "humidity", "category": "environmental"}
elif "co2" in sensor_id or "ppm" in unit:
return {"type": "co2", "category": "environmental"}
# Motion/occupancy sensors
elif "motion" in sensor_id or "occupancy" in sensor_id or ("motion" in str(metadata).lower()):
return {"type": "motion", "category": "occupancy"}
# Generation sensors
elif "generation" in sensor_id or "solar" in sensor_id or "generation" in str(metadata).lower():
return {"type": "generation", "category": "energy"}
# Default to energy if unclear
else:
return {"type": "energy", "category": "energy"}
async def _standardize_unit(self, unit: Optional[str]) -> Dict[str, Any]:
"""Standardize unit format"""
if not unit:
return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
unit_lower = unit.lower().strip()
# Energy units
if unit_lower in ["kwh", "kw-h", "kw_h"]:
return {"standard_unit": "kWh", "conversion_factor": 1.0, "unit_type": "energy"}
elif unit_lower in ["wh", "w-h", "w_h"]:
return {"standard_unit": "kWh", "conversion_factor": 0.001, "unit_type": "energy"}
elif unit_lower in ["mwh", "mw-h", "mw_h"]:
return {"standard_unit": "kWh", "conversion_factor": 1000.0, "unit_type": "energy"}
# Power units
elif unit_lower in ["kw", "kilowatt", "kilowatts"]:
return {"standard_unit": "kW", "conversion_factor": 1.0, "unit_type": "power"}
elif unit_lower in ["w", "watt", "watts"]:
return {"standard_unit": "kW", "conversion_factor": 0.001, "unit_type": "power"}
elif unit_lower in ["mw", "megawatt", "megawatts"]:
return {"standard_unit": "kW", "conversion_factor": 1000.0, "unit_type": "power"}
# Temperature units
elif unit_lower in ["°c", "celsius", "c"]:
return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature"}
elif unit_lower in ["°f", "fahrenheit", "f"]:
return {"standard_unit": "°C", "conversion_factor": 1.0, "unit_type": "temperature", "requires_conversion": True}
# Default
else:
return {"standard_unit": unit, "conversion_factor": 1.0, "unit_type": "unknown"}
async def _calculate_derived_metrics(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
"""Calculate derived metrics from the record"""
derived = {}
try:
value = float(record.get("value", 0))
unit_info = record.get("unit_info", {})
# Apply unit conversion if needed
if unit_info.get("conversion_factor", 1.0) != 1.0:
derived["original_value"] = value
derived["converted_value"] = value * unit_info["conversion_factor"]
# Energy-specific calculations
if unit_info.get("unit_type") == "energy":
# Estimate cost (simplified)
cost_per_kwh = 0.12 # Example rate
derived["estimated_cost"] = value * cost_per_kwh
# Estimate CO2 emissions (simplified)
co2_per_kwh = 0.4 # kg CO2 per kWh (example grid factor)
derived["estimated_co2_kg"] = value * co2_per_kwh
# Add value range classification
derived["value_range"] = await self._classify_value_range(value, unit_info.get("unit_type"))
except Exception as e:
logger.debug(f"Error calculating derived metrics: {e}")
return derived
async def _classify_value_range(self, value: float, unit_type: str) -> str:
"""Classify value into ranges for better understanding"""
if unit_type == "energy":
if value < 1:
return "very_low"
elif value < 10:
return "low"
elif value < 50:
return "medium"
elif value < 200:
return "high"
else:
return "very_high"
elif unit_type == "power":
if value < 0.5:
return "very_low"
elif value < 5:
return "low"
elif value < 20:
return "medium"
elif value < 100:
return "high"
else:
return "very_high"
else:
return "unknown"
async def _enrich_with_context(self, record: Dict[str, Any], source_name: str) -> Dict[str, Any]:
"""Enrich record with contextual information"""
context = {}
try:
# Add geographical context if available
context["data_source"] = "real_community"
context["source_type"] = "ftp_ingestion"
# Add data freshness
ingestion_time = datetime.utcnow()
data_time = datetime.fromtimestamp(record["timestamp"])
context["data_age_minutes"] = (ingestion_time - data_time).total_seconds() / 60
# Classify data freshness
if context["data_age_minutes"] < 15:
context["freshness"] = "real_time"
elif context["data_age_minutes"] < 60:
context["freshness"] = "near_real_time"
elif context["data_age_minutes"] < 1440: # 24 hours
context["freshness"] = "recent"
else:
context["freshness"] = "historical"
except Exception as e:
logger.debug(f"Error adding context: {e}")
return context
async def _extract_temporal_features(self, timestamp: int) -> Dict[str, Any]:
"""Extract temporal features from timestamp"""
dt = datetime.fromtimestamp(timestamp)
return {
"hour": dt.hour,
"day_of_week": dt.weekday(),
"day_of_month": dt.day,
"month": dt.month,
"quarter": (dt.month - 1) // 3 + 1,
"is_weekend": dt.weekday() >= 5,
"is_business_hours": 8 <= dt.hour <= 17,
"season": self._get_season(dt.month)
}
def _get_season(self, month: int) -> str:
"""Get season from month"""
if month in [12, 1, 2]:
return "winter"
elif month in [3, 4, 5]:
return "spring"
elif month in [6, 7, 8]:
return "summer"
else:
return "autumn"
async def _enrich_energy_data(self, record: Dict[str, Any]) -> Dict[str, Any]:
"""Add energy-specific enrichments"""
enrichment = {}
try:
value = record.get("derived_metrics", {}).get("converted_value", record.get("value", 0))
temporal = record.get("temporal", {})
# Energy usage patterns
if temporal.get("is_business_hours"):
enrichment["usage_pattern"] = "business_hours"
elif temporal.get("is_weekend"):
enrichment["usage_pattern"] = "weekend"
else:
enrichment["usage_pattern"] = "off_hours"
# Demand classification
if value > 100:
enrichment["demand_level"] = "high"
elif value > 50:
enrichment["demand_level"] = "medium"
elif value > 10:
enrichment["demand_level"] = "low"
else:
enrichment["demand_level"] = "minimal"
# Peak/off-peak classification
hour = temporal.get("hour", 0)
if 17 <= hour <= 21: # Evening peak
enrichment["tariff_period"] = "peak"
elif 22 <= hour <= 6: # Night off-peak
enrichment["tariff_period"] = "off_peak"
else:
enrichment["tariff_period"] = "standard"
except Exception as e:
logger.debug(f"Error enriching energy data: {e}")
return enrichment
async def _calculate_quality_scores(self, data: List[Dict[str, Any]], quality_report: Dict[str, Any]) -> Dict[str, float]:
"""Calculate overall quality scores"""
if not data:
return {"overall": 0.0, "completeness": 0.0, "accuracy": 0.0, "consistency": 0.0, "timeliness": 0.0}
# Completeness score
total_expected_fields = len(data) * 4 # sensor_id, timestamp, value, unit
total_present_fields = sum(1 for record in data
for field in ["sensor_id", "timestamp", "value", "unit"]
if record.get(field) is not None)
completeness = total_present_fields / total_expected_fields if total_expected_fields > 0 else 0.0
# Accuracy score (based on validation scores)
accuracy_scores = [record.get("data_quality", {}).get("quality_score", 0) for record in data]
accuracy = statistics.mean(accuracy_scores) if accuracy_scores else 0.0
# Consistency score (coefficient of variation for quality scores)
if len(accuracy_scores) > 1:
std_dev = statistics.stdev(accuracy_scores)
mean_score = statistics.mean(accuracy_scores)
consistency = 1.0 - (std_dev / mean_score) if mean_score > 0 else 0.0
else:
consistency = 1.0
# Timeliness score (based on data age)
current_time = datetime.utcnow().timestamp()
ages = [(current_time - record.get("timestamp", current_time)) / 3600 for record in data] # age in hours
avg_age = statistics.mean(ages) if ages else 0
timeliness = max(0.0, 1.0 - (avg_age / 24)) # Decrease score as data gets older than 24 hours
# Overall score
overall = statistics.mean([completeness, accuracy, consistency, timeliness])
return {
"overall": round(overall, 3),
"completeness": round(completeness, 3),
"accuracy": round(accuracy, 3),
"consistency": round(consistency, 3),
"timeliness": round(timeliness, 3)
}
async def _store_quality_report(self, quality_report: Dict[str, Any], source_name: str):
"""Store quality report in database"""
try:
quality_report["_id"] = f"{source_name}_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}"
await self.db.quality_reports.insert_one(quality_report)
# Also cache in Redis for quick access
cache_key = f"quality_report:{source_name}:latest"
await self.redis.setex(cache_key, 3600, json.dumps(quality_report, default=str))
except Exception as e:
logger.error(f"Error storing quality report: {e}")
async def _load_validation_rules(self):
"""Load validation rules configuration"""
# Default validation rules
self.validation_rules = {
"energy": {
"min_value": 0,
"max_value": 100000,
"required_precision": 0.01
},
"power": {
"min_value": 0,
"max_value": 50000,
"required_precision": 0.1
},
"temperature": {
"min_value": -50,
"max_value": 100,
"required_precision": 0.1
}
}
logger.info("Loaded default validation rules")
async def _load_enrichment_metadata(self):
"""Load enrichment metadata"""
# Load any cached enrichment data
try:
cache_keys = []
async for key in self.redis.scan_iter(match="enrichment:*"):
cache_keys.append(key)
logger.info(f"Loaded {len(cache_keys)} enrichment cache entries")
except Exception as e:
logger.debug(f"Error loading enrichment metadata: {e}")
async def get_quality_summary(self, source_name: Optional[str] = None) -> Dict[str, Any]:
"""Get quality summary for sources"""
try:
match_filter = {"source": source_name} if source_name else {}
# Get recent quality reports
cursor = self.db.quality_reports.find(match_filter).sort("processing_time", -1).limit(50)
reports = []
async for report in cursor:
report["_id"] = str(report["_id"])
reports.append(report)
if not reports:
return {"message": "No quality reports found"}
# Calculate summary statistics
avg_quality = statistics.mean([r["quality_scores"]["overall"] for r in reports])
total_processed = sum([r["processed_records"] for r in reports])
total_rejected = sum([r["rejected_records"] for r in reports])
return {
"total_reports": len(reports),
"average_quality": round(avg_quality, 3),
"total_processed_records": total_processed,
"total_rejected_records": total_rejected,
"success_rate": round(total_processed / (total_processed + total_rejected) * 100, 2) if (total_processed + total_rejected) > 0 else 0,
"latest_report": reports[0] if reports else None
}
except Exception as e:
logger.error(f"Error getting quality summary: {e}")
return {"error": str(e)}