Add data-ingestion-service for SA4CPS FTP integration

- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add
robust data processor with multi-format and unit inference support -
Publish parsed data to Redis topics for real-time dashboard simulation -
Include validation, monitoring, and auto-configuration scripts - Provide
documentation and test scripts for SA4CPS integration
This commit is contained in:
rafaeldpsilva
2025-09-10 14:43:30 +01:00
parent d4f280de93
commit 5fdce00e5d
16 changed files with 6353 additions and 0 deletions

View File

@@ -0,0 +1,545 @@
"""
Monitoring and alerting system for the data ingestion service.
Handles error tracking, performance monitoring, and alert generation.
"""
import asyncio
import logging
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import json
import traceback
import uuid
from collections import defaultdict, deque
import time
import psutil
import os
logger = logging.getLogger(__name__)
class PerformanceMonitor:
"""Monitors service performance metrics"""
def __init__(self, redis_client):
self.redis = redis_client
self.metrics_buffer = defaultdict(deque)
self.max_buffer_size = 1000
self.last_flush = datetime.utcnow()
self.flush_interval = 60 # seconds
# Performance counters
self.request_count = 0
self.error_count = 0
self.processing_times = deque(maxlen=100)
self.memory_usage = deque(maxlen=100)
self.cpu_usage = deque(maxlen=100)
async def record_request(self, endpoint: str, duration: float, success: bool = True):
"""Record request metrics"""
try:
self.request_count += 1
if not success:
self.error_count += 1
self.processing_times.append(duration)
# Store in buffer
metric_data = {
"timestamp": datetime.utcnow().isoformat(),
"endpoint": endpoint,
"duration_ms": duration * 1000,
"success": success,
"request_id": str(uuid.uuid4())
}
self.metrics_buffer["requests"].append(metric_data)
# Trim buffer if needed
if len(self.metrics_buffer["requests"]) > self.max_buffer_size:
self.metrics_buffer["requests"].popleft()
# Auto-flush if interval exceeded
if (datetime.utcnow() - self.last_flush).seconds > self.flush_interval:
await self.flush_metrics()
except Exception as e:
logger.error(f"Error recording request metric: {e}")
async def record_system_metrics(self):
"""Record system-level performance metrics"""
try:
# CPU usage
cpu_percent = psutil.cpu_percent()
self.cpu_usage.append(cpu_percent)
# Memory usage
process = psutil.Process()
memory_info = process.memory_info()
memory_mb = memory_info.rss / 1024 / 1024
self.memory_usage.append(memory_mb)
# Disk usage
disk_usage = psutil.disk_usage('/')
system_metrics = {
"timestamp": datetime.utcnow().isoformat(),
"cpu_percent": cpu_percent,
"memory_mb": memory_mb,
"disk_free_gb": disk_usage.free / 1024 / 1024 / 1024,
"disk_percent": (disk_usage.used / disk_usage.total) * 100
}
self.metrics_buffer["system"].append(system_metrics)
# Trim buffer
if len(self.metrics_buffer["system"]) > self.max_buffer_size:
self.metrics_buffer["system"].popleft()
except Exception as e:
logger.error(f"Error recording system metrics: {e}")
async def record_data_processing_metrics(self, source_name: str, files_processed: int,
records_processed: int, processing_time: float):
"""Record data processing performance metrics"""
try:
processing_metrics = {
"timestamp": datetime.utcnow().isoformat(),
"source_name": source_name,
"files_processed": files_processed,
"records_processed": records_processed,
"processing_time_seconds": processing_time,
"records_per_second": records_processed / max(processing_time, 0.001),
"files_per_hour": files_processed * 3600 / max(processing_time, 0.001)
}
self.metrics_buffer["processing"].append(processing_metrics)
# Trim buffer
if len(self.metrics_buffer["processing"]) > self.max_buffer_size:
self.metrics_buffer["processing"].popleft()
except Exception as e:
logger.error(f"Error recording processing metrics: {e}")
async def flush_metrics(self):
"""Flush metrics buffer to Redis"""
try:
if not self.metrics_buffer:
return
# Create batch update
pipe = self.redis.pipeline()
for metric_type, metrics in self.metrics_buffer.items():
# Convert deque to list and serialize
metrics_data = [dict(m) if isinstance(m, dict) else m for m in metrics]
# Store in Redis with timestamp key
timestamp_key = datetime.utcnow().strftime("%Y%m%d_%H%M")
redis_key = f"metrics:{metric_type}:{timestamp_key}"
pipe.lpush(redis_key, json.dumps(metrics_data))
pipe.expire(redis_key, 86400 * 7) # Keep for 7 days
await pipe.execute()
# Clear buffer
self.metrics_buffer.clear()
self.last_flush = datetime.utcnow()
logger.debug("Performance metrics flushed to Redis")
except Exception as e:
logger.error(f"Error flushing metrics: {e}")
async def get_performance_summary(self) -> Dict[str, Any]:
"""Get current performance summary"""
try:
return {
"request_count": self.request_count,
"error_count": self.error_count,
"error_rate": (self.error_count / max(self.request_count, 1)) * 100,
"avg_processing_time_ms": sum(self.processing_times) / max(len(self.processing_times), 1) * 1000,
"current_memory_mb": self.memory_usage[-1] if self.memory_usage else 0,
"current_cpu_percent": self.cpu_usage[-1] if self.cpu_usage else 0,
"metrics_buffer_size": sum(len(buffer) for buffer in self.metrics_buffer.values()),
"last_flush": self.last_flush.isoformat()
}
except Exception as e:
logger.error(f"Error getting performance summary: {e}")
return {}
class ErrorHandler:
"""Centralized error handling and logging"""
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
self.error_counts = defaultdict(int)
self.error_history = deque(maxlen=100)
self.alert_thresholds = {
"error_rate": 10, # errors per minute
"memory_usage": 500, # MB
"cpu_usage": 80, # percent
"disk_usage": 90, # percent
"response_time": 5000 # milliseconds
}
async def log_error(self, error: Exception, context: Dict[str, Any] = None,
source_id: str = None, source_name: str = None):
"""Log error with context information"""
try:
error_type = type(error).__name__
error_message = str(error)
stack_trace = traceback.format_exc()
# Update error counters
self.error_counts[error_type] += 1
# Create error record
error_record = {
"timestamp": datetime.utcnow(),
"service": "data-ingestion-service",
"level": "ERROR",
"source_id": source_id,
"source_name": source_name,
"error_type": error_type,
"error_message": error_message,
"stack_trace": stack_trace,
"context": context or {}
}
# Store in database
await self.db.error_logs.insert_one(error_record)
# Add to history
self.error_history.append({
"timestamp": error_record["timestamp"].isoformat(),
"type": error_type,
"message": error_message[:100] # Truncate for memory
})
# Check for alert conditions
await self.check_alert_conditions(error_record)
# Log to standard logger
logger.error(f"[{source_name or 'system'}] {error_type}: {error_message}",
extra={"context": context, "source_id": source_id})
except Exception as e:
# Fallback logging if error handler fails
logger.critical(f"Error handler failed: {e}")
logger.error(f"Original error: {error}")
async def log_warning(self, message: str, context: Dict[str, Any] = None,
source_id: str = None, source_name: str = None):
"""Log warning message"""
try:
warning_record = {
"timestamp": datetime.utcnow(),
"service": "data-ingestion-service",
"level": "WARNING",
"source_id": source_id,
"source_name": source_name,
"error_type": "WARNING",
"error_message": message,
"context": context or {}
}
await self.db.error_logs.insert_one(warning_record)
logger.warning(f"[{source_name or 'system'}] {message}",
extra={"context": context, "source_id": source_id})
except Exception as e:
logger.error(f"Error logging warning: {e}")
async def check_alert_conditions(self, error_record: Dict[str, Any]):
"""Check if error conditions warrant alerts"""
try:
# Count recent errors (last 1 minute)
one_minute_ago = datetime.utcnow() - timedelta(minutes=1)
recent_errors = await self.db.error_logs.count_documents({
"timestamp": {"$gte": one_minute_ago},
"level": "ERROR"
})
# Check error rate threshold
if recent_errors >= self.alert_thresholds["error_rate"]:
await self.create_alert(
alert_type="error_rate",
title="High Error Rate Detected",
description=f"Detected {recent_errors} errors in the last minute",
severity="high",
metadata={"error_count": recent_errors, "threshold": self.alert_thresholds["error_rate"]}
)
except Exception as e:
logger.error(f"Error checking alert conditions: {e}")
async def create_alert(self, alert_type: str, title: str, description: str,
severity: str, source_id: str = None, metadata: Dict[str, Any] = None):
"""Create monitoring alert"""
try:
alert_record = {
"alert_id": str(uuid.uuid4()),
"alert_type": alert_type,
"source_id": source_id,
"title": title,
"description": description,
"severity": severity,
"timestamp": datetime.utcnow(),
"resolved": False,
"metadata": metadata or {}
}
await self.db.monitoring_alerts.insert_one(alert_record)
# Also publish to Redis for real-time notifications
alert_notification = {
**alert_record,
"timestamp": alert_record["timestamp"].isoformat()
}
await self.redis.publish("alerts:data-ingestion", json.dumps(alert_notification))
logger.warning(f"Alert created: {title} ({severity})")
except Exception as e:
logger.error(f"Error creating alert: {e}")
async def get_error_summary(self) -> Dict[str, Any]:
"""Get error summary statistics"""
try:
# Get error counts by type
error_types = dict(self.error_counts)
# Get recent error rate
one_hour_ago = datetime.utcnow() - timedelta(hours=1)
recent_errors = await self.db.error_logs.count_documents({
"timestamp": {"$gte": one_hour_ago},
"level": "ERROR"
})
# Get recent alerts
recent_alerts = await self.db.monitoring_alerts.count_documents({
"timestamp": {"$gte": one_hour_ago},
"resolved": False
})
return {
"total_errors": sum(error_types.values()),
"error_types": error_types,
"recent_errors_1h": recent_errors,
"active_alerts": recent_alerts,
"error_history": list(self.error_history)[-10:], # Last 10 errors
"last_error": self.error_history[-1] if self.error_history else None
}
except Exception as e:
logger.error(f"Error getting error summary: {e}")
return {}
class ServiceMonitor:
"""Main service monitoring coordinator"""
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
self.performance_monitor = PerformanceMonitor(redis_client)
self.error_handler = ErrorHandler(db, redis_client)
self.monitoring_active = False
self.monitoring_interval = 30 # seconds
async def start_monitoring(self):
"""Start background monitoring tasks"""
self.monitoring_active = True
logger.info("Service monitoring started")
# Start monitoring loop
asyncio.create_task(self._monitoring_loop())
async def stop_monitoring(self):
"""Stop background monitoring"""
self.monitoring_active = False
await self.performance_monitor.flush_metrics()
logger.info("Service monitoring stopped")
async def _monitoring_loop(self):
"""Main monitoring loop"""
while self.monitoring_active:
try:
# Record system metrics
await self.performance_monitor.record_system_metrics()
# Check system health
await self._check_system_health()
# Cleanup old data
await self._cleanup_old_monitoring_data()
# Wait for next cycle
await asyncio.sleep(self.monitoring_interval)
except Exception as e:
await self.error_handler.log_error(e, {"task": "monitoring_loop"})
await asyncio.sleep(self.monitoring_interval)
async def _check_system_health(self):
"""Check system health and create alerts if needed"""
try:
# Check memory usage
current_memory = self.performance_monitor.memory_usage[-1] if self.performance_monitor.memory_usage else 0
if current_memory > self.error_handler.alert_thresholds["memory_usage"]:
await self.error_handler.create_alert(
alert_type="high_memory",
title="High Memory Usage",
description=f"Memory usage at {current_memory:.1f}MB",
severity="warning",
metadata={"current_memory_mb": current_memory}
)
# Check CPU usage
current_cpu = self.performance_monitor.cpu_usage[-1] if self.performance_monitor.cpu_usage else 0
if current_cpu > self.error_handler.alert_thresholds["cpu_usage"]:
await self.error_handler.create_alert(
alert_type="high_cpu",
title="High CPU Usage",
description=f"CPU usage at {current_cpu:.1f}%",
severity="warning",
metadata={"current_cpu_percent": current_cpu}
)
except Exception as e:
logger.error(f"Error checking system health: {e}")
async def _cleanup_old_monitoring_data(self):
"""Clean up old monitoring data"""
try:
# Clean up old error logs (older than 30 days)
thirty_days_ago = datetime.utcnow() - timedelta(days=30)
deleted_errors = await self.db.error_logs.delete_many({
"timestamp": {"$lt": thirty_days_ago}
})
# Clean up resolved alerts (older than 7 days)
seven_days_ago = datetime.utcnow() - timedelta(days=7)
deleted_alerts = await self.db.monitoring_alerts.delete_many({
"timestamp": {"$lt": seven_days_ago},
"resolved": True
})
if deleted_errors.deleted_count > 0 or deleted_alerts.deleted_count > 0:
logger.info(f"Cleaned up {deleted_errors.deleted_count} old error logs and "
f"{deleted_alerts.deleted_count} resolved alerts")
except Exception as e:
logger.error(f"Error cleaning up old monitoring data: {e}")
async def get_service_status(self) -> Dict[str, Any]:
"""Get comprehensive service status"""
try:
performance_summary = await self.performance_monitor.get_performance_summary()
error_summary = await self.error_handler.get_error_summary()
# Get database status
db_status = await self._get_database_status()
# Overall health assessment
health_score = await self._calculate_health_score(performance_summary, error_summary)
return {
"service": "data-ingestion-service",
"timestamp": datetime.utcnow().isoformat(),
"health_score": health_score,
"monitoring_active": self.monitoring_active,
"performance": performance_summary,
"errors": error_summary,
"database": db_status
}
except Exception as e:
logger.error(f"Error getting service status: {e}")
return {"error": str(e)}
async def _get_database_status(self) -> Dict[str, Any]:
"""Get database connection and performance status"""
try:
# Test MongoDB connection
start_time = time.time()
await self.db.command("ping")
mongo_latency = (time.time() - start_time) * 1000
# Test Redis connection
start_time = time.time()
await self.redis.ping()
redis_latency = (time.time() - start_time) * 1000
# Get collection counts
collections_info = {}
for collection_name in ["data_sources", "processed_files", "error_logs", "monitoring_alerts"]:
try:
count = await self.db[collection_name].count_documents({})
collections_info[collection_name] = count
except:
collections_info[collection_name] = "unknown"
return {
"mongodb": {
"connected": True,
"latency_ms": round(mongo_latency, 2)
},
"redis": {
"connected": True,
"latency_ms": round(redis_latency, 2)
},
"collections": collections_info
}
except Exception as e:
return {
"mongodb": {"connected": False, "error": str(e)},
"redis": {"connected": False, "error": str(e)},
"collections": {}
}
async def _calculate_health_score(self, performance: Dict[str, Any], errors: Dict[str, Any]) -> float:
"""Calculate overall health score (0-100)"""
try:
score = 100.0
# Deduct for high error rate
error_rate = performance.get("error_rate", 0)
if error_rate > 5:
score -= min(error_rate * 2, 30)
# Deduct for high resource usage
memory_mb = performance.get("current_memory_mb", 0)
if memory_mb > 300:
score -= min((memory_mb - 300) / 10, 20)
cpu_percent = performance.get("current_cpu_percent", 0)
if cpu_percent > 70:
score -= min((cpu_percent - 70) / 2, 15)
# Deduct for recent errors
recent_errors = errors.get("recent_errors_1h", 0)
if recent_errors > 0:
score -= min(recent_errors * 5, 25)
# Deduct for active alerts
active_alerts = errors.get("active_alerts", 0)
if active_alerts > 0:
score -= min(active_alerts * 10, 20)
return max(0.0, round(score, 1))
except Exception as e:
logger.error(f"Error calculating health score: {e}")
return 50.0 # Default moderate health score
# Export monitoring components
__all__ = [
'ServiceMonitor', 'PerformanceMonitor', 'ErrorHandler'
]