- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
545 lines
21 KiB
Python
545 lines
21 KiB
Python
"""
|
|
Monitoring and alerting system for the data ingestion service.
|
|
Handles error tracking, performance monitoring, and alert generation.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Any, Optional
|
|
import json
|
|
import traceback
|
|
import uuid
|
|
from collections import defaultdict, deque
|
|
import time
|
|
import psutil
|
|
import os
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class PerformanceMonitor:
|
|
"""Monitors service performance metrics"""
|
|
|
|
def __init__(self, redis_client):
|
|
self.redis = redis_client
|
|
self.metrics_buffer = defaultdict(deque)
|
|
self.max_buffer_size = 1000
|
|
self.last_flush = datetime.utcnow()
|
|
self.flush_interval = 60 # seconds
|
|
|
|
# Performance counters
|
|
self.request_count = 0
|
|
self.error_count = 0
|
|
self.processing_times = deque(maxlen=100)
|
|
self.memory_usage = deque(maxlen=100)
|
|
self.cpu_usage = deque(maxlen=100)
|
|
|
|
async def record_request(self, endpoint: str, duration: float, success: bool = True):
|
|
"""Record request metrics"""
|
|
try:
|
|
self.request_count += 1
|
|
if not success:
|
|
self.error_count += 1
|
|
|
|
self.processing_times.append(duration)
|
|
|
|
# Store in buffer
|
|
metric_data = {
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"endpoint": endpoint,
|
|
"duration_ms": duration * 1000,
|
|
"success": success,
|
|
"request_id": str(uuid.uuid4())
|
|
}
|
|
|
|
self.metrics_buffer["requests"].append(metric_data)
|
|
|
|
# Trim buffer if needed
|
|
if len(self.metrics_buffer["requests"]) > self.max_buffer_size:
|
|
self.metrics_buffer["requests"].popleft()
|
|
|
|
# Auto-flush if interval exceeded
|
|
if (datetime.utcnow() - self.last_flush).seconds > self.flush_interval:
|
|
await self.flush_metrics()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error recording request metric: {e}")
|
|
|
|
async def record_system_metrics(self):
|
|
"""Record system-level performance metrics"""
|
|
try:
|
|
# CPU usage
|
|
cpu_percent = psutil.cpu_percent()
|
|
self.cpu_usage.append(cpu_percent)
|
|
|
|
# Memory usage
|
|
process = psutil.Process()
|
|
memory_info = process.memory_info()
|
|
memory_mb = memory_info.rss / 1024 / 1024
|
|
self.memory_usage.append(memory_mb)
|
|
|
|
# Disk usage
|
|
disk_usage = psutil.disk_usage('/')
|
|
|
|
system_metrics = {
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"cpu_percent": cpu_percent,
|
|
"memory_mb": memory_mb,
|
|
"disk_free_gb": disk_usage.free / 1024 / 1024 / 1024,
|
|
"disk_percent": (disk_usage.used / disk_usage.total) * 100
|
|
}
|
|
|
|
self.metrics_buffer["system"].append(system_metrics)
|
|
|
|
# Trim buffer
|
|
if len(self.metrics_buffer["system"]) > self.max_buffer_size:
|
|
self.metrics_buffer["system"].popleft()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error recording system metrics: {e}")
|
|
|
|
async def record_data_processing_metrics(self, source_name: str, files_processed: int,
|
|
records_processed: int, processing_time: float):
|
|
"""Record data processing performance metrics"""
|
|
try:
|
|
processing_metrics = {
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"source_name": source_name,
|
|
"files_processed": files_processed,
|
|
"records_processed": records_processed,
|
|
"processing_time_seconds": processing_time,
|
|
"records_per_second": records_processed / max(processing_time, 0.001),
|
|
"files_per_hour": files_processed * 3600 / max(processing_time, 0.001)
|
|
}
|
|
|
|
self.metrics_buffer["processing"].append(processing_metrics)
|
|
|
|
# Trim buffer
|
|
if len(self.metrics_buffer["processing"]) > self.max_buffer_size:
|
|
self.metrics_buffer["processing"].popleft()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error recording processing metrics: {e}")
|
|
|
|
async def flush_metrics(self):
|
|
"""Flush metrics buffer to Redis"""
|
|
try:
|
|
if not self.metrics_buffer:
|
|
return
|
|
|
|
# Create batch update
|
|
pipe = self.redis.pipeline()
|
|
|
|
for metric_type, metrics in self.metrics_buffer.items():
|
|
# Convert deque to list and serialize
|
|
metrics_data = [dict(m) if isinstance(m, dict) else m for m in metrics]
|
|
|
|
# Store in Redis with timestamp key
|
|
timestamp_key = datetime.utcnow().strftime("%Y%m%d_%H%M")
|
|
redis_key = f"metrics:{metric_type}:{timestamp_key}"
|
|
|
|
pipe.lpush(redis_key, json.dumps(metrics_data))
|
|
pipe.expire(redis_key, 86400 * 7) # Keep for 7 days
|
|
|
|
await pipe.execute()
|
|
|
|
# Clear buffer
|
|
self.metrics_buffer.clear()
|
|
self.last_flush = datetime.utcnow()
|
|
|
|
logger.debug("Performance metrics flushed to Redis")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error flushing metrics: {e}")
|
|
|
|
async def get_performance_summary(self) -> Dict[str, Any]:
|
|
"""Get current performance summary"""
|
|
try:
|
|
return {
|
|
"request_count": self.request_count,
|
|
"error_count": self.error_count,
|
|
"error_rate": (self.error_count / max(self.request_count, 1)) * 100,
|
|
"avg_processing_time_ms": sum(self.processing_times) / max(len(self.processing_times), 1) * 1000,
|
|
"current_memory_mb": self.memory_usage[-1] if self.memory_usage else 0,
|
|
"current_cpu_percent": self.cpu_usage[-1] if self.cpu_usage else 0,
|
|
"metrics_buffer_size": sum(len(buffer) for buffer in self.metrics_buffer.values()),
|
|
"last_flush": self.last_flush.isoformat()
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting performance summary: {e}")
|
|
return {}
|
|
|
|
class ErrorHandler:
|
|
"""Centralized error handling and logging"""
|
|
|
|
def __init__(self, db, redis_client):
|
|
self.db = db
|
|
self.redis = redis_client
|
|
self.error_counts = defaultdict(int)
|
|
self.error_history = deque(maxlen=100)
|
|
self.alert_thresholds = {
|
|
"error_rate": 10, # errors per minute
|
|
"memory_usage": 500, # MB
|
|
"cpu_usage": 80, # percent
|
|
"disk_usage": 90, # percent
|
|
"response_time": 5000 # milliseconds
|
|
}
|
|
|
|
async def log_error(self, error: Exception, context: Dict[str, Any] = None,
|
|
source_id: str = None, source_name: str = None):
|
|
"""Log error with context information"""
|
|
try:
|
|
error_type = type(error).__name__
|
|
error_message = str(error)
|
|
stack_trace = traceback.format_exc()
|
|
|
|
# Update error counters
|
|
self.error_counts[error_type] += 1
|
|
|
|
# Create error record
|
|
error_record = {
|
|
"timestamp": datetime.utcnow(),
|
|
"service": "data-ingestion-service",
|
|
"level": "ERROR",
|
|
"source_id": source_id,
|
|
"source_name": source_name,
|
|
"error_type": error_type,
|
|
"error_message": error_message,
|
|
"stack_trace": stack_trace,
|
|
"context": context or {}
|
|
}
|
|
|
|
# Store in database
|
|
await self.db.error_logs.insert_one(error_record)
|
|
|
|
# Add to history
|
|
self.error_history.append({
|
|
"timestamp": error_record["timestamp"].isoformat(),
|
|
"type": error_type,
|
|
"message": error_message[:100] # Truncate for memory
|
|
})
|
|
|
|
# Check for alert conditions
|
|
await self.check_alert_conditions(error_record)
|
|
|
|
# Log to standard logger
|
|
logger.error(f"[{source_name or 'system'}] {error_type}: {error_message}",
|
|
extra={"context": context, "source_id": source_id})
|
|
|
|
except Exception as e:
|
|
# Fallback logging if error handler fails
|
|
logger.critical(f"Error handler failed: {e}")
|
|
logger.error(f"Original error: {error}")
|
|
|
|
async def log_warning(self, message: str, context: Dict[str, Any] = None,
|
|
source_id: str = None, source_name: str = None):
|
|
"""Log warning message"""
|
|
try:
|
|
warning_record = {
|
|
"timestamp": datetime.utcnow(),
|
|
"service": "data-ingestion-service",
|
|
"level": "WARNING",
|
|
"source_id": source_id,
|
|
"source_name": source_name,
|
|
"error_type": "WARNING",
|
|
"error_message": message,
|
|
"context": context or {}
|
|
}
|
|
|
|
await self.db.error_logs.insert_one(warning_record)
|
|
logger.warning(f"[{source_name or 'system'}] {message}",
|
|
extra={"context": context, "source_id": source_id})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error logging warning: {e}")
|
|
|
|
async def check_alert_conditions(self, error_record: Dict[str, Any]):
|
|
"""Check if error conditions warrant alerts"""
|
|
try:
|
|
# Count recent errors (last 1 minute)
|
|
one_minute_ago = datetime.utcnow() - timedelta(minutes=1)
|
|
recent_errors = await self.db.error_logs.count_documents({
|
|
"timestamp": {"$gte": one_minute_ago},
|
|
"level": "ERROR"
|
|
})
|
|
|
|
# Check error rate threshold
|
|
if recent_errors >= self.alert_thresholds["error_rate"]:
|
|
await self.create_alert(
|
|
alert_type="error_rate",
|
|
title="High Error Rate Detected",
|
|
description=f"Detected {recent_errors} errors in the last minute",
|
|
severity="high",
|
|
metadata={"error_count": recent_errors, "threshold": self.alert_thresholds["error_rate"]}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking alert conditions: {e}")
|
|
|
|
async def create_alert(self, alert_type: str, title: str, description: str,
|
|
severity: str, source_id: str = None, metadata: Dict[str, Any] = None):
|
|
"""Create monitoring alert"""
|
|
try:
|
|
alert_record = {
|
|
"alert_id": str(uuid.uuid4()),
|
|
"alert_type": alert_type,
|
|
"source_id": source_id,
|
|
"title": title,
|
|
"description": description,
|
|
"severity": severity,
|
|
"timestamp": datetime.utcnow(),
|
|
"resolved": False,
|
|
"metadata": metadata or {}
|
|
}
|
|
|
|
await self.db.monitoring_alerts.insert_one(alert_record)
|
|
|
|
# Also publish to Redis for real-time notifications
|
|
alert_notification = {
|
|
**alert_record,
|
|
"timestamp": alert_record["timestamp"].isoformat()
|
|
}
|
|
|
|
await self.redis.publish("alerts:data-ingestion", json.dumps(alert_notification))
|
|
|
|
logger.warning(f"Alert created: {title} ({severity})")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating alert: {e}")
|
|
|
|
async def get_error_summary(self) -> Dict[str, Any]:
|
|
"""Get error summary statistics"""
|
|
try:
|
|
# Get error counts by type
|
|
error_types = dict(self.error_counts)
|
|
|
|
# Get recent error rate
|
|
one_hour_ago = datetime.utcnow() - timedelta(hours=1)
|
|
recent_errors = await self.db.error_logs.count_documents({
|
|
"timestamp": {"$gte": one_hour_ago},
|
|
"level": "ERROR"
|
|
})
|
|
|
|
# Get recent alerts
|
|
recent_alerts = await self.db.monitoring_alerts.count_documents({
|
|
"timestamp": {"$gte": one_hour_ago},
|
|
"resolved": False
|
|
})
|
|
|
|
return {
|
|
"total_errors": sum(error_types.values()),
|
|
"error_types": error_types,
|
|
"recent_errors_1h": recent_errors,
|
|
"active_alerts": recent_alerts,
|
|
"error_history": list(self.error_history)[-10:], # Last 10 errors
|
|
"last_error": self.error_history[-1] if self.error_history else None
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting error summary: {e}")
|
|
return {}
|
|
|
|
class ServiceMonitor:
|
|
"""Main service monitoring coordinator"""
|
|
|
|
def __init__(self, db, redis_client):
|
|
self.db = db
|
|
self.redis = redis_client
|
|
self.performance_monitor = PerformanceMonitor(redis_client)
|
|
self.error_handler = ErrorHandler(db, redis_client)
|
|
self.monitoring_active = False
|
|
self.monitoring_interval = 30 # seconds
|
|
|
|
async def start_monitoring(self):
|
|
"""Start background monitoring tasks"""
|
|
self.monitoring_active = True
|
|
logger.info("Service monitoring started")
|
|
|
|
# Start monitoring loop
|
|
asyncio.create_task(self._monitoring_loop())
|
|
|
|
async def stop_monitoring(self):
|
|
"""Stop background monitoring"""
|
|
self.monitoring_active = False
|
|
await self.performance_monitor.flush_metrics()
|
|
logger.info("Service monitoring stopped")
|
|
|
|
async def _monitoring_loop(self):
|
|
"""Main monitoring loop"""
|
|
while self.monitoring_active:
|
|
try:
|
|
# Record system metrics
|
|
await self.performance_monitor.record_system_metrics()
|
|
|
|
# Check system health
|
|
await self._check_system_health()
|
|
|
|
# Cleanup old data
|
|
await self._cleanup_old_monitoring_data()
|
|
|
|
# Wait for next cycle
|
|
await asyncio.sleep(self.monitoring_interval)
|
|
|
|
except Exception as e:
|
|
await self.error_handler.log_error(e, {"task": "monitoring_loop"})
|
|
await asyncio.sleep(self.monitoring_interval)
|
|
|
|
async def _check_system_health(self):
|
|
"""Check system health and create alerts if needed"""
|
|
try:
|
|
# Check memory usage
|
|
current_memory = self.performance_monitor.memory_usage[-1] if self.performance_monitor.memory_usage else 0
|
|
if current_memory > self.error_handler.alert_thresholds["memory_usage"]:
|
|
await self.error_handler.create_alert(
|
|
alert_type="high_memory",
|
|
title="High Memory Usage",
|
|
description=f"Memory usage at {current_memory:.1f}MB",
|
|
severity="warning",
|
|
metadata={"current_memory_mb": current_memory}
|
|
)
|
|
|
|
# Check CPU usage
|
|
current_cpu = self.performance_monitor.cpu_usage[-1] if self.performance_monitor.cpu_usage else 0
|
|
if current_cpu > self.error_handler.alert_thresholds["cpu_usage"]:
|
|
await self.error_handler.create_alert(
|
|
alert_type="high_cpu",
|
|
title="High CPU Usage",
|
|
description=f"CPU usage at {current_cpu:.1f}%",
|
|
severity="warning",
|
|
metadata={"current_cpu_percent": current_cpu}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking system health: {e}")
|
|
|
|
async def _cleanup_old_monitoring_data(self):
|
|
"""Clean up old monitoring data"""
|
|
try:
|
|
# Clean up old error logs (older than 30 days)
|
|
thirty_days_ago = datetime.utcnow() - timedelta(days=30)
|
|
|
|
deleted_errors = await self.db.error_logs.delete_many({
|
|
"timestamp": {"$lt": thirty_days_ago}
|
|
})
|
|
|
|
# Clean up resolved alerts (older than 7 days)
|
|
seven_days_ago = datetime.utcnow() - timedelta(days=7)
|
|
|
|
deleted_alerts = await self.db.monitoring_alerts.delete_many({
|
|
"timestamp": {"$lt": seven_days_ago},
|
|
"resolved": True
|
|
})
|
|
|
|
if deleted_errors.deleted_count > 0 or deleted_alerts.deleted_count > 0:
|
|
logger.info(f"Cleaned up {deleted_errors.deleted_count} old error logs and "
|
|
f"{deleted_alerts.deleted_count} resolved alerts")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning up old monitoring data: {e}")
|
|
|
|
async def get_service_status(self) -> Dict[str, Any]:
|
|
"""Get comprehensive service status"""
|
|
try:
|
|
performance_summary = await self.performance_monitor.get_performance_summary()
|
|
error_summary = await self.error_handler.get_error_summary()
|
|
|
|
# Get database status
|
|
db_status = await self._get_database_status()
|
|
|
|
# Overall health assessment
|
|
health_score = await self._calculate_health_score(performance_summary, error_summary)
|
|
|
|
return {
|
|
"service": "data-ingestion-service",
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"health_score": health_score,
|
|
"monitoring_active": self.monitoring_active,
|
|
"performance": performance_summary,
|
|
"errors": error_summary,
|
|
"database": db_status
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting service status: {e}")
|
|
return {"error": str(e)}
|
|
|
|
async def _get_database_status(self) -> Dict[str, Any]:
|
|
"""Get database connection and performance status"""
|
|
try:
|
|
# Test MongoDB connection
|
|
start_time = time.time()
|
|
await self.db.command("ping")
|
|
mongo_latency = (time.time() - start_time) * 1000
|
|
|
|
# Test Redis connection
|
|
start_time = time.time()
|
|
await self.redis.ping()
|
|
redis_latency = (time.time() - start_time) * 1000
|
|
|
|
# Get collection counts
|
|
collections_info = {}
|
|
for collection_name in ["data_sources", "processed_files", "error_logs", "monitoring_alerts"]:
|
|
try:
|
|
count = await self.db[collection_name].count_documents({})
|
|
collections_info[collection_name] = count
|
|
except:
|
|
collections_info[collection_name] = "unknown"
|
|
|
|
return {
|
|
"mongodb": {
|
|
"connected": True,
|
|
"latency_ms": round(mongo_latency, 2)
|
|
},
|
|
"redis": {
|
|
"connected": True,
|
|
"latency_ms": round(redis_latency, 2)
|
|
},
|
|
"collections": collections_info
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
"mongodb": {"connected": False, "error": str(e)},
|
|
"redis": {"connected": False, "error": str(e)},
|
|
"collections": {}
|
|
}
|
|
|
|
async def _calculate_health_score(self, performance: Dict[str, Any], errors: Dict[str, Any]) -> float:
|
|
"""Calculate overall health score (0-100)"""
|
|
try:
|
|
score = 100.0
|
|
|
|
# Deduct for high error rate
|
|
error_rate = performance.get("error_rate", 0)
|
|
if error_rate > 5:
|
|
score -= min(error_rate * 2, 30)
|
|
|
|
# Deduct for high resource usage
|
|
memory_mb = performance.get("current_memory_mb", 0)
|
|
if memory_mb > 300:
|
|
score -= min((memory_mb - 300) / 10, 20)
|
|
|
|
cpu_percent = performance.get("current_cpu_percent", 0)
|
|
if cpu_percent > 70:
|
|
score -= min((cpu_percent - 70) / 2, 15)
|
|
|
|
# Deduct for recent errors
|
|
recent_errors = errors.get("recent_errors_1h", 0)
|
|
if recent_errors > 0:
|
|
score -= min(recent_errors * 5, 25)
|
|
|
|
# Deduct for active alerts
|
|
active_alerts = errors.get("active_alerts", 0)
|
|
if active_alerts > 0:
|
|
score -= min(active_alerts * 10, 20)
|
|
|
|
return max(0.0, round(score, 1))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error calculating health score: {e}")
|
|
return 50.0 # Default moderate health score
|
|
|
|
# Export monitoring components
|
|
__all__ = [
|
|
'ServiceMonitor', 'PerformanceMonitor', 'ErrorHandler'
|
|
] |