""" Monitoring and alerting system for the data ingestion service. Handles error tracking, performance monitoring, and alert generation. """ import asyncio import logging from datetime import datetime, timedelta from typing import List, Dict, Any, Optional import json import traceback import uuid from collections import defaultdict, deque import time import psutil import os logger = logging.getLogger(__name__) class PerformanceMonitor: """Monitors service performance metrics""" def __init__(self, redis_client): self.redis = redis_client self.metrics_buffer = defaultdict(deque) self.max_buffer_size = 1000 self.last_flush = datetime.utcnow() self.flush_interval = 60 # seconds # Performance counters self.request_count = 0 self.error_count = 0 self.processing_times = deque(maxlen=100) self.memory_usage = deque(maxlen=100) self.cpu_usage = deque(maxlen=100) async def record_request(self, endpoint: str, duration: float, success: bool = True): """Record request metrics""" try: self.request_count += 1 if not success: self.error_count += 1 self.processing_times.append(duration) # Store in buffer metric_data = { "timestamp": datetime.utcnow().isoformat(), "endpoint": endpoint, "duration_ms": duration * 1000, "success": success, "request_id": str(uuid.uuid4()) } self.metrics_buffer["requests"].append(metric_data) # Trim buffer if needed if len(self.metrics_buffer["requests"]) > self.max_buffer_size: self.metrics_buffer["requests"].popleft() # Auto-flush if interval exceeded if (datetime.utcnow() - self.last_flush).seconds > self.flush_interval: await self.flush_metrics() except Exception as e: logger.error(f"Error recording request metric: {e}") async def record_system_metrics(self): """Record system-level performance metrics""" try: # CPU usage cpu_percent = psutil.cpu_percent() self.cpu_usage.append(cpu_percent) # Memory usage process = psutil.Process() memory_info = process.memory_info() memory_mb = memory_info.rss / 1024 / 1024 self.memory_usage.append(memory_mb) # Disk usage disk_usage = psutil.disk_usage('/') system_metrics = { "timestamp": datetime.utcnow().isoformat(), "cpu_percent": cpu_percent, "memory_mb": memory_mb, "disk_free_gb": disk_usage.free / 1024 / 1024 / 1024, "disk_percent": (disk_usage.used / disk_usage.total) * 100 } self.metrics_buffer["system"].append(system_metrics) # Trim buffer if len(self.metrics_buffer["system"]) > self.max_buffer_size: self.metrics_buffer["system"].popleft() except Exception as e: logger.error(f"Error recording system metrics: {e}") async def record_data_processing_metrics(self, source_name: str, files_processed: int, records_processed: int, processing_time: float): """Record data processing performance metrics""" try: processing_metrics = { "timestamp": datetime.utcnow().isoformat(), "source_name": source_name, "files_processed": files_processed, "records_processed": records_processed, "processing_time_seconds": processing_time, "records_per_second": records_processed / max(processing_time, 0.001), "files_per_hour": files_processed * 3600 / max(processing_time, 0.001) } self.metrics_buffer["processing"].append(processing_metrics) # Trim buffer if len(self.metrics_buffer["processing"]) > self.max_buffer_size: self.metrics_buffer["processing"].popleft() except Exception as e: logger.error(f"Error recording processing metrics: {e}") async def flush_metrics(self): """Flush metrics buffer to Redis""" try: if not self.metrics_buffer: return # Create batch update pipe = self.redis.pipeline() for metric_type, metrics in self.metrics_buffer.items(): # Convert deque to list and serialize metrics_data = [dict(m) if isinstance(m, dict) else m for m in metrics] # Store in Redis with timestamp key timestamp_key = datetime.utcnow().strftime("%Y%m%d_%H%M") redis_key = f"metrics:{metric_type}:{timestamp_key}" pipe.lpush(redis_key, json.dumps(metrics_data)) pipe.expire(redis_key, 86400 * 7) # Keep for 7 days await pipe.execute() # Clear buffer self.metrics_buffer.clear() self.last_flush = datetime.utcnow() logger.debug("Performance metrics flushed to Redis") except Exception as e: logger.error(f"Error flushing metrics: {e}") async def get_performance_summary(self) -> Dict[str, Any]: """Get current performance summary""" try: return { "request_count": self.request_count, "error_count": self.error_count, "error_rate": (self.error_count / max(self.request_count, 1)) * 100, "avg_processing_time_ms": sum(self.processing_times) / max(len(self.processing_times), 1) * 1000, "current_memory_mb": self.memory_usage[-1] if self.memory_usage else 0, "current_cpu_percent": self.cpu_usage[-1] if self.cpu_usage else 0, "metrics_buffer_size": sum(len(buffer) for buffer in self.metrics_buffer.values()), "last_flush": self.last_flush.isoformat() } except Exception as e: logger.error(f"Error getting performance summary: {e}") return {} class ErrorHandler: """Centralized error handling and logging""" def __init__(self, db, redis_client): self.db = db self.redis = redis_client self.error_counts = defaultdict(int) self.error_history = deque(maxlen=100) self.alert_thresholds = { "error_rate": 10, # errors per minute "memory_usage": 500, # MB "cpu_usage": 80, # percent "disk_usage": 90, # percent "response_time": 5000 # milliseconds } async def log_error(self, error: Exception, context: Dict[str, Any] = None, source_id: str = None, source_name: str = None): """Log error with context information""" try: error_type = type(error).__name__ error_message = str(error) stack_trace = traceback.format_exc() # Update error counters self.error_counts[error_type] += 1 # Create error record error_record = { "timestamp": datetime.utcnow(), "service": "data-ingestion-service", "level": "ERROR", "source_id": source_id, "source_name": source_name, "error_type": error_type, "error_message": error_message, "stack_trace": stack_trace, "context": context or {} } # Store in database await self.db.error_logs.insert_one(error_record) # Add to history self.error_history.append({ "timestamp": error_record["timestamp"].isoformat(), "type": error_type, "message": error_message[:100] # Truncate for memory }) # Check for alert conditions await self.check_alert_conditions(error_record) # Log to standard logger logger.error(f"[{source_name or 'system'}] {error_type}: {error_message}", extra={"context": context, "source_id": source_id}) except Exception as e: # Fallback logging if error handler fails logger.critical(f"Error handler failed: {e}") logger.error(f"Original error: {error}") async def log_warning(self, message: str, context: Dict[str, Any] = None, source_id: str = None, source_name: str = None): """Log warning message""" try: warning_record = { "timestamp": datetime.utcnow(), "service": "data-ingestion-service", "level": "WARNING", "source_id": source_id, "source_name": source_name, "error_type": "WARNING", "error_message": message, "context": context or {} } await self.db.error_logs.insert_one(warning_record) logger.warning(f"[{source_name or 'system'}] {message}", extra={"context": context, "source_id": source_id}) except Exception as e: logger.error(f"Error logging warning: {e}") async def check_alert_conditions(self, error_record: Dict[str, Any]): """Check if error conditions warrant alerts""" try: # Count recent errors (last 1 minute) one_minute_ago = datetime.utcnow() - timedelta(minutes=1) recent_errors = await self.db.error_logs.count_documents({ "timestamp": {"$gte": one_minute_ago}, "level": "ERROR" }) # Check error rate threshold if recent_errors >= self.alert_thresholds["error_rate"]: await self.create_alert( alert_type="error_rate", title="High Error Rate Detected", description=f"Detected {recent_errors} errors in the last minute", severity="high", metadata={"error_count": recent_errors, "threshold": self.alert_thresholds["error_rate"]} ) except Exception as e: logger.error(f"Error checking alert conditions: {e}") async def create_alert(self, alert_type: str, title: str, description: str, severity: str, source_id: str = None, metadata: Dict[str, Any] = None): """Create monitoring alert""" try: alert_record = { "alert_id": str(uuid.uuid4()), "alert_type": alert_type, "source_id": source_id, "title": title, "description": description, "severity": severity, "timestamp": datetime.utcnow(), "resolved": False, "metadata": metadata or {} } await self.db.monitoring_alerts.insert_one(alert_record) # Also publish to Redis for real-time notifications alert_notification = { **alert_record, "timestamp": alert_record["timestamp"].isoformat() } await self.redis.publish("alerts:data-ingestion", json.dumps(alert_notification)) logger.warning(f"Alert created: {title} ({severity})") except Exception as e: logger.error(f"Error creating alert: {e}") async def get_error_summary(self) -> Dict[str, Any]: """Get error summary statistics""" try: # Get error counts by type error_types = dict(self.error_counts) # Get recent error rate one_hour_ago = datetime.utcnow() - timedelta(hours=1) recent_errors = await self.db.error_logs.count_documents({ "timestamp": {"$gte": one_hour_ago}, "level": "ERROR" }) # Get recent alerts recent_alerts = await self.db.monitoring_alerts.count_documents({ "timestamp": {"$gte": one_hour_ago}, "resolved": False }) return { "total_errors": sum(error_types.values()), "error_types": error_types, "recent_errors_1h": recent_errors, "active_alerts": recent_alerts, "error_history": list(self.error_history)[-10:], # Last 10 errors "last_error": self.error_history[-1] if self.error_history else None } except Exception as e: logger.error(f"Error getting error summary: {e}") return {} class ServiceMonitor: """Main service monitoring coordinator""" def __init__(self, db, redis_client): self.db = db self.redis = redis_client self.performance_monitor = PerformanceMonitor(redis_client) self.error_handler = ErrorHandler(db, redis_client) self.monitoring_active = False self.monitoring_interval = 30 # seconds async def start_monitoring(self): """Start background monitoring tasks""" self.monitoring_active = True logger.info("Service monitoring started") # Start monitoring loop asyncio.create_task(self._monitoring_loop()) async def stop_monitoring(self): """Stop background monitoring""" self.monitoring_active = False await self.performance_monitor.flush_metrics() logger.info("Service monitoring stopped") async def _monitoring_loop(self): """Main monitoring loop""" while self.monitoring_active: try: # Record system metrics await self.performance_monitor.record_system_metrics() # Check system health await self._check_system_health() # Cleanup old data await self._cleanup_old_monitoring_data() # Wait for next cycle await asyncio.sleep(self.monitoring_interval) except Exception as e: await self.error_handler.log_error(e, {"task": "monitoring_loop"}) await asyncio.sleep(self.monitoring_interval) async def _check_system_health(self): """Check system health and create alerts if needed""" try: # Check memory usage current_memory = self.performance_monitor.memory_usage[-1] if self.performance_monitor.memory_usage else 0 if current_memory > self.error_handler.alert_thresholds["memory_usage"]: await self.error_handler.create_alert( alert_type="high_memory", title="High Memory Usage", description=f"Memory usage at {current_memory:.1f}MB", severity="warning", metadata={"current_memory_mb": current_memory} ) # Check CPU usage current_cpu = self.performance_monitor.cpu_usage[-1] if self.performance_monitor.cpu_usage else 0 if current_cpu > self.error_handler.alert_thresholds["cpu_usage"]: await self.error_handler.create_alert( alert_type="high_cpu", title="High CPU Usage", description=f"CPU usage at {current_cpu:.1f}%", severity="warning", metadata={"current_cpu_percent": current_cpu} ) except Exception as e: logger.error(f"Error checking system health: {e}") async def _cleanup_old_monitoring_data(self): """Clean up old monitoring data""" try: # Clean up old error logs (older than 30 days) thirty_days_ago = datetime.utcnow() - timedelta(days=30) deleted_errors = await self.db.error_logs.delete_many({ "timestamp": {"$lt": thirty_days_ago} }) # Clean up resolved alerts (older than 7 days) seven_days_ago = datetime.utcnow() - timedelta(days=7) deleted_alerts = await self.db.monitoring_alerts.delete_many({ "timestamp": {"$lt": seven_days_ago}, "resolved": True }) if deleted_errors.deleted_count > 0 or deleted_alerts.deleted_count > 0: logger.info(f"Cleaned up {deleted_errors.deleted_count} old error logs and " f"{deleted_alerts.deleted_count} resolved alerts") except Exception as e: logger.error(f"Error cleaning up old monitoring data: {e}") async def get_service_status(self) -> Dict[str, Any]: """Get comprehensive service status""" try: performance_summary = await self.performance_monitor.get_performance_summary() error_summary = await self.error_handler.get_error_summary() # Get database status db_status = await self._get_database_status() # Overall health assessment health_score = await self._calculate_health_score(performance_summary, error_summary) return { "service": "data-ingestion-service", "timestamp": datetime.utcnow().isoformat(), "health_score": health_score, "monitoring_active": self.monitoring_active, "performance": performance_summary, "errors": error_summary, "database": db_status } except Exception as e: logger.error(f"Error getting service status: {e}") return {"error": str(e)} async def _get_database_status(self) -> Dict[str, Any]: """Get database connection and performance status""" try: # Test MongoDB connection start_time = time.time() await self.db.command("ping") mongo_latency = (time.time() - start_time) * 1000 # Test Redis connection start_time = time.time() await self.redis.ping() redis_latency = (time.time() - start_time) * 1000 # Get collection counts collections_info = {} for collection_name in ["data_sources", "processed_files", "error_logs", "monitoring_alerts"]: try: count = await self.db[collection_name].count_documents({}) collections_info[collection_name] = count except: collections_info[collection_name] = "unknown" return { "mongodb": { "connected": True, "latency_ms": round(mongo_latency, 2) }, "redis": { "connected": True, "latency_ms": round(redis_latency, 2) }, "collections": collections_info } except Exception as e: return { "mongodb": {"connected": False, "error": str(e)}, "redis": {"connected": False, "error": str(e)}, "collections": {} } async def _calculate_health_score(self, performance: Dict[str, Any], errors: Dict[str, Any]) -> float: """Calculate overall health score (0-100)""" try: score = 100.0 # Deduct for high error rate error_rate = performance.get("error_rate", 0) if error_rate > 5: score -= min(error_rate * 2, 30) # Deduct for high resource usage memory_mb = performance.get("current_memory_mb", 0) if memory_mb > 300: score -= min((memory_mb - 300) / 10, 20) cpu_percent = performance.get("current_cpu_percent", 0) if cpu_percent > 70: score -= min((cpu_percent - 70) / 2, 15) # Deduct for recent errors recent_errors = errors.get("recent_errors_1h", 0) if recent_errors > 0: score -= min(recent_errors * 5, 25) # Deduct for active alerts active_alerts = errors.get("active_alerts", 0) if active_alerts > 0: score -= min(active_alerts * 10, 20) return max(0.0, round(score, 1)) except Exception as e: logger.error(f"Error calculating health score: {e}") return 50.0 # Default moderate health score # Export monitoring components __all__ = [ 'ServiceMonitor', 'PerformanceMonitor', 'ErrorHandler' ]