""" Data models for the data ingestion service. Defines Pydantic models for request/response validation and database schemas. """ from pydantic import BaseModel, Field, validator from typing import List, Dict, Any, Optional, Union from datetime import datetime from enum import Enum class DataFormat(str, Enum): """Supported data formats for ingestion""" CSV = "csv" JSON = "json" TXT = "txt" EXCEL = "excel" XML = "xml" SLG_V2 = "slg_v2" class SourceStatus(str, Enum): """Status of a data source""" ACTIVE = "active" INACTIVE = "inactive" ERROR = "error" MAINTENANCE = "maintenance" class FTPConfig(BaseModel): """FTP server configuration""" host: str port: int = Field(default=21, ge=1, le=65535) username: str = "anonymous" password: str = "" use_ssl: bool = False passive_mode: bool = True remote_path: str = "/" timeout: int = Field(default=30, ge=5, le=300) @validator('host') def validate_host(cls, v): if not v or len(v.strip()) == 0: raise ValueError('Host cannot be empty') return v.strip() class TopicConfig(BaseModel): """Redis topic configuration""" topic_name: str description: str = "" data_types: List[str] = Field(default_factory=lambda: ["all"]) format: str = "sensor_reading" enabled: bool = True class DataSourceCreate(BaseModel): """Request model for creating a new data source""" name: str = Field(..., min_length=1, max_length=100) description: str = "" source_type: str = Field(default="ftp", regex="^(ftp|sftp|http|https)$") ftp_config: FTPConfig file_patterns: List[str] = Field(default_factory=lambda: ["*.csv"]) data_format: DataFormat = DataFormat.CSV topics: List[TopicConfig] = Field(default_factory=list) polling_interval_minutes: int = Field(default=5, ge=1, le=1440) max_file_size_mb: int = Field(default=100, ge=1, le=1000) enabled: bool = True class DataSourceUpdate(BaseModel): """Request model for updating a data source""" name: Optional[str] = Field(None, min_length=1, max_length=100) description: Optional[str] = None ftp_config: Optional[FTPConfig] = None file_patterns: Optional[List[str]] = None data_format: Optional[DataFormat] = None topics: Optional[List[TopicConfig]] = None polling_interval_minutes: Optional[int] = Field(None, ge=1, le=1440) max_file_size_mb: Optional[int] = Field(None, ge=1, le=1000) enabled: Optional[bool] = None class DataSourceResponse(BaseModel): """Response model for data source information""" id: str name: str description: str source_type: str ftp_config: FTPConfig file_patterns: List[str] data_format: DataFormat topics: List[TopicConfig] polling_interval_minutes: int max_file_size_mb: int enabled: bool status: SourceStatus created_at: datetime updated_at: datetime last_check: Optional[datetime] = None last_success: Optional[datetime] = None error_count: int = 0 total_files_processed: int = 0 class Config: json_encoders = { datetime: lambda v: v.isoformat() } class FileProcessingRequest(BaseModel): """Request model for manual file processing""" source_id: str filename: str force_reprocess: bool = False class FileProcessingResponse(BaseModel): """Response model for file processing results""" success: bool message: str records_processed: int records_rejected: int processing_time_seconds: float file_size_bytes: int topics_published: List[str] class IngestionStats(BaseModel): """Response model for ingestion statistics""" files_processed_today: int records_processed_today: int active_sources: int total_sources: int average_processing_time: float success_rate_percentage: float last_24h_volume_mb: float class QualityMetrics(BaseModel): """Data quality metrics""" completeness: float = Field(..., ge=0.0, le=1.0) accuracy: float = Field(..., ge=0.0, le=1.0) consistency: float = Field(..., ge=0.0, le=1.0) timeliness: float = Field(..., ge=0.0, le=1.0) overall: float = Field(..., ge=0.0, le=1.0) class QualityReport(BaseModel): """Data quality report""" source: str total_records: int processed_records: int rejected_records: int quality_scores: QualityMetrics issues_found: List[str] processing_time: datetime class Config: json_encoders = { datetime: lambda v: v.isoformat() } class HealthStatus(BaseModel): """Service health status""" status: str timestamp: datetime uptime_seconds: float active_sources: int total_processed_files: int redis_connected: bool mongodb_connected: bool last_error: Optional[str] = None class Config: json_encoders = { datetime: lambda v: v.isoformat() } class SensorReading(BaseModel): """Individual sensor reading model""" sensor_id: str timestamp: Union[int, float, str] value: Union[int, float] unit: Optional[str] = None metadata: Dict[str, Any] = Field(default_factory=dict) class ProcessedFile(BaseModel): """Processed file record""" source_id: str source_name: str filename: str file_signature: str file_size: int modified_time: datetime processed_at: datetime class TopicInfo(BaseModel): """Topic information response""" topic_name: str description: str data_types: List[str] format: str message_count: int last_published: Optional[datetime] = None created_at: datetime class Config: json_encoders = { datetime: lambda v: v.isoformat() } class PublishingStats(BaseModel): """Publishing statistics response""" total_messages_published: int active_topics: int topic_stats: Dict[str, int] last_updated: datetime class Config: json_encoders = { datetime: lambda v: v.isoformat() } class ErrorLog(BaseModel): """Error logging model""" service: str = "data-ingestion-service" timestamp: datetime level: str source_id: Optional[str] = None source_name: Optional[str] = None error_type: str error_message: str stack_trace: Optional[str] = None context: Dict[str, Any] = Field(default_factory=dict) class Config: json_encoders = { datetime: lambda v: v.isoformat() } class MonitoringAlert(BaseModel): """Monitoring alert model""" alert_id: str alert_type: str # "error", "warning", "info" source_id: Optional[str] = None title: str description: str severity: str = Field(..., regex="^(low|medium|high|critical)$") timestamp: datetime resolved: bool = False resolved_at: Optional[datetime] = None metadata: Dict[str, Any] = Field(default_factory=dict) class Config: json_encoders = { datetime: lambda v: v.isoformat() } # Database schema definitions for MongoDB collections class DataSourceSchema: """MongoDB schema for data sources""" collection_name = "data_sources" @staticmethod def get_indexes(): return [ {"keys": [("name", 1)], "unique": True}, {"keys": [("status", 1)]}, {"keys": [("enabled", 1)]}, {"keys": [("created_at", -1)]}, {"keys": [("last_check", -1)]} ] class ProcessedFileSchema: """MongoDB schema for processed files""" collection_name = "processed_files" @staticmethod def get_indexes(): return [ {"keys": [("source_id", 1), ("file_signature", 1)], "unique": True}, {"keys": [("processed_at", -1)]}, {"keys": [("source_name", 1)]}, {"keys": [("filename", 1)]} ] class QualityReportSchema: """MongoDB schema for quality reports""" collection_name = "quality_reports" @staticmethod def get_indexes(): return [ {"keys": [("source", 1)]}, {"keys": [("processing_time", -1)]}, {"keys": [("quality_scores.overall", -1)]} ] class IngestionStatsSchema: """MongoDB schema for ingestion statistics""" collection_name = "ingestion_stats" @staticmethod def get_indexes(): return [ {"keys": [("date", 1)], "unique": True}, {"keys": [("timestamp", -1)]} ] class ErrorLogSchema: """MongoDB schema for error logs""" collection_name = "error_logs" @staticmethod def get_indexes(): return [ {"keys": [("timestamp", -1)]}, {"keys": [("source_id", 1)]}, {"keys": [("error_type", 1)]}, {"keys": [("level", 1)]} ] class MonitoringAlertSchema: """MongoDB schema for monitoring alerts""" collection_name = "monitoring_alerts" @staticmethod def get_indexes(): return [ {"keys": [("alert_id", 1)], "unique": True}, {"keys": [("timestamp", -1)]}, {"keys": [("source_id", 1)]}, {"keys": [("alert_type", 1)]}, {"keys": [("resolved", 1)]} ] # Validation helpers def validate_timestamp(timestamp: Union[int, float, str]) -> int: """Validate and convert timestamp to unix timestamp""" if isinstance(timestamp, str): try: # Try ISO format first dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) return int(dt.timestamp()) except ValueError: try: # Try as unix timestamp string return int(float(timestamp)) except ValueError: raise ValueError(f"Invalid timestamp format: {timestamp}") elif isinstance(timestamp, (int, float)): return int(timestamp) else: raise ValueError(f"Timestamp must be int, float, or string, got {type(timestamp)}") def validate_sensor_id(sensor_id: str) -> str: """Validate sensor ID format""" if not isinstance(sensor_id, str) or len(sensor_id.strip()) == 0: raise ValueError("Sensor ID must be a non-empty string") # Remove extra whitespace sensor_id = sensor_id.strip() # Check length if len(sensor_id) > 100: raise ValueError("Sensor ID too long (max 100 characters)") return sensor_id def validate_numeric_value(value: Union[int, float, str]) -> float: """Validate and convert numeric value""" try: numeric_value = float(value) if not (-1e10 <= numeric_value <= 1e10): # Reasonable range raise ValueError(f"Value out of reasonable range: {numeric_value}") return numeric_value except (ValueError, TypeError): raise ValueError(f"Invalid numeric value: {value}") # Export all models for easy importing __all__ = [ # Enums 'DataFormat', 'SourceStatus', # Config models 'FTPConfig', 'TopicConfig', # Request/Response models 'DataSourceCreate', 'DataSourceUpdate', 'DataSourceResponse', 'FileProcessingRequest', 'FileProcessingResponse', 'IngestionStats', 'QualityMetrics', 'QualityReport', 'HealthStatus', 'SensorReading', 'ProcessedFile', 'TopicInfo', 'PublishingStats', 'ErrorLog', 'MonitoringAlert', # Schema definitions 'DataSourceSchema', 'ProcessedFileSchema', 'QualityReportSchema', 'IngestionStatsSchema', 'ErrorLogSchema', 'MonitoringAlertSchema', # Validation helpers 'validate_timestamp', 'validate_sensor_id', 'validate_numeric_value' ]