Add data-ingestion-service for SA4CPS FTP integration

- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
2025-09-10 14:43:30 +01:00
parent d4f280de93
commit 5fdce00e5d
16 changed files with 6353 additions and 0 deletions
--- a/microservices/data-ingestion-service/models.py
+++ b/microservices/data-ingestion-service/models.py
@@ -0,0 +1,391 @@
+"""
+Data models for the data ingestion service.
+Defines Pydantic models for request/response validation and database schemas.
+"""
+
+from pydantic import BaseModel, Field, validator
+from typing import List, Dict, Any, Optional, Union
+from datetime import datetime
+from enum import Enum
+
+class DataFormat(str, Enum):
+    """Supported data formats for ingestion"""
+    CSV = "csv"
+    JSON = "json"
+    TXT = "txt"
+    EXCEL = "excel"
+    XML = "xml"
+    SLG_V2 = "slg_v2"
+
+class SourceStatus(str, Enum):
+    """Status of a data source"""
+    ACTIVE = "active"
+    INACTIVE = "inactive"
+    ERROR = "error"
+    MAINTENANCE = "maintenance"
+
+class FTPConfig(BaseModel):
+    """FTP server configuration"""
+    host: str
+    port: int = Field(default=21, ge=1, le=65535)
+    username: str = "anonymous"
+    password: str = ""
+    use_ssl: bool = False
+    passive_mode: bool = True
+    remote_path: str = "/"
+    timeout: int = Field(default=30, ge=5, le=300)
+
+    @validator('host')
+    def validate_host(cls, v):
+        if not v or len(v.strip()) == 0:
+            raise ValueError('Host cannot be empty')
+        return v.strip()
+
+class TopicConfig(BaseModel):
+    """Redis topic configuration"""
+    topic_name: str
+    description: str = ""
+    data_types: List[str] = Field(default_factory=lambda: ["all"])
+    format: str = "sensor_reading"
+    enabled: bool = True
+
+class DataSourceCreate(BaseModel):
+    """Request model for creating a new data source"""
+    name: str = Field(..., min_length=1, max_length=100)
+    description: str = ""
+    source_type: str = Field(default="ftp", regex="^(ftp|sftp|http|https)$")
+    ftp_config: FTPConfig
+    file_patterns: List[str] = Field(default_factory=lambda: ["*.csv"])
+    data_format: DataFormat = DataFormat.CSV
+    topics: List[TopicConfig] = Field(default_factory=list)
+    polling_interval_minutes: int = Field(default=5, ge=1, le=1440)
+    max_file_size_mb: int = Field(default=100, ge=1, le=1000)
+    enabled: bool = True
+
+class DataSourceUpdate(BaseModel):
+    """Request model for updating a data source"""
+    name: Optional[str] = Field(None, min_length=1, max_length=100)
+    description: Optional[str] = None
+    ftp_config: Optional[FTPConfig] = None
+    file_patterns: Optional[List[str]] = None
+    data_format: Optional[DataFormat] = None
+    topics: Optional[List[TopicConfig]] = None
+    polling_interval_minutes: Optional[int] = Field(None, ge=1, le=1440)
+    max_file_size_mb: Optional[int] = Field(None, ge=1, le=1000)
+    enabled: Optional[bool] = None
+
+class DataSourceResponse(BaseModel):
+    """Response model for data source information"""
+    id: str
+    name: str
+    description: str
+    source_type: str
+    ftp_config: FTPConfig
+    file_patterns: List[str]
+    data_format: DataFormat
+    topics: List[TopicConfig]
+    polling_interval_minutes: int
+    max_file_size_mb: int
+    enabled: bool
+    status: SourceStatus
+    created_at: datetime
+    updated_at: datetime
+    last_check: Optional[datetime] = None
+    last_success: Optional[datetime] = None
+    error_count: int = 0
+    total_files_processed: int = 0
+
+    class Config:
+        json_encoders = {
+            datetime: lambda v: v.isoformat()
+        }
+
+class FileProcessingRequest(BaseModel):
+    """Request model for manual file processing"""
+    source_id: str
+    filename: str
+    force_reprocess: bool = False
+
+class FileProcessingResponse(BaseModel):
+    """Response model for file processing results"""
+    success: bool
+    message: str
+    records_processed: int
+    records_rejected: int
+    processing_time_seconds: float
+    file_size_bytes: int
+    topics_published: List[str]
+
+class IngestionStats(BaseModel):
+    """Response model for ingestion statistics"""
+    files_processed_today: int
+    records_processed_today: int
+    active_sources: int
+    total_sources: int
+    average_processing_time: float
+    success_rate_percentage: float
+    last_24h_volume_mb: float
+
+class QualityMetrics(BaseModel):
+    """Data quality metrics"""
+    completeness: float = Field(..., ge=0.0, le=1.0)
+    accuracy: float = Field(..., ge=0.0, le=1.0)
+    consistency: float = Field(..., ge=0.0, le=1.0)
+    timeliness: float = Field(..., ge=0.0, le=1.0)
+    overall: float = Field(..., ge=0.0, le=1.0)
+
+class QualityReport(BaseModel):
+    """Data quality report"""
+    source: str
+    total_records: int
+    processed_records: int
+    rejected_records: int
+    quality_scores: QualityMetrics
+    issues_found: List[str]
+    processing_time: datetime
+
+    class Config:
+        json_encoders = {
+            datetime: lambda v: v.isoformat()
+        }
+
+class HealthStatus(BaseModel):
+    """Service health status"""
+    status: str
+    timestamp: datetime
+    uptime_seconds: float
+    active_sources: int
+    total_processed_files: int
+    redis_connected: bool
+    mongodb_connected: bool
+    last_error: Optional[str] = None
+
+    class Config:
+        json_encoders = {
+            datetime: lambda v: v.isoformat()
+        }
+
+class SensorReading(BaseModel):
+    """Individual sensor reading model"""
+    sensor_id: str
+    timestamp: Union[int, float, str]
+    value: Union[int, float]
+    unit: Optional[str] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+class ProcessedFile(BaseModel):
+    """Processed file record"""
+    source_id: str
+    source_name: str
+    filename: str
+    file_signature: str
+    file_size: int
+    modified_time: datetime
+    processed_at: datetime
+
+class TopicInfo(BaseModel):
+    """Topic information response"""
+    topic_name: str
+    description: str
+    data_types: List[str]
+    format: str
+    message_count: int
+    last_published: Optional[datetime] = None
+    created_at: datetime
+
+    class Config:
+        json_encoders = {
+            datetime: lambda v: v.isoformat()
+        }
+
+class PublishingStats(BaseModel):
+    """Publishing statistics response"""
+    total_messages_published: int
+    active_topics: int
+    topic_stats: Dict[str, int]
+    last_updated: datetime
+
+    class Config:
+        json_encoders = {
+            datetime: lambda v: v.isoformat()
+        }
+
+class ErrorLog(BaseModel):
+    """Error logging model"""
+    service: str = "data-ingestion-service"
+    timestamp: datetime
+    level: str
+    source_id: Optional[str] = None
+    source_name: Optional[str] = None
+    error_type: str
+    error_message: str
+    stack_trace: Optional[str] = None
+    context: Dict[str, Any] = Field(default_factory=dict)
+
+    class Config:
+        json_encoders = {
+            datetime: lambda v: v.isoformat()
+        }
+
+class MonitoringAlert(BaseModel):
+    """Monitoring alert model"""
+    alert_id: str
+    alert_type: str  # "error", "warning", "info"
+    source_id: Optional[str] = None
+    title: str
+    description: str
+    severity: str = Field(..., regex="^(low|medium|high|critical)$")
+    timestamp: datetime
+    resolved: bool = False
+    resolved_at: Optional[datetime] = None
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+
+    class Config:
+        json_encoders = {
+            datetime: lambda v: v.isoformat()
+        }
+
+# Database schema definitions for MongoDB collections
+
+class DataSourceSchema:
+    """MongoDB schema for data sources"""
+    collection_name = "data_sources"
+    
+    @staticmethod
+    def get_indexes():
+        return [
+            {"keys": [("name", 1)], "unique": True},
+            {"keys": [("status", 1)]},
+            {"keys": [("enabled", 1)]},
+            {"keys": [("created_at", -1)]},
+            {"keys": [("last_check", -1)]}
+        ]
+
+class ProcessedFileSchema:
+    """MongoDB schema for processed files"""
+    collection_name = "processed_files"
+    
+    @staticmethod
+    def get_indexes():
+        return [
+            {"keys": [("source_id", 1), ("file_signature", 1)], "unique": True},
+            {"keys": [("processed_at", -1)]},
+            {"keys": [("source_name", 1)]},
+            {"keys": [("filename", 1)]}
+        ]
+
+class QualityReportSchema:
+    """MongoDB schema for quality reports"""
+    collection_name = "quality_reports"
+    
+    @staticmethod
+    def get_indexes():
+        return [
+            {"keys": [("source", 1)]},
+            {"keys": [("processing_time", -1)]},
+            {"keys": [("quality_scores.overall", -1)]}
+        ]
+
+class IngestionStatsSchema:
+    """MongoDB schema for ingestion statistics"""
+    collection_name = "ingestion_stats"
+    
+    @staticmethod
+    def get_indexes():
+        return [
+            {"keys": [("date", 1)], "unique": True},
+            {"keys": [("timestamp", -1)]}
+        ]
+
+class ErrorLogSchema:
+    """MongoDB schema for error logs"""
+    collection_name = "error_logs"
+    
+    @staticmethod
+    def get_indexes():
+        return [
+            {"keys": [("timestamp", -1)]},
+            {"keys": [("source_id", 1)]},
+            {"keys": [("error_type", 1)]},
+            {"keys": [("level", 1)]}
+        ]
+
+class MonitoringAlertSchema:
+    """MongoDB schema for monitoring alerts"""
+    collection_name = "monitoring_alerts"
+    
+    @staticmethod
+    def get_indexes():
+        return [
+            {"keys": [("alert_id", 1)], "unique": True},
+            {"keys": [("timestamp", -1)]},
+            {"keys": [("source_id", 1)]},
+            {"keys": [("alert_type", 1)]},
+            {"keys": [("resolved", 1)]}
+        ]
+
+# Validation helpers
+def validate_timestamp(timestamp: Union[int, float, str]) -> int:
+    """Validate and convert timestamp to unix timestamp"""
+    if isinstance(timestamp, str):
+        try:
+            # Try ISO format first
+            dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+            return int(dt.timestamp())
+        except ValueError:
+            try:
+                # Try as unix timestamp string
+                return int(float(timestamp))
+            except ValueError:
+                raise ValueError(f"Invalid timestamp format: {timestamp}")
+    elif isinstance(timestamp, (int, float)):
+        return int(timestamp)
+    else:
+        raise ValueError(f"Timestamp must be int, float, or string, got {type(timestamp)}")
+
+def validate_sensor_id(sensor_id: str) -> str:
+    """Validate sensor ID format"""
+    if not isinstance(sensor_id, str) or len(sensor_id.strip()) == 0:
+        raise ValueError("Sensor ID must be a non-empty string")
+    
+    # Remove extra whitespace
+    sensor_id = sensor_id.strip()
+    
+    # Check length
+    if len(sensor_id) > 100:
+        raise ValueError("Sensor ID too long (max 100 characters)")
+    
+    return sensor_id
+
+def validate_numeric_value(value: Union[int, float, str]) -> float:
+    """Validate and convert numeric value"""
+    try:
+        numeric_value = float(value)
+        if not (-1e10 <= numeric_value <= 1e10):  # Reasonable range
+            raise ValueError(f"Value out of reasonable range: {numeric_value}")
+        return numeric_value
+    except (ValueError, TypeError):
+        raise ValueError(f"Invalid numeric value: {value}")
+
+# Export all models for easy importing
+__all__ = [
+    # Enums
+    'DataFormat', 'SourceStatus',
+    
+    # Config models
+    'FTPConfig', 'TopicConfig',
+    
+    # Request/Response models
+    'DataSourceCreate', 'DataSourceUpdate', 'DataSourceResponse',
+    'FileProcessingRequest', 'FileProcessingResponse',
+    'IngestionStats', 'QualityMetrics', 'QualityReport',
+    'HealthStatus', 'SensorReading', 'ProcessedFile',
+    'TopicInfo', 'PublishingStats', 'ErrorLog', 'MonitoringAlert',
+    
+    # Schema definitions
+    'DataSourceSchema', 'ProcessedFileSchema', 'QualityReportSchema',
+    'IngestionStatsSchema', 'ErrorLogSchema', 'MonitoringAlertSchema',
+    
+    # Validation helpers
+    'validate_timestamp', 'validate_sensor_id', 'validate_numeric_value'
+]