sac4cps-backend/microservices/data-ingestion-service/models.py

"""
Data models for the data ingestion service.
Defines Pydantic models for request/response validation and database schemas.
"""

from pydantic import BaseModel, Field, validator
from typing import List, Dict, Any, Optional, Union
from datetime import datetime
from enum import Enum

class DataFormat(str, Enum):
    """Supported data formats for ingestion"""
    CSV = "csv"
    JSON = "json"
    TXT = "txt"
    EXCEL = "excel"
    XML = "xml"
    SLG_V2 = "slg_v2"

class SourceStatus(str, Enum):
    """Status of a data source"""
    ACTIVE = "active"
    INACTIVE = "inactive"
    ERROR = "error"
    MAINTENANCE = "maintenance"

class FTPConfig(BaseModel):
    """FTP server configuration"""
    host: str
    port: int = Field(default=21, ge=1, le=65535)
    username: str = "anonymous"
    password: str = ""
    use_ssl: bool = False
    passive_mode: bool = True
    remote_path: str = "/"
    timeout: int = Field(default=30, ge=5, le=300)

    @validator('host')
    def validate_host(cls, v):
        if not v or len(v.strip()) == 0:
            raise ValueError('Host cannot be empty')
        return v.strip()

class TopicConfig(BaseModel):
    """Redis topic configuration"""
    topic_name: str
    description: str = ""
    data_types: List[str] = Field(default_factory=lambda: ["all"])
    format: str = "sensor_reading"
    enabled: bool = True

class DataSourceCreate(BaseModel):
    """Request model for creating a new data source"""
    name: str = Field(..., min_length=1, max_length=100)
    description: str = ""
    source_type: str = Field(default="ftp", regex="^(ftp|sftp|http|https)$")
    ftp_config: FTPConfig
    file_patterns: List[str] = Field(default_factory=lambda: ["*.csv"])
    data_format: DataFormat = DataFormat.CSV
    topics: List[TopicConfig] = Field(default_factory=list)
    polling_interval_minutes: int = Field(default=5, ge=1, le=1440)
    max_file_size_mb: int = Field(default=100, ge=1, le=1000)
    enabled: bool = True

class DataSourceUpdate(BaseModel):
    """Request model for updating a data source"""
    name: Optional[str] = Field(None, min_length=1, max_length=100)
    description: Optional[str] = None
    ftp_config: Optional[FTPConfig] = None
    file_patterns: Optional[List[str]] = None
    data_format: Optional[DataFormat] = None
    topics: Optional[List[TopicConfig]] = None
    polling_interval_minutes: Optional[int] = Field(None, ge=1, le=1440)
    max_file_size_mb: Optional[int] = Field(None, ge=1, le=1000)
    enabled: Optional[bool] = None

class DataSourceResponse(BaseModel):
    """Response model for data source information"""
    id: str
    name: str
    description: str
    source_type: str
    ftp_config: FTPConfig
    file_patterns: List[str]
    data_format: DataFormat
    topics: List[TopicConfig]
    polling_interval_minutes: int
    max_file_size_mb: int
    enabled: bool
    status: SourceStatus
    created_at: datetime
    updated_at: datetime
    last_check: Optional[datetime] = None
    last_success: Optional[datetime] = None
    error_count: int = 0
    total_files_processed: int = 0

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }

class FileProcessingRequest(BaseModel):
    """Request model for manual file processing"""
    source_id: str
    filename: str
    force_reprocess: bool = False

class FileProcessingResponse(BaseModel):
    """Response model for file processing results"""
    success: bool
    message: str
    records_processed: int
    records_rejected: int
    processing_time_seconds: float
    file_size_bytes: int
    topics_published: List[str]

class IngestionStats(BaseModel):
    """Response model for ingestion statistics"""
    files_processed_today: int
    records_processed_today: int
    active_sources: int
    total_sources: int
    average_processing_time: float
    success_rate_percentage: float
    last_24h_volume_mb: float

class QualityMetrics(BaseModel):
    """Data quality metrics"""
    completeness: float = Field(..., ge=0.0, le=1.0)
    accuracy: float = Field(..., ge=0.0, le=1.0)
    consistency: float = Field(..., ge=0.0, le=1.0)
    timeliness: float = Field(..., ge=0.0, le=1.0)
    overall: float = Field(..., ge=0.0, le=1.0)

class QualityReport(BaseModel):
    """Data quality report"""
    source: str
    total_records: int
    processed_records: int
    rejected_records: int
    quality_scores: QualityMetrics
    issues_found: List[str]
    processing_time: datetime

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }

class HealthStatus(BaseModel):
    """Service health status"""
    status: str
    timestamp: datetime
    uptime_seconds: float
    active_sources: int
    total_processed_files: int
    redis_connected: bool
    mongodb_connected: bool
    last_error: Optional[str] = None

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }

class SensorReading(BaseModel):
    """Individual sensor reading model"""
    sensor_id: str
    timestamp: Union[int, float, str]
    value: Union[int, float]
    unit: Optional[str] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)

class ProcessedFile(BaseModel):
    """Processed file record"""
    source_id: str
    source_name: str
    filename: str
    file_signature: str
    file_size: int
    modified_time: datetime
    processed_at: datetime

class TopicInfo(BaseModel):
    """Topic information response"""
    topic_name: str
    description: str
    data_types: List[str]
    format: str
    message_count: int
    last_published: Optional[datetime] = None
    created_at: datetime

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }

class PublishingStats(BaseModel):
    """Publishing statistics response"""
    total_messages_published: int
    active_topics: int
    topic_stats: Dict[str, int]
    last_updated: datetime

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }

class ErrorLog(BaseModel):
    """Error logging model"""
    service: str = "data-ingestion-service"
    timestamp: datetime
    level: str
    source_id: Optional[str] = None
    source_name: Optional[str] = None
    error_type: str
    error_message: str
    stack_trace: Optional[str] = None
    context: Dict[str, Any] = Field(default_factory=dict)

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }

class MonitoringAlert(BaseModel):
    """Monitoring alert model"""
    alert_id: str
    alert_type: str  # "error", "warning", "info"
    source_id: Optional[str] = None
    title: str
    description: str
    severity: str = Field(..., regex="^(low|medium|high|critical)$")
    timestamp: datetime
    resolved: bool = False
    resolved_at: Optional[datetime] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)

    class Config:
        json_encoders = {
            datetime: lambda v: v.isoformat()
        }

# Database schema definitions for MongoDB collections

class DataSourceSchema:
    """MongoDB schema for data sources"""
    collection_name = "data_sources"

    @staticmethod
    def get_indexes():
        return [
            {"keys": [("name", 1)], "unique": True},
            {"keys": [("status", 1)]},
            {"keys": [("enabled", 1)]},
            {"keys": [("created_at", -1)]},
            {"keys": [("last_check", -1)]}
        ]

class ProcessedFileSchema:
    """MongoDB schema for processed files"""
    collection_name = "processed_files"

    @staticmethod
    def get_indexes():
        return [
            {"keys": [("source_id", 1), ("file_signature", 1)], "unique": True},
            {"keys": [("processed_at", -1)]},
            {"keys": [("source_name", 1)]},
            {"keys": [("filename", 1)]}
        ]

class QualityReportSchema:
    """MongoDB schema for quality reports"""
    collection_name = "quality_reports"

    @staticmethod
    def get_indexes():
        return [
            {"keys": [("source", 1)]},
            {"keys": [("processing_time", -1)]},
            {"keys": [("quality_scores.overall", -1)]}
        ]

class IngestionStatsSchema:
    """MongoDB schema for ingestion statistics"""
    collection_name = "ingestion_stats"

    @staticmethod
    def get_indexes():
        return [
            {"keys": [("date", 1)], "unique": True},
            {"keys": [("timestamp", -1)]}
        ]

class ErrorLogSchema:
    """MongoDB schema for error logs"""
    collection_name = "error_logs"

    @staticmethod
    def get_indexes():
        return [
            {"keys": [("timestamp", -1)]},
            {"keys": [("source_id", 1)]},
            {"keys": [("error_type", 1)]},
            {"keys": [("level", 1)]}
        ]

class MonitoringAlertSchema:
    """MongoDB schema for monitoring alerts"""
    collection_name = "monitoring_alerts"

    @staticmethod
    def get_indexes():
        return [
            {"keys": [("alert_id", 1)], "unique": True},
            {"keys": [("timestamp", -1)]},
            {"keys": [("source_id", 1)]},
            {"keys": [("alert_type", 1)]},
            {"keys": [("resolved", 1)]}
        ]

# Validation helpers
def validate_timestamp(timestamp: Union[int, float, str]) -> int:
    """Validate and convert timestamp to unix timestamp"""
    if isinstance(timestamp, str):
        try:
            # Try ISO format first
            dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
            return int(dt.timestamp())
        except ValueError:
            try:
                # Try as unix timestamp string
                return int(float(timestamp))
            except ValueError:
                raise ValueError(f"Invalid timestamp format: {timestamp}")
    elif isinstance(timestamp, (int, float)):
        return int(timestamp)
    else:
        raise ValueError(f"Timestamp must be int, float, or string, got {type(timestamp)}")

def validate_sensor_id(sensor_id: str) -> str:
    """Validate sensor ID format"""
    if not isinstance(sensor_id, str) or len(sensor_id.strip()) == 0:
        raise ValueError("Sensor ID must be a non-empty string")

    # Remove extra whitespace
    sensor_id = sensor_id.strip()

    # Check length
    if len(sensor_id) > 100:
        raise ValueError("Sensor ID too long (max 100 characters)")

    return sensor_id

def validate_numeric_value(value: Union[int, float, str]) -> float:
    """Validate and convert numeric value"""
    try:
        numeric_value = float(value)
        if not (-1e10 <= numeric_value <= 1e10):  # Reasonable range
            raise ValueError(f"Value out of reasonable range: {numeric_value}")
        return numeric_value
    except (ValueError, TypeError):
        raise ValueError(f"Invalid numeric value: {value}")

# Export all models for easy importing
__all__ = [
    # Enums
    'DataFormat', 'SourceStatus',

    # Config models
    'FTPConfig', 'TopicConfig',

    # Request/Response models
    'DataSourceCreate', 'DataSourceUpdate', 'DataSourceResponse',
    'FileProcessingRequest', 'FileProcessingResponse',
    'IngestionStats', 'QualityMetrics', 'QualityReport',
    'HealthStatus', 'SensorReading', 'ProcessedFile',
    'TopicInfo', 'PublishingStats', 'ErrorLog', 'MonitoringAlert',

    # Schema definitions
    'DataSourceSchema', 'ProcessedFileSchema', 'QualityReportSchema',
    'IngestionStatsSchema', 'ErrorLogSchema', 'MonitoringAlertSchema',

    # Validation helpers
    'validate_timestamp', 'validate_sensor_id', 'validate_numeric_value'
]