Files
sac4cps-backend/microservices/data-ingestion-service/models.py
rafaeldpsilva 5fdce00e5d Add data-ingestion-service for SA4CPS FTP integration
- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add
robust data processor with multi-format and unit inference support -
Publish parsed data to Redis topics for real-time dashboard simulation -
Include validation, monitoring, and auto-configuration scripts - Provide
documentation and test scripts for SA4CPS integration
2025-09-10 14:43:30 +01:00

391 lines
11 KiB
Python

"""
Data models for the data ingestion service.
Defines Pydantic models for request/response validation and database schemas.
"""
from pydantic import BaseModel, Field, validator
from typing import List, Dict, Any, Optional, Union
from datetime import datetime
from enum import Enum
class DataFormat(str, Enum):
"""Supported data formats for ingestion"""
CSV = "csv"
JSON = "json"
TXT = "txt"
EXCEL = "excel"
XML = "xml"
SLG_V2 = "slg_v2"
class SourceStatus(str, Enum):
"""Status of a data source"""
ACTIVE = "active"
INACTIVE = "inactive"
ERROR = "error"
MAINTENANCE = "maintenance"
class FTPConfig(BaseModel):
"""FTP server configuration"""
host: str
port: int = Field(default=21, ge=1, le=65535)
username: str = "anonymous"
password: str = ""
use_ssl: bool = False
passive_mode: bool = True
remote_path: str = "/"
timeout: int = Field(default=30, ge=5, le=300)
@validator('host')
def validate_host(cls, v):
if not v or len(v.strip()) == 0:
raise ValueError('Host cannot be empty')
return v.strip()
class TopicConfig(BaseModel):
"""Redis topic configuration"""
topic_name: str
description: str = ""
data_types: List[str] = Field(default_factory=lambda: ["all"])
format: str = "sensor_reading"
enabled: bool = True
class DataSourceCreate(BaseModel):
"""Request model for creating a new data source"""
name: str = Field(..., min_length=1, max_length=100)
description: str = ""
source_type: str = Field(default="ftp", regex="^(ftp|sftp|http|https)$")
ftp_config: FTPConfig
file_patterns: List[str] = Field(default_factory=lambda: ["*.csv"])
data_format: DataFormat = DataFormat.CSV
topics: List[TopicConfig] = Field(default_factory=list)
polling_interval_minutes: int = Field(default=5, ge=1, le=1440)
max_file_size_mb: int = Field(default=100, ge=1, le=1000)
enabled: bool = True
class DataSourceUpdate(BaseModel):
"""Request model for updating a data source"""
name: Optional[str] = Field(None, min_length=1, max_length=100)
description: Optional[str] = None
ftp_config: Optional[FTPConfig] = None
file_patterns: Optional[List[str]] = None
data_format: Optional[DataFormat] = None
topics: Optional[List[TopicConfig]] = None
polling_interval_minutes: Optional[int] = Field(None, ge=1, le=1440)
max_file_size_mb: Optional[int] = Field(None, ge=1, le=1000)
enabled: Optional[bool] = None
class DataSourceResponse(BaseModel):
"""Response model for data source information"""
id: str
name: str
description: str
source_type: str
ftp_config: FTPConfig
file_patterns: List[str]
data_format: DataFormat
topics: List[TopicConfig]
polling_interval_minutes: int
max_file_size_mb: int
enabled: bool
status: SourceStatus
created_at: datetime
updated_at: datetime
last_check: Optional[datetime] = None
last_success: Optional[datetime] = None
error_count: int = 0
total_files_processed: int = 0
class Config:
json_encoders = {
datetime: lambda v: v.isoformat()
}
class FileProcessingRequest(BaseModel):
"""Request model for manual file processing"""
source_id: str
filename: str
force_reprocess: bool = False
class FileProcessingResponse(BaseModel):
"""Response model for file processing results"""
success: bool
message: str
records_processed: int
records_rejected: int
processing_time_seconds: float
file_size_bytes: int
topics_published: List[str]
class IngestionStats(BaseModel):
"""Response model for ingestion statistics"""
files_processed_today: int
records_processed_today: int
active_sources: int
total_sources: int
average_processing_time: float
success_rate_percentage: float
last_24h_volume_mb: float
class QualityMetrics(BaseModel):
"""Data quality metrics"""
completeness: float = Field(..., ge=0.0, le=1.0)
accuracy: float = Field(..., ge=0.0, le=1.0)
consistency: float = Field(..., ge=0.0, le=1.0)
timeliness: float = Field(..., ge=0.0, le=1.0)
overall: float = Field(..., ge=0.0, le=1.0)
class QualityReport(BaseModel):
"""Data quality report"""
source: str
total_records: int
processed_records: int
rejected_records: int
quality_scores: QualityMetrics
issues_found: List[str]
processing_time: datetime
class Config:
json_encoders = {
datetime: lambda v: v.isoformat()
}
class HealthStatus(BaseModel):
"""Service health status"""
status: str
timestamp: datetime
uptime_seconds: float
active_sources: int
total_processed_files: int
redis_connected: bool
mongodb_connected: bool
last_error: Optional[str] = None
class Config:
json_encoders = {
datetime: lambda v: v.isoformat()
}
class SensorReading(BaseModel):
"""Individual sensor reading model"""
sensor_id: str
timestamp: Union[int, float, str]
value: Union[int, float]
unit: Optional[str] = None
metadata: Dict[str, Any] = Field(default_factory=dict)
class ProcessedFile(BaseModel):
"""Processed file record"""
source_id: str
source_name: str
filename: str
file_signature: str
file_size: int
modified_time: datetime
processed_at: datetime
class TopicInfo(BaseModel):
"""Topic information response"""
topic_name: str
description: str
data_types: List[str]
format: str
message_count: int
last_published: Optional[datetime] = None
created_at: datetime
class Config:
json_encoders = {
datetime: lambda v: v.isoformat()
}
class PublishingStats(BaseModel):
"""Publishing statistics response"""
total_messages_published: int
active_topics: int
topic_stats: Dict[str, int]
last_updated: datetime
class Config:
json_encoders = {
datetime: lambda v: v.isoformat()
}
class ErrorLog(BaseModel):
"""Error logging model"""
service: str = "data-ingestion-service"
timestamp: datetime
level: str
source_id: Optional[str] = None
source_name: Optional[str] = None
error_type: str
error_message: str
stack_trace: Optional[str] = None
context: Dict[str, Any] = Field(default_factory=dict)
class Config:
json_encoders = {
datetime: lambda v: v.isoformat()
}
class MonitoringAlert(BaseModel):
"""Monitoring alert model"""
alert_id: str
alert_type: str # "error", "warning", "info"
source_id: Optional[str] = None
title: str
description: str
severity: str = Field(..., regex="^(low|medium|high|critical)$")
timestamp: datetime
resolved: bool = False
resolved_at: Optional[datetime] = None
metadata: Dict[str, Any] = Field(default_factory=dict)
class Config:
json_encoders = {
datetime: lambda v: v.isoformat()
}
# Database schema definitions for MongoDB collections
class DataSourceSchema:
"""MongoDB schema for data sources"""
collection_name = "data_sources"
@staticmethod
def get_indexes():
return [
{"keys": [("name", 1)], "unique": True},
{"keys": [("status", 1)]},
{"keys": [("enabled", 1)]},
{"keys": [("created_at", -1)]},
{"keys": [("last_check", -1)]}
]
class ProcessedFileSchema:
"""MongoDB schema for processed files"""
collection_name = "processed_files"
@staticmethod
def get_indexes():
return [
{"keys": [("source_id", 1), ("file_signature", 1)], "unique": True},
{"keys": [("processed_at", -1)]},
{"keys": [("source_name", 1)]},
{"keys": [("filename", 1)]}
]
class QualityReportSchema:
"""MongoDB schema for quality reports"""
collection_name = "quality_reports"
@staticmethod
def get_indexes():
return [
{"keys": [("source", 1)]},
{"keys": [("processing_time", -1)]},
{"keys": [("quality_scores.overall", -1)]}
]
class IngestionStatsSchema:
"""MongoDB schema for ingestion statistics"""
collection_name = "ingestion_stats"
@staticmethod
def get_indexes():
return [
{"keys": [("date", 1)], "unique": True},
{"keys": [("timestamp", -1)]}
]
class ErrorLogSchema:
"""MongoDB schema for error logs"""
collection_name = "error_logs"
@staticmethod
def get_indexes():
return [
{"keys": [("timestamp", -1)]},
{"keys": [("source_id", 1)]},
{"keys": [("error_type", 1)]},
{"keys": [("level", 1)]}
]
class MonitoringAlertSchema:
"""MongoDB schema for monitoring alerts"""
collection_name = "monitoring_alerts"
@staticmethod
def get_indexes():
return [
{"keys": [("alert_id", 1)], "unique": True},
{"keys": [("timestamp", -1)]},
{"keys": [("source_id", 1)]},
{"keys": [("alert_type", 1)]},
{"keys": [("resolved", 1)]}
]
# Validation helpers
def validate_timestamp(timestamp: Union[int, float, str]) -> int:
"""Validate and convert timestamp to unix timestamp"""
if isinstance(timestamp, str):
try:
# Try ISO format first
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
return int(dt.timestamp())
except ValueError:
try:
# Try as unix timestamp string
return int(float(timestamp))
except ValueError:
raise ValueError(f"Invalid timestamp format: {timestamp}")
elif isinstance(timestamp, (int, float)):
return int(timestamp)
else:
raise ValueError(f"Timestamp must be int, float, or string, got {type(timestamp)}")
def validate_sensor_id(sensor_id: str) -> str:
"""Validate sensor ID format"""
if not isinstance(sensor_id, str) or len(sensor_id.strip()) == 0:
raise ValueError("Sensor ID must be a non-empty string")
# Remove extra whitespace
sensor_id = sensor_id.strip()
# Check length
if len(sensor_id) > 100:
raise ValueError("Sensor ID too long (max 100 characters)")
return sensor_id
def validate_numeric_value(value: Union[int, float, str]) -> float:
"""Validate and convert numeric value"""
try:
numeric_value = float(value)
if not (-1e10 <= numeric_value <= 1e10): # Reasonable range
raise ValueError(f"Value out of reasonable range: {numeric_value}")
return numeric_value
except (ValueError, TypeError):
raise ValueError(f"Invalid numeric value: {value}")
# Export all models for easy importing
__all__ = [
# Enums
'DataFormat', 'SourceStatus',
# Config models
'FTPConfig', 'TopicConfig',
# Request/Response models
'DataSourceCreate', 'DataSourceUpdate', 'DataSourceResponse',
'FileProcessingRequest', 'FileProcessingResponse',
'IngestionStats', 'QualityMetrics', 'QualityReport',
'HealthStatus', 'SensorReading', 'ProcessedFile',
'TopicInfo', 'PublishingStats', 'ErrorLog', 'MonitoringAlert',
# Schema definitions
'DataSourceSchema', 'ProcessedFileSchema', 'QualityReportSchema',
'IngestionStatsSchema', 'ErrorLogSchema', 'MonitoringAlertSchema',
# Validation helpers
'validate_timestamp', 'validate_sensor_id', 'validate_numeric_value'
]