Add data-ingestion-service for SA4CPS FTP integration
- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
This commit is contained in:
391
microservices/data-ingestion-service/models.py
Normal file
391
microservices/data-ingestion-service/models.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""
|
||||
Data models for the data ingestion service.
|
||||
Defines Pydantic models for request/response validation and database schemas.
|
||||
"""
|
||||
|
||||
from pydantic import BaseModel, Field, validator
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
class DataFormat(str, Enum):
|
||||
"""Supported data formats for ingestion"""
|
||||
CSV = "csv"
|
||||
JSON = "json"
|
||||
TXT = "txt"
|
||||
EXCEL = "excel"
|
||||
XML = "xml"
|
||||
SLG_V2 = "slg_v2"
|
||||
|
||||
class SourceStatus(str, Enum):
|
||||
"""Status of a data source"""
|
||||
ACTIVE = "active"
|
||||
INACTIVE = "inactive"
|
||||
ERROR = "error"
|
||||
MAINTENANCE = "maintenance"
|
||||
|
||||
class FTPConfig(BaseModel):
|
||||
"""FTP server configuration"""
|
||||
host: str
|
||||
port: int = Field(default=21, ge=1, le=65535)
|
||||
username: str = "anonymous"
|
||||
password: str = ""
|
||||
use_ssl: bool = False
|
||||
passive_mode: bool = True
|
||||
remote_path: str = "/"
|
||||
timeout: int = Field(default=30, ge=5, le=300)
|
||||
|
||||
@validator('host')
|
||||
def validate_host(cls, v):
|
||||
if not v or len(v.strip()) == 0:
|
||||
raise ValueError('Host cannot be empty')
|
||||
return v.strip()
|
||||
|
||||
class TopicConfig(BaseModel):
|
||||
"""Redis topic configuration"""
|
||||
topic_name: str
|
||||
description: str = ""
|
||||
data_types: List[str] = Field(default_factory=lambda: ["all"])
|
||||
format: str = "sensor_reading"
|
||||
enabled: bool = True
|
||||
|
||||
class DataSourceCreate(BaseModel):
|
||||
"""Request model for creating a new data source"""
|
||||
name: str = Field(..., min_length=1, max_length=100)
|
||||
description: str = ""
|
||||
source_type: str = Field(default="ftp", regex="^(ftp|sftp|http|https)$")
|
||||
ftp_config: FTPConfig
|
||||
file_patterns: List[str] = Field(default_factory=lambda: ["*.csv"])
|
||||
data_format: DataFormat = DataFormat.CSV
|
||||
topics: List[TopicConfig] = Field(default_factory=list)
|
||||
polling_interval_minutes: int = Field(default=5, ge=1, le=1440)
|
||||
max_file_size_mb: int = Field(default=100, ge=1, le=1000)
|
||||
enabled: bool = True
|
||||
|
||||
class DataSourceUpdate(BaseModel):
|
||||
"""Request model for updating a data source"""
|
||||
name: Optional[str] = Field(None, min_length=1, max_length=100)
|
||||
description: Optional[str] = None
|
||||
ftp_config: Optional[FTPConfig] = None
|
||||
file_patterns: Optional[List[str]] = None
|
||||
data_format: Optional[DataFormat] = None
|
||||
topics: Optional[List[TopicConfig]] = None
|
||||
polling_interval_minutes: Optional[int] = Field(None, ge=1, le=1440)
|
||||
max_file_size_mb: Optional[int] = Field(None, ge=1, le=1000)
|
||||
enabled: Optional[bool] = None
|
||||
|
||||
class DataSourceResponse(BaseModel):
|
||||
"""Response model for data source information"""
|
||||
id: str
|
||||
name: str
|
||||
description: str
|
||||
source_type: str
|
||||
ftp_config: FTPConfig
|
||||
file_patterns: List[str]
|
||||
data_format: DataFormat
|
||||
topics: List[TopicConfig]
|
||||
polling_interval_minutes: int
|
||||
max_file_size_mb: int
|
||||
enabled: bool
|
||||
status: SourceStatus
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
last_check: Optional[datetime] = None
|
||||
last_success: Optional[datetime] = None
|
||||
error_count: int = 0
|
||||
total_files_processed: int = 0
|
||||
|
||||
class Config:
|
||||
json_encoders = {
|
||||
datetime: lambda v: v.isoformat()
|
||||
}
|
||||
|
||||
class FileProcessingRequest(BaseModel):
|
||||
"""Request model for manual file processing"""
|
||||
source_id: str
|
||||
filename: str
|
||||
force_reprocess: bool = False
|
||||
|
||||
class FileProcessingResponse(BaseModel):
|
||||
"""Response model for file processing results"""
|
||||
success: bool
|
||||
message: str
|
||||
records_processed: int
|
||||
records_rejected: int
|
||||
processing_time_seconds: float
|
||||
file_size_bytes: int
|
||||
topics_published: List[str]
|
||||
|
||||
class IngestionStats(BaseModel):
|
||||
"""Response model for ingestion statistics"""
|
||||
files_processed_today: int
|
||||
records_processed_today: int
|
||||
active_sources: int
|
||||
total_sources: int
|
||||
average_processing_time: float
|
||||
success_rate_percentage: float
|
||||
last_24h_volume_mb: float
|
||||
|
||||
class QualityMetrics(BaseModel):
|
||||
"""Data quality metrics"""
|
||||
completeness: float = Field(..., ge=0.0, le=1.0)
|
||||
accuracy: float = Field(..., ge=0.0, le=1.0)
|
||||
consistency: float = Field(..., ge=0.0, le=1.0)
|
||||
timeliness: float = Field(..., ge=0.0, le=1.0)
|
||||
overall: float = Field(..., ge=0.0, le=1.0)
|
||||
|
||||
class QualityReport(BaseModel):
|
||||
"""Data quality report"""
|
||||
source: str
|
||||
total_records: int
|
||||
processed_records: int
|
||||
rejected_records: int
|
||||
quality_scores: QualityMetrics
|
||||
issues_found: List[str]
|
||||
processing_time: datetime
|
||||
|
||||
class Config:
|
||||
json_encoders = {
|
||||
datetime: lambda v: v.isoformat()
|
||||
}
|
||||
|
||||
class HealthStatus(BaseModel):
|
||||
"""Service health status"""
|
||||
status: str
|
||||
timestamp: datetime
|
||||
uptime_seconds: float
|
||||
active_sources: int
|
||||
total_processed_files: int
|
||||
redis_connected: bool
|
||||
mongodb_connected: bool
|
||||
last_error: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
json_encoders = {
|
||||
datetime: lambda v: v.isoformat()
|
||||
}
|
||||
|
||||
class SensorReading(BaseModel):
|
||||
"""Individual sensor reading model"""
|
||||
sensor_id: str
|
||||
timestamp: Union[int, float, str]
|
||||
value: Union[int, float]
|
||||
unit: Optional[str] = None
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
class ProcessedFile(BaseModel):
|
||||
"""Processed file record"""
|
||||
source_id: str
|
||||
source_name: str
|
||||
filename: str
|
||||
file_signature: str
|
||||
file_size: int
|
||||
modified_time: datetime
|
||||
processed_at: datetime
|
||||
|
||||
class TopicInfo(BaseModel):
|
||||
"""Topic information response"""
|
||||
topic_name: str
|
||||
description: str
|
||||
data_types: List[str]
|
||||
format: str
|
||||
message_count: int
|
||||
last_published: Optional[datetime] = None
|
||||
created_at: datetime
|
||||
|
||||
class Config:
|
||||
json_encoders = {
|
||||
datetime: lambda v: v.isoformat()
|
||||
}
|
||||
|
||||
class PublishingStats(BaseModel):
|
||||
"""Publishing statistics response"""
|
||||
total_messages_published: int
|
||||
active_topics: int
|
||||
topic_stats: Dict[str, int]
|
||||
last_updated: datetime
|
||||
|
||||
class Config:
|
||||
json_encoders = {
|
||||
datetime: lambda v: v.isoformat()
|
||||
}
|
||||
|
||||
class ErrorLog(BaseModel):
|
||||
"""Error logging model"""
|
||||
service: str = "data-ingestion-service"
|
||||
timestamp: datetime
|
||||
level: str
|
||||
source_id: Optional[str] = None
|
||||
source_name: Optional[str] = None
|
||||
error_type: str
|
||||
error_message: str
|
||||
stack_trace: Optional[str] = None
|
||||
context: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
class Config:
|
||||
json_encoders = {
|
||||
datetime: lambda v: v.isoformat()
|
||||
}
|
||||
|
||||
class MonitoringAlert(BaseModel):
|
||||
"""Monitoring alert model"""
|
||||
alert_id: str
|
||||
alert_type: str # "error", "warning", "info"
|
||||
source_id: Optional[str] = None
|
||||
title: str
|
||||
description: str
|
||||
severity: str = Field(..., regex="^(low|medium|high|critical)$")
|
||||
timestamp: datetime
|
||||
resolved: bool = False
|
||||
resolved_at: Optional[datetime] = None
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
class Config:
|
||||
json_encoders = {
|
||||
datetime: lambda v: v.isoformat()
|
||||
}
|
||||
|
||||
# Database schema definitions for MongoDB collections
|
||||
|
||||
class DataSourceSchema:
|
||||
"""MongoDB schema for data sources"""
|
||||
collection_name = "data_sources"
|
||||
|
||||
@staticmethod
|
||||
def get_indexes():
|
||||
return [
|
||||
{"keys": [("name", 1)], "unique": True},
|
||||
{"keys": [("status", 1)]},
|
||||
{"keys": [("enabled", 1)]},
|
||||
{"keys": [("created_at", -1)]},
|
||||
{"keys": [("last_check", -1)]}
|
||||
]
|
||||
|
||||
class ProcessedFileSchema:
|
||||
"""MongoDB schema for processed files"""
|
||||
collection_name = "processed_files"
|
||||
|
||||
@staticmethod
|
||||
def get_indexes():
|
||||
return [
|
||||
{"keys": [("source_id", 1), ("file_signature", 1)], "unique": True},
|
||||
{"keys": [("processed_at", -1)]},
|
||||
{"keys": [("source_name", 1)]},
|
||||
{"keys": [("filename", 1)]}
|
||||
]
|
||||
|
||||
class QualityReportSchema:
|
||||
"""MongoDB schema for quality reports"""
|
||||
collection_name = "quality_reports"
|
||||
|
||||
@staticmethod
|
||||
def get_indexes():
|
||||
return [
|
||||
{"keys": [("source", 1)]},
|
||||
{"keys": [("processing_time", -1)]},
|
||||
{"keys": [("quality_scores.overall", -1)]}
|
||||
]
|
||||
|
||||
class IngestionStatsSchema:
|
||||
"""MongoDB schema for ingestion statistics"""
|
||||
collection_name = "ingestion_stats"
|
||||
|
||||
@staticmethod
|
||||
def get_indexes():
|
||||
return [
|
||||
{"keys": [("date", 1)], "unique": True},
|
||||
{"keys": [("timestamp", -1)]}
|
||||
]
|
||||
|
||||
class ErrorLogSchema:
|
||||
"""MongoDB schema for error logs"""
|
||||
collection_name = "error_logs"
|
||||
|
||||
@staticmethod
|
||||
def get_indexes():
|
||||
return [
|
||||
{"keys": [("timestamp", -1)]},
|
||||
{"keys": [("source_id", 1)]},
|
||||
{"keys": [("error_type", 1)]},
|
||||
{"keys": [("level", 1)]}
|
||||
]
|
||||
|
||||
class MonitoringAlertSchema:
|
||||
"""MongoDB schema for monitoring alerts"""
|
||||
collection_name = "monitoring_alerts"
|
||||
|
||||
@staticmethod
|
||||
def get_indexes():
|
||||
return [
|
||||
{"keys": [("alert_id", 1)], "unique": True},
|
||||
{"keys": [("timestamp", -1)]},
|
||||
{"keys": [("source_id", 1)]},
|
||||
{"keys": [("alert_type", 1)]},
|
||||
{"keys": [("resolved", 1)]}
|
||||
]
|
||||
|
||||
# Validation helpers
|
||||
def validate_timestamp(timestamp: Union[int, float, str]) -> int:
|
||||
"""Validate and convert timestamp to unix timestamp"""
|
||||
if isinstance(timestamp, str):
|
||||
try:
|
||||
# Try ISO format first
|
||||
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||
return int(dt.timestamp())
|
||||
except ValueError:
|
||||
try:
|
||||
# Try as unix timestamp string
|
||||
return int(float(timestamp))
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
||||
elif isinstance(timestamp, (int, float)):
|
||||
return int(timestamp)
|
||||
else:
|
||||
raise ValueError(f"Timestamp must be int, float, or string, got {type(timestamp)}")
|
||||
|
||||
def validate_sensor_id(sensor_id: str) -> str:
|
||||
"""Validate sensor ID format"""
|
||||
if not isinstance(sensor_id, str) or len(sensor_id.strip()) == 0:
|
||||
raise ValueError("Sensor ID must be a non-empty string")
|
||||
|
||||
# Remove extra whitespace
|
||||
sensor_id = sensor_id.strip()
|
||||
|
||||
# Check length
|
||||
if len(sensor_id) > 100:
|
||||
raise ValueError("Sensor ID too long (max 100 characters)")
|
||||
|
||||
return sensor_id
|
||||
|
||||
def validate_numeric_value(value: Union[int, float, str]) -> float:
|
||||
"""Validate and convert numeric value"""
|
||||
try:
|
||||
numeric_value = float(value)
|
||||
if not (-1e10 <= numeric_value <= 1e10): # Reasonable range
|
||||
raise ValueError(f"Value out of reasonable range: {numeric_value}")
|
||||
return numeric_value
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(f"Invalid numeric value: {value}")
|
||||
|
||||
# Export all models for easy importing
|
||||
__all__ = [
|
||||
# Enums
|
||||
'DataFormat', 'SourceStatus',
|
||||
|
||||
# Config models
|
||||
'FTPConfig', 'TopicConfig',
|
||||
|
||||
# Request/Response models
|
||||
'DataSourceCreate', 'DataSourceUpdate', 'DataSourceResponse',
|
||||
'FileProcessingRequest', 'FileProcessingResponse',
|
||||
'IngestionStats', 'QualityMetrics', 'QualityReport',
|
||||
'HealthStatus', 'SensorReading', 'ProcessedFile',
|
||||
'TopicInfo', 'PublishingStats', 'ErrorLog', 'MonitoringAlert',
|
||||
|
||||
# Schema definitions
|
||||
'DataSourceSchema', 'ProcessedFileSchema', 'QualityReportSchema',
|
||||
'IngestionStatsSchema', 'ErrorLogSchema', 'MonitoringAlertSchema',
|
||||
|
||||
# Validation helpers
|
||||
'validate_timestamp', 'validate_sensor_id', 'validate_numeric_value'
|
||||
]
|
||||
Reference in New Issue
Block a user