- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
391 lines
11 KiB
Python
391 lines
11 KiB
Python
"""
|
|
Data models for the data ingestion service.
|
|
Defines Pydantic models for request/response validation and database schemas.
|
|
"""
|
|
|
|
from pydantic import BaseModel, Field, validator
|
|
from typing import List, Dict, Any, Optional, Union
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
|
|
class DataFormat(str, Enum):
|
|
"""Supported data formats for ingestion"""
|
|
CSV = "csv"
|
|
JSON = "json"
|
|
TXT = "txt"
|
|
EXCEL = "excel"
|
|
XML = "xml"
|
|
SLG_V2 = "slg_v2"
|
|
|
|
class SourceStatus(str, Enum):
|
|
"""Status of a data source"""
|
|
ACTIVE = "active"
|
|
INACTIVE = "inactive"
|
|
ERROR = "error"
|
|
MAINTENANCE = "maintenance"
|
|
|
|
class FTPConfig(BaseModel):
|
|
"""FTP server configuration"""
|
|
host: str
|
|
port: int = Field(default=21, ge=1, le=65535)
|
|
username: str = "anonymous"
|
|
password: str = ""
|
|
use_ssl: bool = False
|
|
passive_mode: bool = True
|
|
remote_path: str = "/"
|
|
timeout: int = Field(default=30, ge=5, le=300)
|
|
|
|
@validator('host')
|
|
def validate_host(cls, v):
|
|
if not v or len(v.strip()) == 0:
|
|
raise ValueError('Host cannot be empty')
|
|
return v.strip()
|
|
|
|
class TopicConfig(BaseModel):
|
|
"""Redis topic configuration"""
|
|
topic_name: str
|
|
description: str = ""
|
|
data_types: List[str] = Field(default_factory=lambda: ["all"])
|
|
format: str = "sensor_reading"
|
|
enabled: bool = True
|
|
|
|
class DataSourceCreate(BaseModel):
|
|
"""Request model for creating a new data source"""
|
|
name: str = Field(..., min_length=1, max_length=100)
|
|
description: str = ""
|
|
source_type: str = Field(default="ftp", regex="^(ftp|sftp|http|https)$")
|
|
ftp_config: FTPConfig
|
|
file_patterns: List[str] = Field(default_factory=lambda: ["*.csv"])
|
|
data_format: DataFormat = DataFormat.CSV
|
|
topics: List[TopicConfig] = Field(default_factory=list)
|
|
polling_interval_minutes: int = Field(default=5, ge=1, le=1440)
|
|
max_file_size_mb: int = Field(default=100, ge=1, le=1000)
|
|
enabled: bool = True
|
|
|
|
class DataSourceUpdate(BaseModel):
|
|
"""Request model for updating a data source"""
|
|
name: Optional[str] = Field(None, min_length=1, max_length=100)
|
|
description: Optional[str] = None
|
|
ftp_config: Optional[FTPConfig] = None
|
|
file_patterns: Optional[List[str]] = None
|
|
data_format: Optional[DataFormat] = None
|
|
topics: Optional[List[TopicConfig]] = None
|
|
polling_interval_minutes: Optional[int] = Field(None, ge=1, le=1440)
|
|
max_file_size_mb: Optional[int] = Field(None, ge=1, le=1000)
|
|
enabled: Optional[bool] = None
|
|
|
|
class DataSourceResponse(BaseModel):
|
|
"""Response model for data source information"""
|
|
id: str
|
|
name: str
|
|
description: str
|
|
source_type: str
|
|
ftp_config: FTPConfig
|
|
file_patterns: List[str]
|
|
data_format: DataFormat
|
|
topics: List[TopicConfig]
|
|
polling_interval_minutes: int
|
|
max_file_size_mb: int
|
|
enabled: bool
|
|
status: SourceStatus
|
|
created_at: datetime
|
|
updated_at: datetime
|
|
last_check: Optional[datetime] = None
|
|
last_success: Optional[datetime] = None
|
|
error_count: int = 0
|
|
total_files_processed: int = 0
|
|
|
|
class Config:
|
|
json_encoders = {
|
|
datetime: lambda v: v.isoformat()
|
|
}
|
|
|
|
class FileProcessingRequest(BaseModel):
|
|
"""Request model for manual file processing"""
|
|
source_id: str
|
|
filename: str
|
|
force_reprocess: bool = False
|
|
|
|
class FileProcessingResponse(BaseModel):
|
|
"""Response model for file processing results"""
|
|
success: bool
|
|
message: str
|
|
records_processed: int
|
|
records_rejected: int
|
|
processing_time_seconds: float
|
|
file_size_bytes: int
|
|
topics_published: List[str]
|
|
|
|
class IngestionStats(BaseModel):
|
|
"""Response model for ingestion statistics"""
|
|
files_processed_today: int
|
|
records_processed_today: int
|
|
active_sources: int
|
|
total_sources: int
|
|
average_processing_time: float
|
|
success_rate_percentage: float
|
|
last_24h_volume_mb: float
|
|
|
|
class QualityMetrics(BaseModel):
|
|
"""Data quality metrics"""
|
|
completeness: float = Field(..., ge=0.0, le=1.0)
|
|
accuracy: float = Field(..., ge=0.0, le=1.0)
|
|
consistency: float = Field(..., ge=0.0, le=1.0)
|
|
timeliness: float = Field(..., ge=0.0, le=1.0)
|
|
overall: float = Field(..., ge=0.0, le=1.0)
|
|
|
|
class QualityReport(BaseModel):
|
|
"""Data quality report"""
|
|
source: str
|
|
total_records: int
|
|
processed_records: int
|
|
rejected_records: int
|
|
quality_scores: QualityMetrics
|
|
issues_found: List[str]
|
|
processing_time: datetime
|
|
|
|
class Config:
|
|
json_encoders = {
|
|
datetime: lambda v: v.isoformat()
|
|
}
|
|
|
|
class HealthStatus(BaseModel):
|
|
"""Service health status"""
|
|
status: str
|
|
timestamp: datetime
|
|
uptime_seconds: float
|
|
active_sources: int
|
|
total_processed_files: int
|
|
redis_connected: bool
|
|
mongodb_connected: bool
|
|
last_error: Optional[str] = None
|
|
|
|
class Config:
|
|
json_encoders = {
|
|
datetime: lambda v: v.isoformat()
|
|
}
|
|
|
|
class SensorReading(BaseModel):
|
|
"""Individual sensor reading model"""
|
|
sensor_id: str
|
|
timestamp: Union[int, float, str]
|
|
value: Union[int, float]
|
|
unit: Optional[str] = None
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
class ProcessedFile(BaseModel):
|
|
"""Processed file record"""
|
|
source_id: str
|
|
source_name: str
|
|
filename: str
|
|
file_signature: str
|
|
file_size: int
|
|
modified_time: datetime
|
|
processed_at: datetime
|
|
|
|
class TopicInfo(BaseModel):
|
|
"""Topic information response"""
|
|
topic_name: str
|
|
description: str
|
|
data_types: List[str]
|
|
format: str
|
|
message_count: int
|
|
last_published: Optional[datetime] = None
|
|
created_at: datetime
|
|
|
|
class Config:
|
|
json_encoders = {
|
|
datetime: lambda v: v.isoformat()
|
|
}
|
|
|
|
class PublishingStats(BaseModel):
|
|
"""Publishing statistics response"""
|
|
total_messages_published: int
|
|
active_topics: int
|
|
topic_stats: Dict[str, int]
|
|
last_updated: datetime
|
|
|
|
class Config:
|
|
json_encoders = {
|
|
datetime: lambda v: v.isoformat()
|
|
}
|
|
|
|
class ErrorLog(BaseModel):
|
|
"""Error logging model"""
|
|
service: str = "data-ingestion-service"
|
|
timestamp: datetime
|
|
level: str
|
|
source_id: Optional[str] = None
|
|
source_name: Optional[str] = None
|
|
error_type: str
|
|
error_message: str
|
|
stack_trace: Optional[str] = None
|
|
context: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
class Config:
|
|
json_encoders = {
|
|
datetime: lambda v: v.isoformat()
|
|
}
|
|
|
|
class MonitoringAlert(BaseModel):
|
|
"""Monitoring alert model"""
|
|
alert_id: str
|
|
alert_type: str # "error", "warning", "info"
|
|
source_id: Optional[str] = None
|
|
title: str
|
|
description: str
|
|
severity: str = Field(..., regex="^(low|medium|high|critical)$")
|
|
timestamp: datetime
|
|
resolved: bool = False
|
|
resolved_at: Optional[datetime] = None
|
|
metadata: Dict[str, Any] = Field(default_factory=dict)
|
|
|
|
class Config:
|
|
json_encoders = {
|
|
datetime: lambda v: v.isoformat()
|
|
}
|
|
|
|
# Database schema definitions for MongoDB collections
|
|
|
|
class DataSourceSchema:
|
|
"""MongoDB schema for data sources"""
|
|
collection_name = "data_sources"
|
|
|
|
@staticmethod
|
|
def get_indexes():
|
|
return [
|
|
{"keys": [("name", 1)], "unique": True},
|
|
{"keys": [("status", 1)]},
|
|
{"keys": [("enabled", 1)]},
|
|
{"keys": [("created_at", -1)]},
|
|
{"keys": [("last_check", -1)]}
|
|
]
|
|
|
|
class ProcessedFileSchema:
|
|
"""MongoDB schema for processed files"""
|
|
collection_name = "processed_files"
|
|
|
|
@staticmethod
|
|
def get_indexes():
|
|
return [
|
|
{"keys": [("source_id", 1), ("file_signature", 1)], "unique": True},
|
|
{"keys": [("processed_at", -1)]},
|
|
{"keys": [("source_name", 1)]},
|
|
{"keys": [("filename", 1)]}
|
|
]
|
|
|
|
class QualityReportSchema:
|
|
"""MongoDB schema for quality reports"""
|
|
collection_name = "quality_reports"
|
|
|
|
@staticmethod
|
|
def get_indexes():
|
|
return [
|
|
{"keys": [("source", 1)]},
|
|
{"keys": [("processing_time", -1)]},
|
|
{"keys": [("quality_scores.overall", -1)]}
|
|
]
|
|
|
|
class IngestionStatsSchema:
|
|
"""MongoDB schema for ingestion statistics"""
|
|
collection_name = "ingestion_stats"
|
|
|
|
@staticmethod
|
|
def get_indexes():
|
|
return [
|
|
{"keys": [("date", 1)], "unique": True},
|
|
{"keys": [("timestamp", -1)]}
|
|
]
|
|
|
|
class ErrorLogSchema:
|
|
"""MongoDB schema for error logs"""
|
|
collection_name = "error_logs"
|
|
|
|
@staticmethod
|
|
def get_indexes():
|
|
return [
|
|
{"keys": [("timestamp", -1)]},
|
|
{"keys": [("source_id", 1)]},
|
|
{"keys": [("error_type", 1)]},
|
|
{"keys": [("level", 1)]}
|
|
]
|
|
|
|
class MonitoringAlertSchema:
|
|
"""MongoDB schema for monitoring alerts"""
|
|
collection_name = "monitoring_alerts"
|
|
|
|
@staticmethod
|
|
def get_indexes():
|
|
return [
|
|
{"keys": [("alert_id", 1)], "unique": True},
|
|
{"keys": [("timestamp", -1)]},
|
|
{"keys": [("source_id", 1)]},
|
|
{"keys": [("alert_type", 1)]},
|
|
{"keys": [("resolved", 1)]}
|
|
]
|
|
|
|
# Validation helpers
|
|
def validate_timestamp(timestamp: Union[int, float, str]) -> int:
|
|
"""Validate and convert timestamp to unix timestamp"""
|
|
if isinstance(timestamp, str):
|
|
try:
|
|
# Try ISO format first
|
|
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
|
return int(dt.timestamp())
|
|
except ValueError:
|
|
try:
|
|
# Try as unix timestamp string
|
|
return int(float(timestamp))
|
|
except ValueError:
|
|
raise ValueError(f"Invalid timestamp format: {timestamp}")
|
|
elif isinstance(timestamp, (int, float)):
|
|
return int(timestamp)
|
|
else:
|
|
raise ValueError(f"Timestamp must be int, float, or string, got {type(timestamp)}")
|
|
|
|
def validate_sensor_id(sensor_id: str) -> str:
|
|
"""Validate sensor ID format"""
|
|
if not isinstance(sensor_id, str) or len(sensor_id.strip()) == 0:
|
|
raise ValueError("Sensor ID must be a non-empty string")
|
|
|
|
# Remove extra whitespace
|
|
sensor_id = sensor_id.strip()
|
|
|
|
# Check length
|
|
if len(sensor_id) > 100:
|
|
raise ValueError("Sensor ID too long (max 100 characters)")
|
|
|
|
return sensor_id
|
|
|
|
def validate_numeric_value(value: Union[int, float, str]) -> float:
|
|
"""Validate and convert numeric value"""
|
|
try:
|
|
numeric_value = float(value)
|
|
if not (-1e10 <= numeric_value <= 1e10): # Reasonable range
|
|
raise ValueError(f"Value out of reasonable range: {numeric_value}")
|
|
return numeric_value
|
|
except (ValueError, TypeError):
|
|
raise ValueError(f"Invalid numeric value: {value}")
|
|
|
|
# Export all models for easy importing
|
|
__all__ = [
|
|
# Enums
|
|
'DataFormat', 'SourceStatus',
|
|
|
|
# Config models
|
|
'FTPConfig', 'TopicConfig',
|
|
|
|
# Request/Response models
|
|
'DataSourceCreate', 'DataSourceUpdate', 'DataSourceResponse',
|
|
'FileProcessingRequest', 'FileProcessingResponse',
|
|
'IngestionStats', 'QualityMetrics', 'QualityReport',
|
|
'HealthStatus', 'SensorReading', 'ProcessedFile',
|
|
'TopicInfo', 'PublishingStats', 'ErrorLog', 'MonitoringAlert',
|
|
|
|
# Schema definitions
|
|
'DataSourceSchema', 'ProcessedFileSchema', 'QualityReportSchema',
|
|
'IngestionStatsSchema', 'ErrorLogSchema', 'MonitoringAlertSchema',
|
|
|
|
# Validation helpers
|
|
'validate_timestamp', 'validate_sensor_id', 'validate_numeric_value'
|
|
] |