- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
433 lines
15 KiB
Python
433 lines
15 KiB
Python
"""
|
|
Database configuration and connection management for the data ingestion service.
|
|
Handles MongoDB connections, index creation, and Redis connections.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
from typing import Optional
|
|
from contextlib import asynccontextmanager
|
|
import os
|
|
from datetime import datetime
|
|
|
|
import motor.motor_asyncio
|
|
import redis.asyncio as redis
|
|
from pymongo import IndexModel
|
|
|
|
from .models import (
|
|
DataSourceSchema, ProcessedFileSchema, QualityReportSchema,
|
|
IngestionStatsSchema, ErrorLogSchema, MonitoringAlertSchema
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class DatabaseManager:
|
|
"""Manages database connections and operations"""
|
|
|
|
def __init__(self, mongodb_url: str = None, redis_url: str = None):
|
|
self.mongodb_url = mongodb_url or os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
|
self.redis_url = redis_url or os.getenv("REDIS_URL", "redis://localhost:6379")
|
|
|
|
self.mongodb_client: Optional[motor.motor_asyncio.AsyncIOMotorClient] = None
|
|
self.db: Optional[motor.motor_asyncio.AsyncIOMotorDatabase] = None
|
|
self.redis_client: Optional[redis.Redis] = None
|
|
|
|
self._connection_status = {
|
|
"mongodb": False,
|
|
"redis": False,
|
|
"last_check": None
|
|
}
|
|
|
|
async def connect(self):
|
|
"""Establish connections to MongoDB and Redis"""
|
|
try:
|
|
await self._connect_mongodb()
|
|
await self._connect_redis()
|
|
await self._create_indexes()
|
|
|
|
logger.info("Database connections established successfully")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error establishing database connections: {e}")
|
|
raise
|
|
|
|
async def _connect_mongodb(self):
|
|
"""Connect to MongoDB"""
|
|
try:
|
|
# Parse database name from URL or use default
|
|
db_name = "energy_dashboard"
|
|
if self.mongodb_url.count("/") > 2:
|
|
db_name = self.mongodb_url.split("/")[-1]
|
|
|
|
self.mongodb_client = motor.motor_asyncio.AsyncIOMotorClient(
|
|
self.mongodb_url,
|
|
serverSelectionTimeoutMS=5000,
|
|
connectTimeoutMS=5000,
|
|
maxPoolSize=50,
|
|
minPoolSize=10
|
|
)
|
|
|
|
self.db = self.mongodb_client[db_name]
|
|
|
|
# Test connection
|
|
await self.mongodb_client.admin.command('ping')
|
|
|
|
self._connection_status["mongodb"] = True
|
|
logger.info(f"Connected to MongoDB: {self.mongodb_url}")
|
|
|
|
except Exception as e:
|
|
self._connection_status["mongodb"] = False
|
|
logger.error(f"MongoDB connection failed: {e}")
|
|
raise
|
|
|
|
async def _connect_redis(self):
|
|
"""Connect to Redis"""
|
|
try:
|
|
self.redis_client = redis.from_url(
|
|
self.redis_url,
|
|
encoding="utf-8",
|
|
decode_responses=True,
|
|
socket_timeout=5,
|
|
socket_connect_timeout=5,
|
|
health_check_interval=30
|
|
)
|
|
|
|
# Test connection
|
|
await self.redis_client.ping()
|
|
|
|
self._connection_status["redis"] = True
|
|
logger.info(f"Connected to Redis: {self.redis_url}")
|
|
|
|
except Exception as e:
|
|
self._connection_status["redis"] = False
|
|
logger.error(f"Redis connection failed: {e}")
|
|
raise
|
|
|
|
async def _create_indexes(self):
|
|
"""Create database indexes for optimal performance"""
|
|
try:
|
|
schemas = [
|
|
DataSourceSchema,
|
|
ProcessedFileSchema,
|
|
QualityReportSchema,
|
|
IngestionStatsSchema,
|
|
ErrorLogSchema,
|
|
MonitoringAlertSchema
|
|
]
|
|
|
|
for schema in schemas:
|
|
collection = self.db[schema.collection_name]
|
|
indexes = schema.get_indexes()
|
|
|
|
if indexes:
|
|
index_models = []
|
|
for index_spec in indexes:
|
|
keys = index_spec["keys"]
|
|
options = {k: v for k, v in index_spec.items() if k != "keys"}
|
|
index_models.append(IndexModel(keys, **options))
|
|
|
|
await collection.create_indexes(index_models)
|
|
logger.debug(f"Created {len(index_models)} indexes for {schema.collection_name}")
|
|
|
|
logger.info("Database indexes created successfully")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating database indexes: {e}")
|
|
# Don't raise here - indexes are performance optimization, not critical
|
|
|
|
async def disconnect(self):
|
|
"""Close all database connections"""
|
|
try:
|
|
if self.redis_client:
|
|
await self.redis_client.aclose()
|
|
self._connection_status["redis"] = False
|
|
|
|
if self.mongodb_client:
|
|
self.mongodb_client.close()
|
|
self._connection_status["mongodb"] = False
|
|
|
|
logger.info("Database connections closed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error closing database connections: {e}")
|
|
|
|
async def health_check(self) -> dict:
|
|
"""Check health of database connections"""
|
|
health = {
|
|
"mongodb": False,
|
|
"redis": False,
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
"details": {}
|
|
}
|
|
|
|
# Check MongoDB
|
|
try:
|
|
if self.mongodb_client:
|
|
start_time = asyncio.get_event_loop().time()
|
|
await self.mongodb_client.admin.command('ping')
|
|
response_time = (asyncio.get_event_loop().time() - start_time) * 1000
|
|
|
|
health["mongodb"] = True
|
|
health["details"]["mongodb"] = {
|
|
"status": "healthy",
|
|
"response_time_ms": round(response_time, 2),
|
|
"server_info": await self.mongodb_client.server_info()
|
|
}
|
|
|
|
except Exception as e:
|
|
health["details"]["mongodb"] = {
|
|
"status": "unhealthy",
|
|
"error": str(e)
|
|
}
|
|
|
|
# Check Redis
|
|
try:
|
|
if self.redis_client:
|
|
start_time = asyncio.get_event_loop().time()
|
|
await self.redis_client.ping()
|
|
response_time = (asyncio.get_event_loop().time() - start_time) * 1000
|
|
|
|
redis_info = await self.redis_client.info()
|
|
|
|
health["redis"] = True
|
|
health["details"]["redis"] = {
|
|
"status": "healthy",
|
|
"response_time_ms": round(response_time, 2),
|
|
"version": redis_info.get("redis_version"),
|
|
"connected_clients": redis_info.get("connected_clients"),
|
|
"used_memory_human": redis_info.get("used_memory_human")
|
|
}
|
|
|
|
except Exception as e:
|
|
health["details"]["redis"] = {
|
|
"status": "unhealthy",
|
|
"error": str(e)
|
|
}
|
|
|
|
# Update connection status
|
|
self._connection_status.update({
|
|
"mongodb": health["mongodb"],
|
|
"redis": health["redis"],
|
|
"last_check": datetime.utcnow()
|
|
})
|
|
|
|
return health
|
|
|
|
@property
|
|
def is_connected(self) -> bool:
|
|
"""Check if all required connections are established"""
|
|
return self._connection_status["mongodb"] and self._connection_status["redis"]
|
|
|
|
@property
|
|
def data_sources(self):
|
|
"""Data sources collection"""
|
|
return self.db[DataSourceSchema.collection_name]
|
|
|
|
@property
|
|
def processed_files(self):
|
|
"""Processed files collection"""
|
|
return self.db[ProcessedFileSchema.collection_name]
|
|
|
|
@property
|
|
def quality_reports(self):
|
|
"""Quality reports collection"""
|
|
return self.db[QualityReportSchema.collection_name]
|
|
|
|
@property
|
|
def ingestion_stats(self):
|
|
"""Ingestion statistics collection"""
|
|
return self.db[IngestionStatsSchema.collection_name]
|
|
|
|
@property
|
|
def error_logs(self):
|
|
"""Error logs collection"""
|
|
return self.db[ErrorLogSchema.collection_name]
|
|
|
|
@property
|
|
def monitoring_alerts(self):
|
|
"""Monitoring alerts collection"""
|
|
return self.db[MonitoringAlertSchema.collection_name]
|
|
|
|
# Global database manager instance
|
|
db_manager = DatabaseManager()
|
|
|
|
async def get_database():
|
|
"""Dependency function to get database instance"""
|
|
if not db_manager.is_connected:
|
|
await db_manager.connect()
|
|
return db_manager.db
|
|
|
|
async def get_redis():
|
|
"""Dependency function to get Redis client"""
|
|
if not db_manager.is_connected:
|
|
await db_manager.connect()
|
|
return db_manager.redis_client
|
|
|
|
@asynccontextmanager
|
|
async def get_db_session():
|
|
"""Context manager for database operations"""
|
|
try:
|
|
if not db_manager.is_connected:
|
|
await db_manager.connect()
|
|
yield db_manager.db
|
|
except Exception as e:
|
|
logger.error(f"Database session error: {e}")
|
|
raise
|
|
finally:
|
|
# Connection pooling handles cleanup automatically
|
|
pass
|
|
|
|
@asynccontextmanager
|
|
async def get_redis_session():
|
|
"""Context manager for Redis operations"""
|
|
try:
|
|
if not db_manager.is_connected:
|
|
await db_manager.connect()
|
|
yield db_manager.redis_client
|
|
except Exception as e:
|
|
logger.error(f"Redis session error: {e}")
|
|
raise
|
|
finally:
|
|
# Connection pooling handles cleanup automatically
|
|
pass
|
|
|
|
class DatabaseService:
|
|
"""High-level database service with common operations"""
|
|
|
|
def __init__(self, db, redis_client):
|
|
self.db = db
|
|
self.redis = redis_client
|
|
|
|
async def create_data_source(self, source_data: dict) -> str:
|
|
"""Create a new data source"""
|
|
try:
|
|
source_data["created_at"] = datetime.utcnow()
|
|
source_data["updated_at"] = datetime.utcnow()
|
|
source_data["status"] = "active"
|
|
source_data["error_count"] = 0
|
|
source_data["total_files_processed"] = 0
|
|
|
|
result = await self.db.data_sources.insert_one(source_data)
|
|
return str(result.inserted_id)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating data source: {e}")
|
|
raise
|
|
|
|
async def get_data_source(self, source_id: str) -> Optional[dict]:
|
|
"""Get data source by ID"""
|
|
try:
|
|
from bson import ObjectId
|
|
source = await self.db.data_sources.find_one({"_id": ObjectId(source_id)})
|
|
if source:
|
|
source["_id"] = str(source["_id"])
|
|
return source
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting data source: {e}")
|
|
return None
|
|
|
|
async def update_data_source(self, source_id: str, update_data: dict) -> bool:
|
|
"""Update data source"""
|
|
try:
|
|
from bson import ObjectId
|
|
update_data["updated_at"] = datetime.utcnow()
|
|
|
|
result = await self.db.data_sources.update_one(
|
|
{"_id": ObjectId(source_id)},
|
|
{"$set": update_data}
|
|
)
|
|
|
|
return result.modified_count > 0
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating data source: {e}")
|
|
return False
|
|
|
|
async def list_data_sources(self, enabled_only: bool = False) -> list:
|
|
"""List all data sources"""
|
|
try:
|
|
query = {"enabled": True} if enabled_only else {}
|
|
cursor = self.db.data_sources.find(query).sort("created_at", -1)
|
|
|
|
sources = []
|
|
async for source in cursor:
|
|
source["_id"] = str(source["_id"])
|
|
sources.append(source)
|
|
|
|
return sources
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error listing data sources: {e}")
|
|
return []
|
|
|
|
async def log_error(self, error_data: dict):
|
|
"""Log an error to the database"""
|
|
try:
|
|
error_data["timestamp"] = datetime.utcnow()
|
|
await self.db.error_logs.insert_one(error_data)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error logging error: {e}")
|
|
|
|
async def update_ingestion_stats(self, stats_data: dict):
|
|
"""Update daily ingestion statistics"""
|
|
try:
|
|
today = datetime.utcnow().strftime("%Y-%m-%d")
|
|
stats_data["date"] = today
|
|
stats_data["timestamp"] = datetime.utcnow()
|
|
|
|
await self.db.ingestion_stats.update_one(
|
|
{"date": today},
|
|
{"$set": stats_data},
|
|
upsert=True
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating ingestion stats: {e}")
|
|
|
|
async def get_latest_stats(self) -> Optional[dict]:
|
|
"""Get latest ingestion statistics"""
|
|
try:
|
|
stats = await self.db.ingestion_stats.find_one(
|
|
sort=[("timestamp", -1)]
|
|
)
|
|
if stats:
|
|
stats["_id"] = str(stats["_id"])
|
|
return stats
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting latest stats: {e}")
|
|
return None
|
|
|
|
async def cleanup_old_data(self, days: int = 30):
|
|
"""Clean up old data based on retention policy"""
|
|
try:
|
|
cutoff_date = datetime.utcnow() - datetime.timedelta(days=days)
|
|
|
|
# Clean up old processed files records
|
|
result1 = await self.db.processed_files.delete_many({
|
|
"processed_at": {"$lt": cutoff_date}
|
|
})
|
|
|
|
# Clean up old error logs
|
|
result2 = await self.db.error_logs.delete_many({
|
|
"timestamp": {"$lt": cutoff_date}
|
|
})
|
|
|
|
# Clean up old quality reports
|
|
result3 = await self.db.quality_reports.delete_many({
|
|
"processing_time": {"$lt": cutoff_date}
|
|
})
|
|
|
|
logger.info(f"Cleaned up old data: {result1.deleted_count} processed files, "
|
|
f"{result2.deleted_count} error logs, {result3.deleted_count} quality reports")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning up old data: {e}")
|
|
|
|
# Export the database manager and service for use in other modules
|
|
__all__ = [
|
|
'DatabaseManager', 'DatabaseService', 'db_manager',
|
|
'get_database', 'get_redis', 'get_db_session', 'get_redis_session'
|
|
] |