Simplify data ingestion service

This commit is contained in:
rafaeldpsilva
2025-09-10 15:47:10 +01:00
parent 13556347b0
commit b7e734e0d2
13 changed files with 474 additions and 4440 deletions

View File

@@ -1,433 +1,245 @@
#!/usr/bin/env python3
"""
Database configuration and connection management for the data ingestion service.
Handles MongoDB connections, index creation, and Redis connections.
MongoDB Database Manager for SA4CPS Data Ingestion
Simple async MongoDB operations for storing .slg_v2 file data
"""
import asyncio
import logging
from typing import Optional
from contextlib import asynccontextmanager
import os
from datetime import datetime
from typing import List, Dict, Any, Optional
from motor.motor_asyncio import AsyncIOMotorClient
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
import motor.motor_asyncio
import redis.asyncio as redis
from pymongo import IndexModel
from .models import (
DataSourceSchema, ProcessedFileSchema, QualityReportSchema,
IngestionStatsSchema, ErrorLogSchema, MonitoringAlertSchema
)
from config import MONGO_CONFIG
logger = logging.getLogger(__name__)
class DatabaseManager:
"""Manages database connections and operations"""
"""Manages MongoDB connections and operations for SA4CPS data"""
def __init__(self, mongodb_url: str = None, redis_url: str = None):
self.mongodb_url = mongodb_url or os.getenv("MONGODB_URL", "mongodb://localhost:27017")
self.redis_url = redis_url or os.getenv("REDIS_URL", "redis://localhost:6379")
def __init__(self):
self.client: Optional[AsyncIOMotorClient] = None
self.db = None
self.collections = {}
self.mongodb_client: Optional[motor.motor_asyncio.AsyncIOMotorClient] = None
self.db: Optional[motor.motor_asyncio.AsyncIOMotorDatabase] = None
self.redis_client: Optional[redis.Redis] = None
# MongoDB configuration
self.connection_string = MONGO_CONFIG["connection_string"]
self.database_name = MONGO_CONFIG["database_name"]
self._connection_status = {
"mongodb": False,
"redis": False,
"last_check": None
}
logger.info(f"Database manager initialized for: {self.database_name}")
async def connect(self):
"""Establish connections to MongoDB and Redis"""
try:
await self._connect_mongodb()
await self._connect_redis()
await self._create_indexes()
logger.info("Database connections established successfully")
except Exception as e:
logger.error(f"Error establishing database connections: {e}")
raise
async def _connect_mongodb(self):
"""Connect to MongoDB"""
try:
# Parse database name from URL or use default
db_name = "energy_dashboard"
if self.mongodb_url.count("/") > 2:
db_name = self.mongodb_url.split("/")[-1]
self.mongodb_client = motor.motor_asyncio.AsyncIOMotorClient(
self.mongodb_url,
serverSelectionTimeoutMS=5000,
connectTimeoutMS=5000,
maxPoolSize=50,
minPoolSize=10
)
self.db = self.mongodb_client[db_name]
self.client = AsyncIOMotorClient(self.connection_string)
# Test connection
await self.mongodb_client.admin.command('ping')
await self.client.admin.command('ping')
self._connection_status["mongodb"] = True
logger.info(f"Connected to MongoDB: {self.mongodb_url}")
# Get database and collections
self.db = self.client[self.database_name]
self.collections = {
'files': self.db.sa4cps_files,
'energy_data': self.db.sa4cps_energy_data,
'metadata': self.db.sa4cps_metadata
}
except Exception as e:
self._connection_status["mongodb"] = False
logger.error(f"MongoDB connection failed: {e}")
# Create indexes for better performance
await self._create_indexes()
logger.info(f"Connected to MongoDB: {self.database_name}")
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
logger.error(f"Failed to connect to MongoDB: {e}")
raise
async def _connect_redis(self):
"""Connect to Redis"""
try:
self.redis_client = redis.from_url(
self.redis_url,
encoding="utf-8",
decode_responses=True,
socket_timeout=5,
socket_connect_timeout=5,
health_check_interval=30
)
# Test connection
await self.redis_client.ping()
self._connection_status["redis"] = True
logger.info(f"Connected to Redis: {self.redis_url}")
except Exception as e:
self._connection_status["redis"] = False
logger.error(f"Redis connection failed: {e}")
raise
async def close(self):
"""Close MongoDB connection"""
if self.client:
self.client.close()
logger.info("MongoDB connection closed")
async def ping(self):
"""Test database connection"""
if not self.client:
raise ConnectionFailure("No database connection")
await self.client.admin.command('ping')
async def _create_indexes(self):
"""Create database indexes for optimal performance"""
"""Create database indexes for efficient queries"""
try:
schemas = [
DataSourceSchema,
ProcessedFileSchema,
QualityReportSchema,
IngestionStatsSchema,
ErrorLogSchema,
MonitoringAlertSchema
]
# Index on files collection
await self.collections['files'].create_index("filename", unique=True)
await self.collections['files'].create_index("processed_at")
for schema in schemas:
collection = self.db[schema.collection_name]
indexes = schema.get_indexes()
if indexes:
index_models = []
for index_spec in indexes:
keys = index_spec["keys"]
options = {k: v for k, v in index_spec.items() if k != "keys"}
index_models.append(IndexModel(keys, **options))
await collection.create_indexes(index_models)
logger.debug(f"Created {len(index_models)} indexes for {schema.collection_name}")
# Index on energy data collection
await self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
await self.collections['energy_data'].create_index("timestamp")
logger.info("Database indexes created successfully")
except Exception as e:
logger.error(f"Error creating database indexes: {e}")
# Don't raise here - indexes are performance optimization, not critical
logger.warning(f"Failed to create indexes: {e}")
async def disconnect(self):
"""Close all database connections"""
async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool:
"""Store processed .slg_v2 file data in MongoDB"""
try:
if self.redis_client:
await self.redis_client.aclose()
self._connection_status["redis"] = False
current_time = datetime.now()
if self.mongodb_client:
self.mongodb_client.close()
self._connection_status["mongodb"] = False
logger.info("Database connections closed")
except Exception as e:
logger.error(f"Error closing database connections: {e}")
async def health_check(self) -> dict:
"""Check health of database connections"""
health = {
"mongodb": False,
"redis": False,
"timestamp": datetime.utcnow().isoformat(),
"details": {}
}
# Check MongoDB
try:
if self.mongodb_client:
start_time = asyncio.get_event_loop().time()
await self.mongodb_client.admin.command('ping')
response_time = (asyncio.get_event_loop().time() - start_time) * 1000
health["mongodb"] = True
health["details"]["mongodb"] = {
"status": "healthy",
"response_time_ms": round(response_time, 2),
"server_info": await self.mongodb_client.server_info()
}
except Exception as e:
health["details"]["mongodb"] = {
"status": "unhealthy",
"error": str(e)
# Store file metadata
file_metadata = {
"filename": filename,
"record_count": len(records),
"processed_at": current_time,
"file_size": sum(len(str(record)) for record in records),
"status": "processed"
}
# Check Redis
try:
if self.redis_client:
start_time = asyncio.get_event_loop().time()
await self.redis_client.ping()
response_time = (asyncio.get_event_loop().time() - start_time) * 1000
redis_info = await self.redis_client.info()
health["redis"] = True
health["details"]["redis"] = {
"status": "healthy",
"response_time_ms": round(response_time, 2),
"version": redis_info.get("redis_version"),
"connected_clients": redis_info.get("connected_clients"),
"used_memory_human": redis_info.get("used_memory_human")
}
except Exception as e:
health["details"]["redis"] = {
"status": "unhealthy",
"error": str(e)
}
# Update connection status
self._connection_status.update({
"mongodb": health["mongodb"],
"redis": health["redis"],
"last_check": datetime.utcnow()
})
return health
@property
def is_connected(self) -> bool:
"""Check if all required connections are established"""
return self._connection_status["mongodb"] and self._connection_status["redis"]
@property
def data_sources(self):
"""Data sources collection"""
return self.db[DataSourceSchema.collection_name]
@property
def processed_files(self):
"""Processed files collection"""
return self.db[ProcessedFileSchema.collection_name]
@property
def quality_reports(self):
"""Quality reports collection"""
return self.db[QualityReportSchema.collection_name]
@property
def ingestion_stats(self):
"""Ingestion statistics collection"""
return self.db[IngestionStatsSchema.collection_name]
@property
def error_logs(self):
"""Error logs collection"""
return self.db[ErrorLogSchema.collection_name]
@property
def monitoring_alerts(self):
"""Monitoring alerts collection"""
return self.db[MonitoringAlertSchema.collection_name]
# Global database manager instance
db_manager = DatabaseManager()
async def get_database():
"""Dependency function to get database instance"""
if not db_manager.is_connected:
await db_manager.connect()
return db_manager.db
async def get_redis():
"""Dependency function to get Redis client"""
if not db_manager.is_connected:
await db_manager.connect()
return db_manager.redis_client
@asynccontextmanager
async def get_db_session():
"""Context manager for database operations"""
try:
if not db_manager.is_connected:
await db_manager.connect()
yield db_manager.db
except Exception as e:
logger.error(f"Database session error: {e}")
raise
finally:
# Connection pooling handles cleanup automatically
pass
@asynccontextmanager
async def get_redis_session():
"""Context manager for Redis operations"""
try:
if not db_manager.is_connected:
await db_manager.connect()
yield db_manager.redis_client
except Exception as e:
logger.error(f"Redis session error: {e}")
raise
finally:
# Connection pooling handles cleanup automatically
pass
class DatabaseService:
"""High-level database service with common operations"""
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
async def create_data_source(self, source_data: dict) -> str:
"""Create a new data source"""
try:
source_data["created_at"] = datetime.utcnow()
source_data["updated_at"] = datetime.utcnow()
source_data["status"] = "active"
source_data["error_count"] = 0
source_data["total_files_processed"] = 0
result = await self.db.data_sources.insert_one(source_data)
return str(result.inserted_id)
except Exception as e:
logger.error(f"Error creating data source: {e}")
raise
async def get_data_source(self, source_id: str) -> Optional[dict]:
"""Get data source by ID"""
try:
from bson import ObjectId
source = await self.db.data_sources.find_one({"_id": ObjectId(source_id)})
if source:
source["_id"] = str(source["_id"])
return source
except Exception as e:
logger.error(f"Error getting data source: {e}")
return None
async def update_data_source(self, source_id: str, update_data: dict) -> bool:
"""Update data source"""
try:
from bson import ObjectId
update_data["updated_at"] = datetime.utcnow()
result = await self.db.data_sources.update_one(
{"_id": ObjectId(source_id)},
{"$set": update_data}
)
return result.modified_count > 0
except Exception as e:
logger.error(f"Error updating data source: {e}")
return False
async def list_data_sources(self, enabled_only: bool = False) -> list:
"""List all data sources"""
try:
query = {"enabled": True} if enabled_only else {}
cursor = self.db.data_sources.find(query).sort("created_at", -1)
sources = []
async for source in cursor:
source["_id"] = str(source["_id"])
sources.append(source)
return sources
except Exception as e:
logger.error(f"Error listing data sources: {e}")
return []
async def log_error(self, error_data: dict):
"""Log an error to the database"""
try:
error_data["timestamp"] = datetime.utcnow()
await self.db.error_logs.insert_one(error_data)
except Exception as e:
logger.error(f"Error logging error: {e}")
async def update_ingestion_stats(self, stats_data: dict):
"""Update daily ingestion statistics"""
try:
today = datetime.utcnow().strftime("%Y-%m-%d")
stats_data["date"] = today
stats_data["timestamp"] = datetime.utcnow()
await self.db.ingestion_stats.update_one(
{"date": today},
{"$set": stats_data},
# Insert or update file record
await self.collections['files'].replace_one(
{"filename": filename},
file_metadata,
upsert=True
)
# Add filename and processed timestamp to each record
for record in records:
record["filename"] = filename
record["processed_at"] = current_time
# Insert energy data records
if records:
result = await self.collections['energy_data'].insert_many(records)
inserted_count = len(result.inserted_ids)
logger.info(f"Stored {inserted_count} records from {filename}")
return True
return False
except Exception as e:
logger.error(f"Error updating ingestion stats: {e}")
async def get_latest_stats(self) -> Optional[dict]:
"""Get latest ingestion statistics"""
try:
stats = await self.db.ingestion_stats.find_one(
sort=[("timestamp", -1)]
logger.error(f"Error storing data for {filename}: {e}")
# Store error metadata
error_metadata = {
"filename": filename,
"processed_at": current_time,
"status": "error",
"error_message": str(e)
}
await self.collections['files'].replace_one(
{"filename": filename},
error_metadata,
upsert=True
)
if stats:
stats["_id"] = str(stats["_id"])
return False
async def get_processed_files(self) -> List[str]:
"""Get list of successfully processed files"""
try:
cursor = self.collections['files'].find(
{"status": "processed"},
{"filename": 1, "_id": 0}
)
files = []
async for doc in cursor:
files.append(doc["filename"])
return files
except Exception as e:
logger.error(f"Error getting processed files: {e}")
return []
async def get_file_info(self, filename: str) -> Optional[Dict[str, Any]]:
"""Get information about a specific file"""
try:
return await self.collections['files'].find_one({"filename": filename})
except Exception as e:
logger.error(f"Error getting file info for {filename}: {e}")
return None
async def get_stats(self) -> Dict[str, Any]:
"""Get database statistics"""
try:
stats = {
"database": self.database_name,
"timestamp": datetime.now().isoformat()
}
# Count documents in each collection
for name, collection in self.collections.items():
try:
count = await collection.count_documents({})
stats[f"{name}_count"] = count
except Exception as e:
stats[f"{name}_count"] = f"error: {e}"
# Get recent files
try:
recent_files = []
cursor = self.collections['files'].find(
{},
{"filename": 1, "processed_at": 1, "record_count": 1, "status": 1, "_id": 0}
).sort("processed_at", -1).limit(5)
async for doc in cursor:
if doc.get("processed_at"):
doc["processed_at"] = doc["processed_at"].isoformat()
recent_files.append(doc)
stats["recent_files"] = recent_files
except Exception as e:
stats["recent_files"] = f"error: {e}"
return stats
except Exception as e:
logger.error(f"Error getting latest stats: {e}")
return None
logger.error(f"Error getting database stats: {e}")
return {"error": str(e), "timestamp": datetime.now().isoformat()}
async def cleanup_old_data(self, days: int = 30):
"""Clean up old data based on retention policy"""
async def get_energy_data(self,
filename: Optional[str] = None,
start_time: Optional[datetime] = None,
end_time: Optional[datetime] = None,
limit: int = 100) -> List[Dict[str, Any]]:
"""Retrieve energy data with optional filtering"""
try:
cutoff_date = datetime.utcnow() - datetime.timedelta(days=days)
query = {}
# Clean up old processed files records
result1 = await self.db.processed_files.delete_many({
"processed_at": {"$lt": cutoff_date}
})
if filename:
query["filename"] = filename
# Clean up old error logs
result2 = await self.db.error_logs.delete_many({
"timestamp": {"$lt": cutoff_date}
})
if start_time or end_time:
time_query = {}
if start_time:
time_query["$gte"] = start_time
if end_time:
time_query["$lte"] = end_time
query["timestamp"] = time_query
# Clean up old quality reports
result3 = await self.db.quality_reports.delete_many({
"processing_time": {"$lt": cutoff_date}
})
cursor = self.collections['energy_data'].find(query).sort("timestamp", -1).limit(limit)
logger.info(f"Cleaned up old data: {result1.deleted_count} processed files, "
f"{result2.deleted_count} error logs, {result3.deleted_count} quality reports")
data = []
async for doc in cursor:
# Convert ObjectId to string and datetime to ISO string
if "_id" in doc:
doc["_id"] = str(doc["_id"])
if "timestamp" in doc and hasattr(doc["timestamp"], "isoformat"):
doc["timestamp"] = doc["timestamp"].isoformat()
if "processed_at" in doc and hasattr(doc["processed_at"], "isoformat"):
doc["processed_at"] = doc["processed_at"].isoformat()
data.append(doc)
return data
except Exception as e:
logger.error(f"Error cleaning up old data: {e}")
# Export the database manager and service for use in other modules
__all__ = [
'DatabaseManager', 'DatabaseService', 'db_manager',
'get_database', 'get_redis', 'get_db_session', 'get_redis_session'
]
logger.error(f"Error retrieving energy data: {e}")
return []