Simplify data ingestion service
This commit is contained in:
@@ -1,433 +1,245 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Database configuration and connection management for the data ingestion service.
|
||||
Handles MongoDB connections, index creation, and Redis connections.
|
||||
MongoDB Database Manager for SA4CPS Data Ingestion
|
||||
Simple async MongoDB operations for storing .slg_v2 file data
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Optional
|
||||
from contextlib import asynccontextmanager
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
||||
|
||||
import motor.motor_asyncio
|
||||
import redis.asyncio as redis
|
||||
from pymongo import IndexModel
|
||||
|
||||
from .models import (
|
||||
DataSourceSchema, ProcessedFileSchema, QualityReportSchema,
|
||||
IngestionStatsSchema, ErrorLogSchema, MonitoringAlertSchema
|
||||
)
|
||||
from config import MONGO_CONFIG
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""Manages database connections and operations"""
|
||||
"""Manages MongoDB connections and operations for SA4CPS data"""
|
||||
|
||||
def __init__(self, mongodb_url: str = None, redis_url: str = None):
|
||||
self.mongodb_url = mongodb_url or os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
||||
self.redis_url = redis_url or os.getenv("REDIS_URL", "redis://localhost:6379")
|
||||
def __init__(self):
|
||||
self.client: Optional[AsyncIOMotorClient] = None
|
||||
self.db = None
|
||||
self.collections = {}
|
||||
|
||||
self.mongodb_client: Optional[motor.motor_asyncio.AsyncIOMotorClient] = None
|
||||
self.db: Optional[motor.motor_asyncio.AsyncIOMotorDatabase] = None
|
||||
self.redis_client: Optional[redis.Redis] = None
|
||||
# MongoDB configuration
|
||||
self.connection_string = MONGO_CONFIG["connection_string"]
|
||||
self.database_name = MONGO_CONFIG["database_name"]
|
||||
|
||||
self._connection_status = {
|
||||
"mongodb": False,
|
||||
"redis": False,
|
||||
"last_check": None
|
||||
}
|
||||
logger.info(f"Database manager initialized for: {self.database_name}")
|
||||
|
||||
async def connect(self):
|
||||
"""Establish connections to MongoDB and Redis"""
|
||||
try:
|
||||
await self._connect_mongodb()
|
||||
await self._connect_redis()
|
||||
await self._create_indexes()
|
||||
|
||||
logger.info("Database connections established successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error establishing database connections: {e}")
|
||||
raise
|
||||
|
||||
async def _connect_mongodb(self):
|
||||
"""Connect to MongoDB"""
|
||||
try:
|
||||
# Parse database name from URL or use default
|
||||
db_name = "energy_dashboard"
|
||||
if self.mongodb_url.count("/") > 2:
|
||||
db_name = self.mongodb_url.split("/")[-1]
|
||||
|
||||
self.mongodb_client = motor.motor_asyncio.AsyncIOMotorClient(
|
||||
self.mongodb_url,
|
||||
serverSelectionTimeoutMS=5000,
|
||||
connectTimeoutMS=5000,
|
||||
maxPoolSize=50,
|
||||
minPoolSize=10
|
||||
)
|
||||
|
||||
self.db = self.mongodb_client[db_name]
|
||||
self.client = AsyncIOMotorClient(self.connection_string)
|
||||
|
||||
# Test connection
|
||||
await self.mongodb_client.admin.command('ping')
|
||||
await self.client.admin.command('ping')
|
||||
|
||||
self._connection_status["mongodb"] = True
|
||||
logger.info(f"Connected to MongoDB: {self.mongodb_url}")
|
||||
# Get database and collections
|
||||
self.db = self.client[self.database_name]
|
||||
self.collections = {
|
||||
'files': self.db.sa4cps_files,
|
||||
'energy_data': self.db.sa4cps_energy_data,
|
||||
'metadata': self.db.sa4cps_metadata
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self._connection_status["mongodb"] = False
|
||||
logger.error(f"MongoDB connection failed: {e}")
|
||||
# Create indexes for better performance
|
||||
await self._create_indexes()
|
||||
|
||||
logger.info(f"Connected to MongoDB: {self.database_name}")
|
||||
|
||||
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
|
||||
logger.error(f"Failed to connect to MongoDB: {e}")
|
||||
raise
|
||||
|
||||
async def _connect_redis(self):
|
||||
"""Connect to Redis"""
|
||||
try:
|
||||
self.redis_client = redis.from_url(
|
||||
self.redis_url,
|
||||
encoding="utf-8",
|
||||
decode_responses=True,
|
||||
socket_timeout=5,
|
||||
socket_connect_timeout=5,
|
||||
health_check_interval=30
|
||||
)
|
||||
|
||||
# Test connection
|
||||
await self.redis_client.ping()
|
||||
|
||||
self._connection_status["redis"] = True
|
||||
logger.info(f"Connected to Redis: {self.redis_url}")
|
||||
|
||||
except Exception as e:
|
||||
self._connection_status["redis"] = False
|
||||
logger.error(f"Redis connection failed: {e}")
|
||||
raise
|
||||
async def close(self):
|
||||
"""Close MongoDB connection"""
|
||||
if self.client:
|
||||
self.client.close()
|
||||
logger.info("MongoDB connection closed")
|
||||
|
||||
async def ping(self):
|
||||
"""Test database connection"""
|
||||
if not self.client:
|
||||
raise ConnectionFailure("No database connection")
|
||||
|
||||
await self.client.admin.command('ping')
|
||||
|
||||
async def _create_indexes(self):
|
||||
"""Create database indexes for optimal performance"""
|
||||
"""Create database indexes for efficient queries"""
|
||||
try:
|
||||
schemas = [
|
||||
DataSourceSchema,
|
||||
ProcessedFileSchema,
|
||||
QualityReportSchema,
|
||||
IngestionStatsSchema,
|
||||
ErrorLogSchema,
|
||||
MonitoringAlertSchema
|
||||
]
|
||||
# Index on files collection
|
||||
await self.collections['files'].create_index("filename", unique=True)
|
||||
await self.collections['files'].create_index("processed_at")
|
||||
|
||||
for schema in schemas:
|
||||
collection = self.db[schema.collection_name]
|
||||
indexes = schema.get_indexes()
|
||||
|
||||
if indexes:
|
||||
index_models = []
|
||||
for index_spec in indexes:
|
||||
keys = index_spec["keys"]
|
||||
options = {k: v for k, v in index_spec.items() if k != "keys"}
|
||||
index_models.append(IndexModel(keys, **options))
|
||||
|
||||
await collection.create_indexes(index_models)
|
||||
logger.debug(f"Created {len(index_models)} indexes for {schema.collection_name}")
|
||||
# Index on energy data collection
|
||||
await self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
|
||||
await self.collections['energy_data'].create_index("timestamp")
|
||||
|
||||
logger.info("Database indexes created successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating database indexes: {e}")
|
||||
# Don't raise here - indexes are performance optimization, not critical
|
||||
logger.warning(f"Failed to create indexes: {e}")
|
||||
|
||||
async def disconnect(self):
|
||||
"""Close all database connections"""
|
||||
async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool:
|
||||
"""Store processed .slg_v2 file data in MongoDB"""
|
||||
try:
|
||||
if self.redis_client:
|
||||
await self.redis_client.aclose()
|
||||
self._connection_status["redis"] = False
|
||||
current_time = datetime.now()
|
||||
|
||||
if self.mongodb_client:
|
||||
self.mongodb_client.close()
|
||||
self._connection_status["mongodb"] = False
|
||||
|
||||
logger.info("Database connections closed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing database connections: {e}")
|
||||
|
||||
async def health_check(self) -> dict:
|
||||
"""Check health of database connections"""
|
||||
health = {
|
||||
"mongodb": False,
|
||||
"redis": False,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"details": {}
|
||||
}
|
||||
|
||||
# Check MongoDB
|
||||
try:
|
||||
if self.mongodb_client:
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
await self.mongodb_client.admin.command('ping')
|
||||
response_time = (asyncio.get_event_loop().time() - start_time) * 1000
|
||||
|
||||
health["mongodb"] = True
|
||||
health["details"]["mongodb"] = {
|
||||
"status": "healthy",
|
||||
"response_time_ms": round(response_time, 2),
|
||||
"server_info": await self.mongodb_client.server_info()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
health["details"]["mongodb"] = {
|
||||
"status": "unhealthy",
|
||||
"error": str(e)
|
||||
# Store file metadata
|
||||
file_metadata = {
|
||||
"filename": filename,
|
||||
"record_count": len(records),
|
||||
"processed_at": current_time,
|
||||
"file_size": sum(len(str(record)) for record in records),
|
||||
"status": "processed"
|
||||
}
|
||||
|
||||
# Check Redis
|
||||
try:
|
||||
if self.redis_client:
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
await self.redis_client.ping()
|
||||
response_time = (asyncio.get_event_loop().time() - start_time) * 1000
|
||||
|
||||
redis_info = await self.redis_client.info()
|
||||
|
||||
health["redis"] = True
|
||||
health["details"]["redis"] = {
|
||||
"status": "healthy",
|
||||
"response_time_ms": round(response_time, 2),
|
||||
"version": redis_info.get("redis_version"),
|
||||
"connected_clients": redis_info.get("connected_clients"),
|
||||
"used_memory_human": redis_info.get("used_memory_human")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
health["details"]["redis"] = {
|
||||
"status": "unhealthy",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
# Update connection status
|
||||
self._connection_status.update({
|
||||
"mongodb": health["mongodb"],
|
||||
"redis": health["redis"],
|
||||
"last_check": datetime.utcnow()
|
||||
})
|
||||
|
||||
return health
|
||||
|
||||
@property
|
||||
def is_connected(self) -> bool:
|
||||
"""Check if all required connections are established"""
|
||||
return self._connection_status["mongodb"] and self._connection_status["redis"]
|
||||
|
||||
@property
|
||||
def data_sources(self):
|
||||
"""Data sources collection"""
|
||||
return self.db[DataSourceSchema.collection_name]
|
||||
|
||||
@property
|
||||
def processed_files(self):
|
||||
"""Processed files collection"""
|
||||
return self.db[ProcessedFileSchema.collection_name]
|
||||
|
||||
@property
|
||||
def quality_reports(self):
|
||||
"""Quality reports collection"""
|
||||
return self.db[QualityReportSchema.collection_name]
|
||||
|
||||
@property
|
||||
def ingestion_stats(self):
|
||||
"""Ingestion statistics collection"""
|
||||
return self.db[IngestionStatsSchema.collection_name]
|
||||
|
||||
@property
|
||||
def error_logs(self):
|
||||
"""Error logs collection"""
|
||||
return self.db[ErrorLogSchema.collection_name]
|
||||
|
||||
@property
|
||||
def monitoring_alerts(self):
|
||||
"""Monitoring alerts collection"""
|
||||
return self.db[MonitoringAlertSchema.collection_name]
|
||||
|
||||
# Global database manager instance
|
||||
db_manager = DatabaseManager()
|
||||
|
||||
async def get_database():
|
||||
"""Dependency function to get database instance"""
|
||||
if not db_manager.is_connected:
|
||||
await db_manager.connect()
|
||||
return db_manager.db
|
||||
|
||||
async def get_redis():
|
||||
"""Dependency function to get Redis client"""
|
||||
if not db_manager.is_connected:
|
||||
await db_manager.connect()
|
||||
return db_manager.redis_client
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_db_session():
|
||||
"""Context manager for database operations"""
|
||||
try:
|
||||
if not db_manager.is_connected:
|
||||
await db_manager.connect()
|
||||
yield db_manager.db
|
||||
except Exception as e:
|
||||
logger.error(f"Database session error: {e}")
|
||||
raise
|
||||
finally:
|
||||
# Connection pooling handles cleanup automatically
|
||||
pass
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_redis_session():
|
||||
"""Context manager for Redis operations"""
|
||||
try:
|
||||
if not db_manager.is_connected:
|
||||
await db_manager.connect()
|
||||
yield db_manager.redis_client
|
||||
except Exception as e:
|
||||
logger.error(f"Redis session error: {e}")
|
||||
raise
|
||||
finally:
|
||||
# Connection pooling handles cleanup automatically
|
||||
pass
|
||||
|
||||
class DatabaseService:
|
||||
"""High-level database service with common operations"""
|
||||
|
||||
def __init__(self, db, redis_client):
|
||||
self.db = db
|
||||
self.redis = redis_client
|
||||
|
||||
async def create_data_source(self, source_data: dict) -> str:
|
||||
"""Create a new data source"""
|
||||
try:
|
||||
source_data["created_at"] = datetime.utcnow()
|
||||
source_data["updated_at"] = datetime.utcnow()
|
||||
source_data["status"] = "active"
|
||||
source_data["error_count"] = 0
|
||||
source_data["total_files_processed"] = 0
|
||||
|
||||
result = await self.db.data_sources.insert_one(source_data)
|
||||
return str(result.inserted_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating data source: {e}")
|
||||
raise
|
||||
|
||||
async def get_data_source(self, source_id: str) -> Optional[dict]:
|
||||
"""Get data source by ID"""
|
||||
try:
|
||||
from bson import ObjectId
|
||||
source = await self.db.data_sources.find_one({"_id": ObjectId(source_id)})
|
||||
if source:
|
||||
source["_id"] = str(source["_id"])
|
||||
return source
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting data source: {e}")
|
||||
return None
|
||||
|
||||
async def update_data_source(self, source_id: str, update_data: dict) -> bool:
|
||||
"""Update data source"""
|
||||
try:
|
||||
from bson import ObjectId
|
||||
update_data["updated_at"] = datetime.utcnow()
|
||||
|
||||
result = await self.db.data_sources.update_one(
|
||||
{"_id": ObjectId(source_id)},
|
||||
{"$set": update_data}
|
||||
)
|
||||
|
||||
return result.modified_count > 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating data source: {e}")
|
||||
return False
|
||||
|
||||
async def list_data_sources(self, enabled_only: bool = False) -> list:
|
||||
"""List all data sources"""
|
||||
try:
|
||||
query = {"enabled": True} if enabled_only else {}
|
||||
cursor = self.db.data_sources.find(query).sort("created_at", -1)
|
||||
|
||||
sources = []
|
||||
async for source in cursor:
|
||||
source["_id"] = str(source["_id"])
|
||||
sources.append(source)
|
||||
|
||||
return sources
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing data sources: {e}")
|
||||
return []
|
||||
|
||||
async def log_error(self, error_data: dict):
|
||||
"""Log an error to the database"""
|
||||
try:
|
||||
error_data["timestamp"] = datetime.utcnow()
|
||||
await self.db.error_logs.insert_one(error_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error logging error: {e}")
|
||||
|
||||
async def update_ingestion_stats(self, stats_data: dict):
|
||||
"""Update daily ingestion statistics"""
|
||||
try:
|
||||
today = datetime.utcnow().strftime("%Y-%m-%d")
|
||||
stats_data["date"] = today
|
||||
stats_data["timestamp"] = datetime.utcnow()
|
||||
|
||||
await self.db.ingestion_stats.update_one(
|
||||
{"date": today},
|
||||
{"$set": stats_data},
|
||||
# Insert or update file record
|
||||
await self.collections['files'].replace_one(
|
||||
{"filename": filename},
|
||||
file_metadata,
|
||||
upsert=True
|
||||
)
|
||||
|
||||
# Add filename and processed timestamp to each record
|
||||
for record in records:
|
||||
record["filename"] = filename
|
||||
record["processed_at"] = current_time
|
||||
|
||||
# Insert energy data records
|
||||
if records:
|
||||
result = await self.collections['energy_data'].insert_many(records)
|
||||
inserted_count = len(result.inserted_ids)
|
||||
logger.info(f"Stored {inserted_count} records from {filename}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating ingestion stats: {e}")
|
||||
|
||||
async def get_latest_stats(self) -> Optional[dict]:
|
||||
"""Get latest ingestion statistics"""
|
||||
try:
|
||||
stats = await self.db.ingestion_stats.find_one(
|
||||
sort=[("timestamp", -1)]
|
||||
logger.error(f"Error storing data for {filename}: {e}")
|
||||
|
||||
# Store error metadata
|
||||
error_metadata = {
|
||||
"filename": filename,
|
||||
"processed_at": current_time,
|
||||
"status": "error",
|
||||
"error_message": str(e)
|
||||
}
|
||||
|
||||
await self.collections['files'].replace_one(
|
||||
{"filename": filename},
|
||||
error_metadata,
|
||||
upsert=True
|
||||
)
|
||||
if stats:
|
||||
stats["_id"] = str(stats["_id"])
|
||||
|
||||
return False
|
||||
|
||||
async def get_processed_files(self) -> List[str]:
|
||||
"""Get list of successfully processed files"""
|
||||
try:
|
||||
cursor = self.collections['files'].find(
|
||||
{"status": "processed"},
|
||||
{"filename": 1, "_id": 0}
|
||||
)
|
||||
|
||||
files = []
|
||||
async for doc in cursor:
|
||||
files.append(doc["filename"])
|
||||
|
||||
return files
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting processed files: {e}")
|
||||
return []
|
||||
|
||||
async def get_file_info(self, filename: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get information about a specific file"""
|
||||
try:
|
||||
return await self.collections['files'].find_one({"filename": filename})
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting file info for {filename}: {e}")
|
||||
return None
|
||||
|
||||
async def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get database statistics"""
|
||||
try:
|
||||
stats = {
|
||||
"database": self.database_name,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Count documents in each collection
|
||||
for name, collection in self.collections.items():
|
||||
try:
|
||||
count = await collection.count_documents({})
|
||||
stats[f"{name}_count"] = count
|
||||
except Exception as e:
|
||||
stats[f"{name}_count"] = f"error: {e}"
|
||||
|
||||
# Get recent files
|
||||
try:
|
||||
recent_files = []
|
||||
cursor = self.collections['files'].find(
|
||||
{},
|
||||
{"filename": 1, "processed_at": 1, "record_count": 1, "status": 1, "_id": 0}
|
||||
).sort("processed_at", -1).limit(5)
|
||||
|
||||
async for doc in cursor:
|
||||
if doc.get("processed_at"):
|
||||
doc["processed_at"] = doc["processed_at"].isoformat()
|
||||
recent_files.append(doc)
|
||||
|
||||
stats["recent_files"] = recent_files
|
||||
|
||||
except Exception as e:
|
||||
stats["recent_files"] = f"error: {e}"
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting latest stats: {e}")
|
||||
return None
|
||||
logger.error(f"Error getting database stats: {e}")
|
||||
return {"error": str(e), "timestamp": datetime.now().isoformat()}
|
||||
|
||||
async def cleanup_old_data(self, days: int = 30):
|
||||
"""Clean up old data based on retention policy"""
|
||||
async def get_energy_data(self,
|
||||
filename: Optional[str] = None,
|
||||
start_time: Optional[datetime] = None,
|
||||
end_time: Optional[datetime] = None,
|
||||
limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""Retrieve energy data with optional filtering"""
|
||||
try:
|
||||
cutoff_date = datetime.utcnow() - datetime.timedelta(days=days)
|
||||
query = {}
|
||||
|
||||
# Clean up old processed files records
|
||||
result1 = await self.db.processed_files.delete_many({
|
||||
"processed_at": {"$lt": cutoff_date}
|
||||
})
|
||||
if filename:
|
||||
query["filename"] = filename
|
||||
|
||||
# Clean up old error logs
|
||||
result2 = await self.db.error_logs.delete_many({
|
||||
"timestamp": {"$lt": cutoff_date}
|
||||
})
|
||||
if start_time or end_time:
|
||||
time_query = {}
|
||||
if start_time:
|
||||
time_query["$gte"] = start_time
|
||||
if end_time:
|
||||
time_query["$lte"] = end_time
|
||||
query["timestamp"] = time_query
|
||||
|
||||
# Clean up old quality reports
|
||||
result3 = await self.db.quality_reports.delete_many({
|
||||
"processing_time": {"$lt": cutoff_date}
|
||||
})
|
||||
cursor = self.collections['energy_data'].find(query).sort("timestamp", -1).limit(limit)
|
||||
|
||||
logger.info(f"Cleaned up old data: {result1.deleted_count} processed files, "
|
||||
f"{result2.deleted_count} error logs, {result3.deleted_count} quality reports")
|
||||
data = []
|
||||
async for doc in cursor:
|
||||
# Convert ObjectId to string and datetime to ISO string
|
||||
if "_id" in doc:
|
||||
doc["_id"] = str(doc["_id"])
|
||||
if "timestamp" in doc and hasattr(doc["timestamp"], "isoformat"):
|
||||
doc["timestamp"] = doc["timestamp"].isoformat()
|
||||
if "processed_at" in doc and hasattr(doc["processed_at"], "isoformat"):
|
||||
doc["processed_at"] = doc["processed_at"].isoformat()
|
||||
|
||||
data.append(doc)
|
||||
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up old data: {e}")
|
||||
|
||||
# Export the database manager and service for use in other modules
|
||||
__all__ = [
|
||||
'DatabaseManager', 'DatabaseService', 'db_manager',
|
||||
'get_database', 'get_redis', 'get_db_session', 'get_redis_session'
|
||||
]
|
||||
logger.error(f"Error retrieving energy data: {e}")
|
||||
return []
|
||||
Reference in New Issue
Block a user