Switch to PyMongo, update config and requirements, fix FTP extension

typo

- Replace Motor (async) with PyMongo (sync) in database manager - Update
environment variable names for FTP and MongoDB config - Remove unused
dependencies from requirements.txt - Fix file extension typo: .slg_v2 →
.sgl_v2 throughout code and docs - Add debug prints for MongoDB env vars
in config - Update FTP monitor to use correct file extension and PyMongo
- Adjust FastAPI descriptions for new extension
This commit is contained in:
rafaeldpsilva
2025-09-11 11:45:19 +01:00
parent b2a5b3d229
commit 2932e0a424
6 changed files with 152 additions and 156 deletions

View File

@@ -1,14 +1,12 @@
#!/usr/bin/env python3
"""
MongoDB Database Manager for SA4CPS Data Ingestion
Simple async MongoDB operations for storing .slg_v2 file data
Simple sync MongoDB operations for storing .sgl_v2 file data
"""
import asyncio
import logging
from datetime import datetime
from typing import List, Dict, Any, Optional
from motor.motor_asyncio import AsyncIOMotorClient
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
from config import MONGO_CONFIG
@@ -18,26 +16,27 @@ logger = logging.getLogger(__name__)
class DatabaseManager:
"""Manages MongoDB connections and operations for SA4CPS data"""
def __init__(self):
self.client: Optional[AsyncIOMotorClient] = None
self.client: Optional[MongoClient] = None
self.db = None
self.collections = {}
# MongoDB configuration
self.connection_string = MONGO_CONFIG["connection_string"]
self.database_name = MONGO_CONFIG["database_name"]
logger.info(f"Database manager initialized for: {self.database_name}")
async def connect(self):
"""Connect to MongoDB"""
try:
self.client = AsyncIOMotorClient(self.connection_string)
logger.info(f"Connecting to MongoDB at: {self.connection_string}")
self.client = MongoClient(self.connection_string, serverSelectionTimeoutMS=5000)
# Test connection
await self.client.admin.command('ping')
await self.ping()
# Get database and collections
self.db = self.client[self.database_name]
self.collections = {
@@ -45,50 +44,58 @@ class DatabaseManager:
'energy_data': self.db.sa4cps_energy_data,
'metadata': self.db.sa4cps_metadata
}
# Create indexes for better performance
await self._create_indexes()
logger.info(f"Connected to MongoDB: {self.database_name}")
self._create_indexes()
logger.info(f"Connected to MongoDB database: {self.database_name}")
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
logger.error(f"Failed to connect to MongoDB: {e}")
raise
async def close(self):
"""Close MongoDB connection"""
if self.client:
self.client.close()
logger.info("MongoDB connection closed")
async def ping(self):
"""Test database connection"""
if not self.client:
raise ConnectionFailure("No database connection")
await self.client.admin.command('ping')
async def _create_indexes(self):
try:
# The ping command is cheap and does not require auth.
self.client.admin.command('ping')
logger.info("MongoDB ping successful")
except ConnectionFailure as e:
logger.error(f"MongoDB ping failed - Server not available: {e}")
raise
except Exception as e:
logger.error(f"MongoDB ping failed with error: {e}")
raise ConnectionFailure(f"Ping failed: {e}")
def _create_indexes(self):
"""Create database indexes for efficient queries"""
try:
# Index on files collection
await self.collections['files'].create_index("filename", unique=True)
await self.collections['files'].create_index("processed_at")
self.collections['files'].create_index("filename", unique=True)
self.collections['files'].create_index("processed_at")
# Index on energy data collection
await self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
await self.collections['energy_data'].create_index("timestamp")
self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
self.collections['energy_data'].create_index("timestamp")
logger.info("Database indexes created successfully")
except Exception as e:
logger.warning(f"Failed to create indexes: {e}")
async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool:
"""Store processed .slg_v2 file data in MongoDB"""
"""Store processed .sgl_v2 file data in MongoDB"""
try:
current_time = datetime.now()
# Store file metadata
file_metadata = {
"filename": filename,
@@ -97,31 +104,31 @@ class DatabaseManager:
"file_size": sum(len(str(record)) for record in records),
"status": "processed"
}
# Insert or update file record
await self.collections['files'].replace_one(
self.collections['files'].replace_one(
{"filename": filename},
file_metadata,
upsert=True
)
# Add filename and processed timestamp to each record
for record in records:
record["filename"] = filename
record["processed_at"] = current_time
# Insert energy data records
if records:
result = await self.collections['energy_data'].insert_many(records)
result = self.collections['energy_data'].insert_many(records)
inserted_count = len(result.inserted_ids)
logger.info(f"Stored {inserted_count} records from {filename}")
return True
return False
except Exception as e:
logger.error(f"Error storing data for {filename}: {e}")
# Store error metadata
error_metadata = {
"filename": filename,
@@ -129,15 +136,15 @@ class DatabaseManager:
"status": "error",
"error_message": str(e)
}
await self.collections['files'].replace_one(
self.collections['files'].replace_one(
{"filename": filename},
error_metadata,
upsert=True
)
return False
async def get_processed_files(self) -> List[str]:
"""Get list of successfully processed files"""
try:
@@ -145,25 +152,25 @@ class DatabaseManager:
{"status": "processed"},
{"filename": 1, "_id": 0}
)
files = []
async for doc in cursor:
for doc in cursor:
files.append(doc["filename"])
return files
except Exception as e:
logger.error(f"Error getting processed files: {e}")
return []
async def get_file_info(self, filename: str) -> Optional[Dict[str, Any]]:
"""Get information about a specific file"""
try:
return await self.collections['files'].find_one({"filename": filename})
return self.collections['files'].find_one({"filename": filename})
except Exception as e:
logger.error(f"Error getting file info for {filename}: {e}")
return None
async def get_stats(self) -> Dict[str, Any]:
"""Get database statistics"""
try:
@@ -171,15 +178,15 @@ class DatabaseManager:
"database": self.database_name,
"timestamp": datetime.now().isoformat()
}
# Count documents in each collection
for name, collection in self.collections.items():
try:
count = await collection.count_documents({})
count = collection.count_documents({})
stats[f"{name}_count"] = count
except Exception as e:
stats[f"{name}_count"] = f"error: {e}"
# Get recent files
try:
recent_files = []
@@ -187,24 +194,24 @@ class DatabaseManager:
{},
{"filename": 1, "processed_at": 1, "record_count": 1, "status": 1, "_id": 0}
).sort("processed_at", -1).limit(5)
async for doc in cursor:
for doc in cursor:
if doc.get("processed_at"):
doc["processed_at"] = doc["processed_at"].isoformat()
recent_files.append(doc)
stats["recent_files"] = recent_files
except Exception as e:
stats["recent_files"] = f"error: {e}"
return stats
except Exception as e:
logger.error(f"Error getting database stats: {e}")
return {"error": str(e), "timestamp": datetime.now().isoformat()}
async def get_energy_data(self,
async def get_energy_data(self,
filename: Optional[str] = None,
start_time: Optional[datetime] = None,
end_time: Optional[datetime] = None,
@@ -212,10 +219,10 @@ class DatabaseManager:
"""Retrieve energy data with optional filtering"""
try:
query = {}
if filename:
query["filename"] = filename
if start_time or end_time:
time_query = {}
if start_time:
@@ -223,11 +230,11 @@ class DatabaseManager:
if end_time:
time_query["$lte"] = end_time
query["timestamp"] = time_query
cursor = self.collections['energy_data'].find(query).sort("timestamp", -1).limit(limit)
data = []
async for doc in cursor:
for doc in cursor:
# Convert ObjectId to string and datetime to ISO string
if "_id" in doc:
doc["_id"] = str(doc["_id"])
@@ -235,11 +242,11 @@ class DatabaseManager:
doc["timestamp"] = doc["timestamp"].isoformat()
if "processed_at" in doc and hasattr(doc["processed_at"], "isoformat"):
doc["processed_at"] = doc["processed_at"].isoformat()
data.append(doc)
return data
except Exception as e:
logger.error(f"Error retrieving energy data: {e}")
return []
return []