Add scan cache tracking and improve health checks

- Track scanned FTP directories in MongoDB to avoid redundant scans -
Add endpoints to view and clear scan cache - Improve health check logic
for better startup and error reporting - Add readiness endpoint for
deployment probes - Add test script for health check improvements -
Increase logging verbosity for debugging
This commit is contained in:
rafaeldpsilva
2025-09-22 15:12:40 +01:00
parent c3ba7c0dc0
commit 41b8753a92
6 changed files with 440 additions and 95 deletions

View File

@@ -1,8 +1,3 @@
"""
MongoDB Database Manager for SA4CPS Data Ingestion
Simple sync MongoDB operations for storing .sgl_v2 file data
"""
import logging
from datetime import datetime
from typing import List, Dict, Any, Optional
@@ -15,37 +10,32 @@ logger = logging.getLogger(__name__)
class DatabaseManager:
"""Manages MongoDB connections and operations for SA4CPS data"""
def __init__(self):
self.client: Optional[MongoClient] = None
self.db = None
self.collections = {}
# MongoDB configuration
self.connection_string = MONGO_CONFIG["connection_string"]
self.database_name = MONGO_CONFIG["database_name"]
logger.info(f"Database manager initialized for: {self.database_name}")
async def connect(self):
"""Connect to MongoDB"""
try:
logger.info(f"Connecting to MongoDB at: {self.connection_string}")
self.client = MongoClient(self.connection_string, serverSelectionTimeoutMS=5000)
# Test connection
await self.ping()
# Get database and collections
self.db = self.client[self.database_name]
self.collections = {
'files': self.db.sa4cps_files,
'energy_data': self.db.sa4cps_energy_data,
'metadata': self.db.sa4cps_metadata
'metadata': self.db.sa4cps_metadata,
'scanned_directories': self.db.sa4cps_scanned_directories
}
# Create indexes for better performance
self._create_indexes()
logger.info(f"Connected to MongoDB database: {self.database_name}")
@@ -66,9 +56,21 @@ class DatabaseManager:
raise ConnectionFailure("No database connection")
try:
# The ping command is cheap and does not require auth.
self.client.admin.command('ping')
# Use async approach with timeout
import asyncio
import concurrent.futures
# Run the ping command in a thread pool to avoid blocking
loop = asyncio.get_event_loop()
with concurrent.futures.ThreadPoolExecutor() as pool:
await asyncio.wait_for(
loop.run_in_executor(pool, self.client.admin.command, 'ping'),
timeout=3.0 # 3 second timeout for ping
)
logger.debug("MongoDB ping successful")
except asyncio.TimeoutError:
logger.error("MongoDB ping timeout after 3 seconds")
raise ConnectionFailure("MongoDB ping timeout")
except ConnectionFailure as e:
logger.error(f"MongoDB ping failed - Server not available: {e}")
raise
@@ -77,22 +79,22 @@ class DatabaseManager:
raise ConnectionFailure(f"Ping failed: {e}")
def _create_indexes(self):
"""Create database indexes for efficient queries"""
try:
# Index on files collection
self.collections['files'].create_index("filename", unique=True)
self.collections['files'].create_index("processed_at")
# Index on energy data collection
self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
self.collections['energy_data'].create_index("timestamp")
self.collections['scanned_directories'].create_index("directory_path", unique=True)
self.collections['scanned_directories'].create_index("last_scanned")
self.collections['scanned_directories'].create_index("scan_status")
logger.info("Database indexes created successfully")
except Exception as e:
logger.warning(f"Failed to create indexes: {e}")
async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool:
"""Store processed .sgl_v2 file data in MongoDB"""
try:
current_time = datetime.now()
@@ -175,6 +177,77 @@ class DatabaseManager:
logger.error(f"Error getting file info for {filename}: {e}")
return None
# Directory scanning tracking methods
async def is_directory_scanned(self, directory_path: str, since_timestamp: datetime = None) -> bool:
"""Check if directory has been scanned recently"""
try:
query = {"directory_path": directory_path, "scan_status": "complete"}
if since_timestamp:
query["last_scanned"] = {"$gte": since_timestamp}
result = self.collections['scanned_directories'].find_one(query)
return result is not None
except Exception as e:
logger.error(f"Error checking directory scan status for {directory_path}: {e}")
return False
async def mark_directory_scanned(self, directory_path: str, file_count: int, ftp_last_modified: datetime = None) -> bool:
"""Mark directory as scanned with current timestamp"""
try:
scan_record = {
"directory_path": directory_path,
"last_scanned": datetime.now(),
"file_count": file_count,
"scan_status": "complete"
}
if ftp_last_modified:
scan_record["ftp_last_modified"] = ftp_last_modified
# Use upsert to update existing or create new record
self.collections['scanned_directories'].replace_one(
{"directory_path": directory_path},
scan_record,
upsert=True
)
logger.debug(f"Marked directory as scanned: {directory_path} ({file_count} files)")
return True
except Exception as e:
logger.error(f"Error marking directory as scanned {directory_path}: {e}")
return False
async def get_scanned_directories(self) -> List[Dict[str, Any]]:
"""Get all scanned directory records"""
try:
cursor = self.collections['scanned_directories'].find()
return list(cursor)
except Exception as e:
logger.error(f"Error getting scanned directories: {e}")
return []
async def should_skip_directory(self, directory_path: str, ftp_last_modified: datetime = None) -> bool:
"""Determine if directory should be skipped based on scan history and modification time"""
try:
scan_record = self.collections['scanned_directories'].find_one(
{"directory_path": directory_path, "scan_status": "complete"}
)
if not scan_record:
return False # Never scanned, should scan
# If we have FTP modification time and it's newer than our last scan, don't skip
if ftp_last_modified and scan_record.get("last_scanned"):
return ftp_last_modified <= scan_record["last_scanned"]
# If directory was scanned successfully, skip it (assuming it's historical data)
return True
except Exception as e:
logger.error(f"Error determining if directory should be skipped {directory_path}: {e}")
return False
async def get_stats(self) -> Dict[str, Any]:
"""Get database statistics"""
try: