Add scan cache tracking and improve health checks
- Track scanned FTP directories in MongoDB to avoid redundant scans - Add endpoints to view and clear scan cache - Improve health check logic for better startup and error reporting - Add readiness endpoint for deployment probes - Add test script for health check improvements - Increase logging verbosity for debugging
This commit is contained in:
@@ -1,8 +1,3 @@
|
||||
"""
|
||||
MongoDB Database Manager for SA4CPS Data Ingestion
|
||||
Simple sync MongoDB operations for storing .sgl_v2 file data
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Any, Optional
|
||||
@@ -15,37 +10,32 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""Manages MongoDB connections and operations for SA4CPS data"""
|
||||
|
||||
def __init__(self):
|
||||
self.client: Optional[MongoClient] = None
|
||||
self.db = None
|
||||
self.collections = {}
|
||||
|
||||
# MongoDB configuration
|
||||
self.connection_string = MONGO_CONFIG["connection_string"]
|
||||
self.database_name = MONGO_CONFIG["database_name"]
|
||||
|
||||
logger.info(f"Database manager initialized for: {self.database_name}")
|
||||
|
||||
async def connect(self):
|
||||
"""Connect to MongoDB"""
|
||||
try:
|
||||
logger.info(f"Connecting to MongoDB at: {self.connection_string}")
|
||||
self.client = MongoClient(self.connection_string, serverSelectionTimeoutMS=5000)
|
||||
|
||||
# Test connection
|
||||
await self.ping()
|
||||
|
||||
# Get database and collections
|
||||
self.db = self.client[self.database_name]
|
||||
self.collections = {
|
||||
'files': self.db.sa4cps_files,
|
||||
'energy_data': self.db.sa4cps_energy_data,
|
||||
'metadata': self.db.sa4cps_metadata
|
||||
'metadata': self.db.sa4cps_metadata,
|
||||
'scanned_directories': self.db.sa4cps_scanned_directories
|
||||
}
|
||||
|
||||
# Create indexes for better performance
|
||||
self._create_indexes()
|
||||
|
||||
logger.info(f"Connected to MongoDB database: {self.database_name}")
|
||||
@@ -66,9 +56,21 @@ class DatabaseManager:
|
||||
raise ConnectionFailure("No database connection")
|
||||
|
||||
try:
|
||||
# The ping command is cheap and does not require auth.
|
||||
self.client.admin.command('ping')
|
||||
# Use async approach with timeout
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
|
||||
# Run the ping command in a thread pool to avoid blocking
|
||||
loop = asyncio.get_event_loop()
|
||||
with concurrent.futures.ThreadPoolExecutor() as pool:
|
||||
await asyncio.wait_for(
|
||||
loop.run_in_executor(pool, self.client.admin.command, 'ping'),
|
||||
timeout=3.0 # 3 second timeout for ping
|
||||
)
|
||||
logger.debug("MongoDB ping successful")
|
||||
except asyncio.TimeoutError:
|
||||
logger.error("MongoDB ping timeout after 3 seconds")
|
||||
raise ConnectionFailure("MongoDB ping timeout")
|
||||
except ConnectionFailure as e:
|
||||
logger.error(f"MongoDB ping failed - Server not available: {e}")
|
||||
raise
|
||||
@@ -77,22 +79,22 @@ class DatabaseManager:
|
||||
raise ConnectionFailure(f"Ping failed: {e}")
|
||||
|
||||
def _create_indexes(self):
|
||||
"""Create database indexes for efficient queries"""
|
||||
try:
|
||||
# Index on files collection
|
||||
self.collections['files'].create_index("filename", unique=True)
|
||||
self.collections['files'].create_index("processed_at")
|
||||
|
||||
# Index on energy data collection
|
||||
self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
|
||||
self.collections['energy_data'].create_index("timestamp")
|
||||
|
||||
self.collections['scanned_directories'].create_index("directory_path", unique=True)
|
||||
self.collections['scanned_directories'].create_index("last_scanned")
|
||||
self.collections['scanned_directories'].create_index("scan_status")
|
||||
|
||||
logger.info("Database indexes created successfully")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to create indexes: {e}")
|
||||
|
||||
async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool:
|
||||
"""Store processed .sgl_v2 file data in MongoDB"""
|
||||
try:
|
||||
current_time = datetime.now()
|
||||
|
||||
@@ -175,6 +177,77 @@ class DatabaseManager:
|
||||
logger.error(f"Error getting file info for {filename}: {e}")
|
||||
return None
|
||||
|
||||
# Directory scanning tracking methods
|
||||
async def is_directory_scanned(self, directory_path: str, since_timestamp: datetime = None) -> bool:
|
||||
"""Check if directory has been scanned recently"""
|
||||
try:
|
||||
query = {"directory_path": directory_path, "scan_status": "complete"}
|
||||
if since_timestamp:
|
||||
query["last_scanned"] = {"$gte": since_timestamp}
|
||||
|
||||
result = self.collections['scanned_directories'].find_one(query)
|
||||
return result is not None
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking directory scan status for {directory_path}: {e}")
|
||||
return False
|
||||
|
||||
async def mark_directory_scanned(self, directory_path: str, file_count: int, ftp_last_modified: datetime = None) -> bool:
|
||||
"""Mark directory as scanned with current timestamp"""
|
||||
try:
|
||||
scan_record = {
|
||||
"directory_path": directory_path,
|
||||
"last_scanned": datetime.now(),
|
||||
"file_count": file_count,
|
||||
"scan_status": "complete"
|
||||
}
|
||||
|
||||
if ftp_last_modified:
|
||||
scan_record["ftp_last_modified"] = ftp_last_modified
|
||||
|
||||
# Use upsert to update existing or create new record
|
||||
self.collections['scanned_directories'].replace_one(
|
||||
{"directory_path": directory_path},
|
||||
scan_record,
|
||||
upsert=True
|
||||
)
|
||||
|
||||
logger.debug(f"Marked directory as scanned: {directory_path} ({file_count} files)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error marking directory as scanned {directory_path}: {e}")
|
||||
return False
|
||||
|
||||
async def get_scanned_directories(self) -> List[Dict[str, Any]]:
|
||||
"""Get all scanned directory records"""
|
||||
try:
|
||||
cursor = self.collections['scanned_directories'].find()
|
||||
return list(cursor)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting scanned directories: {e}")
|
||||
return []
|
||||
|
||||
async def should_skip_directory(self, directory_path: str, ftp_last_modified: datetime = None) -> bool:
|
||||
"""Determine if directory should be skipped based on scan history and modification time"""
|
||||
try:
|
||||
scan_record = self.collections['scanned_directories'].find_one(
|
||||
{"directory_path": directory_path, "scan_status": "complete"}
|
||||
)
|
||||
|
||||
if not scan_record:
|
||||
return False # Never scanned, should scan
|
||||
|
||||
# If we have FTP modification time and it's newer than our last scan, don't skip
|
||||
if ftp_last_modified and scan_record.get("last_scanned"):
|
||||
return ftp_last_modified <= scan_record["last_scanned"]
|
||||
|
||||
# If directory was scanned successfully, skip it (assuming it's historical data)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error determining if directory should be skipped {directory_path}: {e}")
|
||||
return False
|
||||
|
||||
async def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get database statistics"""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user