Add dynamic energy data collections per building

Energy data is now stored in separate collections for each
SLGs/Community/Building directory. Collections are created and indexed
dynamically. Metadata and API queries have been updated to support this
structure.
This commit is contained in:
rafaeldpsilva
2025-10-08 14:05:00 +01:00
parent 8ff20935fc
commit 38fb3e6e96
3 changed files with 199 additions and 39 deletions

View File

@@ -15,6 +15,7 @@ class DatabaseManager:
self.client: Optional[MongoClient] = None
self.db = None
self.collections = {}
self.energy_collections_cache = {} # Cache for dynamically created energy data collections
self.connection_string = MONGO_CONFIG["connection_string"]
self.database_name = MONGO_CONFIG["database_name"]
@@ -31,12 +32,11 @@ class DatabaseManager:
self.db = self.client[self.database_name]
self.collections = {
'files': self.db.sa4cps_files,
'energy_data': self.db.sa4cps_energy_data,
'metadata': self.db.sa4cps_metadata,
'scanned_directories': self.db.sa4cps_scanned_directories
}
self._create_indexes()
self._create_base_indexes()
logger.info(f"Connected to MongoDB database: {self.database_name}")
@@ -78,13 +78,12 @@ class DatabaseManager:
logger.error(f"MongoDB ping failed with error: {e}")
raise ConnectionFailure(f"Ping failed: {e}")
def _create_indexes(self):
def _create_base_indexes(self):
"""Create indexes for base collections (not energy data collections)"""
try:
self.collections['files'].create_index("filename", unique=True)
self.collections['files'].create_index("processed_at")
self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
self.collections['energy_data'].create_index("timestamp")
self.collections['files'].create_index("directory_path")
self.collections['scanned_directories'].create_index("directory_path", unique=True)
self.collections['scanned_directories'].create_index("last_scanned")
@@ -94,13 +93,97 @@ class DatabaseManager:
except Exception as e:
logger.warning(f"Failed to create indexes: {e}")
async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool:
def _extract_level3_path(self, directory_path: str) -> Optional[str]:
"""Extract level 3 directory path (SLGs/Community/Building) from full path"""
# Expected structure: /SLGs/Community/Building/...
parts = directory_path.strip('/').split('/')
if len(parts) >= 3 and parts[0] == 'SLGs':
# Return SLGs/Community/Building
return '/'.join(parts[:3])
return None
def _sanitize_collection_name(self, level3_path: str) -> str:
"""Convert level 3 directory path to valid MongoDB collection name
Example: SLGs/CommunityA/Building1 -> energy_data__CommunityA_Building1
"""
parts = level3_path.strip('/').split('/')
if len(parts) >= 3 and parts[0] == 'SLGs':
# Use Community_Building as the collection suffix
collection_suffix = f"{parts[1]}_{parts[2]}"
collection_name = f"energy_data__{collection_suffix}"
return collection_name
# Fallback: sanitize the entire path
sanitized = level3_path.replace('/', '_').replace('.', '_').replace(' ', '_')
sanitized = sanitized.strip('_')
return f"energy_data__{sanitized}"
def _get_energy_collection(self, directory_path: str):
"""Get or create energy data collection for a specific level 3 directory path"""
level3_path = self._extract_level3_path(directory_path)
if not level3_path:
logger.warning(f"Could not extract level 3 path from: {directory_path}, using default collection")
# Fallback to a default collection for non-standard paths
collection_name = "energy_data__other"
else:
collection_name = self._sanitize_collection_name(level3_path)
# Check cache first
if collection_name in self.energy_collections_cache:
return self.energy_collections_cache[collection_name]
# Create/get collection
collection = self.db[collection_name]
# Create indexes for this energy collection
try:
collection.create_index([("filename", 1), ("timestamp", 1)])
collection.create_index("timestamp")
collection.create_index("meter_id")
logger.debug(f"Created indexes for collection: {collection_name}")
except Exception as e:
logger.warning(f"Failed to create indexes for {collection_name}: {e}")
# Cache the collection
self.energy_collections_cache[collection_name] = collection
logger.info(f"Initialized energy data collection: {collection_name} for path: {directory_path}")
return collection
def _list_energy_collections(self) -> List[str]:
"""List all energy data collections in the database"""
try:
all_collections = self.db.list_collection_names()
# Filter collections that start with 'energy_data__'
energy_collections = [c for c in all_collections if c.startswith('energy_data__')]
return energy_collections
except Exception as e:
logger.error(f"Error listing energy collections: {e}")
return []
async def store_file_data(self, filename: str, records: List[Dict[str, Any]], directory_path: str = None) -> bool:
try:
current_time = datetime.now()
# Determine which collection to use based on directory path
if directory_path:
energy_collection = self._get_energy_collection(directory_path)
level3_path = self._extract_level3_path(directory_path)
else:
logger.warning(f"No directory path provided for {filename}, using default collection")
energy_collection = self._get_energy_collection("/SLGs/unknown/unknown")
level3_path = None
# Store file metadata
file_metadata = {
"filename": filename,
"directory_path": directory_path,
"level3_path": level3_path,
"record_count": len(records),
"processed_at": current_time,
"file_size": sum(len(str(record)) for record in records),
@@ -118,12 +201,13 @@ class DatabaseManager:
for record in records:
record["filename"] = filename
record["processed_at"] = current_time
record["directory_path"] = directory_path
# Insert energy data records
# Insert energy data records into the appropriate collection
if records:
result = self.collections['energy_data'].insert_many(records)
result = energy_collection.insert_many(records)
inserted_count = len(result.inserted_ids)
logger.debug(f"Stored {inserted_count} records from {filename}")
logger.debug(f"Stored {inserted_count} records from {filename} to {energy_collection.name}")
return True
return False
@@ -134,6 +218,7 @@ class DatabaseManager:
# Store error metadata
error_metadata = {
"filename": filename,
"directory_path": directory_path,
"processed_at": current_time,
"status": "error",
"error_message": str(e)
@@ -178,8 +263,14 @@ class DatabaseManager:
return None
# Directory scanning tracking methods
# Note: Only level 4+ directories (/SLGs/Community/Building/SubDir) are tracked
# to avoid unnecessary caching of high-level organizational directories
async def is_directory_scanned(self, directory_path: str, since_timestamp: datetime = None) -> bool:
"""Check if directory has been scanned recently"""
"""Check if directory has been scanned recently
Note: Only level 4+ directories are tracked in the database
"""
try:
query = {"directory_path": directory_path, "scan_status": "complete"}
if since_timestamp:
@@ -249,14 +340,14 @@ class DatabaseManager:
return False
async def get_stats(self) -> Dict[str, Any]:
"""Get database statistics"""
"""Get database statistics including all energy collections"""
try:
stats = {
"database": self.database_name,
"timestamp": datetime.now().isoformat()
}
# Count documents in each collection
# Count documents in base collections
for name, collection in self.collections.items():
try:
count = collection.count_documents({})
@@ -264,12 +355,35 @@ class DatabaseManager:
except Exception as e:
stats[f"{name}_count"] = f"error: {e}"
# Get all energy collections and their counts
try:
energy_collections = self._list_energy_collections()
energy_stats = []
total_energy_records = 0
for collection_name in energy_collections:
collection = self.db[collection_name]
count = collection.count_documents({})
total_energy_records += count
energy_stats.append({
"collection": collection_name,
"record_count": count
})
stats["energy_collections"] = energy_stats
stats["total_energy_collections"] = len(energy_collections)
stats["total_energy_records"] = total_energy_records
except Exception as e:
stats["energy_collections"] = f"error: {e}"
# Get recent files
try:
recent_files = []
cursor = self.collections['files'].find(
{},
{"filename": 1, "processed_at": 1, "record_count": 1, "status": 1, "_id": 0}
{"filename": 1, "processed_at": 1, "record_count": 1, "status": 1, "directory_path": 1, "level3_path": 1, "_id": 0}
).sort("processed_at", -1).limit(5)
for doc in cursor:
@@ -292,8 +406,17 @@ class DatabaseManager:
filename: Optional[str] = None,
start_time: Optional[datetime] = None,
end_time: Optional[datetime] = None,
directory_path: Optional[str] = None,
limit: int = 100) -> List[Dict[str, Any]]:
"""Retrieve energy data with optional filtering"""
"""Retrieve energy data with optional filtering
Args:
filename: Filter by specific filename
start_time: Filter by start timestamp
end_time: Filter by end timestamp
directory_path: Filter by specific directory path (level 3). If None, queries all collections
limit: Maximum number of records to return
"""
try:
query = {}
@@ -308,22 +431,48 @@ class DatabaseManager:
time_query["$lte"] = end_time
query["timestamp"] = time_query
cursor = self.collections['energy_data'].find(query).sort("timestamp", -1).limit(limit)
data = []
for doc in cursor:
# Convert ObjectId to string and datetime to ISO string
if "_id" in doc:
doc["_id"] = str(doc["_id"])
if "timestamp" in doc and hasattr(doc["timestamp"], "isoformat"):
doc["timestamp"] = doc["timestamp"].isoformat()
if "processed_at" in doc and hasattr(doc["processed_at"], "isoformat"):
doc["processed_at"] = doc["processed_at"].isoformat()
data.append(doc)
# If directory_path is specified, query only that collection
if directory_path:
collection = self._get_energy_collection(directory_path)
cursor = collection.find(query).sort("timestamp", -1).limit(limit)
for doc in cursor:
data.append(self._format_energy_document(doc))
else:
# Query across all energy collections
energy_collection_names = self._list_energy_collections()
# Collect data from all collections, then sort and limit
all_data = []
per_collection_limit = max(limit, 1000) # Get more from each to ensure we have enough after sorting
for collection_name in energy_collection_names:
collection = self.db[collection_name]
cursor = collection.find(query).sort("timestamp", -1).limit(per_collection_limit)
for doc in cursor:
all_data.append(self._format_energy_document(doc))
# Sort all data by timestamp and apply final limit
all_data.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
data = all_data[:limit]
return data
except Exception as e:
logger.error(f"Error retrieving energy data: {e}")
return []
def _format_energy_document(self, doc: Dict[str, Any]) -> Dict[str, Any]:
"""Format energy document for API response"""
# Convert ObjectId to string and datetime to ISO string
if "_id" in doc:
doc["_id"] = str(doc["_id"])
if "timestamp" in doc and hasattr(doc["timestamp"], "isoformat"):
doc["timestamp"] = doc["timestamp"].isoformat()
if "processed_at" in doc and hasattr(doc["processed_at"], "isoformat"):
doc["processed_at"] = doc["processed_at"].isoformat()
return doc

View File

@@ -17,6 +17,7 @@ class FTPFileInfo:
path: str
name: str
size: int
directory_path: str # Directory containing the file
modified_time: Optional[datetime] = None
@@ -79,6 +80,7 @@ class FTPMonitor:
self.status = "stopped"
return
await asyncio.wait_for(self.check_for_new_files(), timeout=300.0) # 5 minute timeout
self.status = "running"
# Optionally skip initial scan and wait for first scheduled interval
@@ -202,9 +204,13 @@ class FTPMonitor:
visited_dirs.add(normalized_path)
# Check if directory should be skipped based on previous scans
if await self.db_manager.should_skip_directory(normalized_path):
logger.info(f"Skipping previously scanned directory: {normalized_path}")
# Determine directory depth (level 4 = /SLGs/Community/Building/SubDir)
path_parts = normalized_path.strip('/').split('/')
directory_level = len(path_parts)
# Check if directory should be skipped based on previous scans (only for level 4+)
if directory_level >= 4 and await self.db_manager.should_skip_directory(normalized_path):
logger.info(f"Skipping previously scanned level {directory_level} directory: {normalized_path}")
skipped_dirs += 1
continue
@@ -221,7 +227,7 @@ class FTPMonitor:
# Count files found in this directory
files_found_in_dir = 0
for line in dir_list:
parts = line.split()
if len(parts) >= 9:
@@ -257,7 +263,8 @@ class FTPMonitor:
files.append(FTPFileInfo(
path=full_path,
name=filename,
size=size
size=size,
directory_path=normalized_path
))
files_found_in_dir += 1
@@ -265,10 +272,13 @@ class FTPMonitor:
logger.warning(f"Could not parse file info for: {filename}")
ftp.cwd(original_dir)
# Mark directory as scanned
await self.db_manager.mark_directory_scanned(normalized_path, files_found_in_dir)
logger.debug(f"Completed scanning: {normalized_path} ({files_found_in_dir} files found)")
# Mark directory as scanned (only for level 4+ directories)
if directory_level >= 4:
await self.db_manager.mark_directory_scanned(normalized_path, files_found_in_dir)
logger.debug(f"Completed scanning level {directory_level} directory: {normalized_path} ({files_found_in_dir} files found)")
else:
logger.debug(f"Completed scanning level {directory_level} directory (not saved to cache): {normalized_path} ({files_found_in_dir} files found)")
except Exception as e:
logger.warning(f"Error scanning directory {normalized_path}: {e}")
@@ -277,7 +287,7 @@ class FTPMonitor:
logger.info(f"Iterative scan completed. Scanned: {scanned_dirs} directories, Skipped: {skipped_dirs} directories (Total visited: {len(visited_dirs)})")
async def _process_file(self, ftp: FTP, file_info: FTPFileInfo) -> bool:
logger.debug(f"Processing file: {file_info.path} ({file_info.size} bytes)")
logger.debug(f"Processing file: {file_info.path} ({file_info.size} bytes) from directory: {file_info.directory_path}")
try:
with tempfile.NamedTemporaryFile(mode='wb', suffix='.slg_v2', delete=False) as temp_file:
@@ -289,8 +299,9 @@ class FTPMonitor:
records = await self.processor.process_file(temp_path, file_info.name)
if records:
await self.db_manager.store_file_data(file_info.name, records)
logger.debug(f"Stored {len(records)} records from {file_info.name}")
# Pass directory path to store_file_data for collection selection
await self.db_manager.store_file_data(file_info.name, records, file_info.directory_path)
logger.debug(f"Stored {len(records)} records from {file_info.name} to collection for {file_info.directory_path}")
return True
else:
logger.warning(f"No valid records found in {file_info.name}")

View File

@@ -32,7 +32,7 @@ class MockDatabaseManager:
"""Mock get list of processed files"""
return list(self.processed_files)
async def store_file_data(self, filename: str, records: List) -> bool:
async def store_file_data(self, filename: str, records: List, directory_path: str = None) -> bool:
"""Mock store file data"""
self.processed_files.add(filename)
self.stored_files[filename] = records