Switch to PyMongo, update config and requirements, fix FTP extension

typo

- Replace Motor (async) with PyMongo (sync) in database manager - Update
environment variable names for FTP and MongoDB config - Remove unused
dependencies from requirements.txt - Fix file extension typo: .slg_v2 →
.sgl_v2 throughout code and docs - Add debug prints for MongoDB env vars
in config - Update FTP monitor to use correct file extension and PyMongo
- Adjust FastAPI descriptions for new extension
This commit is contained in:
rafaeldpsilva
2025-09-11 11:45:19 +01:00
parent b2a5b3d229
commit 2932e0a424
6 changed files with 152 additions and 156 deletions

View File

@@ -84,7 +84,7 @@ Set these in the `docker-compose.yml`:
environment: environment:
- FTP_SA4CPS_HOST=ftp.sa4cps.pt # FTP server hostname - FTP_SA4CPS_HOST=ftp.sa4cps.pt # FTP server hostname
- FTP_SA4CPS_PORT=21 # FTP port (default: 21) - FTP_SA4CPS_PORT=21 # FTP port (default: 21)
- FTP_SA4CPS_USERNAME=anonymous # FTP username - FTP_SA4CPS_USERNAME= # FTP username
- FTP_SA4CPS_PASSWORD= # FTP password (empty for anonymous) - FTP_SA4CPS_PASSWORD= # FTP password (empty for anonymous)
- FTP_SA4CPS_REMOTE_PATH=/ # Remote directory path - FTP_SA4CPS_REMOTE_PATH=/ # Remote directory path
``` ```

View File

@@ -3,26 +3,12 @@ fastapi==0.104.1
uvicorn==0.24.0 uvicorn==0.24.0
pydantic==2.5.0 pydantic==2.5.0
# Database dependencies # Database dependencies - using PyMongo (sync) instead of Motor (async)
motor==3.3.2
pymongo==4.6.0 pymongo==4.6.0
redis==5.0.1
# FTP handling # FTP handling
ftputil==5.0.4 ftputil==5.0.4
# Data processing
pandas==2.1.4
numpy==1.25.2
openpyxl==3.1.2
xlrd==2.0.1
# Async HTTP client
httpx==0.25.2
# Logging and monitoring
structlog==23.2.0
# Date/time utilities # Date/time utilities
python-dateutil==2.8.2 python-dateutil==2.8.2
@@ -32,4 +18,3 @@ typing-extensions==4.8.0
# Development dependencies (optional) # Development dependencies (optional)
pytest==7.4.3 pytest==7.4.3
pytest-asyncio==0.21.1 pytest-asyncio==0.21.1
pytest-cov==4.1.0

View File

@@ -9,20 +9,24 @@ from typing import Dict, Any
# FTP Configuration for SA4CPS server # FTP Configuration for SA4CPS server
FTP_CONFIG: Dict[str, Any] = { FTP_CONFIG: Dict[str, Any] = {
"host": os.getenv("SA4CPS_FTP_HOST", "ftp.sa4cps.pt"), "host": os.getenv("FTP_SA4CPS_HOST", "ftp.sa4cps.pt"),
"username": os.getenv("SA4CPS_FTP_USER", "curvascarga@sa4cps.pt"), "username": os.getenv("FTP_SA4CPS_USERNAME", "curvascarga@sa4cps.pt"),
"password": os.getenv("SA4CPS_FTP_PASS", ""), # Set via environment variable "password": os.getenv("FTP_SA4CPS_PASSWORD", 'n$WFtz9+bleN'), # Set via environment variable
"base_path": os.getenv("SA4CPS_FTP_PATH", "/SLGs/Faial/PT0010000000015181AA/"), "base_path": os.getenv("FTP_SA4CPS_REMOTE_PATH", "/SLGs/Faial/"),
"check_interval": int(os.getenv("SA4CPS_CHECK_INTERVAL", "21600")) # 6 hours default "check_interval": int(os.getenv("FTP_CHECK_INTERVAL", "21600")) # 6 hours default
} }
# MongoDB Configuration # MongoDB Configuration
# Debug environment variables
print(f"DEBUG: MONGO_URL env var = {os.getenv('MONGO_URL', 'NOT SET')}")
print(f"DEBUG: All env vars starting with MONGO: {[k for k in os.environ.keys() if k.startswith('MONGO')]}")
MONGO_CONFIG: Dict[str, Any] = { MONGO_CONFIG: Dict[str, Any] = {
"connection_string": os.getenv( "connection_string": os.getenv(
"MONGODB_URL", "MONGO_URL",
"mongodb://admin:admin@localhost:27018/sa4cps_energy?authSource=admin" "mongodb://admin:password123@localhost:27017/digitalmente_ingestion?authSource=admin"
), ),
"database_name": os.getenv("MONGODB_DATABASE", "sa4cps_energy") "database_name": os.getenv("MONGODB_DATABASE", "digitalmente_ingestion")
} }
# Logging Configuration # Logging Configuration

View File

@@ -1,14 +1,12 @@
#!/usr/bin/env python3
""" """
MongoDB Database Manager for SA4CPS Data Ingestion MongoDB Database Manager for SA4CPS Data Ingestion
Simple async MongoDB operations for storing .slg_v2 file data Simple sync MongoDB operations for storing .sgl_v2 file data
""" """
import asyncio
import logging import logging
from datetime import datetime from datetime import datetime
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from motor.motor_asyncio import AsyncIOMotorClient from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
from config import MONGO_CONFIG from config import MONGO_CONFIG
@@ -20,7 +18,7 @@ class DatabaseManager:
"""Manages MongoDB connections and operations for SA4CPS data""" """Manages MongoDB connections and operations for SA4CPS data"""
def __init__(self): def __init__(self):
self.client: Optional[AsyncIOMotorClient] = None self.client: Optional[MongoClient] = None
self.db = None self.db = None
self.collections = {} self.collections = {}
@@ -33,10 +31,11 @@ class DatabaseManager:
async def connect(self): async def connect(self):
"""Connect to MongoDB""" """Connect to MongoDB"""
try: try:
self.client = AsyncIOMotorClient(self.connection_string) logger.info(f"Connecting to MongoDB at: {self.connection_string}")
self.client = MongoClient(self.connection_string, serverSelectionTimeoutMS=5000)
# Test connection # Test connection
await self.client.admin.command('ping') await self.ping()
# Get database and collections # Get database and collections
self.db = self.client[self.database_name] self.db = self.client[self.database_name]
@@ -47,9 +46,9 @@ class DatabaseManager:
} }
# Create indexes for better performance # Create indexes for better performance
await self._create_indexes() self._create_indexes()
logger.info(f"Connected to MongoDB: {self.database_name}") logger.info(f"Connected to MongoDB database: {self.database_name}")
except (ConnectionFailure, ServerSelectionTimeoutError) as e: except (ConnectionFailure, ServerSelectionTimeoutError) as e:
logger.error(f"Failed to connect to MongoDB: {e}") logger.error(f"Failed to connect to MongoDB: {e}")
@@ -66,26 +65,34 @@ class DatabaseManager:
if not self.client: if not self.client:
raise ConnectionFailure("No database connection") raise ConnectionFailure("No database connection")
await self.client.admin.command('ping') try:
# The ping command is cheap and does not require auth.
self.client.admin.command('ping')
logger.info("MongoDB ping successful")
except ConnectionFailure as e:
logger.error(f"MongoDB ping failed - Server not available: {e}")
raise
except Exception as e:
logger.error(f"MongoDB ping failed with error: {e}")
raise ConnectionFailure(f"Ping failed: {e}")
async def _create_indexes(self): def _create_indexes(self):
"""Create database indexes for efficient queries""" """Create database indexes for efficient queries"""
try: try:
# Index on files collection # Index on files collection
await self.collections['files'].create_index("filename", unique=True) self.collections['files'].create_index("filename", unique=True)
await self.collections['files'].create_index("processed_at") self.collections['files'].create_index("processed_at")
# Index on energy data collection # Index on energy data collection
await self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)]) self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
await self.collections['energy_data'].create_index("timestamp") self.collections['energy_data'].create_index("timestamp")
logger.info("Database indexes created successfully") logger.info("Database indexes created successfully")
except Exception as e: except Exception as e:
logger.warning(f"Failed to create indexes: {e}") logger.warning(f"Failed to create indexes: {e}")
async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool: async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool:
"""Store processed .slg_v2 file data in MongoDB""" """Store processed .sgl_v2 file data in MongoDB"""
try: try:
current_time = datetime.now() current_time = datetime.now()
@@ -99,7 +106,7 @@ class DatabaseManager:
} }
# Insert or update file record # Insert or update file record
await self.collections['files'].replace_one( self.collections['files'].replace_one(
{"filename": filename}, {"filename": filename},
file_metadata, file_metadata,
upsert=True upsert=True
@@ -112,7 +119,7 @@ class DatabaseManager:
# Insert energy data records # Insert energy data records
if records: if records:
result = await self.collections['energy_data'].insert_many(records) result = self.collections['energy_data'].insert_many(records)
inserted_count = len(result.inserted_ids) inserted_count = len(result.inserted_ids)
logger.info(f"Stored {inserted_count} records from {filename}") logger.info(f"Stored {inserted_count} records from {filename}")
return True return True
@@ -130,7 +137,7 @@ class DatabaseManager:
"error_message": str(e) "error_message": str(e)
} }
await self.collections['files'].replace_one( self.collections['files'].replace_one(
{"filename": filename}, {"filename": filename},
error_metadata, error_metadata,
upsert=True upsert=True
@@ -147,7 +154,7 @@ class DatabaseManager:
) )
files = [] files = []
async for doc in cursor: for doc in cursor:
files.append(doc["filename"]) files.append(doc["filename"])
return files return files
@@ -159,7 +166,7 @@ class DatabaseManager:
async def get_file_info(self, filename: str) -> Optional[Dict[str, Any]]: async def get_file_info(self, filename: str) -> Optional[Dict[str, Any]]:
"""Get information about a specific file""" """Get information about a specific file"""
try: try:
return await self.collections['files'].find_one({"filename": filename}) return self.collections['files'].find_one({"filename": filename})
except Exception as e: except Exception as e:
logger.error(f"Error getting file info for {filename}: {e}") logger.error(f"Error getting file info for {filename}: {e}")
return None return None
@@ -175,7 +182,7 @@ class DatabaseManager:
# Count documents in each collection # Count documents in each collection
for name, collection in self.collections.items(): for name, collection in self.collections.items():
try: try:
count = await collection.count_documents({}) count = collection.count_documents({})
stats[f"{name}_count"] = count stats[f"{name}_count"] = count
except Exception as e: except Exception as e:
stats[f"{name}_count"] = f"error: {e}" stats[f"{name}_count"] = f"error: {e}"
@@ -188,7 +195,7 @@ class DatabaseManager:
{"filename": 1, "processed_at": 1, "record_count": 1, "status": 1, "_id": 0} {"filename": 1, "processed_at": 1, "record_count": 1, "status": 1, "_id": 0}
).sort("processed_at", -1).limit(5) ).sort("processed_at", -1).limit(5)
async for doc in cursor: for doc in cursor:
if doc.get("processed_at"): if doc.get("processed_at"):
doc["processed_at"] = doc["processed_at"].isoformat() doc["processed_at"] = doc["processed_at"].isoformat()
recent_files.append(doc) recent_files.append(doc)
@@ -227,7 +234,7 @@ class DatabaseManager:
cursor = self.collections['energy_data'].find(query).sort("timestamp", -1).limit(limit) cursor = self.collections['energy_data'].find(query).sort("timestamp", -1).limit(limit)
data = [] data = []
async for doc in cursor: for doc in cursor:
# Convert ObjectId to string and datetime to ISO string # Convert ObjectId to string and datetime to ISO string
if "_id" in doc: if "_id" in doc:
doc["_id"] = str(doc["_id"]) doc["_id"] = str(doc["_id"])

View File

@@ -5,10 +5,10 @@ Monitors ftp.sa4cps.pt for new monthly files
""" """
import asyncio import asyncio
import ftplib from ftplib import FTP
import logging import logging
import os import os
from datetime import datetime, timedelta from datetime import datetime
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from dataclasses import dataclass from dataclasses import dataclass
import tempfile import tempfile
@@ -44,9 +44,7 @@ class FTPMonitor:
self.ftp_user = FTP_CONFIG["username"] self.ftp_user = FTP_CONFIG["username"]
self.ftp_pass = FTP_CONFIG["password"] self.ftp_pass = FTP_CONFIG["password"]
self.base_path = FTP_CONFIG["base_path"] self.base_path = FTP_CONFIG["base_path"]
self.check_interval = FTP_CONFIG["check_interval"]
# Check interval: 6 hours (files are monthly, so frequent checks aren't needed)
self.check_interval = FTP_CONFIG.get("check_interval", 6 * 3600) # 6 hours
logger.info(f"FTP Monitor initialized for {self.ftp_host}") logger.info(f"FTP Monitor initialized for {self.ftp_host}")
@@ -77,7 +75,7 @@ class FTPMonitor:
try: try:
# Connect to FTP server # Connect to FTP server
with ftplib.FTP(self.ftp_host) as ftp: with FTP(self.ftp_host) as ftp:
ftp.login(self.ftp_user, self.ftp_pass) ftp.login(self.ftp_user, self.ftp_pass)
logger.info(f"Connected to FTP server: {self.ftp_host}") logger.info(f"Connected to FTP server: {self.ftp_host}")
@@ -107,8 +105,8 @@ class FTPMonitor:
logger.error(f"FTP check failed: {e}") logger.error(f"FTP check failed: {e}")
raise raise
async def _find_slg_files(self, ftp: ftplib.FTP) -> List[FTPFileInfo]: async def _find_slg_files(self, ftp: FTP) -> List[FTPFileInfo]:
"""Find .slg_v2 files in the FTP directory structure""" """Find .sgl_v2 files in the FTP directory structure"""
files = [] files = []
try: try:
@@ -119,14 +117,16 @@ class FTPMonitor:
# Get directory listing # Get directory listing
dir_list = [] dir_list = []
ftp.retrlines('LIST', dir_list.append) ftp.retrlines('LIST', dir_list.append)
logger.info(f"Received {len(dir_list)} directory entries")
for line in dir_list: for line in dir_list:
print(line)
parts = line.split() parts = line.split()
if len(parts) >= 9: if len(parts) >= 9:
filename = parts[-1] filename = parts[-1]
# Check if it's a .slg_v2 file # Check if it's a .slg_v2 file
if filename.endswith('.slg_v2'): if filename.endswith('.sgl_v2'):
print('found file')
try: try:
size = int(parts[4]) size = int(parts[4])
full_path = f"{self.base_path.rstrip('/')}/{filename}" full_path = f"{self.base_path.rstrip('/')}/{filename}"
@@ -147,7 +147,7 @@ class FTPMonitor:
logger.error(f"Error scanning FTP directory: {e}") logger.error(f"Error scanning FTP directory: {e}")
return [] return []
async def _process_file(self, ftp: ftplib.FTP, file_info: FTPFileInfo) -> bool: async def _process_file(self, ftp: FTP, file_info: FTPFileInfo) -> bool:
"""Download and process a .slg_v2 file""" """Download and process a .slg_v2 file"""
logger.info(f"Processing file: {file_info.name} ({file_info.size} bytes)") logger.info(f"Processing file: {file_info.name} ({file_info.size} bytes)")

View File

@@ -1,6 +1,6 @@
""" """
SA4CPS Data Ingestion Service SA4CPS Data Ingestion Service
Simple FTP monitoring service for .slg_v2 files with MongoDB storage Simple FTP monitoring service for .sgl_v2 files with MongoDB storage
""" """
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
@@ -53,7 +53,7 @@ async def lifespan(app: FastAPI):
# Create FastAPI app # Create FastAPI app
app = FastAPI( app = FastAPI(
title="SA4CPS Data Ingestion Service", title="SA4CPS Data Ingestion Service",
description="Monitors FTP server for .slg_v2 files and stores data in MongoDB", description="Monitors FTP server for .sgl_v2 files and stores data in MongoDB",
version="1.0.0", version="1.0.0",
lifespan=lifespan lifespan=lifespan
) )