Switch to PyMongo, update config and requirements, fix FTP extension

typo

- Replace Motor (async) with PyMongo (sync) in database manager - Update
environment variable names for FTP and MongoDB config - Remove unused
dependencies from requirements.txt - Fix file extension typo: .slg_v2 →
.sgl_v2 throughout code and docs - Add debug prints for MongoDB env vars
in config - Update FTP monitor to use correct file extension and PyMongo
- Adjust FastAPI descriptions for new extension
This commit is contained in:
rafaeldpsilva
2025-09-11 11:45:19 +01:00
parent b2a5b3d229
commit 2932e0a424
6 changed files with 152 additions and 156 deletions

View File

@@ -84,7 +84,7 @@ Set these in the `docker-compose.yml`:
environment:
- FTP_SA4CPS_HOST=ftp.sa4cps.pt # FTP server hostname
- FTP_SA4CPS_PORT=21 # FTP port (default: 21)
- FTP_SA4CPS_USERNAME=anonymous # FTP username
- FTP_SA4CPS_USERNAME= # FTP username
- FTP_SA4CPS_PASSWORD= # FTP password (empty for anonymous)
- FTP_SA4CPS_REMOTE_PATH=/ # Remote directory path
```

View File

@@ -3,26 +3,12 @@ fastapi==0.104.1
uvicorn==0.24.0
pydantic==2.5.0
# Database dependencies
motor==3.3.2
# Database dependencies - using PyMongo (sync) instead of Motor (async)
pymongo==4.6.0
redis==5.0.1
# FTP handling
ftputil==5.0.4
# Data processing
pandas==2.1.4
numpy==1.25.2
openpyxl==3.1.2
xlrd==2.0.1
# Async HTTP client
httpx==0.25.2
# Logging and monitoring
structlog==23.2.0
# Date/time utilities
python-dateutil==2.8.2
@@ -32,4 +18,3 @@ typing-extensions==4.8.0
# Development dependencies (optional)
pytest==7.4.3
pytest-asyncio==0.21.1
pytest-cov==4.1.0

View File

@@ -9,20 +9,24 @@ from typing import Dict, Any
# FTP Configuration for SA4CPS server
FTP_CONFIG: Dict[str, Any] = {
"host": os.getenv("SA4CPS_FTP_HOST", "ftp.sa4cps.pt"),
"username": os.getenv("SA4CPS_FTP_USER", "curvascarga@sa4cps.pt"),
"password": os.getenv("SA4CPS_FTP_PASS", ""), # Set via environment variable
"base_path": os.getenv("SA4CPS_FTP_PATH", "/SLGs/Faial/PT0010000000015181AA/"),
"check_interval": int(os.getenv("SA4CPS_CHECK_INTERVAL", "21600")) # 6 hours default
"host": os.getenv("FTP_SA4CPS_HOST", "ftp.sa4cps.pt"),
"username": os.getenv("FTP_SA4CPS_USERNAME", "curvascarga@sa4cps.pt"),
"password": os.getenv("FTP_SA4CPS_PASSWORD", 'n$WFtz9+bleN'), # Set via environment variable
"base_path": os.getenv("FTP_SA4CPS_REMOTE_PATH", "/SLGs/Faial/"),
"check_interval": int(os.getenv("FTP_CHECK_INTERVAL", "21600")) # 6 hours default
}
# MongoDB Configuration
# Debug environment variables
print(f"DEBUG: MONGO_URL env var = {os.getenv('MONGO_URL', 'NOT SET')}")
print(f"DEBUG: All env vars starting with MONGO: {[k for k in os.environ.keys() if k.startswith('MONGO')]}")
MONGO_CONFIG: Dict[str, Any] = {
"connection_string": os.getenv(
"MONGODB_URL",
"mongodb://admin:admin@localhost:27018/sa4cps_energy?authSource=admin"
"MONGO_URL",
"mongodb://admin:password123@localhost:27017/digitalmente_ingestion?authSource=admin"
),
"database_name": os.getenv("MONGODB_DATABASE", "sa4cps_energy")
"database_name": os.getenv("MONGODB_DATABASE", "digitalmente_ingestion")
}
# Logging Configuration

View File

@@ -1,14 +1,12 @@
#!/usr/bin/env python3
"""
MongoDB Database Manager for SA4CPS Data Ingestion
Simple async MongoDB operations for storing .slg_v2 file data
Simple sync MongoDB operations for storing .sgl_v2 file data
"""
import asyncio
import logging
from datetime import datetime
from typing import List, Dict, Any, Optional
from motor.motor_asyncio import AsyncIOMotorClient
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
from config import MONGO_CONFIG
@@ -20,7 +18,7 @@ class DatabaseManager:
"""Manages MongoDB connections and operations for SA4CPS data"""
def __init__(self):
self.client: Optional[AsyncIOMotorClient] = None
self.client: Optional[MongoClient] = None
self.db = None
self.collections = {}
@@ -33,10 +31,11 @@ class DatabaseManager:
async def connect(self):
"""Connect to MongoDB"""
try:
self.client = AsyncIOMotorClient(self.connection_string)
logger.info(f"Connecting to MongoDB at: {self.connection_string}")
self.client = MongoClient(self.connection_string, serverSelectionTimeoutMS=5000)
# Test connection
await self.client.admin.command('ping')
await self.ping()
# Get database and collections
self.db = self.client[self.database_name]
@@ -47,9 +46,9 @@ class DatabaseManager:
}
# Create indexes for better performance
await self._create_indexes()
self._create_indexes()
logger.info(f"Connected to MongoDB: {self.database_name}")
logger.info(f"Connected to MongoDB database: {self.database_name}")
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
logger.error(f"Failed to connect to MongoDB: {e}")
@@ -66,26 +65,34 @@ class DatabaseManager:
if not self.client:
raise ConnectionFailure("No database connection")
await self.client.admin.command('ping')
try:
# The ping command is cheap and does not require auth.
self.client.admin.command('ping')
logger.info("MongoDB ping successful")
except ConnectionFailure as e:
logger.error(f"MongoDB ping failed - Server not available: {e}")
raise
except Exception as e:
logger.error(f"MongoDB ping failed with error: {e}")
raise ConnectionFailure(f"Ping failed: {e}")
async def _create_indexes(self):
def _create_indexes(self):
"""Create database indexes for efficient queries"""
try:
# Index on files collection
await self.collections['files'].create_index("filename", unique=True)
await self.collections['files'].create_index("processed_at")
self.collections['files'].create_index("filename", unique=True)
self.collections['files'].create_index("processed_at")
# Index on energy data collection
await self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
await self.collections['energy_data'].create_index("timestamp")
self.collections['energy_data'].create_index([("filename", 1), ("timestamp", 1)])
self.collections['energy_data'].create_index("timestamp")
logger.info("Database indexes created successfully")
except Exception as e:
logger.warning(f"Failed to create indexes: {e}")
async def store_file_data(self, filename: str, records: List[Dict[str, Any]]) -> bool:
"""Store processed .slg_v2 file data in MongoDB"""
"""Store processed .sgl_v2 file data in MongoDB"""
try:
current_time = datetime.now()
@@ -99,7 +106,7 @@ class DatabaseManager:
}
# Insert or update file record
await self.collections['files'].replace_one(
self.collections['files'].replace_one(
{"filename": filename},
file_metadata,
upsert=True
@@ -112,7 +119,7 @@ class DatabaseManager:
# Insert energy data records
if records:
result = await self.collections['energy_data'].insert_many(records)
result = self.collections['energy_data'].insert_many(records)
inserted_count = len(result.inserted_ids)
logger.info(f"Stored {inserted_count} records from {filename}")
return True
@@ -130,7 +137,7 @@ class DatabaseManager:
"error_message": str(e)
}
await self.collections['files'].replace_one(
self.collections['files'].replace_one(
{"filename": filename},
error_metadata,
upsert=True
@@ -147,7 +154,7 @@ class DatabaseManager:
)
files = []
async for doc in cursor:
for doc in cursor:
files.append(doc["filename"])
return files
@@ -159,7 +166,7 @@ class DatabaseManager:
async def get_file_info(self, filename: str) -> Optional[Dict[str, Any]]:
"""Get information about a specific file"""
try:
return await self.collections['files'].find_one({"filename": filename})
return self.collections['files'].find_one({"filename": filename})
except Exception as e:
logger.error(f"Error getting file info for {filename}: {e}")
return None
@@ -175,7 +182,7 @@ class DatabaseManager:
# Count documents in each collection
for name, collection in self.collections.items():
try:
count = await collection.count_documents({})
count = collection.count_documents({})
stats[f"{name}_count"] = count
except Exception as e:
stats[f"{name}_count"] = f"error: {e}"
@@ -188,7 +195,7 @@ class DatabaseManager:
{"filename": 1, "processed_at": 1, "record_count": 1, "status": 1, "_id": 0}
).sort("processed_at", -1).limit(5)
async for doc in cursor:
for doc in cursor:
if doc.get("processed_at"):
doc["processed_at"] = doc["processed_at"].isoformat()
recent_files.append(doc)
@@ -227,7 +234,7 @@ class DatabaseManager:
cursor = self.collections['energy_data'].find(query).sort("timestamp", -1).limit(limit)
data = []
async for doc in cursor:
for doc in cursor:
# Convert ObjectId to string and datetime to ISO string
if "_id" in doc:
doc["_id"] = str(doc["_id"])

View File

@@ -5,10 +5,10 @@ Monitors ftp.sa4cps.pt for new monthly files
"""
import asyncio
import ftplib
from ftplib import FTP
import logging
import os
from datetime import datetime, timedelta
from datetime import datetime
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
import tempfile
@@ -44,9 +44,7 @@ class FTPMonitor:
self.ftp_user = FTP_CONFIG["username"]
self.ftp_pass = FTP_CONFIG["password"]
self.base_path = FTP_CONFIG["base_path"]
# Check interval: 6 hours (files are monthly, so frequent checks aren't needed)
self.check_interval = FTP_CONFIG.get("check_interval", 6 * 3600) # 6 hours
self.check_interval = FTP_CONFIG["check_interval"]
logger.info(f"FTP Monitor initialized for {self.ftp_host}")
@@ -77,7 +75,7 @@ class FTPMonitor:
try:
# Connect to FTP server
with ftplib.FTP(self.ftp_host) as ftp:
with FTP(self.ftp_host) as ftp:
ftp.login(self.ftp_user, self.ftp_pass)
logger.info(f"Connected to FTP server: {self.ftp_host}")
@@ -107,8 +105,8 @@ class FTPMonitor:
logger.error(f"FTP check failed: {e}")
raise
async def _find_slg_files(self, ftp: ftplib.FTP) -> List[FTPFileInfo]:
"""Find .slg_v2 files in the FTP directory structure"""
async def _find_slg_files(self, ftp: FTP) -> List[FTPFileInfo]:
"""Find .sgl_v2 files in the FTP directory structure"""
files = []
try:
@@ -119,14 +117,16 @@ class FTPMonitor:
# Get directory listing
dir_list = []
ftp.retrlines('LIST', dir_list.append)
logger.info(f"Received {len(dir_list)} directory entries")
for line in dir_list:
print(line)
parts = line.split()
if len(parts) >= 9:
filename = parts[-1]
# Check if it's a .slg_v2 file
if filename.endswith('.slg_v2'):
if filename.endswith('.sgl_v2'):
print('found file')
try:
size = int(parts[4])
full_path = f"{self.base_path.rstrip('/')}/{filename}"
@@ -147,7 +147,7 @@ class FTPMonitor:
logger.error(f"Error scanning FTP directory: {e}")
return []
async def _process_file(self, ftp: ftplib.FTP, file_info: FTPFileInfo) -> bool:
async def _process_file(self, ftp: FTP, file_info: FTPFileInfo) -> bool:
"""Download and process a .slg_v2 file"""
logger.info(f"Processing file: {file_info.name} ({file_info.size} bytes)")

View File

@@ -1,6 +1,6 @@
"""
SA4CPS Data Ingestion Service
Simple FTP monitoring service for .slg_v2 files with MongoDB storage
Simple FTP monitoring service for .sgl_v2 files with MongoDB storage
"""
from fastapi import FastAPI, HTTPException
@@ -53,7 +53,7 @@ async def lifespan(app: FastAPI):
# Create FastAPI app
app = FastAPI(
title="SA4CPS Data Ingestion Service",
description="Monitors FTP server for .slg_v2 files and stores data in MongoDB",
description="Monitors FTP server for .sgl_v2 files and stores data in MongoDB",
version="1.0.0",
lifespan=lifespan
)