Files
sac4cps-backend/microservices/data-ingestion-service/src/ftp_monitor.py
2025-09-10 15:47:10 +01:00

209 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""
FTP Monitor for SA4CPS .slg_v2 files
Monitors ftp.sa4cps.pt for new monthly files
"""
import asyncio
import ftplib
import logging
import os
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
import tempfile
from config import FTP_CONFIG
from slg_processor import SLGProcessor
logger = logging.getLogger(__name__)
@dataclass
class FTPFileInfo:
"""Information about an FTP file"""
path: str
name: str
size: int
modified_time: Optional[datetime] = None
class FTPMonitor:
"""Monitors SA4CPS FTP server for new .slg_v2 files"""
def __init__(self, db_manager):
self.db_manager = db_manager
self.processor = SLGProcessor()
self.last_check: Optional[datetime] = None
self.processed_files: set = set()
self.files_processed_count = 0
self.status = "initializing"
# FTP connection settings
self.ftp_host = FTP_CONFIG["host"]
self.ftp_user = FTP_CONFIG["username"]
self.ftp_pass = FTP_CONFIG["password"]
self.base_path = FTP_CONFIG["base_path"]
# Check interval: 6 hours (files are monthly, so frequent checks aren't needed)
self.check_interval = FTP_CONFIG.get("check_interval", 6 * 3600) # 6 hours
logger.info(f"FTP Monitor initialized for {self.ftp_host}")
async def start_monitoring(self):
"""Start the monitoring loop"""
self.status = "running"
logger.info("Starting FTP monitoring loop")
while True:
try:
await self.check_for_new_files()
self.status = "running"
# Wait for next check (6 hours)
logger.info(f"Waiting {self.check_interval/3600:.1f} hours until next check")
await asyncio.sleep(self.check_interval)
except Exception as e:
self.status = "error"
logger.error(f"Error in monitoring loop: {e}")
# Wait 30 minutes before retrying on error
await asyncio.sleep(1800)
async def check_for_new_files(self) -> Dict[str, Any]:
"""Check FTP server for new .slg_v2 files"""
self.last_check = datetime.now()
logger.info(f"Checking FTP server at {self.last_check}")
try:
# Connect to FTP server
with ftplib.FTP(self.ftp_host) as ftp:
ftp.login(self.ftp_user, self.ftp_pass)
logger.info(f"Connected to FTP server: {self.ftp_host}")
# Find .slg_v2 files
new_files = await self._find_slg_files(ftp)
# Process new files
processed_count = 0
for file_info in new_files:
if file_info.path not in self.processed_files:
success = await self._process_file(ftp, file_info)
if success:
self.processed_files.add(file_info.path)
processed_count += 1
self.files_processed_count += 1
result = {
"files_found": len(new_files),
"files_processed": processed_count,
"timestamp": self.last_check.isoformat()
}
logger.info(f"Check complete: {result}")
return result
except Exception as e:
logger.error(f"FTP check failed: {e}")
raise
async def _find_slg_files(self, ftp: ftplib.FTP) -> List[FTPFileInfo]:
"""Find .slg_v2 files in the FTP directory structure"""
files = []
try:
# Navigate to base path
ftp.cwd(self.base_path)
logger.info(f"Scanning directory: {self.base_path}")
# Get directory listing
dir_list = []
ftp.retrlines('LIST', dir_list.append)
for line in dir_list:
parts = line.split()
if len(parts) >= 9:
filename = parts[-1]
# Check if it's a .slg_v2 file
if filename.endswith('.slg_v2'):
try:
size = int(parts[4])
full_path = f"{self.base_path.rstrip('/')}/{filename}"
files.append(FTPFileInfo(
path=full_path,
name=filename,
size=size
))
except (ValueError, IndexError):
logger.warning(f"Could not parse file info for: {filename}")
logger.info(f"Found {len(files)} .slg_v2 files")
return files
except Exception as e:
logger.error(f"Error scanning FTP directory: {e}")
return []
async def _process_file(self, ftp: ftplib.FTP, file_info: FTPFileInfo) -> bool:
"""Download and process a .slg_v2 file"""
logger.info(f"Processing file: {file_info.name} ({file_info.size} bytes)")
try:
# Create temporary file for download
with tempfile.NamedTemporaryFile(mode='wb', suffix='.slg_v2', delete=False) as temp_file:
temp_path = temp_file.name
# Download file
with open(temp_path, 'wb') as f:
ftp.retrbinary(f'RETR {file_info.name}', f.write)
# Process the downloaded file
records = await self.processor.process_file(temp_path, file_info.name)
# Store in database
if records:
await self.db_manager.store_file_data(file_info.name, records)
logger.info(f"Stored {len(records)} records from {file_info.name}")
return True
else:
logger.warning(f"No valid records found in {file_info.name}")
return False
except Exception as e:
logger.error(f"Error processing file {file_info.name}: {e}")
return False
finally:
# Clean up temporary file
try:
if 'temp_path' in locals():
os.unlink(temp_path)
except OSError:
pass
def get_status(self) -> str:
"""Get current monitor status"""
return self.status
def get_last_check_time(self) -> Optional[str]:
"""Get last check time as ISO string"""
return self.last_check.isoformat() if self.last_check else None
def get_processed_count(self) -> int:
"""Get total number of files processed"""
return self.files_processed_count
def get_detailed_status(self) -> Dict[str, Any]:
"""Get detailed status information"""
return {
"status": self.status,
"last_check": self.get_last_check_time(),
"files_processed": self.files_processed_count,
"processed_files_count": len(self.processed_files),
"check_interval_hours": self.check_interval / 3600,
"ftp_host": self.ftp_host,
"base_path": self.base_path
}