Simplify data ingestion service

This commit is contained in:
rafaeldpsilva
2025-09-10 15:47:10 +01:00
parent 13556347b0
commit b7e734e0d2
13 changed files with 474 additions and 4440 deletions

View File

@@ -1,445 +1,209 @@
#!/usr/bin/env python3
"""
FTP monitoring component for detecting and downloading new time series data files.
Handles multiple FTP servers with different configurations and file patterns.
FTP Monitor for SA4CPS .slg_v2 files
Monitors ftp.sa4cps.pt for new monthly files
"""
import asyncio
import ftplib
import ftputil
from ftputil import FTPHost
import logging
import os
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import logging
import io
import os
import hashlib
import json
from pathlib import Path
import re
import ssl
from dataclasses import dataclass
import tempfile
from config import FTP_CONFIG
from slg_processor import SLGProcessor
logger = logging.getLogger(__name__)
@dataclass
class FTPFileInfo:
"""Information about an FTP file"""
path: str
name: str
size: int
modified_time: Optional[datetime] = None
class FTPMonitor:
"""Monitors FTP servers for new time series data files"""
"""Monitors SA4CPS FTP server for new .slg_v2 files"""
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
self.download_cache = {} # Cache for downloaded files
self.connection_pool = {} # Pool of FTP connections
def __init__(self, db_manager):
self.db_manager = db_manager
self.processor = SLGProcessor()
self.last_check: Optional[datetime] = None
self.processed_files: set = set()
self.files_processed_count = 0
self.status = "initializing"
# FTP connection settings
self.ftp_host = FTP_CONFIG["host"]
self.ftp_user = FTP_CONFIG["username"]
self.ftp_pass = FTP_CONFIG["password"]
self.base_path = FTP_CONFIG["base_path"]
# Check interval: 6 hours (files are monthly, so frequent checks aren't needed)
self.check_interval = FTP_CONFIG.get("check_interval", 6 * 3600) # 6 hours
logger.info(f"FTP Monitor initialized for {self.ftp_host}")
async def check_for_new_files(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Check FTP server for new files matching the configured patterns"""
try:
ftp_config = source.get("ftp_config", {})
file_patterns = source.get("file_patterns", ["*.csv"])
if not ftp_config:
logger.warning(f"No FTP config for source: {source['name']}")
return []
# Connect to FTP server
ftp_host = await self._get_ftp_connection(source)
if not ftp_host:
return []
new_files = []
remote_path = ftp_config.get("remote_path", "/")
async def start_monitoring(self):
"""Start the monitoring loop"""
self.status = "running"
logger.info("Starting FTP monitoring loop")
while True:
try:
# List files in remote directory
file_list = await self._list_remote_files(ftp_host, remote_path)
await self.check_for_new_files()
self.status = "running"
# Filter files by patterns and check if they're new
for file_info in file_list:
filename = file_info["filename"]
# Check if file matches any pattern
if self._matches_patterns(filename, file_patterns):
# Check if file is new (not processed before)
if await self._is_new_file(source, file_info):
new_files.append(file_info)
logger.info(f"Found new file: {filename}")
# Update last check timestamp
await self.db.data_sources.update_one(
{"_id": source["_id"]},
{"$set": {"last_check": datetime.utcnow()}}
)
# Wait for next check (6 hours)
logger.info(f"Waiting {self.check_interval/3600:.1f} hours until next check")
await asyncio.sleep(self.check_interval)
except Exception as e:
logger.error(f"Error listing files from FTP: {e}")
await self._close_ftp_connection(source["_id"])
return new_files
except Exception as e:
logger.error(f"Error checking for new files in source {source['name']}: {e}")
return []
self.status = "error"
logger.error(f"Error in monitoring loop: {e}")
# Wait 30 minutes before retrying on error
await asyncio.sleep(1800)
async def download_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bytes:
"""Download a file from FTP server"""
async def check_for_new_files(self) -> Dict[str, Any]:
"""Check FTP server for new .slg_v2 files"""
self.last_check = datetime.now()
logger.info(f"Checking FTP server at {self.last_check}")
try:
ftp_host = await self._get_ftp_connection(source)
if not ftp_host:
raise Exception("Cannot establish FTP connection")
filename = file_info["filename"]
remote_path = source["ftp_config"].get("remote_path", "/")
full_path = f"{remote_path.rstrip('/')}/{filename}"
logger.info(f"Downloading file: {full_path}")
# Download file content
file_content = await self._download_file_content(ftp_host, full_path)
# Mark file as processed
await self._mark_file_processed(source, file_info)
# Cache file info for future reference
await self._cache_file_info(source, file_info, len(file_content))
logger.info(f"Successfully downloaded {filename} ({len(file_content)} bytes)")
return file_content
# Connect to FTP server
with ftplib.FTP(self.ftp_host) as ftp:
ftp.login(self.ftp_user, self.ftp_pass)
logger.info(f"Connected to FTP server: {self.ftp_host}")
# Find .slg_v2 files
new_files = await self._find_slg_files(ftp)
# Process new files
processed_count = 0
for file_info in new_files:
if file_info.path not in self.processed_files:
success = await self._process_file(ftp, file_info)
if success:
self.processed_files.add(file_info.path)
processed_count += 1
self.files_processed_count += 1
result = {
"files_found": len(new_files),
"files_processed": processed_count,
"timestamp": self.last_check.isoformat()
}
logger.info(f"Check complete: {result}")
return result
except Exception as e:
logger.error(f"Error downloading file {file_info.get('filename', 'unknown')}: {e}")
logger.error(f"FTP check failed: {e}")
raise
async def test_connection(self, source: Dict[str, Any]) -> bool:
"""Test FTP connection for a data source"""
async def _find_slg_files(self, ftp: ftplib.FTP) -> List[FTPFileInfo]:
"""Find .slg_v2 files in the FTP directory structure"""
files = []
try:
ftp_config = source.get("ftp_config", {})
if not ftp_config:
return False
# Navigate to base path
ftp.cwd(self.base_path)
logger.info(f"Scanning directory: {self.base_path}")
# Try to establish connection
ftp_host = await self._create_ftp_connection(ftp_config)
if ftp_host:
# Try to list remote directory
remote_path = ftp_config.get("remote_path", "/")
try:
await self._list_remote_files(ftp_host, remote_path, limit=1)
success = True
except:
success = False
# Close connection
try:
await asyncio.get_event_loop().run_in_executor(
None, ftp_host.close
)
except:
pass
return success
# Get directory listing
dir_list = []
ftp.retrlines('LIST', dir_list.append)
return False
except Exception as e:
logger.error(f"Error testing FTP connection: {e}")
return False
async def get_file_metadata(self, source: Dict[str, Any], filename: str) -> Optional[Dict[str, Any]]:
"""Get metadata for a specific file"""
try:
ftp_host = await self._get_ftp_connection(source)
if not ftp_host:
return None
remote_path = source["ftp_config"].get("remote_path", "/")
full_path = f"{remote_path.rstrip('/')}/{filename}"
# Get file stats
def get_file_stat():
try:
return ftp_host.stat(full_path)
except:
return None
stat_info = await asyncio.get_event_loop().run_in_executor(None, get_file_stat)
if stat_info:
return {
"filename": filename,
"size": stat_info.st_size,
"modified_time": datetime.fromtimestamp(stat_info.st_mtime),
"full_path": full_path
}
return None
except Exception as e:
logger.error(f"Error getting file metadata for {filename}: {e}")
return None
async def _get_ftp_connection(self, source: Dict[str, Any]):
"""Get or create FTP connection for a source"""
source_id = str(source["_id"])
# Check if we have a cached connection
if source_id in self.connection_pool:
connection = self.connection_pool[source_id]
try:
# Test if connection is still alive
await asyncio.get_event_loop().run_in_executor(
None, lambda: connection.getcwd()
)
return connection
except:
# Connection is dead, remove from pool
del self.connection_pool[source_id]
# Create new connection
ftp_config = source.get("ftp_config", {})
connection = await self._create_ftp_connection(ftp_config)
if connection:
self.connection_pool[source_id] = connection
return connection
async def _create_ftp_connection(self, ftp_config: Dict[str, Any]):
"""Create a new FTP connection"""
try:
host = ftp_config.get("host")
port = ftp_config.get("port", 21)
username = ftp_config.get("username", "anonymous")
password = ftp_config.get("password", "")
use_ssl = ftp_config.get("use_ssl", False)
passive_mode = ftp_config.get("passive_mode", True)
if not host:
raise ValueError("FTP host not specified")
def create_connection():
if use_ssl:
# Use FTPS (FTP over SSL/TLS)
ftp = ftplib.FTP_TLS()
ftp.connect(host, port)
ftp.login(username, password)
ftp.prot_p() # Enable protection for data channel
else:
# Use regular FTP
ftp = ftplib.FTP()
ftp.connect(host, port)
ftp.login(username, password)
ftp.set_pasv(passive_mode)
# Create FTPHost wrapper for easier file operations
ftp_host = FTPHost.from_ftp_client(ftp)
return ftp_host
# Create connection in thread pool to avoid blocking
ftp_host = await asyncio.get_event_loop().run_in_executor(
None, create_connection
)
logger.info(f"Successfully connected to FTP server: {host}:{port}")
return ftp_host
except Exception as e:
logger.error(f"Error creating FTP connection to {ftp_config.get('host', 'unknown')}: {e}")
return None
async def _close_ftp_connection(self, source_id: str):
"""Close FTP connection for a source"""
if source_id in self.connection_pool:
try:
connection = self.connection_pool[source_id]
await asyncio.get_event_loop().run_in_executor(
None, connection.close
)
except:
pass
finally:
del self.connection_pool[source_id]
async def _list_remote_files(self, ftp_host, remote_path: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
"""List files in remote FTP directory"""
def list_files():
files = []
try:
# Change to remote directory
ftp_host.chdir(remote_path)
# Get file list with details
file_list = ftp_host.listdir(".")
for filename in file_list:
try:
# Get file stats
file_path = f"{remote_path.rstrip('/')}/{filename}"
stat_info = ftp_host.stat(filename)
# Skip directories
if not ftp_host.path.isfile(filename):
continue
file_info = {
"filename": filename,
"full_path": file_path,
"size": stat_info.st_size,
"modified_time": datetime.fromtimestamp(stat_info.st_mtime),
"created_time": datetime.fromtimestamp(stat_info.st_ctime) if hasattr(stat_info, 'st_ctime') else None
}
files.append(file_info)
if limit and len(files) >= limit:
break
for line in dir_list:
parts = line.split()
if len(parts) >= 9:
filename = parts[-1]
# Check if it's a .slg_v2 file
if filename.endswith('.slg_v2'):
try:
size = int(parts[4])
full_path = f"{self.base_path.rstrip('/')}/{filename}"
except Exception as e:
logger.warning(f"Error getting stats for file {filename}: {e}")
continue
except Exception as e:
logger.error(f"Error listing directory {remote_path}: {e}")
raise
files.append(FTPFileInfo(
path=full_path,
name=filename,
size=size
))
except (ValueError, IndexError):
logger.warning(f"Could not parse file info for: {filename}")
logger.info(f"Found {len(files)} .slg_v2 files")
return files
return await asyncio.get_event_loop().run_in_executor(None, list_files)
async def _download_file_content(self, ftp_host, file_path: str) -> bytes:
"""Download file content from FTP server"""
def download():
bio = io.BytesIO()
try:
ftp_host.download(file_path, bio)
bio.seek(0)
return bio.read()
finally:
bio.close()
return await asyncio.get_event_loop().run_in_executor(None, download)
def _matches_patterns(self, filename: str, patterns: List[str]) -> bool:
"""Check if filename matches any of the specified patterns"""
for pattern in patterns:
# Convert shell pattern to regex
regex_pattern = pattern.replace("*", ".*").replace("?", ".")
if re.match(regex_pattern, filename, re.IGNORECASE):
return True
return False
async def _is_new_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bool:
"""Check if file is new (hasn't been processed before)"""
try:
filename = file_info["filename"]
file_size = file_info["size"]
modified_time = file_info["modified_time"]
# Create file signature
file_signature = hashlib.md5(
f"{filename}_{file_size}_{modified_time.timestamp()}".encode()
).hexdigest()
# Check if we've processed this file before
processed_file = await self.db.processed_files.find_one({
"source_id": source["_id"],
"file_signature": file_signature
})
return processed_file is None
except Exception as e:
logger.error(f"Error checking if file is new: {e}")
return True # Assume it's new if we can't check
async def _mark_file_processed(self, source: Dict[str, Any], file_info: Dict[str, Any]):
"""Mark file as processed"""
try:
filename = file_info["filename"]
file_size = file_info["size"]
modified_time = file_info["modified_time"]
# Create file signature
file_signature = hashlib.md5(
f"{filename}_{file_size}_{modified_time.timestamp()}".encode()
).hexdigest()
# Record processed file
processed_record = {
"source_id": source["_id"],
"source_name": source["name"],
"filename": filename,
"file_signature": file_signature,
"file_size": file_size,
"modified_time": modified_time,
"processed_at": datetime.utcnow()
}
await self.db.processed_files.insert_one(processed_record)
except Exception as e:
logger.error(f"Error marking file as processed: {e}")
async def _cache_file_info(self, source: Dict[str, Any], file_info: Dict[str, Any], content_size: int):
"""Cache file information for monitoring"""
try:
cache_key = f"file_cache:{source['_id']}:{file_info['filename']}"
cache_data = {
"filename": file_info["filename"],
"size": file_info["size"],
"content_size": content_size,
"downloaded_at": datetime.utcnow().isoformat(),
"source_name": source["name"]
}
# Store in Redis with 7-day expiration
await self.redis.setex(
cache_key,
7 * 24 * 3600, # 7 days
json.dumps(cache_data)
)
except Exception as e:
logger.error(f"Error caching file info: {e}")
async def get_processing_history(self, source_id: str, limit: int = 50) -> List[Dict[str, Any]]:
"""Get processing history for a data source"""
try:
cursor = self.db.processed_files.find(
{"source_id": source_id}
).sort("processed_at", -1).limit(limit)
history = []
async for record in cursor:
record["_id"] = str(record["_id"])
record["source_id"] = str(record["source_id"])
if "processed_at" in record:
record["processed_at"] = record["processed_at"].isoformat()
if "modified_time" in record:
record["modified_time"] = record["modified_time"].isoformat()
history.append(record)
return history
except Exception as e:
logger.error(f"Error getting processing history: {e}")
logger.error(f"Error scanning FTP directory: {e}")
return []
async def cleanup_old_records(self, days: int = 30):
"""Clean up old processed file records"""
try:
cutoff_date = datetime.utcnow() - timedelta(days=days)
result = await self.db.processed_files.delete_many({
"processed_at": {"$lt": cutoff_date}
})
logger.info(f"Cleaned up {result.deleted_count} old processed file records")
except Exception as e:
logger.error(f"Error cleaning up old records: {e}")
async def close_all_connections(self):
"""Close all FTP connections"""
for source_id in list(self.connection_pool.keys()):
await self._close_ftp_connection(source_id)
async def _process_file(self, ftp: ftplib.FTP, file_info: FTPFileInfo) -> bool:
"""Download and process a .slg_v2 file"""
logger.info(f"Processing file: {file_info.name} ({file_info.size} bytes)")
logger.info("Closed all FTP connections")
try:
# Create temporary file for download
with tempfile.NamedTemporaryFile(mode='wb', suffix='.slg_v2', delete=False) as temp_file:
temp_path = temp_file.name
# Download file
with open(temp_path, 'wb') as f:
ftp.retrbinary(f'RETR {file_info.name}', f.write)
# Process the downloaded file
records = await self.processor.process_file(temp_path, file_info.name)
# Store in database
if records:
await self.db_manager.store_file_data(file_info.name, records)
logger.info(f"Stored {len(records)} records from {file_info.name}")
return True
else:
logger.warning(f"No valid records found in {file_info.name}")
return False
except Exception as e:
logger.error(f"Error processing file {file_info.name}: {e}")
return False
finally:
# Clean up temporary file
try:
if 'temp_path' in locals():
os.unlink(temp_path)
except OSError:
pass
def get_status(self) -> str:
"""Get current monitor status"""
return self.status
def get_last_check_time(self) -> Optional[str]:
"""Get last check time as ISO string"""
return self.last_check.isoformat() if self.last_check else None
def get_processed_count(self) -> int:
"""Get total number of files processed"""
return self.files_processed_count
def get_detailed_status(self) -> Dict[str, Any]:
"""Get detailed status information"""
return {
"status": self.status,
"last_check": self.get_last_check_time(),
"files_processed": self.files_processed_count,
"processed_files_count": len(self.processed_files),
"check_interval_hours": self.check_interval / 3600,
"ftp_host": self.ftp_host,
"base_path": self.base_path
}