Add data-ingestion-service for SA4CPS FTP integration

- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add
robust data processor with multi-format and unit inference support -
Publish parsed data to Redis topics for real-time dashboard simulation -
Include validation, monitoring, and auto-configuration scripts - Provide
documentation and test scripts for SA4CPS integration
This commit is contained in:
rafaeldpsilva
2025-09-10 14:43:30 +01:00
parent d4f280de93
commit 5fdce00e5d
16 changed files with 6353 additions and 0 deletions

View File

@@ -0,0 +1,445 @@
"""
FTP monitoring component for detecting and downloading new time series data files.
Handles multiple FTP servers with different configurations and file patterns.
"""
import asyncio
import ftplib
import ftputil
from ftputil import FTPHost
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
import logging
import io
import os
import hashlib
import json
from pathlib import Path
import re
import ssl
logger = logging.getLogger(__name__)
class FTPMonitor:
"""Monitors FTP servers for new time series data files"""
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
self.download_cache = {} # Cache for downloaded files
self.connection_pool = {} # Pool of FTP connections
async def check_for_new_files(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Check FTP server for new files matching the configured patterns"""
try:
ftp_config = source.get("ftp_config", {})
file_patterns = source.get("file_patterns", ["*.csv"])
if not ftp_config:
logger.warning(f"No FTP config for source: {source['name']}")
return []
# Connect to FTP server
ftp_host = await self._get_ftp_connection(source)
if not ftp_host:
return []
new_files = []
remote_path = ftp_config.get("remote_path", "/")
try:
# List files in remote directory
file_list = await self._list_remote_files(ftp_host, remote_path)
# Filter files by patterns and check if they're new
for file_info in file_list:
filename = file_info["filename"]
# Check if file matches any pattern
if self._matches_patterns(filename, file_patterns):
# Check if file is new (not processed before)
if await self._is_new_file(source, file_info):
new_files.append(file_info)
logger.info(f"Found new file: {filename}")
# Update last check timestamp
await self.db.data_sources.update_one(
{"_id": source["_id"]},
{"$set": {"last_check": datetime.utcnow()}}
)
except Exception as e:
logger.error(f"Error listing files from FTP: {e}")
await self._close_ftp_connection(source["_id"])
return new_files
except Exception as e:
logger.error(f"Error checking for new files in source {source['name']}: {e}")
return []
async def download_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bytes:
"""Download a file from FTP server"""
try:
ftp_host = await self._get_ftp_connection(source)
if not ftp_host:
raise Exception("Cannot establish FTP connection")
filename = file_info["filename"]
remote_path = source["ftp_config"].get("remote_path", "/")
full_path = f"{remote_path.rstrip('/')}/{filename}"
logger.info(f"Downloading file: {full_path}")
# Download file content
file_content = await self._download_file_content(ftp_host, full_path)
# Mark file as processed
await self._mark_file_processed(source, file_info)
# Cache file info for future reference
await self._cache_file_info(source, file_info, len(file_content))
logger.info(f"Successfully downloaded {filename} ({len(file_content)} bytes)")
return file_content
except Exception as e:
logger.error(f"Error downloading file {file_info.get('filename', 'unknown')}: {e}")
raise
async def test_connection(self, source: Dict[str, Any]) -> bool:
"""Test FTP connection for a data source"""
try:
ftp_config = source.get("ftp_config", {})
if not ftp_config:
return False
# Try to establish connection
ftp_host = await self._create_ftp_connection(ftp_config)
if ftp_host:
# Try to list remote directory
remote_path = ftp_config.get("remote_path", "/")
try:
await self._list_remote_files(ftp_host, remote_path, limit=1)
success = True
except:
success = False
# Close connection
try:
await asyncio.get_event_loop().run_in_executor(
None, ftp_host.close
)
except:
pass
return success
return False
except Exception as e:
logger.error(f"Error testing FTP connection: {e}")
return False
async def get_file_metadata(self, source: Dict[str, Any], filename: str) -> Optional[Dict[str, Any]]:
"""Get metadata for a specific file"""
try:
ftp_host = await self._get_ftp_connection(source)
if not ftp_host:
return None
remote_path = source["ftp_config"].get("remote_path", "/")
full_path = f"{remote_path.rstrip('/')}/{filename}"
# Get file stats
def get_file_stat():
try:
return ftp_host.stat(full_path)
except:
return None
stat_info = await asyncio.get_event_loop().run_in_executor(None, get_file_stat)
if stat_info:
return {
"filename": filename,
"size": stat_info.st_size,
"modified_time": datetime.fromtimestamp(stat_info.st_mtime),
"full_path": full_path
}
return None
except Exception as e:
logger.error(f"Error getting file metadata for {filename}: {e}")
return None
async def _get_ftp_connection(self, source: Dict[str, Any]):
"""Get or create FTP connection for a source"""
source_id = str(source["_id"])
# Check if we have a cached connection
if source_id in self.connection_pool:
connection = self.connection_pool[source_id]
try:
# Test if connection is still alive
await asyncio.get_event_loop().run_in_executor(
None, lambda: connection.getcwd()
)
return connection
except:
# Connection is dead, remove from pool
del self.connection_pool[source_id]
# Create new connection
ftp_config = source.get("ftp_config", {})
connection = await self._create_ftp_connection(ftp_config)
if connection:
self.connection_pool[source_id] = connection
return connection
async def _create_ftp_connection(self, ftp_config: Dict[str, Any]):
"""Create a new FTP connection"""
try:
host = ftp_config.get("host")
port = ftp_config.get("port", 21)
username = ftp_config.get("username", "anonymous")
password = ftp_config.get("password", "")
use_ssl = ftp_config.get("use_ssl", False)
passive_mode = ftp_config.get("passive_mode", True)
if not host:
raise ValueError("FTP host not specified")
def create_connection():
if use_ssl:
# Use FTPS (FTP over SSL/TLS)
ftp = ftplib.FTP_TLS()
ftp.connect(host, port)
ftp.login(username, password)
ftp.prot_p() # Enable protection for data channel
else:
# Use regular FTP
ftp = ftplib.FTP()
ftp.connect(host, port)
ftp.login(username, password)
ftp.set_pasv(passive_mode)
# Create FTPHost wrapper for easier file operations
ftp_host = FTPHost.from_ftp_client(ftp)
return ftp_host
# Create connection in thread pool to avoid blocking
ftp_host = await asyncio.get_event_loop().run_in_executor(
None, create_connection
)
logger.info(f"Successfully connected to FTP server: {host}:{port}")
return ftp_host
except Exception as e:
logger.error(f"Error creating FTP connection to {ftp_config.get('host', 'unknown')}: {e}")
return None
async def _close_ftp_connection(self, source_id: str):
"""Close FTP connection for a source"""
if source_id in self.connection_pool:
try:
connection = self.connection_pool[source_id]
await asyncio.get_event_loop().run_in_executor(
None, connection.close
)
except:
pass
finally:
del self.connection_pool[source_id]
async def _list_remote_files(self, ftp_host, remote_path: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
"""List files in remote FTP directory"""
def list_files():
files = []
try:
# Change to remote directory
ftp_host.chdir(remote_path)
# Get file list with details
file_list = ftp_host.listdir(".")
for filename in file_list:
try:
# Get file stats
file_path = f"{remote_path.rstrip('/')}/{filename}"
stat_info = ftp_host.stat(filename)
# Skip directories
if not ftp_host.path.isfile(filename):
continue
file_info = {
"filename": filename,
"full_path": file_path,
"size": stat_info.st_size,
"modified_time": datetime.fromtimestamp(stat_info.st_mtime),
"created_time": datetime.fromtimestamp(stat_info.st_ctime) if hasattr(stat_info, 'st_ctime') else None
}
files.append(file_info)
if limit and len(files) >= limit:
break
except Exception as e:
logger.warning(f"Error getting stats for file {filename}: {e}")
continue
except Exception as e:
logger.error(f"Error listing directory {remote_path}: {e}")
raise
return files
return await asyncio.get_event_loop().run_in_executor(None, list_files)
async def _download_file_content(self, ftp_host, file_path: str) -> bytes:
"""Download file content from FTP server"""
def download():
bio = io.BytesIO()
try:
ftp_host.download(file_path, bio)
bio.seek(0)
return bio.read()
finally:
bio.close()
return await asyncio.get_event_loop().run_in_executor(None, download)
def _matches_patterns(self, filename: str, patterns: List[str]) -> bool:
"""Check if filename matches any of the specified patterns"""
for pattern in patterns:
# Convert shell pattern to regex
regex_pattern = pattern.replace("*", ".*").replace("?", ".")
if re.match(regex_pattern, filename, re.IGNORECASE):
return True
return False
async def _is_new_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bool:
"""Check if file is new (hasn't been processed before)"""
try:
filename = file_info["filename"]
file_size = file_info["size"]
modified_time = file_info["modified_time"]
# Create file signature
file_signature = hashlib.md5(
f"{filename}_{file_size}_{modified_time.timestamp()}".encode()
).hexdigest()
# Check if we've processed this file before
processed_file = await self.db.processed_files.find_one({
"source_id": source["_id"],
"file_signature": file_signature
})
return processed_file is None
except Exception as e:
logger.error(f"Error checking if file is new: {e}")
return True # Assume it's new if we can't check
async def _mark_file_processed(self, source: Dict[str, Any], file_info: Dict[str, Any]):
"""Mark file as processed"""
try:
filename = file_info["filename"]
file_size = file_info["size"]
modified_time = file_info["modified_time"]
# Create file signature
file_signature = hashlib.md5(
f"{filename}_{file_size}_{modified_time.timestamp()}".encode()
).hexdigest()
# Record processed file
processed_record = {
"source_id": source["_id"],
"source_name": source["name"],
"filename": filename,
"file_signature": file_signature,
"file_size": file_size,
"modified_time": modified_time,
"processed_at": datetime.utcnow()
}
await self.db.processed_files.insert_one(processed_record)
except Exception as e:
logger.error(f"Error marking file as processed: {e}")
async def _cache_file_info(self, source: Dict[str, Any], file_info: Dict[str, Any], content_size: int):
"""Cache file information for monitoring"""
try:
cache_key = f"file_cache:{source['_id']}:{file_info['filename']}"
cache_data = {
"filename": file_info["filename"],
"size": file_info["size"],
"content_size": content_size,
"downloaded_at": datetime.utcnow().isoformat(),
"source_name": source["name"]
}
# Store in Redis with 7-day expiration
await self.redis.setex(
cache_key,
7 * 24 * 3600, # 7 days
json.dumps(cache_data)
)
except Exception as e:
logger.error(f"Error caching file info: {e}")
async def get_processing_history(self, source_id: str, limit: int = 50) -> List[Dict[str, Any]]:
"""Get processing history for a data source"""
try:
cursor = self.db.processed_files.find(
{"source_id": source_id}
).sort("processed_at", -1).limit(limit)
history = []
async for record in cursor:
record["_id"] = str(record["_id"])
record["source_id"] = str(record["source_id"])
if "processed_at" in record:
record["processed_at"] = record["processed_at"].isoformat()
if "modified_time" in record:
record["modified_time"] = record["modified_time"].isoformat()
history.append(record)
return history
except Exception as e:
logger.error(f"Error getting processing history: {e}")
return []
async def cleanup_old_records(self, days: int = 30):
"""Clean up old processed file records"""
try:
cutoff_date = datetime.utcnow() - timedelta(days=days)
result = await self.db.processed_files.delete_many({
"processed_at": {"$lt": cutoff_date}
})
logger.info(f"Cleaned up {result.deleted_count} old processed file records")
except Exception as e:
logger.error(f"Error cleaning up old records: {e}")
async def close_all_connections(self):
"""Close all FTP connections"""
for source_id in list(self.connection_pool.keys()):
await self._close_ftp_connection(source_id)
logger.info("Closed all FTP connections")