- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
445 lines
17 KiB
Python
445 lines
17 KiB
Python
"""
|
|
FTP monitoring component for detecting and downloading new time series data files.
|
|
Handles multiple FTP servers with different configurations and file patterns.
|
|
"""
|
|
|
|
import asyncio
|
|
import ftplib
|
|
import ftputil
|
|
from ftputil import FTPHost
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Any, Optional
|
|
import logging
|
|
import io
|
|
import os
|
|
import hashlib
|
|
import json
|
|
from pathlib import Path
|
|
import re
|
|
import ssl
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class FTPMonitor:
|
|
"""Monitors FTP servers for new time series data files"""
|
|
|
|
def __init__(self, db, redis_client):
|
|
self.db = db
|
|
self.redis = redis_client
|
|
self.download_cache = {} # Cache for downloaded files
|
|
self.connection_pool = {} # Pool of FTP connections
|
|
|
|
async def check_for_new_files(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Check FTP server for new files matching the configured patterns"""
|
|
try:
|
|
ftp_config = source.get("ftp_config", {})
|
|
file_patterns = source.get("file_patterns", ["*.csv"])
|
|
|
|
if not ftp_config:
|
|
logger.warning(f"No FTP config for source: {source['name']}")
|
|
return []
|
|
|
|
# Connect to FTP server
|
|
ftp_host = await self._get_ftp_connection(source)
|
|
if not ftp_host:
|
|
return []
|
|
|
|
new_files = []
|
|
remote_path = ftp_config.get("remote_path", "/")
|
|
|
|
try:
|
|
# List files in remote directory
|
|
file_list = await self._list_remote_files(ftp_host, remote_path)
|
|
|
|
# Filter files by patterns and check if they're new
|
|
for file_info in file_list:
|
|
filename = file_info["filename"]
|
|
|
|
# Check if file matches any pattern
|
|
if self._matches_patterns(filename, file_patterns):
|
|
|
|
# Check if file is new (not processed before)
|
|
if await self._is_new_file(source, file_info):
|
|
new_files.append(file_info)
|
|
logger.info(f"Found new file: {filename}")
|
|
|
|
# Update last check timestamp
|
|
await self.db.data_sources.update_one(
|
|
{"_id": source["_id"]},
|
|
{"$set": {"last_check": datetime.utcnow()}}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error listing files from FTP: {e}")
|
|
await self._close_ftp_connection(source["_id"])
|
|
|
|
return new_files
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking for new files in source {source['name']}: {e}")
|
|
return []
|
|
|
|
async def download_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bytes:
|
|
"""Download a file from FTP server"""
|
|
try:
|
|
ftp_host = await self._get_ftp_connection(source)
|
|
if not ftp_host:
|
|
raise Exception("Cannot establish FTP connection")
|
|
|
|
filename = file_info["filename"]
|
|
remote_path = source["ftp_config"].get("remote_path", "/")
|
|
full_path = f"{remote_path.rstrip('/')}/{filename}"
|
|
|
|
logger.info(f"Downloading file: {full_path}")
|
|
|
|
# Download file content
|
|
file_content = await self._download_file_content(ftp_host, full_path)
|
|
|
|
# Mark file as processed
|
|
await self._mark_file_processed(source, file_info)
|
|
|
|
# Cache file info for future reference
|
|
await self._cache_file_info(source, file_info, len(file_content))
|
|
|
|
logger.info(f"Successfully downloaded {filename} ({len(file_content)} bytes)")
|
|
return file_content
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error downloading file {file_info.get('filename', 'unknown')}: {e}")
|
|
raise
|
|
|
|
async def test_connection(self, source: Dict[str, Any]) -> bool:
|
|
"""Test FTP connection for a data source"""
|
|
try:
|
|
ftp_config = source.get("ftp_config", {})
|
|
if not ftp_config:
|
|
return False
|
|
|
|
# Try to establish connection
|
|
ftp_host = await self._create_ftp_connection(ftp_config)
|
|
if ftp_host:
|
|
# Try to list remote directory
|
|
remote_path = ftp_config.get("remote_path", "/")
|
|
try:
|
|
await self._list_remote_files(ftp_host, remote_path, limit=1)
|
|
success = True
|
|
except:
|
|
success = False
|
|
|
|
# Close connection
|
|
try:
|
|
await asyncio.get_event_loop().run_in_executor(
|
|
None, ftp_host.close
|
|
)
|
|
except:
|
|
pass
|
|
|
|
return success
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error testing FTP connection: {e}")
|
|
return False
|
|
|
|
async def get_file_metadata(self, source: Dict[str, Any], filename: str) -> Optional[Dict[str, Any]]:
|
|
"""Get metadata for a specific file"""
|
|
try:
|
|
ftp_host = await self._get_ftp_connection(source)
|
|
if not ftp_host:
|
|
return None
|
|
|
|
remote_path = source["ftp_config"].get("remote_path", "/")
|
|
full_path = f"{remote_path.rstrip('/')}/{filename}"
|
|
|
|
# Get file stats
|
|
def get_file_stat():
|
|
try:
|
|
return ftp_host.stat(full_path)
|
|
except:
|
|
return None
|
|
|
|
stat_info = await asyncio.get_event_loop().run_in_executor(None, get_file_stat)
|
|
|
|
if stat_info:
|
|
return {
|
|
"filename": filename,
|
|
"size": stat_info.st_size,
|
|
"modified_time": datetime.fromtimestamp(stat_info.st_mtime),
|
|
"full_path": full_path
|
|
}
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting file metadata for {filename}: {e}")
|
|
return None
|
|
|
|
async def _get_ftp_connection(self, source: Dict[str, Any]):
|
|
"""Get or create FTP connection for a source"""
|
|
source_id = str(source["_id"])
|
|
|
|
# Check if we have a cached connection
|
|
if source_id in self.connection_pool:
|
|
connection = self.connection_pool[source_id]
|
|
try:
|
|
# Test if connection is still alive
|
|
await asyncio.get_event_loop().run_in_executor(
|
|
None, lambda: connection.getcwd()
|
|
)
|
|
return connection
|
|
except:
|
|
# Connection is dead, remove from pool
|
|
del self.connection_pool[source_id]
|
|
|
|
# Create new connection
|
|
ftp_config = source.get("ftp_config", {})
|
|
connection = await self._create_ftp_connection(ftp_config)
|
|
|
|
if connection:
|
|
self.connection_pool[source_id] = connection
|
|
|
|
return connection
|
|
|
|
async def _create_ftp_connection(self, ftp_config: Dict[str, Any]):
|
|
"""Create a new FTP connection"""
|
|
try:
|
|
host = ftp_config.get("host")
|
|
port = ftp_config.get("port", 21)
|
|
username = ftp_config.get("username", "anonymous")
|
|
password = ftp_config.get("password", "")
|
|
use_ssl = ftp_config.get("use_ssl", False)
|
|
passive_mode = ftp_config.get("passive_mode", True)
|
|
|
|
if not host:
|
|
raise ValueError("FTP host not specified")
|
|
|
|
def create_connection():
|
|
if use_ssl:
|
|
# Use FTPS (FTP over SSL/TLS)
|
|
ftp = ftplib.FTP_TLS()
|
|
ftp.connect(host, port)
|
|
ftp.login(username, password)
|
|
ftp.prot_p() # Enable protection for data channel
|
|
else:
|
|
# Use regular FTP
|
|
ftp = ftplib.FTP()
|
|
ftp.connect(host, port)
|
|
ftp.login(username, password)
|
|
|
|
ftp.set_pasv(passive_mode)
|
|
|
|
# Create FTPHost wrapper for easier file operations
|
|
ftp_host = FTPHost.from_ftp_client(ftp)
|
|
return ftp_host
|
|
|
|
# Create connection in thread pool to avoid blocking
|
|
ftp_host = await asyncio.get_event_loop().run_in_executor(
|
|
None, create_connection
|
|
)
|
|
|
|
logger.info(f"Successfully connected to FTP server: {host}:{port}")
|
|
return ftp_host
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating FTP connection to {ftp_config.get('host', 'unknown')}: {e}")
|
|
return None
|
|
|
|
async def _close_ftp_connection(self, source_id: str):
|
|
"""Close FTP connection for a source"""
|
|
if source_id in self.connection_pool:
|
|
try:
|
|
connection = self.connection_pool[source_id]
|
|
await asyncio.get_event_loop().run_in_executor(
|
|
None, connection.close
|
|
)
|
|
except:
|
|
pass
|
|
finally:
|
|
del self.connection_pool[source_id]
|
|
|
|
async def _list_remote_files(self, ftp_host, remote_path: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
"""List files in remote FTP directory"""
|
|
def list_files():
|
|
files = []
|
|
try:
|
|
# Change to remote directory
|
|
ftp_host.chdir(remote_path)
|
|
|
|
# Get file list with details
|
|
file_list = ftp_host.listdir(".")
|
|
|
|
for filename in file_list:
|
|
try:
|
|
# Get file stats
|
|
file_path = f"{remote_path.rstrip('/')}/{filename}"
|
|
stat_info = ftp_host.stat(filename)
|
|
|
|
# Skip directories
|
|
if not ftp_host.path.isfile(filename):
|
|
continue
|
|
|
|
file_info = {
|
|
"filename": filename,
|
|
"full_path": file_path,
|
|
"size": stat_info.st_size,
|
|
"modified_time": datetime.fromtimestamp(stat_info.st_mtime),
|
|
"created_time": datetime.fromtimestamp(stat_info.st_ctime) if hasattr(stat_info, 'st_ctime') else None
|
|
}
|
|
|
|
files.append(file_info)
|
|
|
|
if limit and len(files) >= limit:
|
|
break
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error getting stats for file {filename}: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error listing directory {remote_path}: {e}")
|
|
raise
|
|
|
|
return files
|
|
|
|
return await asyncio.get_event_loop().run_in_executor(None, list_files)
|
|
|
|
async def _download_file_content(self, ftp_host, file_path: str) -> bytes:
|
|
"""Download file content from FTP server"""
|
|
def download():
|
|
bio = io.BytesIO()
|
|
try:
|
|
ftp_host.download(file_path, bio)
|
|
bio.seek(0)
|
|
return bio.read()
|
|
finally:
|
|
bio.close()
|
|
|
|
return await asyncio.get_event_loop().run_in_executor(None, download)
|
|
|
|
def _matches_patterns(self, filename: str, patterns: List[str]) -> bool:
|
|
"""Check if filename matches any of the specified patterns"""
|
|
for pattern in patterns:
|
|
# Convert shell pattern to regex
|
|
regex_pattern = pattern.replace("*", ".*").replace("?", ".")
|
|
if re.match(regex_pattern, filename, re.IGNORECASE):
|
|
return True
|
|
return False
|
|
|
|
async def _is_new_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bool:
|
|
"""Check if file is new (hasn't been processed before)"""
|
|
try:
|
|
filename = file_info["filename"]
|
|
file_size = file_info["size"]
|
|
modified_time = file_info["modified_time"]
|
|
|
|
# Create file signature
|
|
file_signature = hashlib.md5(
|
|
f"{filename}_{file_size}_{modified_time.timestamp()}".encode()
|
|
).hexdigest()
|
|
|
|
# Check if we've processed this file before
|
|
processed_file = await self.db.processed_files.find_one({
|
|
"source_id": source["_id"],
|
|
"file_signature": file_signature
|
|
})
|
|
|
|
return processed_file is None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking if file is new: {e}")
|
|
return True # Assume it's new if we can't check
|
|
|
|
async def _mark_file_processed(self, source: Dict[str, Any], file_info: Dict[str, Any]):
|
|
"""Mark file as processed"""
|
|
try:
|
|
filename = file_info["filename"]
|
|
file_size = file_info["size"]
|
|
modified_time = file_info["modified_time"]
|
|
|
|
# Create file signature
|
|
file_signature = hashlib.md5(
|
|
f"{filename}_{file_size}_{modified_time.timestamp()}".encode()
|
|
).hexdigest()
|
|
|
|
# Record processed file
|
|
processed_record = {
|
|
"source_id": source["_id"],
|
|
"source_name": source["name"],
|
|
"filename": filename,
|
|
"file_signature": file_signature,
|
|
"file_size": file_size,
|
|
"modified_time": modified_time,
|
|
"processed_at": datetime.utcnow()
|
|
}
|
|
|
|
await self.db.processed_files.insert_one(processed_record)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error marking file as processed: {e}")
|
|
|
|
async def _cache_file_info(self, source: Dict[str, Any], file_info: Dict[str, Any], content_size: int):
|
|
"""Cache file information for monitoring"""
|
|
try:
|
|
cache_key = f"file_cache:{source['_id']}:{file_info['filename']}"
|
|
cache_data = {
|
|
"filename": file_info["filename"],
|
|
"size": file_info["size"],
|
|
"content_size": content_size,
|
|
"downloaded_at": datetime.utcnow().isoformat(),
|
|
"source_name": source["name"]
|
|
}
|
|
|
|
# Store in Redis with 7-day expiration
|
|
await self.redis.setex(
|
|
cache_key,
|
|
7 * 24 * 3600, # 7 days
|
|
json.dumps(cache_data)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error caching file info: {e}")
|
|
|
|
async def get_processing_history(self, source_id: str, limit: int = 50) -> List[Dict[str, Any]]:
|
|
"""Get processing history for a data source"""
|
|
try:
|
|
cursor = self.db.processed_files.find(
|
|
{"source_id": source_id}
|
|
).sort("processed_at", -1).limit(limit)
|
|
|
|
history = []
|
|
async for record in cursor:
|
|
record["_id"] = str(record["_id"])
|
|
record["source_id"] = str(record["source_id"])
|
|
if "processed_at" in record:
|
|
record["processed_at"] = record["processed_at"].isoformat()
|
|
if "modified_time" in record:
|
|
record["modified_time"] = record["modified_time"].isoformat()
|
|
history.append(record)
|
|
|
|
return history
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting processing history: {e}")
|
|
return []
|
|
|
|
async def cleanup_old_records(self, days: int = 30):
|
|
"""Clean up old processed file records"""
|
|
try:
|
|
cutoff_date = datetime.utcnow() - timedelta(days=days)
|
|
|
|
result = await self.db.processed_files.delete_many({
|
|
"processed_at": {"$lt": cutoff_date}
|
|
})
|
|
|
|
logger.info(f"Cleaned up {result.deleted_count} old processed file records")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning up old records: {e}")
|
|
|
|
async def close_all_connections(self):
|
|
"""Close all FTP connections"""
|
|
for source_id in list(self.connection_pool.keys()):
|
|
await self._close_ftp_connection(source_id)
|
|
|
|
logger.info("Closed all FTP connections") |