Add data-ingestion-service for SA4CPS FTP integration
- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
This commit is contained in:
445
microservices/data-ingestion-service/ftp_monitor.py
Normal file
445
microservices/data-ingestion-service/ftp_monitor.py
Normal file
@@ -0,0 +1,445 @@
|
||||
"""
|
||||
FTP monitoring component for detecting and downloading new time series data files.
|
||||
Handles multiple FTP servers with different configurations and file patterns.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import ftplib
|
||||
import ftputil
|
||||
from ftputil import FTPHost
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Any, Optional
|
||||
import logging
|
||||
import io
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
from pathlib import Path
|
||||
import re
|
||||
import ssl
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class FTPMonitor:
|
||||
"""Monitors FTP servers for new time series data files"""
|
||||
|
||||
def __init__(self, db, redis_client):
|
||||
self.db = db
|
||||
self.redis = redis_client
|
||||
self.download_cache = {} # Cache for downloaded files
|
||||
self.connection_pool = {} # Pool of FTP connections
|
||||
|
||||
async def check_for_new_files(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Check FTP server for new files matching the configured patterns"""
|
||||
try:
|
||||
ftp_config = source.get("ftp_config", {})
|
||||
file_patterns = source.get("file_patterns", ["*.csv"])
|
||||
|
||||
if not ftp_config:
|
||||
logger.warning(f"No FTP config for source: {source['name']}")
|
||||
return []
|
||||
|
||||
# Connect to FTP server
|
||||
ftp_host = await self._get_ftp_connection(source)
|
||||
if not ftp_host:
|
||||
return []
|
||||
|
||||
new_files = []
|
||||
remote_path = ftp_config.get("remote_path", "/")
|
||||
|
||||
try:
|
||||
# List files in remote directory
|
||||
file_list = await self._list_remote_files(ftp_host, remote_path)
|
||||
|
||||
# Filter files by patterns and check if they're new
|
||||
for file_info in file_list:
|
||||
filename = file_info["filename"]
|
||||
|
||||
# Check if file matches any pattern
|
||||
if self._matches_patterns(filename, file_patterns):
|
||||
|
||||
# Check if file is new (not processed before)
|
||||
if await self._is_new_file(source, file_info):
|
||||
new_files.append(file_info)
|
||||
logger.info(f"Found new file: {filename}")
|
||||
|
||||
# Update last check timestamp
|
||||
await self.db.data_sources.update_one(
|
||||
{"_id": source["_id"]},
|
||||
{"$set": {"last_check": datetime.utcnow()}}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing files from FTP: {e}")
|
||||
await self._close_ftp_connection(source["_id"])
|
||||
|
||||
return new_files
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking for new files in source {source['name']}: {e}")
|
||||
return []
|
||||
|
||||
async def download_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bytes:
|
||||
"""Download a file from FTP server"""
|
||||
try:
|
||||
ftp_host = await self._get_ftp_connection(source)
|
||||
if not ftp_host:
|
||||
raise Exception("Cannot establish FTP connection")
|
||||
|
||||
filename = file_info["filename"]
|
||||
remote_path = source["ftp_config"].get("remote_path", "/")
|
||||
full_path = f"{remote_path.rstrip('/')}/{filename}"
|
||||
|
||||
logger.info(f"Downloading file: {full_path}")
|
||||
|
||||
# Download file content
|
||||
file_content = await self._download_file_content(ftp_host, full_path)
|
||||
|
||||
# Mark file as processed
|
||||
await self._mark_file_processed(source, file_info)
|
||||
|
||||
# Cache file info for future reference
|
||||
await self._cache_file_info(source, file_info, len(file_content))
|
||||
|
||||
logger.info(f"Successfully downloaded {filename} ({len(file_content)} bytes)")
|
||||
return file_content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading file {file_info.get('filename', 'unknown')}: {e}")
|
||||
raise
|
||||
|
||||
async def test_connection(self, source: Dict[str, Any]) -> bool:
|
||||
"""Test FTP connection for a data source"""
|
||||
try:
|
||||
ftp_config = source.get("ftp_config", {})
|
||||
if not ftp_config:
|
||||
return False
|
||||
|
||||
# Try to establish connection
|
||||
ftp_host = await self._create_ftp_connection(ftp_config)
|
||||
if ftp_host:
|
||||
# Try to list remote directory
|
||||
remote_path = ftp_config.get("remote_path", "/")
|
||||
try:
|
||||
await self._list_remote_files(ftp_host, remote_path, limit=1)
|
||||
success = True
|
||||
except:
|
||||
success = False
|
||||
|
||||
# Close connection
|
||||
try:
|
||||
await asyncio.get_event_loop().run_in_executor(
|
||||
None, ftp_host.close
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
return success
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error testing FTP connection: {e}")
|
||||
return False
|
||||
|
||||
async def get_file_metadata(self, source: Dict[str, Any], filename: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get metadata for a specific file"""
|
||||
try:
|
||||
ftp_host = await self._get_ftp_connection(source)
|
||||
if not ftp_host:
|
||||
return None
|
||||
|
||||
remote_path = source["ftp_config"].get("remote_path", "/")
|
||||
full_path = f"{remote_path.rstrip('/')}/{filename}"
|
||||
|
||||
# Get file stats
|
||||
def get_file_stat():
|
||||
try:
|
||||
return ftp_host.stat(full_path)
|
||||
except:
|
||||
return None
|
||||
|
||||
stat_info = await asyncio.get_event_loop().run_in_executor(None, get_file_stat)
|
||||
|
||||
if stat_info:
|
||||
return {
|
||||
"filename": filename,
|
||||
"size": stat_info.st_size,
|
||||
"modified_time": datetime.fromtimestamp(stat_info.st_mtime),
|
||||
"full_path": full_path
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting file metadata for {filename}: {e}")
|
||||
return None
|
||||
|
||||
async def _get_ftp_connection(self, source: Dict[str, Any]):
|
||||
"""Get or create FTP connection for a source"""
|
||||
source_id = str(source["_id"])
|
||||
|
||||
# Check if we have a cached connection
|
||||
if source_id in self.connection_pool:
|
||||
connection = self.connection_pool[source_id]
|
||||
try:
|
||||
# Test if connection is still alive
|
||||
await asyncio.get_event_loop().run_in_executor(
|
||||
None, lambda: connection.getcwd()
|
||||
)
|
||||
return connection
|
||||
except:
|
||||
# Connection is dead, remove from pool
|
||||
del self.connection_pool[source_id]
|
||||
|
||||
# Create new connection
|
||||
ftp_config = source.get("ftp_config", {})
|
||||
connection = await self._create_ftp_connection(ftp_config)
|
||||
|
||||
if connection:
|
||||
self.connection_pool[source_id] = connection
|
||||
|
||||
return connection
|
||||
|
||||
async def _create_ftp_connection(self, ftp_config: Dict[str, Any]):
|
||||
"""Create a new FTP connection"""
|
||||
try:
|
||||
host = ftp_config.get("host")
|
||||
port = ftp_config.get("port", 21)
|
||||
username = ftp_config.get("username", "anonymous")
|
||||
password = ftp_config.get("password", "")
|
||||
use_ssl = ftp_config.get("use_ssl", False)
|
||||
passive_mode = ftp_config.get("passive_mode", True)
|
||||
|
||||
if not host:
|
||||
raise ValueError("FTP host not specified")
|
||||
|
||||
def create_connection():
|
||||
if use_ssl:
|
||||
# Use FTPS (FTP over SSL/TLS)
|
||||
ftp = ftplib.FTP_TLS()
|
||||
ftp.connect(host, port)
|
||||
ftp.login(username, password)
|
||||
ftp.prot_p() # Enable protection for data channel
|
||||
else:
|
||||
# Use regular FTP
|
||||
ftp = ftplib.FTP()
|
||||
ftp.connect(host, port)
|
||||
ftp.login(username, password)
|
||||
|
||||
ftp.set_pasv(passive_mode)
|
||||
|
||||
# Create FTPHost wrapper for easier file operations
|
||||
ftp_host = FTPHost.from_ftp_client(ftp)
|
||||
return ftp_host
|
||||
|
||||
# Create connection in thread pool to avoid blocking
|
||||
ftp_host = await asyncio.get_event_loop().run_in_executor(
|
||||
None, create_connection
|
||||
)
|
||||
|
||||
logger.info(f"Successfully connected to FTP server: {host}:{port}")
|
||||
return ftp_host
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating FTP connection to {ftp_config.get('host', 'unknown')}: {e}")
|
||||
return None
|
||||
|
||||
async def _close_ftp_connection(self, source_id: str):
|
||||
"""Close FTP connection for a source"""
|
||||
if source_id in self.connection_pool:
|
||||
try:
|
||||
connection = self.connection_pool[source_id]
|
||||
await asyncio.get_event_loop().run_in_executor(
|
||||
None, connection.close
|
||||
)
|
||||
except:
|
||||
pass
|
||||
finally:
|
||||
del self.connection_pool[source_id]
|
||||
|
||||
async def _list_remote_files(self, ftp_host, remote_path: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
"""List files in remote FTP directory"""
|
||||
def list_files():
|
||||
files = []
|
||||
try:
|
||||
# Change to remote directory
|
||||
ftp_host.chdir(remote_path)
|
||||
|
||||
# Get file list with details
|
||||
file_list = ftp_host.listdir(".")
|
||||
|
||||
for filename in file_list:
|
||||
try:
|
||||
# Get file stats
|
||||
file_path = f"{remote_path.rstrip('/')}/{filename}"
|
||||
stat_info = ftp_host.stat(filename)
|
||||
|
||||
# Skip directories
|
||||
if not ftp_host.path.isfile(filename):
|
||||
continue
|
||||
|
||||
file_info = {
|
||||
"filename": filename,
|
||||
"full_path": file_path,
|
||||
"size": stat_info.st_size,
|
||||
"modified_time": datetime.fromtimestamp(stat_info.st_mtime),
|
||||
"created_time": datetime.fromtimestamp(stat_info.st_ctime) if hasattr(stat_info, 'st_ctime') else None
|
||||
}
|
||||
|
||||
files.append(file_info)
|
||||
|
||||
if limit and len(files) >= limit:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error getting stats for file {filename}: {e}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing directory {remote_path}: {e}")
|
||||
raise
|
||||
|
||||
return files
|
||||
|
||||
return await asyncio.get_event_loop().run_in_executor(None, list_files)
|
||||
|
||||
async def _download_file_content(self, ftp_host, file_path: str) -> bytes:
|
||||
"""Download file content from FTP server"""
|
||||
def download():
|
||||
bio = io.BytesIO()
|
||||
try:
|
||||
ftp_host.download(file_path, bio)
|
||||
bio.seek(0)
|
||||
return bio.read()
|
||||
finally:
|
||||
bio.close()
|
||||
|
||||
return await asyncio.get_event_loop().run_in_executor(None, download)
|
||||
|
||||
def _matches_patterns(self, filename: str, patterns: List[str]) -> bool:
|
||||
"""Check if filename matches any of the specified patterns"""
|
||||
for pattern in patterns:
|
||||
# Convert shell pattern to regex
|
||||
regex_pattern = pattern.replace("*", ".*").replace("?", ".")
|
||||
if re.match(regex_pattern, filename, re.IGNORECASE):
|
||||
return True
|
||||
return False
|
||||
|
||||
async def _is_new_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bool:
|
||||
"""Check if file is new (hasn't been processed before)"""
|
||||
try:
|
||||
filename = file_info["filename"]
|
||||
file_size = file_info["size"]
|
||||
modified_time = file_info["modified_time"]
|
||||
|
||||
# Create file signature
|
||||
file_signature = hashlib.md5(
|
||||
f"{filename}_{file_size}_{modified_time.timestamp()}".encode()
|
||||
).hexdigest()
|
||||
|
||||
# Check if we've processed this file before
|
||||
processed_file = await self.db.processed_files.find_one({
|
||||
"source_id": source["_id"],
|
||||
"file_signature": file_signature
|
||||
})
|
||||
|
||||
return processed_file is None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking if file is new: {e}")
|
||||
return True # Assume it's new if we can't check
|
||||
|
||||
async def _mark_file_processed(self, source: Dict[str, Any], file_info: Dict[str, Any]):
|
||||
"""Mark file as processed"""
|
||||
try:
|
||||
filename = file_info["filename"]
|
||||
file_size = file_info["size"]
|
||||
modified_time = file_info["modified_time"]
|
||||
|
||||
# Create file signature
|
||||
file_signature = hashlib.md5(
|
||||
f"{filename}_{file_size}_{modified_time.timestamp()}".encode()
|
||||
).hexdigest()
|
||||
|
||||
# Record processed file
|
||||
processed_record = {
|
||||
"source_id": source["_id"],
|
||||
"source_name": source["name"],
|
||||
"filename": filename,
|
||||
"file_signature": file_signature,
|
||||
"file_size": file_size,
|
||||
"modified_time": modified_time,
|
||||
"processed_at": datetime.utcnow()
|
||||
}
|
||||
|
||||
await self.db.processed_files.insert_one(processed_record)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error marking file as processed: {e}")
|
||||
|
||||
async def _cache_file_info(self, source: Dict[str, Any], file_info: Dict[str, Any], content_size: int):
|
||||
"""Cache file information for monitoring"""
|
||||
try:
|
||||
cache_key = f"file_cache:{source['_id']}:{file_info['filename']}"
|
||||
cache_data = {
|
||||
"filename": file_info["filename"],
|
||||
"size": file_info["size"],
|
||||
"content_size": content_size,
|
||||
"downloaded_at": datetime.utcnow().isoformat(),
|
||||
"source_name": source["name"]
|
||||
}
|
||||
|
||||
# Store in Redis with 7-day expiration
|
||||
await self.redis.setex(
|
||||
cache_key,
|
||||
7 * 24 * 3600, # 7 days
|
||||
json.dumps(cache_data)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error caching file info: {e}")
|
||||
|
||||
async def get_processing_history(self, source_id: str, limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""Get processing history for a data source"""
|
||||
try:
|
||||
cursor = self.db.processed_files.find(
|
||||
{"source_id": source_id}
|
||||
).sort("processed_at", -1).limit(limit)
|
||||
|
||||
history = []
|
||||
async for record in cursor:
|
||||
record["_id"] = str(record["_id"])
|
||||
record["source_id"] = str(record["source_id"])
|
||||
if "processed_at" in record:
|
||||
record["processed_at"] = record["processed_at"].isoformat()
|
||||
if "modified_time" in record:
|
||||
record["modified_time"] = record["modified_time"].isoformat()
|
||||
history.append(record)
|
||||
|
||||
return history
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting processing history: {e}")
|
||||
return []
|
||||
|
||||
async def cleanup_old_records(self, days: int = 30):
|
||||
"""Clean up old processed file records"""
|
||||
try:
|
||||
cutoff_date = datetime.utcnow() - timedelta(days=days)
|
||||
|
||||
result = await self.db.processed_files.delete_many({
|
||||
"processed_at": {"$lt": cutoff_date}
|
||||
})
|
||||
|
||||
logger.info(f"Cleaned up {result.deleted_count} old processed file records")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up old records: {e}")
|
||||
|
||||
async def close_all_connections(self):
|
||||
"""Close all FTP connections"""
|
||||
for source_id in list(self.connection_pool.keys()):
|
||||
await self._close_ftp_connection(source_id)
|
||||
|
||||
logger.info("Closed all FTP connections")
|
||||
Reference in New Issue
Block a user