""" FTP monitoring component for detecting and downloading new time series data files. Handles multiple FTP servers with different configurations and file patterns. """ import asyncio import ftplib import ftputil from ftputil import FTPHost from datetime import datetime, timedelta from typing import List, Dict, Any, Optional import logging import io import os import hashlib import json from pathlib import Path import re import ssl logger = logging.getLogger(__name__) class FTPMonitor: """Monitors FTP servers for new time series data files""" def __init__(self, db, redis_client): self.db = db self.redis = redis_client self.download_cache = {} # Cache for downloaded files self.connection_pool = {} # Pool of FTP connections async def check_for_new_files(self, source: Dict[str, Any]) -> List[Dict[str, Any]]: """Check FTP server for new files matching the configured patterns""" try: ftp_config = source.get("ftp_config", {}) file_patterns = source.get("file_patterns", ["*.csv"]) if not ftp_config: logger.warning(f"No FTP config for source: {source['name']}") return [] # Connect to FTP server ftp_host = await self._get_ftp_connection(source) if not ftp_host: return [] new_files = [] remote_path = ftp_config.get("remote_path", "/") try: # List files in remote directory file_list = await self._list_remote_files(ftp_host, remote_path) # Filter files by patterns and check if they're new for file_info in file_list: filename = file_info["filename"] # Check if file matches any pattern if self._matches_patterns(filename, file_patterns): # Check if file is new (not processed before) if await self._is_new_file(source, file_info): new_files.append(file_info) logger.info(f"Found new file: {filename}") # Update last check timestamp await self.db.data_sources.update_one( {"_id": source["_id"]}, {"$set": {"last_check": datetime.utcnow()}} ) except Exception as e: logger.error(f"Error listing files from FTP: {e}") await self._close_ftp_connection(source["_id"]) return new_files except Exception as e: logger.error(f"Error checking for new files in source {source['name']}: {e}") return [] async def download_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bytes: """Download a file from FTP server""" try: ftp_host = await self._get_ftp_connection(source) if not ftp_host: raise Exception("Cannot establish FTP connection") filename = file_info["filename"] remote_path = source["ftp_config"].get("remote_path", "/") full_path = f"{remote_path.rstrip('/')}/{filename}" logger.info(f"Downloading file: {full_path}") # Download file content file_content = await self._download_file_content(ftp_host, full_path) # Mark file as processed await self._mark_file_processed(source, file_info) # Cache file info for future reference await self._cache_file_info(source, file_info, len(file_content)) logger.info(f"Successfully downloaded {filename} ({len(file_content)} bytes)") return file_content except Exception as e: logger.error(f"Error downloading file {file_info.get('filename', 'unknown')}: {e}") raise async def test_connection(self, source: Dict[str, Any]) -> bool: """Test FTP connection for a data source""" try: ftp_config = source.get("ftp_config", {}) if not ftp_config: return False # Try to establish connection ftp_host = await self._create_ftp_connection(ftp_config) if ftp_host: # Try to list remote directory remote_path = ftp_config.get("remote_path", "/") try: await self._list_remote_files(ftp_host, remote_path, limit=1) success = True except: success = False # Close connection try: await asyncio.get_event_loop().run_in_executor( None, ftp_host.close ) except: pass return success return False except Exception as e: logger.error(f"Error testing FTP connection: {e}") return False async def get_file_metadata(self, source: Dict[str, Any], filename: str) -> Optional[Dict[str, Any]]: """Get metadata for a specific file""" try: ftp_host = await self._get_ftp_connection(source) if not ftp_host: return None remote_path = source["ftp_config"].get("remote_path", "/") full_path = f"{remote_path.rstrip('/')}/{filename}" # Get file stats def get_file_stat(): try: return ftp_host.stat(full_path) except: return None stat_info = await asyncio.get_event_loop().run_in_executor(None, get_file_stat) if stat_info: return { "filename": filename, "size": stat_info.st_size, "modified_time": datetime.fromtimestamp(stat_info.st_mtime), "full_path": full_path } return None except Exception as e: logger.error(f"Error getting file metadata for {filename}: {e}") return None async def _get_ftp_connection(self, source: Dict[str, Any]): """Get or create FTP connection for a source""" source_id = str(source["_id"]) # Check if we have a cached connection if source_id in self.connection_pool: connection = self.connection_pool[source_id] try: # Test if connection is still alive await asyncio.get_event_loop().run_in_executor( None, lambda: connection.getcwd() ) return connection except: # Connection is dead, remove from pool del self.connection_pool[source_id] # Create new connection ftp_config = source.get("ftp_config", {}) connection = await self._create_ftp_connection(ftp_config) if connection: self.connection_pool[source_id] = connection return connection async def _create_ftp_connection(self, ftp_config: Dict[str, Any]): """Create a new FTP connection""" try: host = ftp_config.get("host") port = ftp_config.get("port", 21) username = ftp_config.get("username", "anonymous") password = ftp_config.get("password", "") use_ssl = ftp_config.get("use_ssl", False) passive_mode = ftp_config.get("passive_mode", True) if not host: raise ValueError("FTP host not specified") def create_connection(): if use_ssl: # Use FTPS (FTP over SSL/TLS) ftp = ftplib.FTP_TLS() ftp.connect(host, port) ftp.login(username, password) ftp.prot_p() # Enable protection for data channel else: # Use regular FTP ftp = ftplib.FTP() ftp.connect(host, port) ftp.login(username, password) ftp.set_pasv(passive_mode) # Create FTPHost wrapper for easier file operations ftp_host = FTPHost.from_ftp_client(ftp) return ftp_host # Create connection in thread pool to avoid blocking ftp_host = await asyncio.get_event_loop().run_in_executor( None, create_connection ) logger.info(f"Successfully connected to FTP server: {host}:{port}") return ftp_host except Exception as e: logger.error(f"Error creating FTP connection to {ftp_config.get('host', 'unknown')}: {e}") return None async def _close_ftp_connection(self, source_id: str): """Close FTP connection for a source""" if source_id in self.connection_pool: try: connection = self.connection_pool[source_id] await asyncio.get_event_loop().run_in_executor( None, connection.close ) except: pass finally: del self.connection_pool[source_id] async def _list_remote_files(self, ftp_host, remote_path: str, limit: Optional[int] = None) -> List[Dict[str, Any]]: """List files in remote FTP directory""" def list_files(): files = [] try: # Change to remote directory ftp_host.chdir(remote_path) # Get file list with details file_list = ftp_host.listdir(".") for filename in file_list: try: # Get file stats file_path = f"{remote_path.rstrip('/')}/{filename}" stat_info = ftp_host.stat(filename) # Skip directories if not ftp_host.path.isfile(filename): continue file_info = { "filename": filename, "full_path": file_path, "size": stat_info.st_size, "modified_time": datetime.fromtimestamp(stat_info.st_mtime), "created_time": datetime.fromtimestamp(stat_info.st_ctime) if hasattr(stat_info, 'st_ctime') else None } files.append(file_info) if limit and len(files) >= limit: break except Exception as e: logger.warning(f"Error getting stats for file {filename}: {e}") continue except Exception as e: logger.error(f"Error listing directory {remote_path}: {e}") raise return files return await asyncio.get_event_loop().run_in_executor(None, list_files) async def _download_file_content(self, ftp_host, file_path: str) -> bytes: """Download file content from FTP server""" def download(): bio = io.BytesIO() try: ftp_host.download(file_path, bio) bio.seek(0) return bio.read() finally: bio.close() return await asyncio.get_event_loop().run_in_executor(None, download) def _matches_patterns(self, filename: str, patterns: List[str]) -> bool: """Check if filename matches any of the specified patterns""" for pattern in patterns: # Convert shell pattern to regex regex_pattern = pattern.replace("*", ".*").replace("?", ".") if re.match(regex_pattern, filename, re.IGNORECASE): return True return False async def _is_new_file(self, source: Dict[str, Any], file_info: Dict[str, Any]) -> bool: """Check if file is new (hasn't been processed before)""" try: filename = file_info["filename"] file_size = file_info["size"] modified_time = file_info["modified_time"] # Create file signature file_signature = hashlib.md5( f"{filename}_{file_size}_{modified_time.timestamp()}".encode() ).hexdigest() # Check if we've processed this file before processed_file = await self.db.processed_files.find_one({ "source_id": source["_id"], "file_signature": file_signature }) return processed_file is None except Exception as e: logger.error(f"Error checking if file is new: {e}") return True # Assume it's new if we can't check async def _mark_file_processed(self, source: Dict[str, Any], file_info: Dict[str, Any]): """Mark file as processed""" try: filename = file_info["filename"] file_size = file_info["size"] modified_time = file_info["modified_time"] # Create file signature file_signature = hashlib.md5( f"{filename}_{file_size}_{modified_time.timestamp()}".encode() ).hexdigest() # Record processed file processed_record = { "source_id": source["_id"], "source_name": source["name"], "filename": filename, "file_signature": file_signature, "file_size": file_size, "modified_time": modified_time, "processed_at": datetime.utcnow() } await self.db.processed_files.insert_one(processed_record) except Exception as e: logger.error(f"Error marking file as processed: {e}") async def _cache_file_info(self, source: Dict[str, Any], file_info: Dict[str, Any], content_size: int): """Cache file information for monitoring""" try: cache_key = f"file_cache:{source['_id']}:{file_info['filename']}" cache_data = { "filename": file_info["filename"], "size": file_info["size"], "content_size": content_size, "downloaded_at": datetime.utcnow().isoformat(), "source_name": source["name"] } # Store in Redis with 7-day expiration await self.redis.setex( cache_key, 7 * 24 * 3600, # 7 days json.dumps(cache_data) ) except Exception as e: logger.error(f"Error caching file info: {e}") async def get_processing_history(self, source_id: str, limit: int = 50) -> List[Dict[str, Any]]: """Get processing history for a data source""" try: cursor = self.db.processed_files.find( {"source_id": source_id} ).sort("processed_at", -1).limit(limit) history = [] async for record in cursor: record["_id"] = str(record["_id"]) record["source_id"] = str(record["source_id"]) if "processed_at" in record: record["processed_at"] = record["processed_at"].isoformat() if "modified_time" in record: record["modified_time"] = record["modified_time"].isoformat() history.append(record) return history except Exception as e: logger.error(f"Error getting processing history: {e}") return [] async def cleanup_old_records(self, days: int = 30): """Clean up old processed file records""" try: cutoff_date = datetime.utcnow() - timedelta(days=days) result = await self.db.processed_files.delete_many({ "processed_at": {"$lt": cutoff_date} }) logger.info(f"Cleaned up {result.deleted_count} old processed file records") except Exception as e: logger.error(f"Error cleaning up old records: {e}") async def close_all_connections(self): """Close all FTP connections""" for source_id in list(self.connection_pool.keys()): await self._close_ftp_connection(source_id) logger.info("Closed all FTP connections")