Implement iterative FTP scan and skip logic with processed file cache

- Add iterative directory scanning to prevent infinite recursion - Cache
processed files in memory to avoid redundant database lookups - Skip
already processed files using cache and database fallback - Add tests
for skip logic and iterative scan behavior - Change logging for MongoDB
connection and file storage to debug level - Clean up FastAPI app and
remove redundant docstrings
This commit is contained in:
rafaeldpsilva
2025-09-12 13:43:21 +01:00
parent a703240b27
commit aa07347604
8 changed files with 906 additions and 136 deletions

View File

@@ -33,6 +33,9 @@ async def lifespan(app: FastAPI):
# Initialize service registry # Initialize service registry
await service_registry.initialize() await service_registry.initialize()
# Register all services
await service_registry.register_services(SERVICES)
# Start health check task # Start health check task
asyncio.create_task(health_check_task()) asyncio.create_task(health_check_task())
@@ -66,51 +69,51 @@ auth_middleware = AuthMiddleware()
# Service configuration # Service configuration
SERVICES = { SERVICES = {
"token-service": ServiceConfig( # "token-service": ServiceConfig(
name="token-service", # name="token-service",
base_url=os.getenv("TOKEN_SERVICE_URL", "http://energy-token-service:8001"), # base_url=os.getenv("TOKEN_SERVICE_URL", "http://token-service:8001"),
health_endpoint="/health", # health_endpoint="/health",
auth_required=False # auth_required=False
), # ),
"battery-service": ServiceConfig( # "battery-service": ServiceConfig(
name="battery-service", # name="battery-service",
base_url=os.getenv("BATTERY_SERVICE_URL", "http://energy-battery-service:8002"), # base_url=os.getenv("BATTERY_SERVICE_URL", "http://battery-service:8002"),
health_endpoint="/health", # health_endpoint="/health",
auth_required=True # auth_required=True
), # ),
"demand-response-service": ServiceConfig( # "demand-response-service": ServiceConfig(
name="demand-response-service", # name="demand-response-service",
base_url=os.getenv("DEMAND_RESPONSE_SERVICE_URL", "http://energy-demand-response-service:8003"), # base_url=os.getenv("DEMAND_RESPONSE_SERVICE_URL", "http://demand-response-service:8003"),
health_endpoint="/health", # health_endpoint="/health",
auth_required=True # auth_required=True
), # ),
"p2p-trading-service": ServiceConfig( # "p2p-trading-service": ServiceConfig(
name="p2p-trading-service", # name="p2p-trading-service",
base_url=os.getenv("P2P_TRADING_SERVICE_URL", "http://energy-p2p-trading-service:8004"), # base_url=os.getenv("P2P_TRADING_SERVICE_URL", "http://p2p-trading-service:8004"),
health_endpoint="/health", # health_endpoint="/health",
auth_required=True # auth_required=True
), # ),
"forecasting-service": ServiceConfig( # "forecasting-service": ServiceConfig(
name="forecasting-service", # name="forecasting-service",
base_url=os.getenv("FORECASTING_SERVICE_URL", "http://energy-forecasting-service:8005"), # base_url=os.getenv("FORECASTING_SERVICE_URL", "http://forecasting-service:8005"),
health_endpoint="/health", # health_endpoint="/health",
auth_required=True # auth_required=True
), # ),
"iot-control-service": ServiceConfig( # "iot-control-service": ServiceConfig(
name="iot-control-service", # name="iot-control-service",
base_url=os.getenv("IOT_CONTROL_SERVICE_URL", "http://energy-iot-control-service:8006"), # base_url=os.getenv("IOT_CONTROL_SERVICE_URL", "http://iot-control-service:8006"),
health_endpoint="/health", # health_endpoint="/health",
auth_required=True # auth_required=True
), # ),
"sensor-service": ServiceConfig( "sensor-service": ServiceConfig(
name="sensor-service", name="sensor-service",
base_url=os.getenv("SENSOR_SERVICE_URL", "http://energy-sensor-service:8007"), base_url=os.getenv("SENSOR_SERVICE_URL", "http://sensor-service:8007"),
health_endpoint="/health", health_endpoint="/health",
auth_required=True auth_required=True
), ),
"data-ingestion-service": ServiceConfig( "data-ingestion-service": ServiceConfig(
name="data-ingestion-service", name="data-ingestion-service",
base_url=os.getenv("DATA_INGESTION_SERVICE_URL", "http://energy-data-ingestion-service:8008"), base_url=os.getenv("DATA_INGESTION_SERVICE_URL", "http://data-ingestion-service:8008"),
health_endpoint="/health", health_endpoint="/health",
auth_required=False auth_required=False
) )
@@ -437,9 +440,6 @@ async def health_check_task():
logger.error(f"Error in health check task: {e}") logger.error(f"Error in health check task: {e}")
await asyncio.sleep(60) await asyncio.sleep(60)
# Initialize service registry with services
asyncio.create_task(service_registry.register_services(SERVICES))
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000) uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -12,8 +12,8 @@ FTP_CONFIG: Dict[str, Any] = {
"host": os.getenv("FTP_SA4CPS_HOST", "ftp.sa4cps.pt"), "host": os.getenv("FTP_SA4CPS_HOST", "ftp.sa4cps.pt"),
"username": os.getenv("FTP_SA4CPS_USERNAME", "curvascarga@sa4cps.pt"), "username": os.getenv("FTP_SA4CPS_USERNAME", "curvascarga@sa4cps.pt"),
"password": os.getenv("FTP_SA4CPS_PASSWORD", 'n$WFtz9+bleN'), # Set via environment variable "password": os.getenv("FTP_SA4CPS_PASSWORD", 'n$WFtz9+bleN'), # Set via environment variable
"base_path": os.getenv("FTP_SA4CPS_REMOTE_PATH", "/SLGs/Faial/"), "base_path": os.getenv("FTP_SA4CPS_REMOTE_PATH", "/SLGs/"),
"check_interval": int(os.getenv("FTP_CHECK_INTERVAL", "21600")) # 6 hours default "check_interval": int(os.getenv("FTP_CHECK_INTERVAL", "21600")), # 6 hours default
} }
# MongoDB Configuration # MongoDB Configuration

View File

@@ -58,7 +58,7 @@ class DatabaseManager:
"""Close MongoDB connection""" """Close MongoDB connection"""
if self.client: if self.client:
self.client.close() self.client.close()
logger.info("MongoDB connection closed") logger.debug("MongoDB connection closed")
async def ping(self): async def ping(self):
"""Test database connection""" """Test database connection"""
@@ -68,7 +68,7 @@ class DatabaseManager:
try: try:
# The ping command is cheap and does not require auth. # The ping command is cheap and does not require auth.
self.client.admin.command('ping') self.client.admin.command('ping')
logger.info("MongoDB ping successful") logger.debug("MongoDB ping successful")
except ConnectionFailure as e: except ConnectionFailure as e:
logger.error(f"MongoDB ping failed - Server not available: {e}") logger.error(f"MongoDB ping failed - Server not available: {e}")
raise raise
@@ -121,7 +121,7 @@ class DatabaseManager:
if records: if records:
result = self.collections['energy_data'].insert_many(records) result = self.collections['energy_data'].insert_many(records)
inserted_count = len(result.inserted_ids) inserted_count = len(result.inserted_ids)
logger.info(f"Stored {inserted_count} records from {filename}") logger.debug(f"Stored {inserted_count} records from {filename}")
return True return True
return False return False
@@ -163,6 +163,10 @@ class DatabaseManager:
logger.error(f"Error getting processed files: {e}") logger.error(f"Error getting processed files: {e}")
return [] return []
async def is_file_processed(self, filename: str) -> bool:
"""Mock check if file is processed"""
return filename in await self.get_processed_files()
async def get_file_info(self, filename: str) -> Optional[Dict[str, Any]]: async def get_file_info(self, filename: str) -> Optional[Dict[str, Any]]:
"""Get information about a specific file""" """Get information about a specific file"""
try: try:

View File

@@ -48,11 +48,30 @@ class FTPMonitor:
logger.info(f"FTP Monitor initialized for {self.ftp_host}") logger.info(f"FTP Monitor initialized for {self.ftp_host}")
async def initialize_processed_files_cache(self):
"""Load already processed files from database into memory cache"""
try:
processed_file_names = await self.db_manager.get_processed_files()
# Convert filenames to full paths and add to processed_files set
for filename in processed_file_names:
# We'll use just the filename as the key since we check by filename
# But we need to be consistent with how we store paths
self.processed_files.add(filename)
logger.info(f"Loaded {len(processed_file_names)} already processed files from database")
return len(processed_file_names)
except Exception as e:
logger.error(f"Error loading processed files from database: {e}")
return 0
async def start_monitoring(self): async def start_monitoring(self):
"""Start the monitoring loop""" """Start the monitoring loop"""
self.status = "running" self.status = "running"
logger.info("Starting FTP monitoring loop") logger.info("Starting FTP monitoring loop")
# Initialize cache of processed files from database
await self.initialize_processed_files_cache()
while True: while True:
try: try:
await self.check_for_new_files() await self.check_for_new_files()
@@ -84,17 +103,35 @@ class FTPMonitor:
# Process new files # Process new files
processed_count = 0 processed_count = 0
skipped_count = 0
for file_info in new_files: for file_info in new_files:
if file_info.path not in self.processed_files: # Check if file is already processed (using filename for cache consistency)
if file_info.name in self.processed_files:
logger.debug(f"Skipping already processed file (cached): {file_info.name}")
skipped_count += 1
continue
# Double-check with database (in case cache missed something)
if await self.db_manager.is_file_processed(file_info.name):
logger.debug(f"Skipping already processed file (database): {file_info.name}")
# Add to cache to avoid future database checks
self.processed_files.add(file_info.name)
skipped_count += 1
continue
# Process the file
logger.debug(f"Processing new file: {file_info.name}")
success = await self._process_file(ftp, file_info) success = await self._process_file(ftp, file_info)
if success: if success:
self.processed_files.add(file_info.path) self.processed_files.add(file_info.name)
processed_count += 1 processed_count += 1
logger.debug(f"Successfully processed file: {file_info.name} ({processed_count} total)")
self.files_processed_count += 1 self.files_processed_count += 1
result = { result = {
"files_found": len(new_files), "files_found": len(new_files),
"files_processed": processed_count, "files_processed": processed_count,
"files_skipped": skipped_count,
"timestamp": self.last_check.isoformat() "timestamp": self.last_check.isoformat()
} }
@@ -110,26 +147,82 @@ class FTPMonitor:
files = [] files = []
try: try:
# Navigate to base path await self._scan_directories_iterative(ftp, self.base_path, files)
ftp.cwd(self.base_path) logger.info(f"Found {len(files)} .slg_v2 files across all directories")
logger.info(f"Scanning directory: {self.base_path}") return files
except Exception as e:
logger.error(f"Error scanning FTP directory: {e}")
return []
async def _scan_directories_iterative(self, ftp: FTP, base_path: str, files: List[FTPFileInfo]):
"""Iteratively scan directories for .slg_v2 files using a queue approach"""
# Queue of directories to scan: (directory_path, depth)
directories_to_scan = [(base_path, 0)]
visited_dirs = set()
while directories_to_scan:
current_dir, current_depth = directories_to_scan.pop(0) # FIFO queue
# Normalize directory path
normalized_path = current_dir.rstrip('/') if current_dir != '/' else '/'
# Skip if already visited (loop prevention)
if normalized_path in visited_dirs:
logger.debug(f"Skipping already visited directory: {normalized_path}")
continue
# Mark as visited
visited_dirs.add(normalized_path)
logger.debug(f"Scanning directory: {normalized_path} (depth: {current_depth}, queue: {len(directories_to_scan)})")
try:
# Navigate to directory
original_dir = ftp.pwd()
ftp.cwd(current_dir)
# Get directory listing # Get directory listing
dir_list = [] dir_list = []
ftp.retrlines('LIST', dir_list.append) ftp.retrlines('LIST', dir_list.append)
logger.info(f"Received {len(dir_list)} directory entries") logger.debug(f"Found {len(dir_list)} entries in {normalized_path}")
# Process entries
for line in dir_list: for line in dir_list:
print(line)
parts = line.split() parts = line.split()
if len(parts) >= 9: if len(parts) >= 9:
filename = parts[-1] filename = parts[-1]
# Check if it's a .slg_v2 file permissions = parts[0]
if filename.endswith('.sgl_v2'):
print('found file') # Skip current and parent directory references
if filename in ['.', '..']:
continue
# Handle directories
if permissions.startswith('d'):
# Create full subdirectory path
if normalized_path == '/':
subdirectory_path = f"/{filename}"
else:
subdirectory_path = f"{normalized_path}/{filename}"
# Normalize subdirectory path
subdirectory_normalized = subdirectory_path.rstrip('/') if subdirectory_path != '/' else '/'
# Add to queue if not already visited
if subdirectory_normalized not in visited_dirs:
directories_to_scan.append((subdirectory_path, current_depth + 1))
logger.debug(f"Added to queue: {subdirectory_path}")
else:
logger.debug(f"Skipping already visited: {subdirectory_path}")
# Handle .slg_v2 files
elif filename.endswith('.sgl_v2'):
logger.debug(f"Found .slg_v2 file: {filename} in {normalized_path}")
try: try:
size = int(parts[4]) size = int(parts[4])
full_path = f"{self.base_path.rstrip('/')}/{filename}" if normalized_path == '/':
full_path = f"/{filename}"
else:
full_path = f"{normalized_path}/{filename}"
files.append(FTPFileInfo( files.append(FTPFileInfo(
path=full_path, path=full_path,
@@ -140,25 +233,29 @@ class FTPMonitor:
except (ValueError, IndexError): except (ValueError, IndexError):
logger.warning(f"Could not parse file info for: {filename}") logger.warning(f"Could not parse file info for: {filename}")
logger.info(f"Found {len(files)} .slg_v2 files") # Return to original directory
return files ftp.cwd(original_dir)
logger.debug(f"Completed scanning: {normalized_path}")
except Exception as e: except Exception as e:
logger.error(f"Error scanning FTP directory: {e}") logger.warning(f"Error scanning directory {normalized_path}: {e}")
return [] continue
logger.info(f"Iterative scan completed. Visited {len(visited_dirs)} directories")
async def _process_file(self, ftp: FTP, file_info: FTPFileInfo) -> bool: async def _process_file(self, ftp: FTP, file_info: FTPFileInfo) -> bool:
"""Download and process a .slg_v2 file""" """Download and process a .slg_v2 file"""
logger.info(f"Processing file: {file_info.name} ({file_info.size} bytes)") logger.debug(f"Processing file: {file_info.path} ({file_info.size} bytes)")
try: try:
# Create temporary file for download # Create temporary file for download
with tempfile.NamedTemporaryFile(mode='wb', suffix='.slg_v2', delete=False) as temp_file: with tempfile.NamedTemporaryFile(mode='wb', suffix='.slg_v2', delete=False) as temp_file:
temp_path = temp_file.name temp_path = temp_file.name
# Download file # Download file using full path
with open(temp_path, 'wb') as f: with open(temp_path, 'wb') as f:
ftp.retrbinary(f'RETR {file_info.name}', f.write) # Use the full path for RETR command
ftp.retrbinary(f'RETR {file_info.path}', f.write)
# Process the downloaded file # Process the downloaded file
records = await self.processor.process_file(temp_path, file_info.name) records = await self.processor.process_file(temp_path, file_info.name)
@@ -166,7 +263,7 @@ class FTPMonitor:
# Store in database # Store in database
if records: if records:
await self.db_manager.store_file_data(file_info.name, records) await self.db_manager.store_file_data(file_info.name, records)
logger.info(f"Stored {len(records)} records from {file_info.name}") logger.debug(f"Stored {len(records)} records from {file_info.name}")
return True return True
else: else:
logger.warning(f"No valid records found in {file_info.name}") logger.warning(f"No valid records found in {file_info.name}")
@@ -205,5 +302,5 @@ class FTPMonitor:
"processed_files_count": len(self.processed_files), "processed_files_count": len(self.processed_files),
"check_interval_hours": self.check_interval / 3600, "check_interval_hours": self.check_interval / 3600,
"ftp_host": self.ftp_host, "ftp_host": self.ftp_host,
"base_path": self.base_path "base_path": self.base_path,
} }

View File

@@ -1,56 +1,43 @@
"""
SA4CPS Data Ingestion Service
Simple FTP monitoring service for .sgl_v2 files with MongoDB storage
"""
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
import asyncio import asyncio
import logging import logging
from datetime import datetime from datetime import datetime
from typing import Dict, Any from typing import Any
from ftp_monitor import FTPMonitor from ftp_monitor import FTPMonitor
from database import DatabaseManager from database import DatabaseManager
# Configure logging
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Global services
ftp_monitor = None ftp_monitor = None
db_manager = None db_manager = None
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
"""Application lifespan management"""
global ftp_monitor, db_manager global ftp_monitor, db_manager
logger.info("Starting SA4CPS Data Ingestion Service...") logger.info("Starting SA4CPS Data Ingestion Service...")
# Initialize database connection
db_manager = DatabaseManager() db_manager = DatabaseManager()
await db_manager.connect() await db_manager.connect()
# Initialize FTP monitor
ftp_monitor = FTPMonitor(db_manager) ftp_monitor = FTPMonitor(db_manager)
# Start background monitoring task
monitoring_task = asyncio.create_task(ftp_monitor.start_monitoring()) monitoring_task = asyncio.create_task(ftp_monitor.start_monitoring())
logger.info("Service started successfully") logger.info("Service started successfully")
yield yield
# Cleanup on shutdown
logger.info("Shutting down service...") logger.info("Shutting down service...")
monitoring_task.cancel() monitoring_task.cancel()
await db_manager.close() await db_manager.close()
logger.info("Service shutdown complete") logger.info("Service shutdown complete")
# Create FastAPI app
app = FastAPI( app = FastAPI(
title="SA4CPS Data Ingestion Service", title="SA4CPS Data Ingestion Service",
description="Monitors FTP server for .sgl_v2 files and stores data in MongoDB", description="Monitors FTP server for .sgl_v2 files and stores data in MongoDB",
@@ -61,7 +48,6 @@ app = FastAPI(
@app.get("/") @app.get("/")
async def root(): async def root():
"""Root endpoint"""
return { return {
"service": "SA4CPS Data Ingestion Service", "service": "SA4CPS Data Ingestion Service",
"status": "running", "status": "running",
@@ -71,7 +57,6 @@ async def root():
@app.get("/health") @app.get("/health")
async def health_check(): async def health_check():
"""Health check endpoint"""
global ftp_monitor, db_manager global ftp_monitor, db_manager
health_status = { health_status = {
@@ -101,7 +86,6 @@ async def health_check():
@app.get("/status") @app.get("/status")
async def get_status(): async def get_status():
"""Detailed status endpoint"""
global ftp_monitor, db_manager global ftp_monitor, db_manager
if not ftp_monitor: if not ftp_monitor:
@@ -116,7 +100,6 @@ async def get_status():
@app.post("/trigger-check") @app.post("/trigger-check")
async def trigger_manual_check(): async def trigger_manual_check():
"""Manually trigger FTP check"""
global ftp_monitor global ftp_monitor
if not ftp_monitor: if not ftp_monitor:

View File

@@ -0,0 +1,357 @@
#!/usr/bin/env python3
"""
Test database skip functionality
Tests that already processed files are skipped to avoid reprocessing
"""
import asyncio
import sys
import os
from pathlib import Path
from unittest.mock import MagicMock, patch, AsyncMock
from typing import List
# Add src directory to path
sys.path.append(str(Path(__file__).parent.parent / "src"))
from ftp_monitor import FTPMonitor, FTPFileInfo
class MockDatabaseManager:
"""Mock database manager for testing skip functionality"""
def __init__(self):
self.processed_files = set()
self.stored_files = {}
async def is_file_processed(self, filename: str) -> bool:
"""Mock check if file is processed"""
return filename in self.processed_files
async def get_processed_files(self) -> List[str]:
"""Mock get list of processed files"""
return list(self.processed_files)
async def store_file_data(self, filename: str, records: List) -> bool:
"""Mock store file data"""
self.processed_files.add(filename)
self.stored_files[filename] = records
return True
def mark_as_processed(self, filename: str):
"""Helper method to mark file as processed for testing"""
self.processed_files.add(filename)
class MockFTP:
"""Mock FTP client"""
def __init__(self, directory_structure):
self.directory_structure = directory_structure
self.current_dir = '/'
def pwd(self):
return self.current_dir
def cwd(self, path):
if path in self.directory_structure:
self.current_dir = path
else:
raise Exception(f"Directory not found: {path}")
def retrlines(self, command, callback):
"""Mock LIST command"""
if not command.startswith('LIST'):
raise Exception(f"Unsupported command: {command}")
current_struct = self.directory_structure.get(self.current_dir, {})
# Add files
for filename in current_struct.get('files', []):
callback(f"-rw-r--r-- 1 user group 1024 Jan 01 12:00 {filename}")
async def test_skip_already_processed_files():
"""Test that already processed files are skipped"""
print("🧪 Testing skip already processed files")
print("-" * 40)
# Create mock directory with files
directory_structure = {
'/': {
'files': ['file1.sgl_v2', 'file2.sgl_v2', 'file3.sgl_v2']
}
}
# Create mock database with some files already processed
mock_db = MockDatabaseManager()
mock_db.mark_as_processed('file1.sgl_v2') # Already processed
mock_db.mark_as_processed('file3.sgl_v2') # Already processed
# file2.sgl_v2 is NOT processed
with patch('ftp_monitor.FTP_CONFIG', {
'host': 'test.example.com',
'username': 'testuser',
'password': 'testpass',
'base_path': '/',
'check_interval': 3600,
'recursive_scan': False,
'max_recursion_depth': 5
}):
# Create FTP monitor with mock database
monitor = FTPMonitor(mock_db)
# Initialize cache from database
cache_count = await monitor.initialize_processed_files_cache()
print(f" Loaded {cache_count} files from database cache")
# Verify cache was loaded correctly
assert cache_count == 2, f"Expected 2 cached files, got {cache_count}"
assert 'file1.sgl_v2' in monitor.processed_files
assert 'file3.sgl_v2' in monitor.processed_files
assert 'file2.sgl_v2' not in monitor.processed_files
mock_ftp = MockFTP(directory_structure)
# Mock the _process_file method to track which files are processed
processed_files = []
original_process_file = monitor._process_file
async def mock_process_file(ftp, file_info):
processed_files.append(file_info.name)
return True
monitor._process_file = mock_process_file
# Test file processing
result = await monitor.check_for_new_files()
print(f"✅ Processing complete")
print(f" Files found: {result['files_found']}")
print(f" Files processed: {result['files_processed']}")
print(f" Files skipped: {result['files_skipped']}")
# Verify results
assert result['files_found'] == 3, "Should find 3 files total"
assert result['files_processed'] == 1, "Should process only 1 new file"
assert result['files_skipped'] == 2, "Should skip 2 already processed files"
# Verify only file2.sgl_v2 was processed
assert len(processed_files) == 1, f"Expected 1 processed file, got {len(processed_files)}"
assert 'file2.sgl_v2' in processed_files, "Should process file2.sgl_v2"
print("✅ Skip already processed files test passed")
async def test_database_lookup_fallback():
"""Test that database lookup works when cache misses"""
print("\n🧪 Testing database lookup fallback")
print("-" * 40)
# Create mock directory with files
directory_structure = {
'/': {
'files': ['new_file.sgl_v2', 'db_only_file.sgl_v2']
}
}
# Create mock database
mock_db = MockDatabaseManager()
# Simulate a file that exists in database but not in cache
mock_db.mark_as_processed('db_only_file.sgl_v2')
with patch('ftp_monitor.FTP_CONFIG', {
'host': 'test.example.com',
'username': 'testuser',
'password': 'testpass',
'base_path': '/',
'check_interval': 3600,
'recursive_scan': False,
'max_recursion_depth': 5
}):
monitor = FTPMonitor(mock_db)
# Don't initialize cache - simulate starting with empty cache
# but database has processed files
mock_ftp = MockFTP(directory_structure)
# Mock the _process_file method
processed_files = []
async def mock_process_file(ftp, file_info):
processed_files.append(file_info.name)
return True
monitor._process_file = mock_process_file
# Test file processing
result = await monitor.check_for_new_files()
print(f"✅ Database fallback test complete")
print(f" Files found: {result['files_found']}")
print(f" Files processed: {result['files_processed']}")
print(f" Files skipped: {result['files_skipped']}")
# Verify results
assert result['files_found'] == 2, "Should find 2 files total"
assert result['files_processed'] == 1, "Should process only 1 new file"
assert result['files_skipped'] == 1, "Should skip 1 database-processed file"
# Verify only new_file.sgl_v2 was processed
assert len(processed_files) == 1, f"Expected 1 processed file, got {len(processed_files)}"
assert 'new_file.sgl_v2' in processed_files, "Should process new_file.sgl_v2"
# Verify cache was updated with database file
assert 'db_only_file.sgl_v2' in monitor.processed_files, "Cache should be updated with database file"
print("✅ Database lookup fallback test passed")
async def test_cache_initialization():
"""Test that cache is properly initialized from database"""
print("\n🧪 Testing cache initialization")
print("-" * 35)
# Create mock database with processed files
mock_db = MockDatabaseManager()
mock_db.mark_as_processed('old_file1.sgl_v2')
mock_db.mark_as_processed('old_file2.sgl_v2')
mock_db.mark_as_processed('old_file3.sgl_v2')
with patch('ftp_monitor.FTP_CONFIG', {
'host': 'test.example.com',
'username': 'testuser',
'password': 'testpass',
'base_path': '/',
'check_interval': 3600,
'recursive_scan': False,
'max_recursion_depth': 5
}):
monitor = FTPMonitor(mock_db)
# Verify cache starts empty
assert len(monitor.processed_files) == 0, "Cache should start empty"
# Initialize cache
cache_count = await monitor.initialize_processed_files_cache()
print(f"✅ Cache initialized with {cache_count} files")
# Verify cache is populated
assert cache_count == 3, f"Expected 3 cached files, got {cache_count}"
assert len(monitor.processed_files) == 3, "Cache should contain 3 files"
expected_files = {'old_file1.sgl_v2', 'old_file2.sgl_v2', 'old_file3.sgl_v2'}
assert monitor.processed_files == expected_files, "Cache should contain expected files"
print("✅ Cache initialization test passed")
async def test_performance_with_many_processed_files():
"""Test performance with many already processed files"""
print("\n🧪 Testing performance with many processed files")
print("-" * 50)
# Create many files, mostly already processed
all_files = [f"file_{i:04d}.sgl_v2" for i in range(100)]
new_files = [f"new_file_{i}.sgl_v2" for i in range(3)]
directory_structure = {
'/': {
'files': all_files + new_files
}
}
# Create mock database with most files already processed
mock_db = MockDatabaseManager()
for filename in all_files:
mock_db.mark_as_processed(filename)
with patch('ftp_monitor.FTP_CONFIG', {
'host': 'test.example.com',
'username': 'testuser',
'password': 'testpass',
'base_path': '/',
'check_interval': 3600,
'recursive_scan': False,
'max_recursion_depth': 5
}):
monitor = FTPMonitor(mock_db)
# Initialize cache
cache_count = await monitor.initialize_processed_files_cache()
print(f" Loaded {cache_count} files into cache")
mock_ftp = MockFTP(directory_structure)
# Mock the _process_file method to track processing
processed_files = []
db_lookups = 0
# Track database lookups
original_is_file_processed = mock_db.is_file_processed
async def tracked_is_file_processed(filename):
nonlocal db_lookups
db_lookups += 1
return await original_is_file_processed(filename)
mock_db.is_file_processed = tracked_is_file_processed
async def mock_process_file(ftp, file_info):
processed_files.append(file_info.name)
return True
monitor._process_file = mock_process_file
# Test file processing
result = await monitor.check_for_new_files()
print(f"✅ Performance test complete")
print(f" Files found: {result['files_found']}")
print(f" Files processed: {result['files_processed']}")
print(f" Files skipped: {result['files_skipped']}")
print(f" Database lookups: {db_lookups}")
# Verify results
assert result['files_found'] == 103, "Should find 103 files total"
assert result['files_processed'] == 3, "Should process only 3 new files"
assert result['files_skipped'] == 100, "Should skip 100 already processed files"
# Verify performance: should have minimal database lookups due to caching
assert db_lookups == 3, f"Should have only 3 database lookups (for new files), got {db_lookups}"
# Verify only new files were processed
assert len(processed_files) == 3, f"Expected 3 processed files, got {len(processed_files)}"
for new_file in new_files:
assert new_file in processed_files, f"Should process {new_file}"
print("✅ Performance test passed")
async def main():
"""Main test function"""
print("🚀 Database Skip Functionality Test Suite")
print("=" * 50)
try:
await test_skip_already_processed_files()
await test_database_lookup_fallback()
await test_cache_initialization()
await test_performance_with_many_processed_files()
print("\n" + "=" * 50)
print("✅ All database skip tests passed!")
print("💾 File duplication prevention is working correctly")
print("🚀 Performance optimizations are effective")
except Exception as e:
print(f"\n❌ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,330 @@
#!/usr/bin/env python3
"""
Test FTP Monitor iterative directory scanning
Tests the new queue-based approach that prevents infinite loops
"""
import asyncio
import sys
import os
from pathlib import Path
from unittest.mock import MagicMock, patch
from typing import List
# Add src directory to path
sys.path.append(str(Path(__file__).parent.parent / "src"))
from ftp_monitor import FTPMonitor, FTPFileInfo
class MockFTP:
"""Mock FTP client for testing iterative scanning"""
def __init__(self, directory_structure):
self.directory_structure = directory_structure
self.current_dir = '/'
self.operations_log = [] # Track all operations for debugging
def pwd(self):
return self.current_dir
def cwd(self, path):
self.operations_log.append(f"CWD: {path}")
if path in self.directory_structure:
self.current_dir = path
else:
raise Exception(f"Directory not found: {path}")
def retrlines(self, command, callback):
"""Mock LIST command"""
if not command.startswith('LIST'):
raise Exception(f"Unsupported command: {command}")
self.operations_log.append(f"LIST: {self.current_dir}")
current_struct = self.directory_structure.get(self.current_dir, {})
# Add directories
for dirname in current_struct.get('directories', {}):
callback(f"drwxr-xr-x 2 user group 4096 Jan 01 12:00 {dirname}")
# Add files
for filename in current_struct.get('files', []):
callback(f"-rw-r--r-- 1 user group 1024 Jan 01 12:00 {filename}")
async def test_simple_directory_structure():
"""Test iterative scanning with simple nested structure"""
print("🧪 Testing simple directory structure")
print("-" * 40)
directory_structure = {
'/': {
'files': ['root.sgl_v2'],
'directories': {
'level1': {},
'level2': {}
}
},
'/level1': {
'files': ['file1.sgl_v2'],
'directories': {
'nested': {}
}
},
'/level1/nested': {
'files': ['nested.sgl_v2'],
'directories': {}
},
'/level2': {
'files': ['file2.sgl_v2'],
'directories': {}
}
}
mock_db = MagicMock()
with patch('ftp_monitor.FTP_CONFIG', {
'host': 'test.example.com',
'username': 'testuser',
'password': 'testpass',
'base_path': '/',
'check_interval': 3600,
'recursive_scan': True,
'max_recursion_depth': 10
}):
monitor = FTPMonitor(mock_db)
mock_ftp = MockFTP(directory_structure)
# Test iterative scan
files = []
await monitor._scan_directories_iterative(mock_ftp, '/', files)
print(f"✅ Found {len(files)} files")
print(f" Operations: {len(mock_ftp.operations_log)}")
# Verify all files were found
file_names = [f.name for f in files]
expected_files = ['root.sgl_v2', 'file1.sgl_v2', 'nested.sgl_v2', 'file2.sgl_v2']
assert len(files) == 4, f"Expected 4 files, got {len(files)}"
for expected_file in expected_files:
assert expected_file in file_names, f"Missing file: {expected_file}"
# Check that operations are reasonable (no infinite loops)
assert len(mock_ftp.operations_log) < 20, f"Too many operations: {len(mock_ftp.operations_log)}"
print("✅ Simple structure test passed")
async def test_circular_references():
"""Test that circular references are handled correctly"""
print("\n🧪 Testing circular references")
print("-" * 40)
# Create structure with circular reference
directory_structure = {
'/': {
'files': ['root.sgl_v2'],
'directories': {
'dirA': {}
}
},
'/dirA': {
'files': ['fileA.sgl_v2'],
'directories': {
'dirB': {}
}
},
'/dirA/dirB': {
'files': ['fileB.sgl_v2'],
'directories': {
'dirA': {} # This would create A -> B -> A loop in recursive approach
}
}
}
mock_db = MagicMock()
with patch('ftp_monitor.FTP_CONFIG', {
'host': 'test.example.com',
'username': 'testuser',
'password': 'testpass',
'base_path': '/',
'check_interval': 3600,
'recursive_scan': True,
'max_recursion_depth': 5
}):
monitor = FTPMonitor(mock_db)
mock_ftp = MockFTP(directory_structure)
# Test iterative scan
files = []
await monitor._scan_directories_iterative(mock_ftp, '/', files)
print(f"✅ Handled circular references")
print(f" Files found: {len(files)}")
print(f" Operations: {len(mock_ftp.operations_log)}")
# Should find all files without getting stuck
file_names = [f.name for f in files]
expected_files = ['root.sgl_v2', 'fileA.sgl_v2', 'fileB.sgl_v2']
assert len(files) == 3, f"Expected 3 files, got {len(files)}"
for expected_file in expected_files:
assert expected_file in file_names, f"Missing file: {expected_file}"
# Should not have excessive operations (indicating no infinite loop)
assert len(mock_ftp.operations_log) < 15, f"Too many operations: {len(mock_ftp.operations_log)}"
print("✅ Circular references test passed")
async def test_deep_structure_with_limit():
"""Test deep directory structure respects depth limit"""
print("\n🧪 Testing deep structure with depth limit")
print("-" * 45)
# Create deep structure
directory_structure = {
'/': {
'files': ['root.sgl_v2'],
'directories': {
'level1': {}
}
},
'/level1': {
'files': ['file1.sgl_v2'],
'directories': {
'level2': {}
}
},
'/level1/level2': {
'files': ['file2.sgl_v2'],
'directories': {
'level3': {}
}
},
'/level1/level2/level3': {
'files': ['deep_file.sgl_v2'], # Should not be found due to depth limit
'directories': {}
}
}
mock_db = MagicMock()
# Set low depth limit
with patch('ftp_monitor.FTP_CONFIG', {
'host': 'test.example.com',
'username': 'testuser',
'password': 'testpass',
'base_path': '/',
'check_interval': 3600,
'recursive_scan': True,
'max_recursion_depth': 2 # Should stop at level 2
}):
monitor = FTPMonitor(mock_db)
mock_ftp = MockFTP(directory_structure)
# Test iterative scan with depth limit
files = []
await monitor._scan_directories_iterative(mock_ftp, '/', files)
print(f"✅ Depth limit respected")
print(f" Files found: {len(files)}")
# Should find files up to depth 2, but not deeper
file_names = [f.name for f in files]
assert 'root.sgl_v2' in file_names, "Should find root file (depth 0)"
assert 'file1.sgl_v2' in file_names, "Should find level 1 file (depth 1)"
assert 'file2.sgl_v2' in file_names, "Should find level 2 file (depth 2)"
assert 'deep_file.sgl_v2' not in file_names, "Should NOT find deep file (depth 3)"
print("✅ Depth limit test passed")
async def test_queue_behavior():
"""Test that the queue processes directories in FIFO order"""
print("\n🧪 Testing queue FIFO behavior")
print("-" * 35)
directory_structure = {
'/': {
'files': [],
'directories': {
'first': {},
'second': {},
'third': {}
}
},
'/first': {
'files': ['first.sgl_v2'],
'directories': {}
},
'/second': {
'files': ['second.sgl_v2'],
'directories': {}
},
'/third': {
'files': ['third.sgl_v2'],
'directories': {}
}
}
mock_db = MagicMock()
with patch('ftp_monitor.FTP_CONFIG', {
'host': 'test.example.com',
'username': 'testuser',
'password': 'testpass',
'base_path': '/',
'check_interval': 3600,
'recursive_scan': True,
'max_recursion_depth': 5
}):
monitor = FTPMonitor(mock_db)
mock_ftp = MockFTP(directory_structure)
# Test iterative scan
files = []
await monitor._scan_directories_iterative(mock_ftp, '/', files)
print(f"✅ Queue behavior test completed")
print(f" Files found: {len(files)}")
# Should find all files
assert len(files) == 3, f"Expected 3 files, got {len(files)}"
file_names = [f.name for f in files]
expected_files = ['first.sgl_v2', 'second.sgl_v2', 'third.sgl_v2']
for expected_file in expected_files:
assert expected_file in file_names, f"Missing file: {expected_file}"
print("✅ Queue behavior test passed")
async def main():
"""Main test function"""
print("🚀 FTP Monitor Iterative Scanning Test Suite")
print("=" * 55)
try:
await test_simple_directory_structure()
await test_circular_references()
await test_deep_structure_with_limit()
await test_queue_behavior()
print("\n" + "=" * 55)
print("✅ All iterative scanning tests passed!")
print("🔄 Queue-based approach is working correctly")
except Exception as e:
print(f"\n❌ Test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -4,7 +4,7 @@ services:
# Database Services # Database Services
mongodb: mongodb:
image: mongo:5.0 image: mongo:5.0
container_name: energy-mongodb container_name: mongodb
restart: unless-stopped restart: unless-stopped
environment: environment:
MONGO_INITDB_ROOT_USERNAME: admin MONGO_INITDB_ROOT_USERNAME: admin
@@ -19,7 +19,7 @@ services:
redis: redis:
image: redis:7-alpine image: redis:7-alpine
container_name: energy-redis container_name: redis
restart: unless-stopped restart: unless-stopped
ports: ports:
- "6379:6379" - "6379:6379"
@@ -33,7 +33,7 @@ services:
build: build:
context: ./api-gateway context: ./api-gateway
dockerfile: Dockerfile dockerfile: Dockerfile
container_name: energy-api-gateway container_name: api-gateway
restart: unless-stopped restart: unless-stopped
ports: ports:
- "8000:8000" - "8000:8000"
@@ -51,7 +51,7 @@ services:
depends_on: depends_on:
- mongodb - mongodb
- redis - redis
- token-service # - token-service
- sensor-service - sensor-service
- data-ingestion-service - data-ingestion-service
# - battery-service # - battery-service
@@ -60,28 +60,28 @@ services:
- energy-network - energy-network
# Token Management Service # Token Management Service
token-service: # token-service:
build: # build:
context: ./token-service # context: ./token-service
dockerfile: Dockerfile # dockerfile: Dockerfile
container_name: energy-token-service # container_name: token-service
restart: unless-stopped # restart: unless-stopped
ports: # ports:
- "8001:8001" # - "8001:8001"
environment: # environment:
- MONGO_URL=mongodb://admin:password123@localhost:27017/energy_dashboard_tokens?authSource=admin # - MONGO_URL=mongodb://admin:password123@localhost:27017/energy_dashboard_tokens?authSource=admin
- JWT_SECRET_KEY=your-super-secret-jwt-key-change-in-production # - JWT_SECRET_KEY=your-super-secret-jwt-key-change-in-production
depends_on: # depends_on:
- mongodb # - mongodb
networks: # networks:
- energy-network # - energy-network
# Battery Management Service # Battery Management Service
# battery-service: # battery-service:
# build: # build:
# context: ./battery-service # context: ./battery-service
# dockerfile: Dockerfile # dockerfile: Dockerfile
# container_name: energy-battery-service # container_name: battery-service
# restart: unless-stopped # restart: unless-stopped
# ports: # ports:
# - "8002:8002" # - "8002:8002"
@@ -99,7 +99,7 @@ services:
# build: # build:
# context: ./demand-response-service # context: ./demand-response-service
# dockerfile: Dockerfile # dockerfile: Dockerfile
# container_name: energy-demand-response-service # container_name: demand-response-service
# restart: unless-stopped # restart: unless-stopped
# ports: # ports:
# - "8003:8003" # - "8003:8003"
@@ -118,7 +118,7 @@ services:
# build: # build:
# context: ./p2p-trading-service # context: ./p2p-trading-service
# dockerfile: Dockerfile # dockerfile: Dockerfile
# container_name: energy-p2p-trading-service # container_name: p2p-trading-service
# restart: unless-stopped # restart: unless-stopped
# ports: # ports:
# - "8004:8004" # - "8004:8004"
@@ -136,7 +136,7 @@ services:
# build: # build:
# context: ./forecasting-service # context: ./forecasting-service
# dockerfile: Dockerfile # dockerfile: Dockerfile
# container_name: energy-forecasting-service # container_name: forecasting-service
# restart: unless-stopped # restart: unless-stopped
# ports: # ports:
# - "8005:8005" # - "8005:8005"
@@ -154,7 +154,7 @@ services:
# build: # build:
# context: ./iot-control-service # context: ./iot-control-service
# dockerfile: Dockerfile # dockerfile: Dockerfile
# container_name: energy-iot-control-service # container_name: iot-control-service
# restart: unless-stopped # restart: unless-stopped
# ports: # ports:
# - "8006:8006" # - "8006:8006"
@@ -174,7 +174,7 @@ services:
build: build:
context: ./data-ingestion-service context: ./data-ingestion-service
dockerfile: Dockerfile dockerfile: Dockerfile
container_name: energy-data-ingestion-service container_name: data-ingestion-service
restart: unless-stopped restart: unless-stopped
ports: ports:
- "8008:8008" - "8008:8008"
@@ -183,8 +183,7 @@ services:
- FTP_SA4CPS_HOST=ftp.sa4cps.pt - FTP_SA4CPS_HOST=ftp.sa4cps.pt
- FTP_SA4CPS_PORT=21 - FTP_SA4CPS_PORT=21
- FTP_SA4CPS_USERNAME=curvascarga@sa4cps.pt - FTP_SA4CPS_USERNAME=curvascarga@sa4cps.pt
- FTP_SA4CPS_PASSWORD=n$WFtz9+bleN - FTP_SA4CPS_REMOTE_PATH=/SLGs/
- FTP_SA4CPS_REMOTE_PATH=/
- FTP_CHECK_INTERVAL=21600 - FTP_CHECK_INTERVAL=21600
depends_on: depends_on:
- mongodb - mongodb
@@ -196,7 +195,7 @@ services:
build: build:
context: ./sensor-service context: ./sensor-service
dockerfile: Dockerfile dockerfile: Dockerfile
container_name: energy-sensor-service container_name: sensor-service
restart: unless-stopped restart: unless-stopped
ports: ports:
- "8007:8007" - "8007:8007"
@@ -213,7 +212,7 @@ services:
# Monitoring and Management # Monitoring and Management
nginx: nginx:
image: nginx:alpine image: nginx:alpine
container_name: energy-nginx container_name: nginx
restart: unless-stopped restart: unless-stopped
ports: ports:
- "80:80" - "80:80"