Add scan cache tracking and improve health checks

- Track scanned FTP directories in MongoDB to avoid redundant scans - Add endpoints to view and clear scan cache - Improve health check logic for better startup and error reporting - Add readiness endpoint for deployment probes - Add test script for health check improvements - Increase logging verbosity for debugging
2025-09-22 15:12:40 +01:00
parent c3ba7c0dc0
commit 41b8753a92
6 changed files with 440 additions and 95 deletions
--- a/microservices/data-ingestion-service/src/main.py
+++ b/microservices/data-ingestion-service/src/main.py
@@ -8,7 +8,7 @@ from typing import Any
 from ftp_monitor import FTPMonitor
 from database import DatabaseManager

-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)

 ftp_monitor = None
@@ -23,18 +23,36 @@ async def lifespan(app: FastAPI):

    db_manager = DatabaseManager()
    await db_manager.connect()
+    logger.info("Database connection established")

    ftp_monitor = FTPMonitor(db_manager)
+    logger.info("FTP monitor created")

    monitoring_task = asyncio.create_task(ftp_monitor.start_monitoring())
+    logger.info("FTP monitoring task started in background")

-    logger.info("Service started successfully")
+    logger.info("Service startup complete - HTTP server ready to accept requests")

    yield

    logger.info("Shutting down service...")
-    monitoring_task.cancel()
-    await db_manager.close()
+
+    # Cancel monitoring task and wait for graceful shutdown
+    if not monitoring_task.done():
+        monitoring_task.cancel()
+        try:
+            await asyncio.wait_for(monitoring_task, timeout=5.0)
+            logger.info("Monitoring task stopped gracefully")
+        except asyncio.TimeoutError:
+            logger.warning("Monitoring task shutdown timeout - forcing termination")
+        except asyncio.CancelledError:
+            logger.info("Monitoring task cancelled successfully")
+
+    # Close database connection
+    if db_manager:
+        await db_manager.close()
+        logger.info("Database connection closed")
+
    logger.info("Service shutdown complete")


@@ -66,24 +84,85 @@ async def health_check():
        "ftp_monitor": "unknown"
    }

+    service_issues = []
+
    # Check database connection
    if db_manager:
        try:
            await db_manager.ping()
            health_status["database"] = "connected"
-        except Exception:
+        except Exception as e:
            health_status["database"] = "disconnected"
-            health_status["service"] = "degraded"
+            service_issues.append("database_disconnected")
+            logger.warning(f"Database health check failed: {e}")
+    else:
+        health_status["database"] = "not_initialized"
+        health_status["service"] = "starting"

    # Check FTP monitor status
    if ftp_monitor:
-        health_status["ftp_monitor"] = ftp_monitor.get_status()
-        health_status["last_check"] = ftp_monitor.get_last_check_time()
-        health_status["files_processed"] = ftp_monitor.get_processed_count()
+        ftp_status = ftp_monitor.get_status()
+        health_status["ftp_monitor"] = ftp_status
+
+        try:
+            health_status["last_check"] = ftp_monitor.get_last_check_time()
+            health_status["files_processed"] = ftp_monitor.get_processed_count()
+        except:
+            # Don't fail health check if optional status fields fail
+            pass
+
+        # Improved service status logic - be more tolerant during startup
+        if ftp_status == "initializing":
+            # Service is initializing but can still be considered healthy for basic operations
+            if health_status["database"] == "connected":
+                health_status["service"] = "healthy"  # Database is ready, FTP is starting
+            else:
+                health_status["service"] = "starting"
+        elif ftp_status == "error":
+            service_issues.append("ftp_monitor_error")
+        elif ftp_status == "running":
+            pass  # Keep healthy status
+    else:
+        health_status["ftp_monitor"] = "not_initialized"
+        # Don't mark as starting if database is connected - service can be functional
+        if health_status["database"] != "connected":
+            health_status["service"] = "starting"
+
+    # Determine final service status
+    if service_issues:
+        health_status["service"] = "degraded"
+        health_status["issues"] = service_issues
+    elif health_status["service"] != "starting":
+        health_status["service"] = "healthy"

    return health_status


+@app.get("/readiness")
+async def readiness_check():
+    global ftp_monitor, db_manager
+
+    if not db_manager or not ftp_monitor:
+        raise HTTPException(status_code=503, detail="Service not ready - components not initialized")
+
+    # Check database connectivity
+    try:
+        await db_manager.ping()
+    except Exception as e:
+        raise HTTPException(status_code=503, detail=f"Service not ready - database issue: {str(e)}")
+
+    # FTP monitor should be at least initializing
+    ftp_status = ftp_monitor.get_status()
+    if ftp_status == "error":
+        raise HTTPException(status_code=503, detail="Service not ready - FTP monitor in error state")
+
+    return {
+        "ready": True,
+        "timestamp": datetime.now().isoformat(),
+        "ftp_monitor_status": ftp_status
+    }
+
+
@app.get("/status")
 async def get_status():
    global ftp_monitor, db_manager
@@ -117,6 +196,44 @@ async def trigger_manual_check():
        raise HTTPException(status_code=500, detail=f"Check failed: {str(e)}")


+@app.get("/scan-cache")
+async def get_scan_cache():
+    global db_manager
+
+    if not db_manager:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+
+    try:
+        scanned_dirs = await db_manager.get_scanned_directories()
+        return {
+            "scanned_directories": scanned_dirs,
+            "total_directories": len(scanned_dirs),
+            "timestamp": datetime.now().isoformat()
+        }
+    except Exception as e:
+        logger.error(f"Error getting scan cache: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to get scan cache: {str(e)}")
+
+
+@app.delete("/scan-cache")
+async def clear_scan_cache():
+    global db_manager
+
+    if not db_manager:
+        raise HTTPException(status_code=503, detail="Database not initialized")
+
+    try:
+        result = db_manager.collections['scanned_directories'].delete_many({})
+        return {
+            "message": "Scan cache cleared successfully",
+            "deleted_count": result.deleted_count,
+            "timestamp": datetime.now().isoformat()
+        }
+    except Exception as e:
+        logger.error(f"Error clearing scan cache: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to clear scan cache: {str(e)}")
+
+
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run("main:app", host="0.0.0.0", port=8008, reload=True)