Add data-ingestion-service for SA4CPS FTP integration
- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add robust data processor with multi-format and unit inference support - Publish parsed data to Redis topics for real-time dashboard simulation - Include validation, monitoring, and auto-configuration scripts - Provide documentation and test scripts for SA4CPS integration
This commit is contained in:
899
microservices/data-ingestion-service/data_processor.py
Normal file
899
microservices/data-ingestion-service/data_processor.py
Normal file
@@ -0,0 +1,899 @@
|
||||
"""
|
||||
Data processor for parsing and transforming time series data from various formats.
|
||||
Handles CSV, JSON, and other time series data formats from real community sources.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import pandas as pd
|
||||
import json
|
||||
import csv
|
||||
import io
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
import logging
|
||||
import numpy as np
|
||||
from dateutil import parser as date_parser
|
||||
import re
|
||||
import hashlib
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DataProcessor:
|
||||
"""Processes time series data from various formats"""
|
||||
|
||||
def __init__(self, db, redis_client):
|
||||
self.db = db
|
||||
self.redis = redis_client
|
||||
self.supported_formats = ["csv", "json", "txt", "xlsx", "slg_v2"]
|
||||
self.time_formats = [
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
"%Y-%m-%d %H:%M",
|
||||
"%Y-%m-%dT%H:%M:%S",
|
||||
"%Y-%m-%dT%H:%M:%SZ",
|
||||
"%d/%m/%Y %H:%M:%S",
|
||||
"%d-%m-%Y %H:%M:%S",
|
||||
"%Y/%m/%d %H:%M:%S"
|
||||
]
|
||||
|
||||
async def process_time_series_data(self, file_content: bytes, data_format: str) -> List[Dict[str, Any]]:
|
||||
"""Process time series data from file content"""
|
||||
try:
|
||||
logger.info(f"Processing time series data in {data_format} format ({len(file_content)} bytes)")
|
||||
|
||||
# Decode file content
|
||||
try:
|
||||
text_content = file_content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# Try other encodings
|
||||
try:
|
||||
text_content = file_content.decode('latin1')
|
||||
except UnicodeDecodeError:
|
||||
text_content = file_content.decode('utf-8', errors='ignore')
|
||||
|
||||
# Process based on format
|
||||
if data_format.lower() == "csv":
|
||||
return await self._process_csv_data(text_content)
|
||||
elif data_format.lower() == "json":
|
||||
return await self._process_json_data(text_content)
|
||||
elif data_format.lower() == "txt":
|
||||
return await self._process_text_data(text_content)
|
||||
elif data_format.lower() == "xlsx":
|
||||
return await self._process_excel_data(file_content)
|
||||
elif data_format.lower() == "slg_v2":
|
||||
return await self._process_slg_v2_data(text_content)
|
||||
else:
|
||||
# Try to auto-detect format
|
||||
return await self._auto_detect_and_process(text_content)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing time series data: {e}")
|
||||
raise
|
||||
|
||||
async def _process_csv_data(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""Process CSV time series data"""
|
||||
try:
|
||||
# Parse CSV content
|
||||
csv_reader = csv.DictReader(io.StringIO(content))
|
||||
rows = list(csv_reader)
|
||||
|
||||
if not rows:
|
||||
logger.warning("CSV file is empty")
|
||||
return []
|
||||
|
||||
logger.info(f"Found {len(rows)} rows in CSV")
|
||||
|
||||
# Auto-detect column mappings
|
||||
column_mapping = await self._detect_csv_columns(rows[0].keys())
|
||||
|
||||
processed_data = []
|
||||
for row_idx, row in enumerate(rows):
|
||||
try:
|
||||
processed_row = await self._process_csv_row(row, column_mapping)
|
||||
if processed_row:
|
||||
processed_data.append(processed_row)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing CSV row {row_idx}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Successfully processed {len(processed_data)} CSV records")
|
||||
return processed_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing CSV data: {e}")
|
||||
raise
|
||||
|
||||
async def _process_json_data(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""Process JSON time series data"""
|
||||
try:
|
||||
data = json.loads(content)
|
||||
|
||||
# Handle different JSON structures
|
||||
if isinstance(data, list):
|
||||
# Array of records
|
||||
return await self._process_json_array(data)
|
||||
elif isinstance(data, dict):
|
||||
# Single record or object with nested data
|
||||
return await self._process_json_object(data)
|
||||
else:
|
||||
logger.warning(f"Unexpected JSON structure: {type(data)}")
|
||||
return []
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Invalid JSON content: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing JSON data: {e}")
|
||||
raise
|
||||
|
||||
async def _process_text_data(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""Process text-based time series data"""
|
||||
try:
|
||||
lines = content.strip().split('\n')
|
||||
|
||||
# Try to detect the format of text data
|
||||
if not lines:
|
||||
return []
|
||||
|
||||
# Check if it's space-separated, tab-separated, or has another delimiter
|
||||
first_line = lines[0].strip()
|
||||
|
||||
# Detect delimiter
|
||||
delimiter = None
|
||||
for test_delim in ['\t', ' ', ';', '|']:
|
||||
if first_line.count(test_delim) > 0:
|
||||
delimiter = test_delim
|
||||
break
|
||||
|
||||
if not delimiter:
|
||||
# Try to parse as single column data
|
||||
return await self._process_single_column_data(lines)
|
||||
|
||||
# Parse delimited data
|
||||
processed_data = []
|
||||
header = None
|
||||
|
||||
for line_idx, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'): # Skip empty lines and comments
|
||||
continue
|
||||
|
||||
parts = line.split(delimiter)
|
||||
parts = [part.strip() for part in parts if part.strip()]
|
||||
|
||||
if not header:
|
||||
# First data line - use as header or create generic headers
|
||||
if await self._is_header_line(parts):
|
||||
header = parts
|
||||
continue
|
||||
else:
|
||||
header = [f"col_{i}" for i in range(len(parts))]
|
||||
|
||||
try:
|
||||
row_dict = dict(zip(header, parts))
|
||||
processed_row = await self._process_generic_row(row_dict)
|
||||
if processed_row:
|
||||
processed_data.append(processed_row)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing text line {line_idx}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Successfully processed {len(processed_data)} text records")
|
||||
return processed_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing text data: {e}")
|
||||
raise
|
||||
|
||||
async def _process_excel_data(self, content: bytes) -> List[Dict[str, Any]]:
|
||||
"""Process Excel time series data"""
|
||||
try:
|
||||
# Read Excel file
|
||||
df = pd.read_excel(io.BytesIO(content))
|
||||
|
||||
if df.empty:
|
||||
return []
|
||||
|
||||
# Convert DataFrame to list of dictionaries
|
||||
records = df.to_dict('records')
|
||||
|
||||
# Process each record
|
||||
processed_data = []
|
||||
for record in records:
|
||||
try:
|
||||
processed_row = await self._process_generic_row(record)
|
||||
if processed_row:
|
||||
processed_data.append(processed_row)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing Excel record: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Successfully processed {len(processed_data)} Excel records")
|
||||
return processed_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing Excel data: {e}")
|
||||
raise
|
||||
|
||||
async def _detect_csv_columns(self, columns: List[str]) -> Dict[str, str]:
|
||||
"""Auto-detect column mappings for CSV data"""
|
||||
mapping = {}
|
||||
|
||||
# Common column name patterns
|
||||
timestamp_patterns = [
|
||||
r'time.*stamp', r'date.*time', r'datetime', r'time', r'date',
|
||||
r'timestamp', r'ts', r'hora', r'fecha', r'datum', r'zeit'
|
||||
]
|
||||
|
||||
value_patterns = [
|
||||
r'.*energy.*', r'.*power.*', r'.*consumption.*', r'.*usage.*', r'.*load.*',
|
||||
r'.*wh.*', r'.*kwh.*', r'.*mwh.*', r'.*w.*', r'.*kw.*', r'.*mw.*',
|
||||
r'value', r'val', r'measure', r'reading', r'datos', r'wert'
|
||||
]
|
||||
|
||||
sensor_patterns = [
|
||||
r'.*sensor.*', r'.*device.*', r'.*meter.*', r'.*id.*',
|
||||
r'sensor', r'device', r'meter', r'contador', r'medidor'
|
||||
]
|
||||
|
||||
unit_patterns = [
|
||||
r'.*unit.*', r'.*measure.*', r'unit', r'unidad', r'einheit'
|
||||
]
|
||||
|
||||
for col in columns:
|
||||
col_lower = col.lower()
|
||||
|
||||
# Check for timestamp columns
|
||||
if any(re.match(pattern, col_lower) for pattern in timestamp_patterns):
|
||||
mapping['timestamp'] = col
|
||||
|
||||
# Check for value columns
|
||||
elif any(re.match(pattern, col_lower) for pattern in value_patterns):
|
||||
mapping['value'] = col
|
||||
|
||||
# Check for sensor ID columns
|
||||
elif any(re.match(pattern, col_lower) for pattern in sensor_patterns):
|
||||
mapping['sensor_id'] = col
|
||||
|
||||
# Check for unit columns
|
||||
elif any(re.match(pattern, col_lower) for pattern in unit_patterns):
|
||||
mapping['unit'] = col
|
||||
|
||||
# Set defaults if not found
|
||||
if 'timestamp' not in mapping:
|
||||
# Use first column as timestamp
|
||||
mapping['timestamp'] = columns[0]
|
||||
|
||||
if 'value' not in mapping and len(columns) > 1:
|
||||
# Use second column or first numeric-looking column
|
||||
for col in columns[1:]:
|
||||
if col != mapping.get('timestamp'):
|
||||
mapping['value'] = col
|
||||
break
|
||||
|
||||
logger.info(f"Detected column mapping: {mapping}")
|
||||
return mapping
|
||||
|
||||
async def _process_csv_row(self, row: Dict[str, str], column_mapping: Dict[str, str]) -> Optional[Dict[str, Any]]:
|
||||
"""Process a single CSV row"""
|
||||
try:
|
||||
processed_row = {}
|
||||
|
||||
# Extract timestamp
|
||||
timestamp_col = column_mapping.get('timestamp')
|
||||
if timestamp_col and timestamp_col in row:
|
||||
timestamp = await self._parse_timestamp(row[timestamp_col])
|
||||
if timestamp:
|
||||
processed_row['timestamp'] = int(timestamp.timestamp())
|
||||
processed_row['datetime'] = timestamp.isoformat()
|
||||
else:
|
||||
return None
|
||||
|
||||
# Extract sensor ID
|
||||
sensor_col = column_mapping.get('sensor_id')
|
||||
if sensor_col and sensor_col in row:
|
||||
processed_row['sensor_id'] = str(row[sensor_col]).strip()
|
||||
else:
|
||||
# Generate a default sensor ID
|
||||
processed_row['sensor_id'] = "unknown_sensor"
|
||||
|
||||
# Extract value(s)
|
||||
value_col = column_mapping.get('value')
|
||||
if value_col and value_col in row:
|
||||
try:
|
||||
value = await self._parse_numeric_value(row[value_col])
|
||||
if value is not None:
|
||||
processed_row['value'] = value
|
||||
else:
|
||||
return None
|
||||
except:
|
||||
return None
|
||||
|
||||
# Extract unit
|
||||
unit_col = column_mapping.get('unit')
|
||||
if unit_col and unit_col in row:
|
||||
processed_row['unit'] = str(row[unit_col]).strip()
|
||||
else:
|
||||
processed_row['unit'] = await self._infer_unit(processed_row.get('value', 0))
|
||||
|
||||
# Add all other columns as metadata
|
||||
metadata = {}
|
||||
for col, val in row.items():
|
||||
if col not in column_mapping.values() and val:
|
||||
try:
|
||||
# Try to parse as number
|
||||
num_val = await self._parse_numeric_value(val)
|
||||
metadata[col] = num_val if num_val is not None else str(val).strip()
|
||||
except:
|
||||
metadata[col] = str(val).strip()
|
||||
|
||||
if metadata:
|
||||
processed_row['metadata'] = metadata
|
||||
|
||||
# Add processing metadata
|
||||
processed_row['processed_at'] = datetime.utcnow().isoformat()
|
||||
processed_row['data_source'] = 'csv'
|
||||
|
||||
return processed_row
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing CSV row: {e}")
|
||||
return None
|
||||
|
||||
async def _process_json_array(self, data: List[Any]) -> List[Dict[str, Any]]:
|
||||
"""Process JSON array of records"""
|
||||
processed_data = []
|
||||
|
||||
for item in data:
|
||||
if isinstance(item, dict):
|
||||
processed_row = await self._process_json_record(item)
|
||||
if processed_row:
|
||||
processed_data.append(processed_row)
|
||||
|
||||
return processed_data
|
||||
|
||||
async def _process_json_object(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Process JSON object"""
|
||||
# Check if it contains time series data
|
||||
if 'data' in data and isinstance(data['data'], list):
|
||||
return await self._process_json_array(data['data'])
|
||||
elif 'readings' in data and isinstance(data['readings'], list):
|
||||
return await self._process_json_array(data['readings'])
|
||||
elif 'values' in data and isinstance(data['values'], list):
|
||||
return await self._process_json_array(data['values'])
|
||||
else:
|
||||
# Treat as single record
|
||||
processed_row = await self._process_json_record(data)
|
||||
return [processed_row] if processed_row else []
|
||||
|
||||
async def _process_json_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Process a single JSON record"""
|
||||
try:
|
||||
processed_row = {}
|
||||
|
||||
# Extract timestamp
|
||||
timestamp = None
|
||||
for ts_field in ['timestamp', 'datetime', 'time', 'date', 'ts']:
|
||||
if ts_field in record:
|
||||
timestamp = await self._parse_timestamp(record[ts_field])
|
||||
if timestamp:
|
||||
break
|
||||
|
||||
if timestamp:
|
||||
processed_row['timestamp'] = int(timestamp.timestamp())
|
||||
processed_row['datetime'] = timestamp.isoformat()
|
||||
else:
|
||||
# Use current time if no timestamp found
|
||||
now = datetime.utcnow()
|
||||
processed_row['timestamp'] = int(now.timestamp())
|
||||
processed_row['datetime'] = now.isoformat()
|
||||
|
||||
# Extract sensor ID
|
||||
sensor_id = None
|
||||
for id_field in ['sensor_id', 'sensorId', 'device_id', 'deviceId', 'id', 'sensor', 'device']:
|
||||
if id_field in record:
|
||||
sensor_id = str(record[id_field])
|
||||
break
|
||||
|
||||
processed_row['sensor_id'] = sensor_id or "unknown_sensor"
|
||||
|
||||
# Extract value(s)
|
||||
value = None
|
||||
for val_field in ['value', 'reading', 'measurement', 'data', 'energy', 'power', 'consumption']:
|
||||
if val_field in record:
|
||||
try:
|
||||
value = await self._parse_numeric_value(record[val_field])
|
||||
if value is not None:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if value is not None:
|
||||
processed_row['value'] = value
|
||||
|
||||
# Extract unit
|
||||
unit = None
|
||||
for unit_field in ['unit', 'units', 'measure_unit', 'uom']:
|
||||
if unit_field in record:
|
||||
unit = str(record[unit_field])
|
||||
break
|
||||
|
||||
processed_row['unit'] = unit or await self._infer_unit(processed_row.get('value', 0))
|
||||
|
||||
# Add remaining fields as metadata
|
||||
metadata = {}
|
||||
processed_fields = {'timestamp', 'datetime', 'time', 'date', 'ts',
|
||||
'sensor_id', 'sensorId', 'device_id', 'deviceId', 'id', 'sensor', 'device',
|
||||
'value', 'reading', 'measurement', 'data', 'energy', 'power', 'consumption',
|
||||
'unit', 'units', 'measure_unit', 'uom'}
|
||||
|
||||
for key, val in record.items():
|
||||
if key not in processed_fields and val is not None:
|
||||
metadata[key] = val
|
||||
|
||||
if metadata:
|
||||
processed_row['metadata'] = metadata
|
||||
|
||||
# Add processing metadata
|
||||
processed_row['processed_at'] = datetime.utcnow().isoformat()
|
||||
processed_row['data_source'] = 'json'
|
||||
|
||||
return processed_row
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing JSON record: {e}")
|
||||
return None
|
||||
|
||||
async def _process_generic_row(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Process a generic row of data"""
|
||||
try:
|
||||
processed_row = {}
|
||||
|
||||
# Try to find timestamp
|
||||
timestamp = None
|
||||
for key, val in row.items():
|
||||
if 'time' in key.lower() or 'date' in key.lower():
|
||||
timestamp = await self._parse_timestamp(val)
|
||||
if timestamp:
|
||||
break
|
||||
|
||||
if timestamp:
|
||||
processed_row['timestamp'] = int(timestamp.timestamp())
|
||||
processed_row['datetime'] = timestamp.isoformat()
|
||||
else:
|
||||
now = datetime.utcnow()
|
||||
processed_row['timestamp'] = int(now.timestamp())
|
||||
processed_row['datetime'] = now.isoformat()
|
||||
|
||||
# Try to find sensor ID
|
||||
sensor_id = None
|
||||
for key, val in row.items():
|
||||
if 'sensor' in key.lower() or 'device' in key.lower() or 'id' in key.lower():
|
||||
sensor_id = str(val)
|
||||
break
|
||||
|
||||
processed_row['sensor_id'] = sensor_id or "unknown_sensor"
|
||||
|
||||
# Try to find numeric value
|
||||
value = None
|
||||
for key, val in row.items():
|
||||
if key.lower() not in ['timestamp', 'datetime', 'time', 'date', 'sensor_id', 'device_id', 'id']:
|
||||
try:
|
||||
value = await self._parse_numeric_value(val)
|
||||
if value is not None:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
if value is not None:
|
||||
processed_row['value'] = value
|
||||
processed_row['unit'] = await self._infer_unit(value)
|
||||
|
||||
# Add all fields as metadata
|
||||
metadata = {k: v for k, v in row.items() if v is not None}
|
||||
if metadata:
|
||||
processed_row['metadata'] = metadata
|
||||
|
||||
processed_row['processed_at'] = datetime.utcnow().isoformat()
|
||||
processed_row['data_source'] = 'generic'
|
||||
|
||||
return processed_row
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing generic row: {e}")
|
||||
return None
|
||||
|
||||
async def _parse_timestamp(self, timestamp_str: Union[str, int, float]) -> Optional[datetime]:
|
||||
"""Parse timestamp from various formats"""
|
||||
try:
|
||||
if isinstance(timestamp_str, (int, float)):
|
||||
# Unix timestamp
|
||||
if timestamp_str > 1e10: # Milliseconds
|
||||
timestamp_str = timestamp_str / 1000
|
||||
return datetime.fromtimestamp(timestamp_str)
|
||||
|
||||
if isinstance(timestamp_str, str):
|
||||
timestamp_str = timestamp_str.strip()
|
||||
|
||||
# Try common formats first
|
||||
for fmt in self.time_formats:
|
||||
try:
|
||||
return datetime.strptime(timestamp_str, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Try dateutil parser as fallback
|
||||
try:
|
||||
return date_parser.parse(timestamp_str)
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error parsing timestamp '{timestamp_str}': {e}")
|
||||
return None
|
||||
|
||||
async def _parse_numeric_value(self, value_str: Union[str, int, float]) -> Optional[float]:
|
||||
"""Parse numeric value from string"""
|
||||
try:
|
||||
if isinstance(value_str, (int, float)):
|
||||
return float(value_str) if not (isinstance(value_str, float) and np.isnan(value_str)) else None
|
||||
|
||||
if isinstance(value_str, str):
|
||||
# Clean the string
|
||||
cleaned = re.sub(r'[^\d.-]', '', value_str.strip())
|
||||
if cleaned:
|
||||
return float(cleaned)
|
||||
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
async def _infer_unit(self, value: float) -> str:
|
||||
"""Infer unit based on value range"""
|
||||
try:
|
||||
if value is None:
|
||||
return "unknown"
|
||||
|
||||
# Common energy unit ranges
|
||||
if value < 1:
|
||||
return "Wh"
|
||||
elif value < 1000:
|
||||
return "kWh"
|
||||
elif value < 1000000:
|
||||
return "MWh"
|
||||
else:
|
||||
return "GWh"
|
||||
|
||||
except:
|
||||
return "unknown"
|
||||
|
||||
async def _is_header_line(self, parts: List[str]) -> bool:
|
||||
"""Check if a line appears to be a header"""
|
||||
# If all parts are strings without numbers, likely a header
|
||||
for part in parts:
|
||||
try:
|
||||
float(part)
|
||||
return False # Found a number, not a header
|
||||
except ValueError:
|
||||
continue
|
||||
return True
|
||||
|
||||
async def _process_single_column_data(self, lines: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Process single column data"""
|
||||
processed_data = []
|
||||
|
||||
for line_idx, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
try:
|
||||
value = await self._parse_numeric_value(line)
|
||||
if value is not None:
|
||||
now = datetime.utcnow()
|
||||
processed_row = {
|
||||
'sensor_id': 'single_column_sensor',
|
||||
'timestamp': int(now.timestamp()) + line_idx, # Spread timestamps
|
||||
'datetime': (now + timedelta(seconds=line_idx)).isoformat(),
|
||||
'value': value,
|
||||
'unit': await self._infer_unit(value),
|
||||
'processed_at': now.isoformat(),
|
||||
'data_source': 'text_single_column',
|
||||
'metadata': {'line_number': line_idx}
|
||||
}
|
||||
processed_data.append(processed_row)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing single column line {line_idx}: {e}")
|
||||
continue
|
||||
|
||||
return processed_data
|
||||
|
||||
async def _auto_detect_and_process(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""Auto-detect format and process data"""
|
||||
try:
|
||||
# Try JSON first
|
||||
try:
|
||||
json.loads(content)
|
||||
return await self._process_json_data(content)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try CSV
|
||||
try:
|
||||
lines = content.strip().split('\n')
|
||||
if len(lines) > 1 and (',' in lines[0] or ';' in lines[0] or '\t' in lines[0]):
|
||||
return await self._process_csv_data(content)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Fall back to text processing
|
||||
return await self._process_text_data(content)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in auto-detection: {e}")
|
||||
raise
|
||||
|
||||
async def _process_slg_v2_data(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""Process SA4CPS .slg_v2 format files"""
|
||||
try:
|
||||
lines = content.strip().split('\n')
|
||||
|
||||
if not lines:
|
||||
logger.warning("SLG_V2 file is empty")
|
||||
return []
|
||||
|
||||
logger.info(f"Processing SLG_V2 file with {len(lines)} lines")
|
||||
|
||||
processed_data = []
|
||||
header = None
|
||||
metadata = {}
|
||||
|
||||
for line_idx, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Handle comment lines and metadata
|
||||
if line.startswith('#') or line.startswith('//'):
|
||||
# Extract metadata from comment lines
|
||||
comment = line[1:].strip() if line.startswith('#') else line[2:].strip()
|
||||
if ':' in comment:
|
||||
key, value = comment.split(':', 1)
|
||||
metadata[key.strip()] = value.strip()
|
||||
continue
|
||||
|
||||
# Handle header lines (if present)
|
||||
if line_idx == 0 or (header is None and await self._is_slg_v2_header(line)):
|
||||
header = await self._parse_slg_v2_header(line)
|
||||
continue
|
||||
|
||||
# Process data lines
|
||||
try:
|
||||
processed_row = await self._process_slg_v2_line(line, header, metadata, line_idx)
|
||||
if processed_row:
|
||||
processed_data.append(processed_row)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error processing SLG_V2 line {line_idx}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Successfully processed {len(processed_data)} SLG_V2 records")
|
||||
return processed_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing SLG_V2 data: {e}")
|
||||
raise
|
||||
|
||||
async def _is_slg_v2_header(self, line: str) -> bool:
|
||||
"""Check if a line appears to be a SLG_V2 header"""
|
||||
# Common SLG_V2 header patterns
|
||||
header_keywords = ['timestamp', 'time', 'date', 'sensor', 'id', 'value', 'reading',
|
||||
'energy', 'power', 'voltage', 'current', 'temperature']
|
||||
|
||||
line_lower = line.lower()
|
||||
# Check if line contains header-like words and few or no numbers
|
||||
has_keywords = any(keyword in line_lower for keyword in header_keywords)
|
||||
|
||||
# Try to parse as numbers - if most parts fail, likely a header
|
||||
parts = line.replace(',', ' ').replace(';', ' ').replace('\t', ' ').split()
|
||||
numeric_parts = 0
|
||||
for part in parts:
|
||||
try:
|
||||
float(part.strip())
|
||||
numeric_parts += 1
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# If less than half are numeric and has keywords, likely header
|
||||
return has_keywords and (numeric_parts < len(parts) / 2)
|
||||
|
||||
async def _parse_slg_v2_header(self, line: str) -> List[str]:
|
||||
"""Parse SLG_V2 header line"""
|
||||
# Try different delimiters
|
||||
for delimiter in [',', ';', '\t', ' ']:
|
||||
if delimiter in line:
|
||||
parts = [part.strip() for part in line.split(delimiter) if part.strip()]
|
||||
if len(parts) > 1:
|
||||
return parts
|
||||
|
||||
# Default to splitting by whitespace
|
||||
return [part.strip() for part in line.split() if part.strip()]
|
||||
|
||||
async def _process_slg_v2_line(self, line: str, header: Optional[List[str]],
|
||||
metadata: Dict[str, Any], line_idx: int) -> Optional[Dict[str, Any]]:
|
||||
"""Process a single SLG_V2 data line"""
|
||||
try:
|
||||
# Try different delimiters to parse the line
|
||||
parts = None
|
||||
for delimiter in [',', ';', '\t', ' ']:
|
||||
if delimiter in line:
|
||||
test_parts = [part.strip() for part in line.split(delimiter) if part.strip()]
|
||||
if len(test_parts) > 1:
|
||||
parts = test_parts
|
||||
break
|
||||
|
||||
if not parts:
|
||||
# Split by whitespace as fallback
|
||||
parts = [part.strip() for part in line.split() if part.strip()]
|
||||
|
||||
if not parts:
|
||||
return None
|
||||
|
||||
# Create row dictionary
|
||||
if header and len(parts) >= len(header):
|
||||
row_dict = dict(zip(header, parts[:len(header)]))
|
||||
# Add extra columns if any
|
||||
for i, extra_part in enumerate(parts[len(header):]):
|
||||
row_dict[f"extra_col_{i}"] = extra_part
|
||||
else:
|
||||
# Create generic column names
|
||||
row_dict = {f"col_{i}": part for i, part in enumerate(parts)}
|
||||
|
||||
# Process the row similar to generic processing but with SLG_V2 specifics
|
||||
processed_row = {}
|
||||
|
||||
# Extract timestamp
|
||||
timestamp = None
|
||||
timestamp_value = None
|
||||
for key, val in row_dict.items():
|
||||
key_lower = key.lower()
|
||||
if any(ts_word in key_lower for ts_word in ['time', 'date', 'timestamp', 'ts']):
|
||||
timestamp = await self._parse_timestamp(val)
|
||||
timestamp_value = val
|
||||
if timestamp:
|
||||
break
|
||||
|
||||
if timestamp:
|
||||
processed_row['timestamp'] = int(timestamp.timestamp())
|
||||
processed_row['datetime'] = timestamp.isoformat()
|
||||
else:
|
||||
# Use current time with line offset for uniqueness
|
||||
now = datetime.utcnow()
|
||||
processed_row['timestamp'] = int(now.timestamp()) + line_idx
|
||||
processed_row['datetime'] = (now + timedelta(seconds=line_idx)).isoformat()
|
||||
|
||||
# Extract sensor ID
|
||||
sensor_id = None
|
||||
for key, val in row_dict.items():
|
||||
key_lower = key.lower()
|
||||
if any(id_word in key_lower for id_word in ['sensor', 'device', 'meter', 'id']):
|
||||
sensor_id = str(val).strip()
|
||||
break
|
||||
|
||||
processed_row['sensor_id'] = sensor_id or f"slg_v2_sensor_{line_idx}"
|
||||
|
||||
# Extract numeric values
|
||||
values_found = []
|
||||
for key, val in row_dict.items():
|
||||
key_lower = key.lower()
|
||||
# Skip timestamp and ID fields
|
||||
if (any(skip_word in key_lower for skip_word in ['time', 'date', 'timestamp', 'ts', 'id', 'sensor', 'device', 'meter']) and
|
||||
val == timestamp_value) or key_lower.endswith('_id'):
|
||||
continue
|
||||
|
||||
try:
|
||||
numeric_val = await self._parse_numeric_value(val)
|
||||
if numeric_val is not None:
|
||||
values_found.append({
|
||||
'key': key,
|
||||
'value': numeric_val,
|
||||
'unit': await self._infer_slg_v2_unit(key, numeric_val)
|
||||
})
|
||||
except:
|
||||
continue
|
||||
|
||||
# Handle multiple values
|
||||
if len(values_found) == 1:
|
||||
# Single value case
|
||||
processed_row['value'] = values_found[0]['value']
|
||||
processed_row['unit'] = values_found[0]['unit']
|
||||
processed_row['value_type'] = values_found[0]['key']
|
||||
elif len(values_found) > 1:
|
||||
# Multiple values case - create main value and store others in metadata
|
||||
main_value = values_found[0] # Use first numeric value as main
|
||||
processed_row['value'] = main_value['value']
|
||||
processed_row['unit'] = main_value['unit']
|
||||
processed_row['value_type'] = main_value['key']
|
||||
|
||||
# Store additional values in metadata
|
||||
additional_values = {}
|
||||
for val_info in values_found[1:]:
|
||||
additional_values[val_info['key']] = {
|
||||
'value': val_info['value'],
|
||||
'unit': val_info['unit']
|
||||
}
|
||||
processed_row['additional_values'] = additional_values
|
||||
|
||||
# Add all data as metadata
|
||||
row_metadata = dict(row_dict)
|
||||
row_metadata.update(metadata) # Include file-level metadata
|
||||
row_metadata['line_number'] = line_idx
|
||||
row_metadata['raw_line'] = line
|
||||
processed_row['metadata'] = row_metadata
|
||||
|
||||
# Add processing info
|
||||
processed_row['processed_at'] = datetime.utcnow().isoformat()
|
||||
processed_row['data_source'] = 'slg_v2'
|
||||
processed_row['file_format'] = 'SA4CPS_SLG_V2'
|
||||
|
||||
return processed_row
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing SLG_V2 line {line_idx}: {e}")
|
||||
return None
|
||||
|
||||
async def _infer_slg_v2_unit(self, column_name: str, value: float) -> str:
|
||||
"""Infer unit based on SLG_V2 column name and value"""
|
||||
try:
|
||||
col_lower = column_name.lower()
|
||||
|
||||
# Common SA4CPS/energy monitoring units
|
||||
if any(word in col_lower for word in ['energy', 'wh', 'consumption']):
|
||||
if value < 1:
|
||||
return "Wh"
|
||||
elif value < 1000:
|
||||
return "kWh"
|
||||
elif value < 1000000:
|
||||
return "MWh"
|
||||
else:
|
||||
return "GWh"
|
||||
elif any(word in col_lower for word in ['power', 'watt', 'w']):
|
||||
if value < 1000:
|
||||
return "W"
|
||||
elif value < 1000000:
|
||||
return "kW"
|
||||
else:
|
||||
return "MW"
|
||||
elif any(word in col_lower for word in ['voltage', 'volt', 'v']):
|
||||
return "V"
|
||||
elif any(word in col_lower for word in ['current', 'amp', 'a']):
|
||||
return "A"
|
||||
elif any(word in col_lower for word in ['temp', 'temperature']):
|
||||
return "°C"
|
||||
elif any(word in col_lower for word in ['freq', 'frequency']):
|
||||
return "Hz"
|
||||
elif any(word in col_lower for word in ['percent', '%']):
|
||||
return "%"
|
||||
else:
|
||||
# Default energy unit inference
|
||||
return await self._infer_unit(value)
|
||||
|
||||
except:
|
||||
return "unknown"
|
||||
|
||||
async def get_processing_stats(self) -> Dict[str, Any]:
|
||||
"""Get processing statistics"""
|
||||
try:
|
||||
# This could be enhanced to return actual processing metrics
|
||||
return {
|
||||
"supported_formats": self.supported_formats,
|
||||
"time_formats_supported": len(self.time_formats),
|
||||
"slg_v2_support": True,
|
||||
"last_updated": datetime.utcnow().isoformat()
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting processing stats: {e}")
|
||||
return {}
|
||||
Reference in New Issue
Block a user