Files
sac4cps-backend/microservices/data-ingestion-service/data_processor.py
rafaeldpsilva 5fdce00e5d Add data-ingestion-service for SA4CPS FTP integration
- Implement FTP monitoring and ingestion for SA4CPS .slg_v2 files - Add
robust data processor with multi-format and unit inference support -
Publish parsed data to Redis topics for real-time dashboard simulation -
Include validation, monitoring, and auto-configuration scripts - Provide
documentation and test scripts for SA4CPS integration
2025-09-10 14:43:30 +01:00

899 lines
36 KiB
Python

"""
Data processor for parsing and transforming time series data from various formats.
Handles CSV, JSON, and other time series data formats from real community sources.
"""
import asyncio
import pandas as pd
import json
import csv
import io
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Union
import logging
import numpy as np
from dateutil import parser as date_parser
import re
import hashlib
logger = logging.getLogger(__name__)
class DataProcessor:
"""Processes time series data from various formats"""
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
self.supported_formats = ["csv", "json", "txt", "xlsx", "slg_v2"]
self.time_formats = [
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%dT%H:%M:%SZ",
"%d/%m/%Y %H:%M:%S",
"%d-%m-%Y %H:%M:%S",
"%Y/%m/%d %H:%M:%S"
]
async def process_time_series_data(self, file_content: bytes, data_format: str) -> List[Dict[str, Any]]:
"""Process time series data from file content"""
try:
logger.info(f"Processing time series data in {data_format} format ({len(file_content)} bytes)")
# Decode file content
try:
text_content = file_content.decode('utf-8')
except UnicodeDecodeError:
# Try other encodings
try:
text_content = file_content.decode('latin1')
except UnicodeDecodeError:
text_content = file_content.decode('utf-8', errors='ignore')
# Process based on format
if data_format.lower() == "csv":
return await self._process_csv_data(text_content)
elif data_format.lower() == "json":
return await self._process_json_data(text_content)
elif data_format.lower() == "txt":
return await self._process_text_data(text_content)
elif data_format.lower() == "xlsx":
return await self._process_excel_data(file_content)
elif data_format.lower() == "slg_v2":
return await self._process_slg_v2_data(text_content)
else:
# Try to auto-detect format
return await self._auto_detect_and_process(text_content)
except Exception as e:
logger.error(f"Error processing time series data: {e}")
raise
async def _process_csv_data(self, content: str) -> List[Dict[str, Any]]:
"""Process CSV time series data"""
try:
# Parse CSV content
csv_reader = csv.DictReader(io.StringIO(content))
rows = list(csv_reader)
if not rows:
logger.warning("CSV file is empty")
return []
logger.info(f"Found {len(rows)} rows in CSV")
# Auto-detect column mappings
column_mapping = await self._detect_csv_columns(rows[0].keys())
processed_data = []
for row_idx, row in enumerate(rows):
try:
processed_row = await self._process_csv_row(row, column_mapping)
if processed_row:
processed_data.append(processed_row)
except Exception as e:
logger.warning(f"Error processing CSV row {row_idx}: {e}")
continue
logger.info(f"Successfully processed {len(processed_data)} CSV records")
return processed_data
except Exception as e:
logger.error(f"Error processing CSV data: {e}")
raise
async def _process_json_data(self, content: str) -> List[Dict[str, Any]]:
"""Process JSON time series data"""
try:
data = json.loads(content)
# Handle different JSON structures
if isinstance(data, list):
# Array of records
return await self._process_json_array(data)
elif isinstance(data, dict):
# Single record or object with nested data
return await self._process_json_object(data)
else:
logger.warning(f"Unexpected JSON structure: {type(data)}")
return []
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON content: {e}")
raise
except Exception as e:
logger.error(f"Error processing JSON data: {e}")
raise
async def _process_text_data(self, content: str) -> List[Dict[str, Any]]:
"""Process text-based time series data"""
try:
lines = content.strip().split('\n')
# Try to detect the format of text data
if not lines:
return []
# Check if it's space-separated, tab-separated, or has another delimiter
first_line = lines[0].strip()
# Detect delimiter
delimiter = None
for test_delim in ['\t', ' ', ';', '|']:
if first_line.count(test_delim) > 0:
delimiter = test_delim
break
if not delimiter:
# Try to parse as single column data
return await self._process_single_column_data(lines)
# Parse delimited data
processed_data = []
header = None
for line_idx, line in enumerate(lines):
line = line.strip()
if not line or line.startswith('#'): # Skip empty lines and comments
continue
parts = line.split(delimiter)
parts = [part.strip() for part in parts if part.strip()]
if not header:
# First data line - use as header or create generic headers
if await self._is_header_line(parts):
header = parts
continue
else:
header = [f"col_{i}" for i in range(len(parts))]
try:
row_dict = dict(zip(header, parts))
processed_row = await self._process_generic_row(row_dict)
if processed_row:
processed_data.append(processed_row)
except Exception as e:
logger.warning(f"Error processing text line {line_idx}: {e}")
continue
logger.info(f"Successfully processed {len(processed_data)} text records")
return processed_data
except Exception as e:
logger.error(f"Error processing text data: {e}")
raise
async def _process_excel_data(self, content: bytes) -> List[Dict[str, Any]]:
"""Process Excel time series data"""
try:
# Read Excel file
df = pd.read_excel(io.BytesIO(content))
if df.empty:
return []
# Convert DataFrame to list of dictionaries
records = df.to_dict('records')
# Process each record
processed_data = []
for record in records:
try:
processed_row = await self._process_generic_row(record)
if processed_row:
processed_data.append(processed_row)
except Exception as e:
logger.warning(f"Error processing Excel record: {e}")
continue
logger.info(f"Successfully processed {len(processed_data)} Excel records")
return processed_data
except Exception as e:
logger.error(f"Error processing Excel data: {e}")
raise
async def _detect_csv_columns(self, columns: List[str]) -> Dict[str, str]:
"""Auto-detect column mappings for CSV data"""
mapping = {}
# Common column name patterns
timestamp_patterns = [
r'time.*stamp', r'date.*time', r'datetime', r'time', r'date',
r'timestamp', r'ts', r'hora', r'fecha', r'datum', r'zeit'
]
value_patterns = [
r'.*energy.*', r'.*power.*', r'.*consumption.*', r'.*usage.*', r'.*load.*',
r'.*wh.*', r'.*kwh.*', r'.*mwh.*', r'.*w.*', r'.*kw.*', r'.*mw.*',
r'value', r'val', r'measure', r'reading', r'datos', r'wert'
]
sensor_patterns = [
r'.*sensor.*', r'.*device.*', r'.*meter.*', r'.*id.*',
r'sensor', r'device', r'meter', r'contador', r'medidor'
]
unit_patterns = [
r'.*unit.*', r'.*measure.*', r'unit', r'unidad', r'einheit'
]
for col in columns:
col_lower = col.lower()
# Check for timestamp columns
if any(re.match(pattern, col_lower) for pattern in timestamp_patterns):
mapping['timestamp'] = col
# Check for value columns
elif any(re.match(pattern, col_lower) for pattern in value_patterns):
mapping['value'] = col
# Check for sensor ID columns
elif any(re.match(pattern, col_lower) for pattern in sensor_patterns):
mapping['sensor_id'] = col
# Check for unit columns
elif any(re.match(pattern, col_lower) for pattern in unit_patterns):
mapping['unit'] = col
# Set defaults if not found
if 'timestamp' not in mapping:
# Use first column as timestamp
mapping['timestamp'] = columns[0]
if 'value' not in mapping and len(columns) > 1:
# Use second column or first numeric-looking column
for col in columns[1:]:
if col != mapping.get('timestamp'):
mapping['value'] = col
break
logger.info(f"Detected column mapping: {mapping}")
return mapping
async def _process_csv_row(self, row: Dict[str, str], column_mapping: Dict[str, str]) -> Optional[Dict[str, Any]]:
"""Process a single CSV row"""
try:
processed_row = {}
# Extract timestamp
timestamp_col = column_mapping.get('timestamp')
if timestamp_col and timestamp_col in row:
timestamp = await self._parse_timestamp(row[timestamp_col])
if timestamp:
processed_row['timestamp'] = int(timestamp.timestamp())
processed_row['datetime'] = timestamp.isoformat()
else:
return None
# Extract sensor ID
sensor_col = column_mapping.get('sensor_id')
if sensor_col and sensor_col in row:
processed_row['sensor_id'] = str(row[sensor_col]).strip()
else:
# Generate a default sensor ID
processed_row['sensor_id'] = "unknown_sensor"
# Extract value(s)
value_col = column_mapping.get('value')
if value_col and value_col in row:
try:
value = await self._parse_numeric_value(row[value_col])
if value is not None:
processed_row['value'] = value
else:
return None
except:
return None
# Extract unit
unit_col = column_mapping.get('unit')
if unit_col and unit_col in row:
processed_row['unit'] = str(row[unit_col]).strip()
else:
processed_row['unit'] = await self._infer_unit(processed_row.get('value', 0))
# Add all other columns as metadata
metadata = {}
for col, val in row.items():
if col not in column_mapping.values() and val:
try:
# Try to parse as number
num_val = await self._parse_numeric_value(val)
metadata[col] = num_val if num_val is not None else str(val).strip()
except:
metadata[col] = str(val).strip()
if metadata:
processed_row['metadata'] = metadata
# Add processing metadata
processed_row['processed_at'] = datetime.utcnow().isoformat()
processed_row['data_source'] = 'csv'
return processed_row
except Exception as e:
logger.error(f"Error processing CSV row: {e}")
return None
async def _process_json_array(self, data: List[Any]) -> List[Dict[str, Any]]:
"""Process JSON array of records"""
processed_data = []
for item in data:
if isinstance(item, dict):
processed_row = await self._process_json_record(item)
if processed_row:
processed_data.append(processed_row)
return processed_data
async def _process_json_object(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Process JSON object"""
# Check if it contains time series data
if 'data' in data and isinstance(data['data'], list):
return await self._process_json_array(data['data'])
elif 'readings' in data and isinstance(data['readings'], list):
return await self._process_json_array(data['readings'])
elif 'values' in data and isinstance(data['values'], list):
return await self._process_json_array(data['values'])
else:
# Treat as single record
processed_row = await self._process_json_record(data)
return [processed_row] if processed_row else []
async def _process_json_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Process a single JSON record"""
try:
processed_row = {}
# Extract timestamp
timestamp = None
for ts_field in ['timestamp', 'datetime', 'time', 'date', 'ts']:
if ts_field in record:
timestamp = await self._parse_timestamp(record[ts_field])
if timestamp:
break
if timestamp:
processed_row['timestamp'] = int(timestamp.timestamp())
processed_row['datetime'] = timestamp.isoformat()
else:
# Use current time if no timestamp found
now = datetime.utcnow()
processed_row['timestamp'] = int(now.timestamp())
processed_row['datetime'] = now.isoformat()
# Extract sensor ID
sensor_id = None
for id_field in ['sensor_id', 'sensorId', 'device_id', 'deviceId', 'id', 'sensor', 'device']:
if id_field in record:
sensor_id = str(record[id_field])
break
processed_row['sensor_id'] = sensor_id or "unknown_sensor"
# Extract value(s)
value = None
for val_field in ['value', 'reading', 'measurement', 'data', 'energy', 'power', 'consumption']:
if val_field in record:
try:
value = await self._parse_numeric_value(record[val_field])
if value is not None:
break
except:
continue
if value is not None:
processed_row['value'] = value
# Extract unit
unit = None
for unit_field in ['unit', 'units', 'measure_unit', 'uom']:
if unit_field in record:
unit = str(record[unit_field])
break
processed_row['unit'] = unit or await self._infer_unit(processed_row.get('value', 0))
# Add remaining fields as metadata
metadata = {}
processed_fields = {'timestamp', 'datetime', 'time', 'date', 'ts',
'sensor_id', 'sensorId', 'device_id', 'deviceId', 'id', 'sensor', 'device',
'value', 'reading', 'measurement', 'data', 'energy', 'power', 'consumption',
'unit', 'units', 'measure_unit', 'uom'}
for key, val in record.items():
if key not in processed_fields and val is not None:
metadata[key] = val
if metadata:
processed_row['metadata'] = metadata
# Add processing metadata
processed_row['processed_at'] = datetime.utcnow().isoformat()
processed_row['data_source'] = 'json'
return processed_row
except Exception as e:
logger.error(f"Error processing JSON record: {e}")
return None
async def _process_generic_row(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Process a generic row of data"""
try:
processed_row = {}
# Try to find timestamp
timestamp = None
for key, val in row.items():
if 'time' in key.lower() or 'date' in key.lower():
timestamp = await self._parse_timestamp(val)
if timestamp:
break
if timestamp:
processed_row['timestamp'] = int(timestamp.timestamp())
processed_row['datetime'] = timestamp.isoformat()
else:
now = datetime.utcnow()
processed_row['timestamp'] = int(now.timestamp())
processed_row['datetime'] = now.isoformat()
# Try to find sensor ID
sensor_id = None
for key, val in row.items():
if 'sensor' in key.lower() or 'device' in key.lower() or 'id' in key.lower():
sensor_id = str(val)
break
processed_row['sensor_id'] = sensor_id or "unknown_sensor"
# Try to find numeric value
value = None
for key, val in row.items():
if key.lower() not in ['timestamp', 'datetime', 'time', 'date', 'sensor_id', 'device_id', 'id']:
try:
value = await self._parse_numeric_value(val)
if value is not None:
break
except:
continue
if value is not None:
processed_row['value'] = value
processed_row['unit'] = await self._infer_unit(value)
# Add all fields as metadata
metadata = {k: v for k, v in row.items() if v is not None}
if metadata:
processed_row['metadata'] = metadata
processed_row['processed_at'] = datetime.utcnow().isoformat()
processed_row['data_source'] = 'generic'
return processed_row
except Exception as e:
logger.error(f"Error processing generic row: {e}")
return None
async def _parse_timestamp(self, timestamp_str: Union[str, int, float]) -> Optional[datetime]:
"""Parse timestamp from various formats"""
try:
if isinstance(timestamp_str, (int, float)):
# Unix timestamp
if timestamp_str > 1e10: # Milliseconds
timestamp_str = timestamp_str / 1000
return datetime.fromtimestamp(timestamp_str)
if isinstance(timestamp_str, str):
timestamp_str = timestamp_str.strip()
# Try common formats first
for fmt in self.time_formats:
try:
return datetime.strptime(timestamp_str, fmt)
except ValueError:
continue
# Try dateutil parser as fallback
try:
return date_parser.parse(timestamp_str)
except:
pass
return None
except Exception as e:
logger.debug(f"Error parsing timestamp '{timestamp_str}': {e}")
return None
async def _parse_numeric_value(self, value_str: Union[str, int, float]) -> Optional[float]:
"""Parse numeric value from string"""
try:
if isinstance(value_str, (int, float)):
return float(value_str) if not (isinstance(value_str, float) and np.isnan(value_str)) else None
if isinstance(value_str, str):
# Clean the string
cleaned = re.sub(r'[^\d.-]', '', value_str.strip())
if cleaned:
return float(cleaned)
return None
except Exception:
return None
async def _infer_unit(self, value: float) -> str:
"""Infer unit based on value range"""
try:
if value is None:
return "unknown"
# Common energy unit ranges
if value < 1:
return "Wh"
elif value < 1000:
return "kWh"
elif value < 1000000:
return "MWh"
else:
return "GWh"
except:
return "unknown"
async def _is_header_line(self, parts: List[str]) -> bool:
"""Check if a line appears to be a header"""
# If all parts are strings without numbers, likely a header
for part in parts:
try:
float(part)
return False # Found a number, not a header
except ValueError:
continue
return True
async def _process_single_column_data(self, lines: List[str]) -> List[Dict[str, Any]]:
"""Process single column data"""
processed_data = []
for line_idx, line in enumerate(lines):
line = line.strip()
if not line or line.startswith('#'):
continue
try:
value = await self._parse_numeric_value(line)
if value is not None:
now = datetime.utcnow()
processed_row = {
'sensor_id': 'single_column_sensor',
'timestamp': int(now.timestamp()) + line_idx, # Spread timestamps
'datetime': (now + timedelta(seconds=line_idx)).isoformat(),
'value': value,
'unit': await self._infer_unit(value),
'processed_at': now.isoformat(),
'data_source': 'text_single_column',
'metadata': {'line_number': line_idx}
}
processed_data.append(processed_row)
except Exception as e:
logger.warning(f"Error processing single column line {line_idx}: {e}")
continue
return processed_data
async def _auto_detect_and_process(self, content: str) -> List[Dict[str, Any]]:
"""Auto-detect format and process data"""
try:
# Try JSON first
try:
json.loads(content)
return await self._process_json_data(content)
except json.JSONDecodeError:
pass
# Try CSV
try:
lines = content.strip().split('\n')
if len(lines) > 1 and (',' in lines[0] or ';' in lines[0] or '\t' in lines[0]):
return await self._process_csv_data(content)
except:
pass
# Fall back to text processing
return await self._process_text_data(content)
except Exception as e:
logger.error(f"Error in auto-detection: {e}")
raise
async def _process_slg_v2_data(self, content: str) -> List[Dict[str, Any]]:
"""Process SA4CPS .slg_v2 format files"""
try:
lines = content.strip().split('\n')
if not lines:
logger.warning("SLG_V2 file is empty")
return []
logger.info(f"Processing SLG_V2 file with {len(lines)} lines")
processed_data = []
header = None
metadata = {}
for line_idx, line in enumerate(lines):
line = line.strip()
# Skip empty lines
if not line:
continue
# Handle comment lines and metadata
if line.startswith('#') or line.startswith('//'):
# Extract metadata from comment lines
comment = line[1:].strip() if line.startswith('#') else line[2:].strip()
if ':' in comment:
key, value = comment.split(':', 1)
metadata[key.strip()] = value.strip()
continue
# Handle header lines (if present)
if line_idx == 0 or (header is None and await self._is_slg_v2_header(line)):
header = await self._parse_slg_v2_header(line)
continue
# Process data lines
try:
processed_row = await self._process_slg_v2_line(line, header, metadata, line_idx)
if processed_row:
processed_data.append(processed_row)
except Exception as e:
logger.warning(f"Error processing SLG_V2 line {line_idx}: {e}")
continue
logger.info(f"Successfully processed {len(processed_data)} SLG_V2 records")
return processed_data
except Exception as e:
logger.error(f"Error processing SLG_V2 data: {e}")
raise
async def _is_slg_v2_header(self, line: str) -> bool:
"""Check if a line appears to be a SLG_V2 header"""
# Common SLG_V2 header patterns
header_keywords = ['timestamp', 'time', 'date', 'sensor', 'id', 'value', 'reading',
'energy', 'power', 'voltage', 'current', 'temperature']
line_lower = line.lower()
# Check if line contains header-like words and few or no numbers
has_keywords = any(keyword in line_lower for keyword in header_keywords)
# Try to parse as numbers - if most parts fail, likely a header
parts = line.replace(',', ' ').replace(';', ' ').replace('\t', ' ').split()
numeric_parts = 0
for part in parts:
try:
float(part.strip())
numeric_parts += 1
except ValueError:
continue
# If less than half are numeric and has keywords, likely header
return has_keywords and (numeric_parts < len(parts) / 2)
async def _parse_slg_v2_header(self, line: str) -> List[str]:
"""Parse SLG_V2 header line"""
# Try different delimiters
for delimiter in [',', ';', '\t', ' ']:
if delimiter in line:
parts = [part.strip() for part in line.split(delimiter) if part.strip()]
if len(parts) > 1:
return parts
# Default to splitting by whitespace
return [part.strip() for part in line.split() if part.strip()]
async def _process_slg_v2_line(self, line: str, header: Optional[List[str]],
metadata: Dict[str, Any], line_idx: int) -> Optional[Dict[str, Any]]:
"""Process a single SLG_V2 data line"""
try:
# Try different delimiters to parse the line
parts = None
for delimiter in [',', ';', '\t', ' ']:
if delimiter in line:
test_parts = [part.strip() for part in line.split(delimiter) if part.strip()]
if len(test_parts) > 1:
parts = test_parts
break
if not parts:
# Split by whitespace as fallback
parts = [part.strip() for part in line.split() if part.strip()]
if not parts:
return None
# Create row dictionary
if header and len(parts) >= len(header):
row_dict = dict(zip(header, parts[:len(header)]))
# Add extra columns if any
for i, extra_part in enumerate(parts[len(header):]):
row_dict[f"extra_col_{i}"] = extra_part
else:
# Create generic column names
row_dict = {f"col_{i}": part for i, part in enumerate(parts)}
# Process the row similar to generic processing but with SLG_V2 specifics
processed_row = {}
# Extract timestamp
timestamp = None
timestamp_value = None
for key, val in row_dict.items():
key_lower = key.lower()
if any(ts_word in key_lower for ts_word in ['time', 'date', 'timestamp', 'ts']):
timestamp = await self._parse_timestamp(val)
timestamp_value = val
if timestamp:
break
if timestamp:
processed_row['timestamp'] = int(timestamp.timestamp())
processed_row['datetime'] = timestamp.isoformat()
else:
# Use current time with line offset for uniqueness
now = datetime.utcnow()
processed_row['timestamp'] = int(now.timestamp()) + line_idx
processed_row['datetime'] = (now + timedelta(seconds=line_idx)).isoformat()
# Extract sensor ID
sensor_id = None
for key, val in row_dict.items():
key_lower = key.lower()
if any(id_word in key_lower for id_word in ['sensor', 'device', 'meter', 'id']):
sensor_id = str(val).strip()
break
processed_row['sensor_id'] = sensor_id or f"slg_v2_sensor_{line_idx}"
# Extract numeric values
values_found = []
for key, val in row_dict.items():
key_lower = key.lower()
# Skip timestamp and ID fields
if (any(skip_word in key_lower for skip_word in ['time', 'date', 'timestamp', 'ts', 'id', 'sensor', 'device', 'meter']) and
val == timestamp_value) or key_lower.endswith('_id'):
continue
try:
numeric_val = await self._parse_numeric_value(val)
if numeric_val is not None:
values_found.append({
'key': key,
'value': numeric_val,
'unit': await self._infer_slg_v2_unit(key, numeric_val)
})
except:
continue
# Handle multiple values
if len(values_found) == 1:
# Single value case
processed_row['value'] = values_found[0]['value']
processed_row['unit'] = values_found[0]['unit']
processed_row['value_type'] = values_found[0]['key']
elif len(values_found) > 1:
# Multiple values case - create main value and store others in metadata
main_value = values_found[0] # Use first numeric value as main
processed_row['value'] = main_value['value']
processed_row['unit'] = main_value['unit']
processed_row['value_type'] = main_value['key']
# Store additional values in metadata
additional_values = {}
for val_info in values_found[1:]:
additional_values[val_info['key']] = {
'value': val_info['value'],
'unit': val_info['unit']
}
processed_row['additional_values'] = additional_values
# Add all data as metadata
row_metadata = dict(row_dict)
row_metadata.update(metadata) # Include file-level metadata
row_metadata['line_number'] = line_idx
row_metadata['raw_line'] = line
processed_row['metadata'] = row_metadata
# Add processing info
processed_row['processed_at'] = datetime.utcnow().isoformat()
processed_row['data_source'] = 'slg_v2'
processed_row['file_format'] = 'SA4CPS_SLG_V2'
return processed_row
except Exception as e:
logger.error(f"Error processing SLG_V2 line {line_idx}: {e}")
return None
async def _infer_slg_v2_unit(self, column_name: str, value: float) -> str:
"""Infer unit based on SLG_V2 column name and value"""
try:
col_lower = column_name.lower()
# Common SA4CPS/energy monitoring units
if any(word in col_lower for word in ['energy', 'wh', 'consumption']):
if value < 1:
return "Wh"
elif value < 1000:
return "kWh"
elif value < 1000000:
return "MWh"
else:
return "GWh"
elif any(word in col_lower for word in ['power', 'watt', 'w']):
if value < 1000:
return "W"
elif value < 1000000:
return "kW"
else:
return "MW"
elif any(word in col_lower for word in ['voltage', 'volt', 'v']):
return "V"
elif any(word in col_lower for word in ['current', 'amp', 'a']):
return "A"
elif any(word in col_lower for word in ['temp', 'temperature']):
return "°C"
elif any(word in col_lower for word in ['freq', 'frequency']):
return "Hz"
elif any(word in col_lower for word in ['percent', '%']):
return "%"
else:
# Default energy unit inference
return await self._infer_unit(value)
except:
return "unknown"
async def get_processing_stats(self) -> Dict[str, Any]:
"""Get processing statistics"""
try:
# This could be enhanced to return actual processing metrics
return {
"supported_formats": self.supported_formats,
"time_formats_supported": len(self.time_formats),
"slg_v2_support": True,
"last_updated": datetime.utcnow().isoformat()
}
except Exception as e:
logger.error(f"Error getting processing stats: {e}")
return {}