""" Data processor for parsing and transforming time series data from various formats. Handles CSV, JSON, and other time series data formats from real community sources. """ import asyncio import pandas as pd import json import csv import io from datetime import datetime, timedelta from typing import List, Dict, Any, Optional, Union import logging import numpy as np from dateutil import parser as date_parser import re import hashlib logger = logging.getLogger(__name__) class DataProcessor: """Processes time series data from various formats""" def __init__(self, db, redis_client): self.db = db self.redis = redis_client self.supported_formats = ["csv", "json", "txt", "xlsx", "slg_v2"] self.time_formats = [ "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%SZ", "%d/%m/%Y %H:%M:%S", "%d-%m-%Y %H:%M:%S", "%Y/%m/%d %H:%M:%S" ] async def process_time_series_data(self, file_content: bytes, data_format: str) -> List[Dict[str, Any]]: """Process time series data from file content""" try: logger.info(f"Processing time series data in {data_format} format ({len(file_content)} bytes)") # Decode file content try: text_content = file_content.decode('utf-8') except UnicodeDecodeError: # Try other encodings try: text_content = file_content.decode('latin1') except UnicodeDecodeError: text_content = file_content.decode('utf-8', errors='ignore') # Process based on format if data_format.lower() == "csv": return await self._process_csv_data(text_content) elif data_format.lower() == "json": return await self._process_json_data(text_content) elif data_format.lower() == "txt": return await self._process_text_data(text_content) elif data_format.lower() == "xlsx": return await self._process_excel_data(file_content) elif data_format.lower() == "slg_v2": return await self._process_slg_v2_data(text_content) else: # Try to auto-detect format return await self._auto_detect_and_process(text_content) except Exception as e: logger.error(f"Error processing time series data: {e}") raise async def _process_csv_data(self, content: str) -> List[Dict[str, Any]]: """Process CSV time series data""" try: # Parse CSV content csv_reader = csv.DictReader(io.StringIO(content)) rows = list(csv_reader) if not rows: logger.warning("CSV file is empty") return [] logger.info(f"Found {len(rows)} rows in CSV") # Auto-detect column mappings column_mapping = await self._detect_csv_columns(rows[0].keys()) processed_data = [] for row_idx, row in enumerate(rows): try: processed_row = await self._process_csv_row(row, column_mapping) if processed_row: processed_data.append(processed_row) except Exception as e: logger.warning(f"Error processing CSV row {row_idx}: {e}") continue logger.info(f"Successfully processed {len(processed_data)} CSV records") return processed_data except Exception as e: logger.error(f"Error processing CSV data: {e}") raise async def _process_json_data(self, content: str) -> List[Dict[str, Any]]: """Process JSON time series data""" try: data = json.loads(content) # Handle different JSON structures if isinstance(data, list): # Array of records return await self._process_json_array(data) elif isinstance(data, dict): # Single record or object with nested data return await self._process_json_object(data) else: logger.warning(f"Unexpected JSON structure: {type(data)}") return [] except json.JSONDecodeError as e: logger.error(f"Invalid JSON content: {e}") raise except Exception as e: logger.error(f"Error processing JSON data: {e}") raise async def _process_text_data(self, content: str) -> List[Dict[str, Any]]: """Process text-based time series data""" try: lines = content.strip().split('\n') # Try to detect the format of text data if not lines: return [] # Check if it's space-separated, tab-separated, or has another delimiter first_line = lines[0].strip() # Detect delimiter delimiter = None for test_delim in ['\t', ' ', ';', '|']: if first_line.count(test_delim) > 0: delimiter = test_delim break if not delimiter: # Try to parse as single column data return await self._process_single_column_data(lines) # Parse delimited data processed_data = [] header = None for line_idx, line in enumerate(lines): line = line.strip() if not line or line.startswith('#'): # Skip empty lines and comments continue parts = line.split(delimiter) parts = [part.strip() for part in parts if part.strip()] if not header: # First data line - use as header or create generic headers if await self._is_header_line(parts): header = parts continue else: header = [f"col_{i}" for i in range(len(parts))] try: row_dict = dict(zip(header, parts)) processed_row = await self._process_generic_row(row_dict) if processed_row: processed_data.append(processed_row) except Exception as e: logger.warning(f"Error processing text line {line_idx}: {e}") continue logger.info(f"Successfully processed {len(processed_data)} text records") return processed_data except Exception as e: logger.error(f"Error processing text data: {e}") raise async def _process_excel_data(self, content: bytes) -> List[Dict[str, Any]]: """Process Excel time series data""" try: # Read Excel file df = pd.read_excel(io.BytesIO(content)) if df.empty: return [] # Convert DataFrame to list of dictionaries records = df.to_dict('records') # Process each record processed_data = [] for record in records: try: processed_row = await self._process_generic_row(record) if processed_row: processed_data.append(processed_row) except Exception as e: logger.warning(f"Error processing Excel record: {e}") continue logger.info(f"Successfully processed {len(processed_data)} Excel records") return processed_data except Exception as e: logger.error(f"Error processing Excel data: {e}") raise async def _detect_csv_columns(self, columns: List[str]) -> Dict[str, str]: """Auto-detect column mappings for CSV data""" mapping = {} # Common column name patterns timestamp_patterns = [ r'time.*stamp', r'date.*time', r'datetime', r'time', r'date', r'timestamp', r'ts', r'hora', r'fecha', r'datum', r'zeit' ] value_patterns = [ r'.*energy.*', r'.*power.*', r'.*consumption.*', r'.*usage.*', r'.*load.*', r'.*wh.*', r'.*kwh.*', r'.*mwh.*', r'.*w.*', r'.*kw.*', r'.*mw.*', r'value', r'val', r'measure', r'reading', r'datos', r'wert' ] sensor_patterns = [ r'.*sensor.*', r'.*device.*', r'.*meter.*', r'.*id.*', r'sensor', r'device', r'meter', r'contador', r'medidor' ] unit_patterns = [ r'.*unit.*', r'.*measure.*', r'unit', r'unidad', r'einheit' ] for col in columns: col_lower = col.lower() # Check for timestamp columns if any(re.match(pattern, col_lower) for pattern in timestamp_patterns): mapping['timestamp'] = col # Check for value columns elif any(re.match(pattern, col_lower) for pattern in value_patterns): mapping['value'] = col # Check for sensor ID columns elif any(re.match(pattern, col_lower) for pattern in sensor_patterns): mapping['sensor_id'] = col # Check for unit columns elif any(re.match(pattern, col_lower) for pattern in unit_patterns): mapping['unit'] = col # Set defaults if not found if 'timestamp' not in mapping: # Use first column as timestamp mapping['timestamp'] = columns[0] if 'value' not in mapping and len(columns) > 1: # Use second column or first numeric-looking column for col in columns[1:]: if col != mapping.get('timestamp'): mapping['value'] = col break logger.info(f"Detected column mapping: {mapping}") return mapping async def _process_csv_row(self, row: Dict[str, str], column_mapping: Dict[str, str]) -> Optional[Dict[str, Any]]: """Process a single CSV row""" try: processed_row = {} # Extract timestamp timestamp_col = column_mapping.get('timestamp') if timestamp_col and timestamp_col in row: timestamp = await self._parse_timestamp(row[timestamp_col]) if timestamp: processed_row['timestamp'] = int(timestamp.timestamp()) processed_row['datetime'] = timestamp.isoformat() else: return None # Extract sensor ID sensor_col = column_mapping.get('sensor_id') if sensor_col and sensor_col in row: processed_row['sensor_id'] = str(row[sensor_col]).strip() else: # Generate a default sensor ID processed_row['sensor_id'] = "unknown_sensor" # Extract value(s) value_col = column_mapping.get('value') if value_col and value_col in row: try: value = await self._parse_numeric_value(row[value_col]) if value is not None: processed_row['value'] = value else: return None except: return None # Extract unit unit_col = column_mapping.get('unit') if unit_col and unit_col in row: processed_row['unit'] = str(row[unit_col]).strip() else: processed_row['unit'] = await self._infer_unit(processed_row.get('value', 0)) # Add all other columns as metadata metadata = {} for col, val in row.items(): if col not in column_mapping.values() and val: try: # Try to parse as number num_val = await self._parse_numeric_value(val) metadata[col] = num_val if num_val is not None else str(val).strip() except: metadata[col] = str(val).strip() if metadata: processed_row['metadata'] = metadata # Add processing metadata processed_row['processed_at'] = datetime.utcnow().isoformat() processed_row['data_source'] = 'csv' return processed_row except Exception as e: logger.error(f"Error processing CSV row: {e}") return None async def _process_json_array(self, data: List[Any]) -> List[Dict[str, Any]]: """Process JSON array of records""" processed_data = [] for item in data: if isinstance(item, dict): processed_row = await self._process_json_record(item) if processed_row: processed_data.append(processed_row) return processed_data async def _process_json_object(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """Process JSON object""" # Check if it contains time series data if 'data' in data and isinstance(data['data'], list): return await self._process_json_array(data['data']) elif 'readings' in data and isinstance(data['readings'], list): return await self._process_json_array(data['readings']) elif 'values' in data and isinstance(data['values'], list): return await self._process_json_array(data['values']) else: # Treat as single record processed_row = await self._process_json_record(data) return [processed_row] if processed_row else [] async def _process_json_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]: """Process a single JSON record""" try: processed_row = {} # Extract timestamp timestamp = None for ts_field in ['timestamp', 'datetime', 'time', 'date', 'ts']: if ts_field in record: timestamp = await self._parse_timestamp(record[ts_field]) if timestamp: break if timestamp: processed_row['timestamp'] = int(timestamp.timestamp()) processed_row['datetime'] = timestamp.isoformat() else: # Use current time if no timestamp found now = datetime.utcnow() processed_row['timestamp'] = int(now.timestamp()) processed_row['datetime'] = now.isoformat() # Extract sensor ID sensor_id = None for id_field in ['sensor_id', 'sensorId', 'device_id', 'deviceId', 'id', 'sensor', 'device']: if id_field in record: sensor_id = str(record[id_field]) break processed_row['sensor_id'] = sensor_id or "unknown_sensor" # Extract value(s) value = None for val_field in ['value', 'reading', 'measurement', 'data', 'energy', 'power', 'consumption']: if val_field in record: try: value = await self._parse_numeric_value(record[val_field]) if value is not None: break except: continue if value is not None: processed_row['value'] = value # Extract unit unit = None for unit_field in ['unit', 'units', 'measure_unit', 'uom']: if unit_field in record: unit = str(record[unit_field]) break processed_row['unit'] = unit or await self._infer_unit(processed_row.get('value', 0)) # Add remaining fields as metadata metadata = {} processed_fields = {'timestamp', 'datetime', 'time', 'date', 'ts', 'sensor_id', 'sensorId', 'device_id', 'deviceId', 'id', 'sensor', 'device', 'value', 'reading', 'measurement', 'data', 'energy', 'power', 'consumption', 'unit', 'units', 'measure_unit', 'uom'} for key, val in record.items(): if key not in processed_fields and val is not None: metadata[key] = val if metadata: processed_row['metadata'] = metadata # Add processing metadata processed_row['processed_at'] = datetime.utcnow().isoformat() processed_row['data_source'] = 'json' return processed_row except Exception as e: logger.error(f"Error processing JSON record: {e}") return None async def _process_generic_row(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: """Process a generic row of data""" try: processed_row = {} # Try to find timestamp timestamp = None for key, val in row.items(): if 'time' in key.lower() or 'date' in key.lower(): timestamp = await self._parse_timestamp(val) if timestamp: break if timestamp: processed_row['timestamp'] = int(timestamp.timestamp()) processed_row['datetime'] = timestamp.isoformat() else: now = datetime.utcnow() processed_row['timestamp'] = int(now.timestamp()) processed_row['datetime'] = now.isoformat() # Try to find sensor ID sensor_id = None for key, val in row.items(): if 'sensor' in key.lower() or 'device' in key.lower() or 'id' in key.lower(): sensor_id = str(val) break processed_row['sensor_id'] = sensor_id or "unknown_sensor" # Try to find numeric value value = None for key, val in row.items(): if key.lower() not in ['timestamp', 'datetime', 'time', 'date', 'sensor_id', 'device_id', 'id']: try: value = await self._parse_numeric_value(val) if value is not None: break except: continue if value is not None: processed_row['value'] = value processed_row['unit'] = await self._infer_unit(value) # Add all fields as metadata metadata = {k: v for k, v in row.items() if v is not None} if metadata: processed_row['metadata'] = metadata processed_row['processed_at'] = datetime.utcnow().isoformat() processed_row['data_source'] = 'generic' return processed_row except Exception as e: logger.error(f"Error processing generic row: {e}") return None async def _parse_timestamp(self, timestamp_str: Union[str, int, float]) -> Optional[datetime]: """Parse timestamp from various formats""" try: if isinstance(timestamp_str, (int, float)): # Unix timestamp if timestamp_str > 1e10: # Milliseconds timestamp_str = timestamp_str / 1000 return datetime.fromtimestamp(timestamp_str) if isinstance(timestamp_str, str): timestamp_str = timestamp_str.strip() # Try common formats first for fmt in self.time_formats: try: return datetime.strptime(timestamp_str, fmt) except ValueError: continue # Try dateutil parser as fallback try: return date_parser.parse(timestamp_str) except: pass return None except Exception as e: logger.debug(f"Error parsing timestamp '{timestamp_str}': {e}") return None async def _parse_numeric_value(self, value_str: Union[str, int, float]) -> Optional[float]: """Parse numeric value from string""" try: if isinstance(value_str, (int, float)): return float(value_str) if not (isinstance(value_str, float) and np.isnan(value_str)) else None if isinstance(value_str, str): # Clean the string cleaned = re.sub(r'[^\d.-]', '', value_str.strip()) if cleaned: return float(cleaned) return None except Exception: return None async def _infer_unit(self, value: float) -> str: """Infer unit based on value range""" try: if value is None: return "unknown" # Common energy unit ranges if value < 1: return "Wh" elif value < 1000: return "kWh" elif value < 1000000: return "MWh" else: return "GWh" except: return "unknown" async def _is_header_line(self, parts: List[str]) -> bool: """Check if a line appears to be a header""" # If all parts are strings without numbers, likely a header for part in parts: try: float(part) return False # Found a number, not a header except ValueError: continue return True async def _process_single_column_data(self, lines: List[str]) -> List[Dict[str, Any]]: """Process single column data""" processed_data = [] for line_idx, line in enumerate(lines): line = line.strip() if not line or line.startswith('#'): continue try: value = await self._parse_numeric_value(line) if value is not None: now = datetime.utcnow() processed_row = { 'sensor_id': 'single_column_sensor', 'timestamp': int(now.timestamp()) + line_idx, # Spread timestamps 'datetime': (now + timedelta(seconds=line_idx)).isoformat(), 'value': value, 'unit': await self._infer_unit(value), 'processed_at': now.isoformat(), 'data_source': 'text_single_column', 'metadata': {'line_number': line_idx} } processed_data.append(processed_row) except Exception as e: logger.warning(f"Error processing single column line {line_idx}: {e}") continue return processed_data async def _auto_detect_and_process(self, content: str) -> List[Dict[str, Any]]: """Auto-detect format and process data""" try: # Try JSON first try: json.loads(content) return await self._process_json_data(content) except json.JSONDecodeError: pass # Try CSV try: lines = content.strip().split('\n') if len(lines) > 1 and (',' in lines[0] or ';' in lines[0] or '\t' in lines[0]): return await self._process_csv_data(content) except: pass # Fall back to text processing return await self._process_text_data(content) except Exception as e: logger.error(f"Error in auto-detection: {e}") raise async def _process_slg_v2_data(self, content: str) -> List[Dict[str, Any]]: """Process SA4CPS .slg_v2 format files""" try: lines = content.strip().split('\n') if not lines: logger.warning("SLG_V2 file is empty") return [] logger.info(f"Processing SLG_V2 file with {len(lines)} lines") processed_data = [] header = None metadata = {} for line_idx, line in enumerate(lines): line = line.strip() # Skip empty lines if not line: continue # Handle comment lines and metadata if line.startswith('#') or line.startswith('//'): # Extract metadata from comment lines comment = line[1:].strip() if line.startswith('#') else line[2:].strip() if ':' in comment: key, value = comment.split(':', 1) metadata[key.strip()] = value.strip() continue # Handle header lines (if present) if line_idx == 0 or (header is None and await self._is_slg_v2_header(line)): header = await self._parse_slg_v2_header(line) continue # Process data lines try: processed_row = await self._process_slg_v2_line(line, header, metadata, line_idx) if processed_row: processed_data.append(processed_row) except Exception as e: logger.warning(f"Error processing SLG_V2 line {line_idx}: {e}") continue logger.info(f"Successfully processed {len(processed_data)} SLG_V2 records") return processed_data except Exception as e: logger.error(f"Error processing SLG_V2 data: {e}") raise async def _is_slg_v2_header(self, line: str) -> bool: """Check if a line appears to be a SLG_V2 header""" # Common SLG_V2 header patterns header_keywords = ['timestamp', 'time', 'date', 'sensor', 'id', 'value', 'reading', 'energy', 'power', 'voltage', 'current', 'temperature'] line_lower = line.lower() # Check if line contains header-like words and few or no numbers has_keywords = any(keyword in line_lower for keyword in header_keywords) # Try to parse as numbers - if most parts fail, likely a header parts = line.replace(',', ' ').replace(';', ' ').replace('\t', ' ').split() numeric_parts = 0 for part in parts: try: float(part.strip()) numeric_parts += 1 except ValueError: continue # If less than half are numeric and has keywords, likely header return has_keywords and (numeric_parts < len(parts) / 2) async def _parse_slg_v2_header(self, line: str) -> List[str]: """Parse SLG_V2 header line""" # Try different delimiters for delimiter in [',', ';', '\t', ' ']: if delimiter in line: parts = [part.strip() for part in line.split(delimiter) if part.strip()] if len(parts) > 1: return parts # Default to splitting by whitespace return [part.strip() for part in line.split() if part.strip()] async def _process_slg_v2_line(self, line: str, header: Optional[List[str]], metadata: Dict[str, Any], line_idx: int) -> Optional[Dict[str, Any]]: """Process a single SLG_V2 data line""" try: # Try different delimiters to parse the line parts = None for delimiter in [',', ';', '\t', ' ']: if delimiter in line: test_parts = [part.strip() for part in line.split(delimiter) if part.strip()] if len(test_parts) > 1: parts = test_parts break if not parts: # Split by whitespace as fallback parts = [part.strip() for part in line.split() if part.strip()] if not parts: return None # Create row dictionary if header and len(parts) >= len(header): row_dict = dict(zip(header, parts[:len(header)])) # Add extra columns if any for i, extra_part in enumerate(parts[len(header):]): row_dict[f"extra_col_{i}"] = extra_part else: # Create generic column names row_dict = {f"col_{i}": part for i, part in enumerate(parts)} # Process the row similar to generic processing but with SLG_V2 specifics processed_row = {} # Extract timestamp timestamp = None timestamp_value = None for key, val in row_dict.items(): key_lower = key.lower() if any(ts_word in key_lower for ts_word in ['time', 'date', 'timestamp', 'ts']): timestamp = await self._parse_timestamp(val) timestamp_value = val if timestamp: break if timestamp: processed_row['timestamp'] = int(timestamp.timestamp()) processed_row['datetime'] = timestamp.isoformat() else: # Use current time with line offset for uniqueness now = datetime.utcnow() processed_row['timestamp'] = int(now.timestamp()) + line_idx processed_row['datetime'] = (now + timedelta(seconds=line_idx)).isoformat() # Extract sensor ID sensor_id = None for key, val in row_dict.items(): key_lower = key.lower() if any(id_word in key_lower for id_word in ['sensor', 'device', 'meter', 'id']): sensor_id = str(val).strip() break processed_row['sensor_id'] = sensor_id or f"slg_v2_sensor_{line_idx}" # Extract numeric values values_found = [] for key, val in row_dict.items(): key_lower = key.lower() # Skip timestamp and ID fields if (any(skip_word in key_lower for skip_word in ['time', 'date', 'timestamp', 'ts', 'id', 'sensor', 'device', 'meter']) and val == timestamp_value) or key_lower.endswith('_id'): continue try: numeric_val = await self._parse_numeric_value(val) if numeric_val is not None: values_found.append({ 'key': key, 'value': numeric_val, 'unit': await self._infer_slg_v2_unit(key, numeric_val) }) except: continue # Handle multiple values if len(values_found) == 1: # Single value case processed_row['value'] = values_found[0]['value'] processed_row['unit'] = values_found[0]['unit'] processed_row['value_type'] = values_found[0]['key'] elif len(values_found) > 1: # Multiple values case - create main value and store others in metadata main_value = values_found[0] # Use first numeric value as main processed_row['value'] = main_value['value'] processed_row['unit'] = main_value['unit'] processed_row['value_type'] = main_value['key'] # Store additional values in metadata additional_values = {} for val_info in values_found[1:]: additional_values[val_info['key']] = { 'value': val_info['value'], 'unit': val_info['unit'] } processed_row['additional_values'] = additional_values # Add all data as metadata row_metadata = dict(row_dict) row_metadata.update(metadata) # Include file-level metadata row_metadata['line_number'] = line_idx row_metadata['raw_line'] = line processed_row['metadata'] = row_metadata # Add processing info processed_row['processed_at'] = datetime.utcnow().isoformat() processed_row['data_source'] = 'slg_v2' processed_row['file_format'] = 'SA4CPS_SLG_V2' return processed_row except Exception as e: logger.error(f"Error processing SLG_V2 line {line_idx}: {e}") return None async def _infer_slg_v2_unit(self, column_name: str, value: float) -> str: """Infer unit based on SLG_V2 column name and value""" try: col_lower = column_name.lower() # Common SA4CPS/energy monitoring units if any(word in col_lower for word in ['energy', 'wh', 'consumption']): if value < 1: return "Wh" elif value < 1000: return "kWh" elif value < 1000000: return "MWh" else: return "GWh" elif any(word in col_lower for word in ['power', 'watt', 'w']): if value < 1000: return "W" elif value < 1000000: return "kW" else: return "MW" elif any(word in col_lower for word in ['voltage', 'volt', 'v']): return "V" elif any(word in col_lower for word in ['current', 'amp', 'a']): return "A" elif any(word in col_lower for word in ['temp', 'temperature']): return "°C" elif any(word in col_lower for word in ['freq', 'frequency']): return "Hz" elif any(word in col_lower for word in ['percent', '%']): return "%" else: # Default energy unit inference return await self._infer_unit(value) except: return "unknown" async def get_processing_stats(self) -> Dict[str, Any]: """Get processing statistics""" try: # This could be enhanced to return actual processing metrics return { "supported_formats": self.supported_formats, "time_formats_supported": len(self.time_formats), "slg_v2_support": True, "last_updated": datetime.utcnow().isoformat() } except Exception as e: logger.error(f"Error getting processing stats: {e}") return {}