sac4cps-backend/microservices/data-ingestion-service/data_processor.py

"""
Data processor for parsing and transforming time series data from various formats.
Handles CSV, JSON, and other time series data formats from real community sources.
"""

import asyncio
import pandas as pd
import json
import csv
import io
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Union
import logging
import numpy as np
from dateutil import parser as date_parser
import re
import hashlib

logger = logging.getLogger(__name__)

class DataProcessor:
    """Processes time series data from various formats"""

    def __init__(self, db, redis_client):
        self.db = db
        self.redis = redis_client
        self.supported_formats = ["csv", "json", "txt", "xlsx", "slg_v2"]
        self.time_formats = [
            "%Y-%m-%d %H:%M:%S",
            "%Y-%m-%d %H:%M",
            "%Y-%m-%dT%H:%M:%S",
            "%Y-%m-%dT%H:%M:%SZ",
            "%d/%m/%Y %H:%M:%S",
            "%d-%m-%Y %H:%M:%S",
            "%Y/%m/%d %H:%M:%S"
        ]

    async def process_time_series_data(self, file_content: bytes, data_format: str) -> List[Dict[str, Any]]:
        """Process time series data from file content"""
        try:
            logger.info(f"Processing time series data in {data_format} format ({len(file_content)} bytes)")

            # Decode file content
            try:
                text_content = file_content.decode('utf-8')
            except UnicodeDecodeError:
                # Try other encodings
                try:
                    text_content = file_content.decode('latin1')
                except UnicodeDecodeError:
                    text_content = file_content.decode('utf-8', errors='ignore')

            # Process based on format
            if data_format.lower() == "csv":
                return await self._process_csv_data(text_content)
            elif data_format.lower() == "json":
                return await self._process_json_data(text_content)
            elif data_format.lower() == "txt":
                return await self._process_text_data(text_content)
            elif data_format.lower() == "xlsx":
                return await self._process_excel_data(file_content)
            elif data_format.lower() == "slg_v2":
                return await self._process_slg_v2_data(text_content)
            else:
                # Try to auto-detect format
                return await self._auto_detect_and_process(text_content)

        except Exception as e:
            logger.error(f"Error processing time series data: {e}")
            raise

    async def _process_csv_data(self, content: str) -> List[Dict[str, Any]]:
        """Process CSV time series data"""
        try:
            # Parse CSV content
            csv_reader = csv.DictReader(io.StringIO(content))
            rows = list(csv_reader)

            if not rows:
                logger.warning("CSV file is empty")
                return []

            logger.info(f"Found {len(rows)} rows in CSV")

            # Auto-detect column mappings
            column_mapping = await self._detect_csv_columns(rows[0].keys())

            processed_data = []
            for row_idx, row in enumerate(rows):
                try:
                    processed_row = await self._process_csv_row(row, column_mapping)
                    if processed_row:
                        processed_data.append(processed_row)
                except Exception as e:
                    logger.warning(f"Error processing CSV row {row_idx}: {e}")
                    continue

            logger.info(f"Successfully processed {len(processed_data)} CSV records")
            return processed_data

        except Exception as e:
            logger.error(f"Error processing CSV data: {e}")
            raise

    async def _process_json_data(self, content: str) -> List[Dict[str, Any]]:
        """Process JSON time series data"""
        try:
            data = json.loads(content)

            # Handle different JSON structures
            if isinstance(data, list):
                # Array of records
                return await self._process_json_array(data)
            elif isinstance(data, dict):
                # Single record or object with nested data
                return await self._process_json_object(data)
            else:
                logger.warning(f"Unexpected JSON structure: {type(data)}")
                return []

        except json.JSONDecodeError as e:
            logger.error(f"Invalid JSON content: {e}")
            raise
        except Exception as e:
            logger.error(f"Error processing JSON data: {e}")
            raise

    async def _process_text_data(self, content: str) -> List[Dict[str, Any]]:
        """Process text-based time series data"""
        try:
            lines = content.strip().split('\n')

            # Try to detect the format of text data
            if not lines:
                return []

            # Check if it's space-separated, tab-separated, or has another delimiter
            first_line = lines[0].strip()

            # Detect delimiter
            delimiter = None
            for test_delim in ['\t', ' ', ';', '|']:
                if first_line.count(test_delim) > 0:
                    delimiter = test_delim
                    break

            if not delimiter:
                # Try to parse as single column data
                return await self._process_single_column_data(lines)

            # Parse delimited data
            processed_data = []
            header = None

            for line_idx, line in enumerate(lines):
                line = line.strip()
                if not line or line.startswith('#'):  # Skip empty lines and comments
                    continue

                parts = line.split(delimiter)
                parts = [part.strip() for part in parts if part.strip()]

                if not header:
                    # First data line - use as header or create generic headers
                    if await self._is_header_line(parts):
                        header = parts
                        continue
                    else:
                        header = [f"col_{i}" for i in range(len(parts))]

                try:
                    row_dict = dict(zip(header, parts))
                    processed_row = await self._process_generic_row(row_dict)
                    if processed_row:
                        processed_data.append(processed_row)
                except Exception as e:
                    logger.warning(f"Error processing text line {line_idx}: {e}")
                    continue

            logger.info(f"Successfully processed {len(processed_data)} text records")
            return processed_data

        except Exception as e:
            logger.error(f"Error processing text data: {e}")
            raise

    async def _process_excel_data(self, content: bytes) -> List[Dict[str, Any]]:
        """Process Excel time series data"""
        try:
            # Read Excel file
            df = pd.read_excel(io.BytesIO(content))

            if df.empty:
                return []

            # Convert DataFrame to list of dictionaries
            records = df.to_dict('records')

            # Process each record
            processed_data = []
            for record in records:
                try:
                    processed_row = await self._process_generic_row(record)
                    if processed_row:
                        processed_data.append(processed_row)
                except Exception as e:
                    logger.warning(f"Error processing Excel record: {e}")
                    continue

            logger.info(f"Successfully processed {len(processed_data)} Excel records")
            return processed_data

        except Exception as e:
            logger.error(f"Error processing Excel data: {e}")
            raise

    async def _detect_csv_columns(self, columns: List[str]) -> Dict[str, str]:
        """Auto-detect column mappings for CSV data"""
        mapping = {}

        # Common column name patterns
        timestamp_patterns = [
            r'time.*stamp', r'date.*time', r'datetime', r'time', r'date',
            r'timestamp', r'ts', r'hora', r'fecha', r'datum', r'zeit'
        ]

        value_patterns = [
            r'.*energy.*', r'.*power.*', r'.*consumption.*', r'.*usage.*', r'.*load.*',
            r'.*wh.*', r'.*kwh.*', r'.*mwh.*', r'.*w.*', r'.*kw.*', r'.*mw.*',
            r'value', r'val', r'measure', r'reading', r'datos', r'wert'
        ]

        sensor_patterns = [
            r'.*sensor.*', r'.*device.*', r'.*meter.*', r'.*id.*',
            r'sensor', r'device', r'meter', r'contador', r'medidor'
        ]

        unit_patterns = [
            r'.*unit.*', r'.*measure.*', r'unit', r'unidad', r'einheit'
        ]

        for col in columns:
            col_lower = col.lower()

            # Check for timestamp columns
            if any(re.match(pattern, col_lower) for pattern in timestamp_patterns):
                mapping['timestamp'] = col

            # Check for value columns
            elif any(re.match(pattern, col_lower) for pattern in value_patterns):
                mapping['value'] = col

            # Check for sensor ID columns
            elif any(re.match(pattern, col_lower) for pattern in sensor_patterns):
                mapping['sensor_id'] = col

            # Check for unit columns
            elif any(re.match(pattern, col_lower) for pattern in unit_patterns):
                mapping['unit'] = col

        # Set defaults if not found
        if 'timestamp' not in mapping:
            # Use first column as timestamp
            mapping['timestamp'] = columns[0]

        if 'value' not in mapping and len(columns) > 1:
            # Use second column or first numeric-looking column
            for col in columns[1:]:
                if col != mapping.get('timestamp'):
                    mapping['value'] = col
                    break

        logger.info(f"Detected column mapping: {mapping}")
        return mapping

    async def _process_csv_row(self, row: Dict[str, str], column_mapping: Dict[str, str]) -> Optional[Dict[str, Any]]:
        """Process a single CSV row"""
        try:
            processed_row = {}

            # Extract timestamp
            timestamp_col = column_mapping.get('timestamp')
            if timestamp_col and timestamp_col in row:
                timestamp = await self._parse_timestamp(row[timestamp_col])
                if timestamp:
                    processed_row['timestamp'] = int(timestamp.timestamp())
                    processed_row['datetime'] = timestamp.isoformat()
                else:
                    return None

            # Extract sensor ID
            sensor_col = column_mapping.get('sensor_id')
            if sensor_col and sensor_col in row:
                processed_row['sensor_id'] = str(row[sensor_col]).strip()
            else:
                # Generate a default sensor ID
                processed_row['sensor_id'] = "unknown_sensor"

            # Extract value(s)
            value_col = column_mapping.get('value')
            if value_col and value_col in row:
                try:
                    value = await self._parse_numeric_value(row[value_col])
                    if value is not None:
                        processed_row['value'] = value
                    else:
                        return None
                except:
                    return None

            # Extract unit
            unit_col = column_mapping.get('unit')
            if unit_col and unit_col in row:
                processed_row['unit'] = str(row[unit_col]).strip()
            else:
                processed_row['unit'] = await self._infer_unit(processed_row.get('value', 0))

            # Add all other columns as metadata
            metadata = {}
            for col, val in row.items():
                if col not in column_mapping.values() and val:
                    try:
                        # Try to parse as number
                        num_val = await self._parse_numeric_value(val)
                        metadata[col] = num_val if num_val is not None else str(val).strip()
                    except:
                        metadata[col] = str(val).strip()

            if metadata:
                processed_row['metadata'] = metadata

            # Add processing metadata
            processed_row['processed_at'] = datetime.utcnow().isoformat()
            processed_row['data_source'] = 'csv'

            return processed_row

        except Exception as e:
            logger.error(f"Error processing CSV row: {e}")
            return None

    async def _process_json_array(self, data: List[Any]) -> List[Dict[str, Any]]:
        """Process JSON array of records"""
        processed_data = []

        for item in data:
            if isinstance(item, dict):
                processed_row = await self._process_json_record(item)
                if processed_row:
                    processed_data.append(processed_row)

        return processed_data

    async def _process_json_object(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Process JSON object"""
        # Check if it contains time series data
        if 'data' in data and isinstance(data['data'], list):
            return await self._process_json_array(data['data'])
        elif 'readings' in data and isinstance(data['readings'], list):
            return await self._process_json_array(data['readings'])
        elif 'values' in data and isinstance(data['values'], list):
            return await self._process_json_array(data['values'])
        else:
            # Treat as single record
            processed_row = await self._process_json_record(data)
            return [processed_row] if processed_row else []

    async def _process_json_record(self, record: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Process a single JSON record"""
        try:
            processed_row = {}

            # Extract timestamp
            timestamp = None
            for ts_field in ['timestamp', 'datetime', 'time', 'date', 'ts']:
                if ts_field in record:
                    timestamp = await self._parse_timestamp(record[ts_field])
                    if timestamp:
                        break

            if timestamp:
                processed_row['timestamp'] = int(timestamp.timestamp())
                processed_row['datetime'] = timestamp.isoformat()
            else:
                # Use current time if no timestamp found
                now = datetime.utcnow()
                processed_row['timestamp'] = int(now.timestamp())
                processed_row['datetime'] = now.isoformat()

            # Extract sensor ID
            sensor_id = None
            for id_field in ['sensor_id', 'sensorId', 'device_id', 'deviceId', 'id', 'sensor', 'device']:
                if id_field in record:
                    sensor_id = str(record[id_field])
                    break

            processed_row['sensor_id'] = sensor_id or "unknown_sensor"

            # Extract value(s)
            value = None
            for val_field in ['value', 'reading', 'measurement', 'data', 'energy', 'power', 'consumption']:
                if val_field in record:
                    try:
                        value = await self._parse_numeric_value(record[val_field])
                        if value is not None:
                            break
                    except:
                        continue

            if value is not None:
                processed_row['value'] = value

            # Extract unit
            unit = None
            for unit_field in ['unit', 'units', 'measure_unit', 'uom']:
                if unit_field in record:
                    unit = str(record[unit_field])
                    break

            processed_row['unit'] = unit or await self._infer_unit(processed_row.get('value', 0))

            # Add remaining fields as metadata
            metadata = {}
            processed_fields = {'timestamp', 'datetime', 'time', 'date', 'ts',
                              'sensor_id', 'sensorId', 'device_id', 'deviceId', 'id', 'sensor', 'device',
                              'value', 'reading', 'measurement', 'data', 'energy', 'power', 'consumption',
                              'unit', 'units', 'measure_unit', 'uom'}

            for key, val in record.items():
                if key not in processed_fields and val is not None:
                    metadata[key] = val

            if metadata:
                processed_row['metadata'] = metadata

            # Add processing metadata
            processed_row['processed_at'] = datetime.utcnow().isoformat()
            processed_row['data_source'] = 'json'

            return processed_row

        except Exception as e:
            logger.error(f"Error processing JSON record: {e}")
            return None

    async def _process_generic_row(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Process a generic row of data"""
        try:
            processed_row = {}

            # Try to find timestamp
            timestamp = None
            for key, val in row.items():
                if 'time' in key.lower() or 'date' in key.lower():
                    timestamp = await self._parse_timestamp(val)
                    if timestamp:
                        break

            if timestamp:
                processed_row['timestamp'] = int(timestamp.timestamp())
                processed_row['datetime'] = timestamp.isoformat()
            else:
                now = datetime.utcnow()
                processed_row['timestamp'] = int(now.timestamp())
                processed_row['datetime'] = now.isoformat()

            # Try to find sensor ID
            sensor_id = None
            for key, val in row.items():
                if 'sensor' in key.lower() or 'device' in key.lower() or 'id' in key.lower():
                    sensor_id = str(val)
                    break

            processed_row['sensor_id'] = sensor_id or "unknown_sensor"

            # Try to find numeric value
            value = None
            for key, val in row.items():
                if key.lower() not in ['timestamp', 'datetime', 'time', 'date', 'sensor_id', 'device_id', 'id']:
                    try:
                        value = await self._parse_numeric_value(val)
                        if value is not None:
                            break
                    except:
                        continue

            if value is not None:
                processed_row['value'] = value
                processed_row['unit'] = await self._infer_unit(value)

            # Add all fields as metadata
            metadata = {k: v for k, v in row.items() if v is not None}
            if metadata:
                processed_row['metadata'] = metadata

            processed_row['processed_at'] = datetime.utcnow().isoformat()
            processed_row['data_source'] = 'generic'

            return processed_row

        except Exception as e:
            logger.error(f"Error processing generic row: {e}")
            return None

    async def _parse_timestamp(self, timestamp_str: Union[str, int, float]) -> Optional[datetime]:
        """Parse timestamp from various formats"""
        try:
            if isinstance(timestamp_str, (int, float)):
                # Unix timestamp
                if timestamp_str > 1e10:  # Milliseconds
                    timestamp_str = timestamp_str / 1000
                return datetime.fromtimestamp(timestamp_str)

            if isinstance(timestamp_str, str):
                timestamp_str = timestamp_str.strip()

                # Try common formats first
                for fmt in self.time_formats:
                    try:
                        return datetime.strptime(timestamp_str, fmt)
                    except ValueError:
                        continue

                # Try dateutil parser as fallback
                try:
                    return date_parser.parse(timestamp_str)
                except:
                    pass

            return None

        except Exception as e:
            logger.debug(f"Error parsing timestamp '{timestamp_str}': {e}")
            return None

    async def _parse_numeric_value(self, value_str: Union[str, int, float]) -> Optional[float]:
        """Parse numeric value from string"""
        try:
            if isinstance(value_str, (int, float)):
                return float(value_str) if not (isinstance(value_str, float) and np.isnan(value_str)) else None

            if isinstance(value_str, str):
                # Clean the string
                cleaned = re.sub(r'[^\d.-]', '', value_str.strip())
                if cleaned:
                    return float(cleaned)

            return None

        except Exception:
            return None

    async def _infer_unit(self, value: float) -> str:
        """Infer unit based on value range"""
        try:
            if value is None:
                return "unknown"

            # Common energy unit ranges
            if value < 1:
                return "Wh"
            elif value < 1000:
                return "kWh"
            elif value < 1000000:
                return "MWh"
            else:
                return "GWh"

        except:
            return "unknown"

    async def _is_header_line(self, parts: List[str]) -> bool:
        """Check if a line appears to be a header"""
        # If all parts are strings without numbers, likely a header
        for part in parts:
            try:
                float(part)
                return False  # Found a number, not a header
            except ValueError:
                continue
        return True

    async def _process_single_column_data(self, lines: List[str]) -> List[Dict[str, Any]]:
        """Process single column data"""
        processed_data = []

        for line_idx, line in enumerate(lines):
            line = line.strip()
            if not line or line.startswith('#'):
                continue

            try:
                value = await self._parse_numeric_value(line)
                if value is not None:
                    now = datetime.utcnow()
                    processed_row = {
                        'sensor_id': 'single_column_sensor',
                        'timestamp': int(now.timestamp()) + line_idx,  # Spread timestamps
                        'datetime': (now + timedelta(seconds=line_idx)).isoformat(),
                        'value': value,
                        'unit': await self._infer_unit(value),
                        'processed_at': now.isoformat(),
                        'data_source': 'text_single_column',
                        'metadata': {'line_number': line_idx}
                    }
                    processed_data.append(processed_row)
            except Exception as e:
                logger.warning(f"Error processing single column line {line_idx}: {e}")
                continue

        return processed_data

    async def _auto_detect_and_process(self, content: str) -> List[Dict[str, Any]]:
        """Auto-detect format and process data"""
        try:
            # Try JSON first
            try:
                json.loads(content)
                return await self._process_json_data(content)
            except json.JSONDecodeError:
                pass

            # Try CSV
            try:
                lines = content.strip().split('\n')
                if len(lines) > 1 and (',' in lines[0] or ';' in lines[0] or '\t' in lines[0]):
                    return await self._process_csv_data(content)
            except:
                pass

            # Fall back to text processing
            return await self._process_text_data(content)

        except Exception as e:
            logger.error(f"Error in auto-detection: {e}")
            raise

    async def _process_slg_v2_data(self, content: str) -> List[Dict[str, Any]]:
        """Process SA4CPS .slg_v2 format files"""
        try:
            lines = content.strip().split('\n')

            if not lines:
                logger.warning("SLG_V2 file is empty")
                return []

            logger.info(f"Processing SLG_V2 file with {len(lines)} lines")

            processed_data = []
            header = None
            metadata = {}

            for line_idx, line in enumerate(lines):
                line = line.strip()

                # Skip empty lines
                if not line:
                    continue

                # Handle comment lines and metadata
                if line.startswith('#') or line.startswith('//'):
                    # Extract metadata from comment lines
                    comment = line[1:].strip() if line.startswith('#') else line[2:].strip()
                    if ':' in comment:
                        key, value = comment.split(':', 1)
                        metadata[key.strip()] = value.strip()
                    continue

                # Handle header lines (if present)
                if line_idx == 0 or (header is None and await self._is_slg_v2_header(line)):
                    header = await self._parse_slg_v2_header(line)
                    continue

                # Process data lines
                try:
                    processed_row = await self._process_slg_v2_line(line, header, metadata, line_idx)
                    if processed_row:
                        processed_data.append(processed_row)
                except Exception as e:
                    logger.warning(f"Error processing SLG_V2 line {line_idx}: {e}")
                    continue

            logger.info(f"Successfully processed {len(processed_data)} SLG_V2 records")
            return processed_data

        except Exception as e:
            logger.error(f"Error processing SLG_V2 data: {e}")
            raise

    async def _is_slg_v2_header(self, line: str) -> bool:
        """Check if a line appears to be a SLG_V2 header"""
        # Common SLG_V2 header patterns
        header_keywords = ['timestamp', 'time', 'date', 'sensor', 'id', 'value', 'reading',
                          'energy', 'power', 'voltage', 'current', 'temperature']

        line_lower = line.lower()
        # Check if line contains header-like words and few or no numbers
        has_keywords = any(keyword in line_lower for keyword in header_keywords)

        # Try to parse as numbers - if most parts fail, likely a header
        parts = line.replace(',', ' ').replace(';', ' ').replace('\t', ' ').split()
        numeric_parts = 0
        for part in parts:
            try:
                float(part.strip())
                numeric_parts += 1
            except ValueError:
                continue

        # If less than half are numeric and has keywords, likely header
        return has_keywords and (numeric_parts < len(parts) / 2)

    async def _parse_slg_v2_header(self, line: str) -> List[str]:
        """Parse SLG_V2 header line"""
        # Try different delimiters
        for delimiter in [',', ';', '\t', ' ']:
            if delimiter in line:
                parts = [part.strip() for part in line.split(delimiter) if part.strip()]
                if len(parts) > 1:
                    return parts

        # Default to splitting by whitespace
        return [part.strip() for part in line.split() if part.strip()]

    async def _process_slg_v2_line(self, line: str, header: Optional[List[str]],
                                  metadata: Dict[str, Any], line_idx: int) -> Optional[Dict[str, Any]]:
        """Process a single SLG_V2 data line"""
        try:
            # Try different delimiters to parse the line
            parts = None
            for delimiter in [',', ';', '\t', ' ']:
                if delimiter in line:
                    test_parts = [part.strip() for part in line.split(delimiter) if part.strip()]
                    if len(test_parts) > 1:
                        parts = test_parts
                        break

            if not parts:
                # Split by whitespace as fallback
                parts = [part.strip() for part in line.split() if part.strip()]

            if not parts:
                return None

            # Create row dictionary
            if header and len(parts) >= len(header):
                row_dict = dict(zip(header, parts[:len(header)]))
                # Add extra columns if any
                for i, extra_part in enumerate(parts[len(header):]):
                    row_dict[f"extra_col_{i}"] = extra_part
            else:
                # Create generic column names
                row_dict = {f"col_{i}": part for i, part in enumerate(parts)}

            # Process the row similar to generic processing but with SLG_V2 specifics
            processed_row = {}

            # Extract timestamp
            timestamp = None
            timestamp_value = None
            for key, val in row_dict.items():
                key_lower = key.lower()
                if any(ts_word in key_lower for ts_word in ['time', 'date', 'timestamp', 'ts']):
                    timestamp = await self._parse_timestamp(val)
                    timestamp_value = val
                    if timestamp:
                        break

            if timestamp:
                processed_row['timestamp'] = int(timestamp.timestamp())
                processed_row['datetime'] = timestamp.isoformat()
            else:
                # Use current time with line offset for uniqueness
                now = datetime.utcnow()
                processed_row['timestamp'] = int(now.timestamp()) + line_idx
                processed_row['datetime'] = (now + timedelta(seconds=line_idx)).isoformat()

            # Extract sensor ID
            sensor_id = None
            for key, val in row_dict.items():
                key_lower = key.lower()
                if any(id_word in key_lower for id_word in ['sensor', 'device', 'meter', 'id']):
                    sensor_id = str(val).strip()
                    break

            processed_row['sensor_id'] = sensor_id or f"slg_v2_sensor_{line_idx}"

            # Extract numeric values
            values_found = []
            for key, val in row_dict.items():
                key_lower = key.lower()
                # Skip timestamp and ID fields
                if (any(skip_word in key_lower for skip_word in ['time', 'date', 'timestamp', 'ts', 'id', 'sensor', 'device', 'meter']) and
                    val == timestamp_value) or key_lower.endswith('_id'):
                    continue

                try:
                    numeric_val = await self._parse_numeric_value(val)
                    if numeric_val is not None:
                        values_found.append({
                            'key': key,
                            'value': numeric_val,
                            'unit': await self._infer_slg_v2_unit(key, numeric_val)
                        })
                except:
                    continue

            # Handle multiple values
            if len(values_found) == 1:
                # Single value case
                processed_row['value'] = values_found[0]['value']
                processed_row['unit'] = values_found[0]['unit']
                processed_row['value_type'] = values_found[0]['key']
            elif len(values_found) > 1:
                # Multiple values case - create main value and store others in metadata
                main_value = values_found[0]  # Use first numeric value as main
                processed_row['value'] = main_value['value']
                processed_row['unit'] = main_value['unit']
                processed_row['value_type'] = main_value['key']

                # Store additional values in metadata
                additional_values = {}
                for val_info in values_found[1:]:
                    additional_values[val_info['key']] = {
                        'value': val_info['value'],
                        'unit': val_info['unit']
                    }
                processed_row['additional_values'] = additional_values

            # Add all data as metadata
            row_metadata = dict(row_dict)
            row_metadata.update(metadata)  # Include file-level metadata
            row_metadata['line_number'] = line_idx
            row_metadata['raw_line'] = line
            processed_row['metadata'] = row_metadata

            # Add processing info
            processed_row['processed_at'] = datetime.utcnow().isoformat()
            processed_row['data_source'] = 'slg_v2'
            processed_row['file_format'] = 'SA4CPS_SLG_V2'

            return processed_row

        except Exception as e:
            logger.error(f"Error processing SLG_V2 line {line_idx}: {e}")
            return None

    async def _infer_slg_v2_unit(self, column_name: str, value: float) -> str:
        """Infer unit based on SLG_V2 column name and value"""
        try:
            col_lower = column_name.lower()

            # Common SA4CPS/energy monitoring units
            if any(word in col_lower for word in ['energy', 'wh', 'consumption']):
                if value < 1:
                    return "Wh"
                elif value < 1000:
                    return "kWh"
                elif value < 1000000:
                    return "MWh"
                else:
                    return "GWh"
            elif any(word in col_lower for word in ['power', 'watt', 'w']):
                if value < 1000:
                    return "W"
                elif value < 1000000:
                    return "kW"
                else:
                    return "MW"
            elif any(word in col_lower for word in ['voltage', 'volt', 'v']):
                return "V"
            elif any(word in col_lower for word in ['current', 'amp', 'a']):
                return "A"
            elif any(word in col_lower for word in ['temp', 'temperature']):
                return "°C"
            elif any(word in col_lower for word in ['freq', 'frequency']):
                return "Hz"
            elif any(word in col_lower for word in ['percent', '%']):
                return "%"
            else:
                # Default energy unit inference
                return await self._infer_unit(value)

        except:
            return "unknown"

    async def get_processing_stats(self) -> Dict[str, Any]:
        """Get processing statistics"""
        try:
            # This could be enhanced to return actual processing metrics
            return {
                "supported_formats": self.supported_formats,
                "time_formats_supported": len(self.time_formats),
                "slg_v2_support": True,
                "last_updated": datetime.utcnow().isoformat()
            }
        except Exception as e:
            logger.error(f"Error getting processing stats: {e}")
            return {}