"""
ChalkML BOW Engine
==================
Blueprint-based data standardization with compliance guarantees.

BOW = Blueprint Operations Workflow

Core Concepts:
- Format Standardization: Date, currency, percent, text transformations
- Compliance Standards: HIPAA, GAAP, GDPR, ISO, DICOM
- Layout Templates: Industry-specific column ordering
- Validation Rules: Regex, range, type checking
- Audit Trail: Every transformation logged

BOW ensures data conforms to standards BEFORE analysis.
"""

import json
import yaml
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any, Callable
from datetime import datetime
import pandas as pd
import numpy as np


class FormatTransformer:
    """
    Apply standardized formatting transformations.
    """
    
    @staticmethod
    def to_sentence_case(series: pd.Series) -> pd.Series:
        """First letter uppercase, rest lowercase."""
        return series.apply(lambda x: str(x).capitalize() if pd.notna(x) else x)
    
    @staticmethod
    def to_title_case(series: pd.Series) -> pd.Series:
        """Each Word Capitalized."""
        return series.apply(lambda x: str(x).title() if pd.notna(x) else x)
    
    @staticmethod
    def to_upper_case(series: pd.Series) -> pd.Series:
        """ALL UPPERCASE."""
        return series.apply(lambda x: str(x).upper() if pd.notna(x) else x)
    
    @staticmethod
    def to_lower_case(series: pd.Series) -> pd.Series:
        """all lowercase."""
        return series.apply(lambda x: str(x).lower() if pd.notna(x) else x)
    
    @staticmethod
    def format_date(series: pd.Series, pattern: str = "YYYY-MM-DD") -> pd.Series:
        """
        Standardize date format.
        
        Patterns:
            YYYY-MM-DD: 2024-10-26
            MM/DD/YYYY: 10/26/2024
            DD-MMM-YYYY: 26-Oct-2024
        """
        try:
            dates = pd.to_datetime(series, errors='coerce')
            
            if pattern == "YYYY-MM-DD":
                return dates.dt.strftime('%Y-%m-%d')
            elif pattern == "MM/DD/YYYY":
                return dates.dt.strftime('%m/%d/%Y')
            elif pattern == "DD-MMM-YYYY":
                return dates.dt.strftime('%d-%b-%Y')
            else:
                return dates.dt.strftime(pattern)
        except Exception:
            return series
    
    @staticmethod
    def format_currency(
        series: pd.Series,
        symbol: str = "$",
        precision: int = 2,
        thousands: str = ","
    ) -> pd.Series:
        """
        Format as currency.
        
        Example: 1234.56 → $1,234.56
        """
        def format_value(val):
            if pd.isna(val):
                return val
            try:
                num = float(val)
                formatted = f"{num:,.{precision}f}"
                if thousands != ",":
                    formatted = formatted.replace(",", thousands)
                return f"{symbol}{formatted}"
            except:
                return val
        
        return series.apply(format_value)
    
    @staticmethod
    def format_percent(series: pd.Series, precision: int = 1) -> pd.Series:
        """
        Format as percentage.
        
        Example: 0.156 → 15.6%
        """
        def format_value(val):
            if pd.isna(val):
                return val
            try:
                num = float(val) * 100
                return f"{num:.{precision}f}%"
            except:
                return val
        
        return series.apply(format_value)
    
    @staticmethod
    def format_phone(series: pd.Series, pattern: str = "XXX-XXX-XXXX") -> pd.Series:
        """
        Format phone numbers.
        
        Pattern: XXX-XXX-XXXX → 555-123-4567
        """
        def format_value(val):
            if pd.isna(val):
                return val
            
            # Extract digits only
            digits = re.sub(r'\D', '', str(val))
            
            if len(digits) == 10:
                return f"{digits[:3]}-{digits[3:6]}-{digits[6:]}"
            return val
        
        return series.apply(format_value)
    
    @staticmethod
    def format_ssn(series: pd.Series) -> pd.Series:
        """
        Format SSN: XXX-XX-XXXX
        """
        def format_value(val):
            if pd.isna(val):
                return val
            
            digits = re.sub(r'\D', '', str(val))
            
            if len(digits) == 9:
                return f"{digits[:3]}-{digits[3:5]}-{digits[5:]}"
            return val
        
        return series.apply(format_value)


class ComplianceStandards:
    """
    Apply industry-specific compliance standards.
    """
    
    @staticmethod
    def hipaa_safe_harbor(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
        """
        Apply HIPAA Safe Harbor de-identification.
        
        Removes/generalizes 18 identifiers per 45 CFR 164.514(b).
        """
        transformations = {}
        df_compliant = df.copy()
        
        # Detect and generalize dates (keep year only)
        date_cols = df.select_dtypes(include=['datetime64']).columns
        for col in date_cols:
            df_compliant[col] = df[col].dt.year
            transformations[col] = "date_to_year"
        
        # Detect and suppress potential PII columns
        pii_keywords = ['name', 'ssn', 'email', 'phone', 'address', 'zip']
        for col in df.columns:
            col_lower = col.lower()
            for keyword in pii_keywords:
                if keyword in col_lower:
                    df_compliant[col] = "***REDACTED***"
                    transformations[col] = f"suppressed_{keyword}"
                    break
        
        return df_compliant, transformations
    
    @staticmethod
    def gaap_formatting(df: pd.DataFrame, currency_cols: List[str]) -> pd.DataFrame:
        """
        Apply GAAP (Generally Accepted Accounting Principles) formatting.
        
        - Currency in USD format
        - Negative values in parentheses
        - Consistent precision (2 decimal places)
        """
        df_gaap = df.copy()
        
        for col in currency_cols:
            if col in df.columns:
                def gaap_format(val):
                    if pd.isna(val):
                        return val
                    try:
                        num = float(val)
                        if num < 0:
                            return f"(${ abs(num):,.2f})"
                        else:
                            return f"${num:,.2f}"
                    except:
                        return val
                
                df_gaap[col] = df[col].apply(gaap_format)
        
        return df_gaap
    
    @staticmethod
    def iso_4217_currency(df: pd.DataFrame, col: str, currency_code: str = "USD") -> pd.DataFrame:
        """
        Apply ISO 4217 currency formatting.
        """
        df_iso = df.copy()
        
        symbols = {
            "USD": "$",
            "EUR": "€",
            "GBP": "£",
            "JPY": "¥",
            "CNY": "¥"
        }
        
        symbol = symbols.get(currency_code, currency_code + " ")
        
        if col in df.columns:
            df_iso[col] = df[col].apply(
                lambda x: f"{symbol}{float(x):,.2f}" if pd.notna(x) else x
            )
        
        return df_iso
    
    @staticmethod
    def gdpr_compliant(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, str]]:
        """
        Apply GDPR compliance transformations.
        
        - Data minimization
        - Pseudonymization of identifiers
        """
        import hashlib
        
        df_gdpr = df.copy()
        transformations = {}
        
        # Hash potential identifiers
        id_keywords = ['id', 'email', 'user', 'customer', 'patient']
        for col in df.columns:
            col_lower = col.lower()
            for keyword in id_keywords:
                if keyword in col_lower:
                    df_gdpr[col] = df[col].apply(
                        lambda x: hashlib.sha256(str(x).encode()).hexdigest()[:16] if pd.notna(x) else x
                    )
                    transformations[col] = "pseudonymized"
                    break
        
        return df_gdpr, transformations


class LayoutTemplates:
    """
    Apply industry-specific column layouts.
    """
    
    @staticmethod
    def radiology_dicom_layout(df: pd.DataFrame) -> pd.DataFrame:
        """
        Reorder columns to DICOM standard layout.
        
        Standard order:
        1. PatientID
        2. StudyDate
        3. Modality
        4. BodyPart
        5. [Additional fields]
        """
        standard_order = [
            'PatientID', 'patient_id', 'id',
            'StudyDate', 'study_date', 'date',
            'Modality', 'modality', 'type',
            'BodyPart', 'body_part', 'part'
        ]
        
        # Find columns matching standard order
        ordered_cols = []
        remaining_cols = list(df.columns)
        
        for standard_col in standard_order:
            for col in df.columns:
                if col.lower() == standard_col.lower() and col in remaining_cols:
                    ordered_cols.append(col)
                    remaining_cols.remove(col)
                    break
        
        # Add remaining columns
        ordered_cols.extend(remaining_cols)
        
        return df[ordered_cols]
    
    @staticmethod
    def financial_ledger_layout(df: pd.DataFrame) -> pd.DataFrame:
        """
        Standard accounting ledger layout.
        
        Order:
        1. Date
        2. Account
        3. Description
        4. Debit
        5. Credit
        6. Balance
        """
        standard_order = [
            'date', 'transaction_date',
            'account', 'account_id', 'account_number',
            'description', 'memo', 'note',
            'debit', 'debit_amount',
            'credit', 'credit_amount',
            'balance', 'running_balance'
        ]
        
        ordered_cols = []
        remaining_cols = list(df.columns)
        
        for standard_col in standard_order:
            for col in df.columns:
                if standard_col in col.lower() and col in remaining_cols:
                    ordered_cols.append(col)
                    remaining_cols.remove(col)
                    break
        
        ordered_cols.extend(remaining_cols)
        
        return df[ordered_cols]


class BowEngine:
    """
    High-level engine for blueprint-based standardization.
    """
    
    def __init__(self, workspace_path: Optional[str] = None):
        if workspace_path is None:
            workspace_path = Path.cwd()
        self.workspace_path = Path(workspace_path)
        self.chalkml_dir = self.workspace_path / ".chalkml"
        self.bow_dir = self.chalkml_dir / "bow_reports"
        self.bow_dir.mkdir(parents=True, exist_ok=True)
        
        self.formatter = FormatTransformer()
        self.compliance = ComplianceStandards()
        self.layouts = LayoutTemplates()
    
    def apply_format(
        self,
        df: pd.DataFrame,
        column: str,
        format_type: str,
        **kwargs
    ) -> pd.DataFrame:
        """
        Apply formatting to specific column.
        
        format_type: 'date', 'currency', 'percent', 'phone', 'ssn', 
                     'sentence', 'title', 'upper', 'lower'
        """
        df_formatted = df.copy()
        
        if column not in df.columns:
            raise ValueError(f"Column '{column}' not found")
        
        if format_type == 'date':
            pattern = kwargs.get('pattern', 'YYYY-MM-DD')
            df_formatted[column] = self.formatter.format_date(df[column], pattern)
        
        elif format_type == 'currency':
            symbol = kwargs.get('symbol', '$')
            precision = kwargs.get('precision', 2)
            thousands = kwargs.get('thousands', ',')
            df_formatted[column] = self.formatter.format_currency(
                df[column], symbol, precision, thousands
            )
        
        elif format_type == 'percent':
            precision = kwargs.get('precision', 1)
            df_formatted[column] = self.formatter.format_percent(df[column], precision)
        
        elif format_type == 'phone':
            df_formatted[column] = self.formatter.format_phone(df[column])
        
        elif format_type == 'ssn':
            df_formatted[column] = self.formatter.format_ssn(df[column])
        
        elif format_type == 'sentence':
            df_formatted[column] = self.formatter.to_sentence_case(df[column])
        
        elif format_type == 'title':
            df_formatted[column] = self.formatter.to_title_case(df[column])
        
        elif format_type == 'upper':
            df_formatted[column] = self.formatter.to_upper_case(df[column])
        
        elif format_type == 'lower':
            df_formatted[column] = self.formatter.to_lower_case(df[column])
        
        else:
            raise ValueError(f"Unknown format type: {format_type}")
        
        return df_formatted
    
    def apply_standard(
        self,
        df: pd.DataFrame,
        standard: str,
        **kwargs
    ) -> Tuple[pd.DataFrame, Dict[str, Any]]:
        """
        Apply compliance standard.
        
        standard: 'HIPAA', 'GAAP', 'GDPR', 'ISO-4217'
        """
        if standard.upper() == 'HIPAA':
            df_compliant, transformations = self.compliance.hipaa_safe_harbor(df)
            metadata = {"standard": "HIPAA", "transformations": transformations}
        
        elif standard.upper() == 'GAAP':
            currency_cols = kwargs.get('currency_cols', [])
            df_compliant = self.compliance.gaap_formatting(df, currency_cols)
            metadata = {"standard": "GAAP", "currency_columns": currency_cols}
        
        elif standard.upper() == 'GDPR':
            df_compliant, transformations = self.compliance.gdpr_compliant(df)
            metadata = {"standard": "GDPR", "transformations": transformations}
        
        elif standard.upper() == 'ISO-4217':
            col = kwargs.get('column')
            currency = kwargs.get('currency', 'USD')
            df_compliant = self.compliance.iso_4217_currency(df, col, currency)
            metadata = {"standard": "ISO-4217", "currency": currency}
        
        else:
            raise ValueError(f"Unknown standard: {standard}")
        
        return df_compliant, metadata
    
    def apply_layout(
        self,
        df: pd.DataFrame,
        layout: str
    ) -> pd.DataFrame:
        """
        Apply column layout template.
        
        layout: 'radiology', 'financial', 'custom'
        """
        if layout == 'radiology':
            return self.layouts.radiology_dicom_layout(df)
        
        elif layout == 'financial':
            return self.layouts.financial_ledger_layout(df)
        
        else:
            raise ValueError(f"Unknown layout: {layout}")
    
    def bow_file(
        self,
        input_file: str,
        output_file: str,
        operation: str,
        **kwargs
    ) -> Dict[str, Any]:
        """
        Apply BOW operation to file.
        
        operation: 'format', 'standard', 'layout'
        """
        df = pd.read_csv(input_file)
        
        if operation == 'format':
            column = kwargs.get('column')
            format_type = kwargs.get('format_type')
            df_result = self.apply_format(df, column, format_type, **kwargs)
            metadata = {"operation": "format", "column": column, "type": format_type}
        
        elif operation == 'standard':
            standard = kwargs.get('standard')
            df_result, metadata = self.apply_standard(df, standard, **kwargs)
        
        elif operation == 'layout':
            layout = kwargs.get('layout')
            df_result = self.apply_layout(df, layout)
            metadata = {"operation": "layout", "template": layout}
        
        else:
            raise ValueError(f"Unknown operation: {operation}")
        
        # Save result
        df_result.to_csv(output_file, index=False)
        
        # Save report
        report_path = self.bow_dir / f"bow_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(report_path, 'w') as f:
            json.dump({
                "input_file": input_file,
                "output_file": output_file,
                "operation": operation,
                "metadata": metadata,
                "timestamp": datetime.now().isoformat()
            }, f, indent=2)
        
        return {
            "success": True,
            "input_file": input_file,
            "output_file": output_file,
            "rows": len(df_result),
            "operation": operation,
            "report_path": str(report_path)
        }


def get_bow_engine(workspace_path: Optional[str] = None) -> BowEngine:
    """Factory function to get BowEngine instance."""
    return BowEngine(workspace_path=workspace_path)
