"""
Clerk Agent: Ingestion and normalization of financial data.

The Clerk is responsible for:
- Reading data from various sources (CSV, Excel, APIs)
- Normalizing column names and formats
- Inferring semantic types
- Creating EmberFrame objects
"""

from __future__ import annotations

from typing import Any, Dict, Optional

import pandas as pd

from emberquant.agents.base import AgentConfig, BaseAgent
from emberquant.core.emberframe import EmberFrame


class ClerkConfig(AgentConfig):
    """Configuration for the Clerk agent."""

    auto_clean: bool = True
    infer_types: bool = True
    drop_empty_rows: bool = True
    drop_empty_columns: bool = True
    date_format: Optional[str] = None


class ClerkAgent(BaseAgent):
    """
    The Clerk Agent handles data ingestion and normalization.

    Transforms messy input data (Excel, CSV, API responses) into clean,
    semantically-tagged EmberFrame objects.
    """

    def __init__(self, config: Optional[ClerkConfig] = None) -> None:
        """
        Initialize the Clerk agent.

        Args:
            config: Optional configuration
        """
        super().__init__(config or ClerkConfig())
        self.config: ClerkConfig = self.config  # type: ignore

    @property
    def name(self) -> str:
        """Return the agent name."""
        return "Clerk"

    def execute(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        """
        Execute the Clerk's ingestion and normalization tasks.

        Expected inputs:
            - data: pd.DataFrame or path to file
            - source: Source identifier
            - source_type: Type of source (csv, excel, quickbooks, etc.)

        Returns:
            Dictionary with 'emberframe' key containing the created EmberFrame
        """
        self._log("Starting data ingestion...")

        data = inputs.get("data")
        source = inputs.get("source", "unknown")
        source_type = inputs.get("source_type", "unknown")

        if data is None:
            raise ValueError("No data provided to Clerk agent")

        # Handle different input types
        if isinstance(data, str):
            # Assume it's a file path
            df = self._load_from_file(data)
            source = data
        elif isinstance(data, pd.DataFrame):
            df = data.copy()
        else:
            raise ValueError(f"Unsupported data type: {type(data)}")

        # Clean the data
        if self.config.auto_clean:
            df = self._clean_dataframe(df)

        self._log(f"Loaded {len(df)} rows and {len(df.columns)} columns")

        # Create EmberFrame
        emberframe = EmberFrame.from_dataframe(
            df=df,
            source=source,
            source_type=source_type,
            infer_types=self.config.infer_types,
        )

        self._log("EmberFrame created successfully")

        return {"emberframe": emberframe, "status": "success"}

    def _load_from_file(self, file_path: str) -> pd.DataFrame:
        """
        Load data from a file.

        Args:
            file_path: Path to the file

        Returns:
            DataFrame with loaded data
        """
        if file_path.endswith(".csv"):
            return pd.read_csv(file_path)
        elif file_path.endswith((".xlsx", ".xls")):
            return pd.read_excel(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_path}")

    def _clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Clean a DataFrame by removing empty rows/columns and standardizing formats.

        Args:
            df: Input DataFrame

        Returns:
            Cleaned DataFrame
        """
        df = df.copy()

        # Drop completely empty rows
        if self.config.drop_empty_rows:
            df = df.dropna(how="all")

        # Drop completely empty columns
        if self.config.drop_empty_columns:
            df = df.dropna(axis=1, how="all")

        # Strip whitespace from string columns
        for col in df.columns:
            if df[col].dtype == "object":
                df[col] = df[col].astype(str).str.strip()
                # Replace "nan" strings with actual NaN
                df[col] = df[col].replace("nan", pd.NA)

        # Normalize column names
        df.columns = [self._normalize_column_name(col) for col in df.columns]

        # Handle dates if format specified
        if self.config.date_format:
            for col in df.columns:
                if "date" in col.lower():
                    try:
                        df[col] = pd.to_datetime(df[col], format=self.config.date_format)
                    except Exception:
                        pass  # Skip if conversion fails

        return df

    @staticmethod
    def _normalize_column_name(name: str) -> str:
        """
        Normalize a column name to a consistent format.

        Args:
            name: Original column name

        Returns:
            Normalized column name
        """
        # Convert to lowercase, replace spaces and special chars with underscores
        normalized = str(name).lower()
        normalized = normalized.replace(" ", "_")
        normalized = normalized.replace("-", "_")
        normalized = normalized.replace(".", "_")
        # Remove multiple consecutive underscores
        while "__" in normalized:
            normalized = normalized.replace("__", "_")
        # Remove leading/trailing underscores
        normalized = normalized.strip("_")
        return normalized
