"""
PDF adapter for extracting financial data from bank statements and invoices.
"""

from __future__ import annotations

from pathlib import Path
from typing import Optional, List, Dict, Any
import re
from datetime import datetime

import pandas as pd

try:
    import PyPDF2
except ImportError:
    PyPDF2 = None

from emberquant.adapters.base import BaseAdapter, ConnectionConfig


class PDFAdapter(BaseAdapter):
    """
    Adapter for PDF files containing financial data.

    Supports extracting transactions from:
    - Bank statements
    - Credit card statements
    - Invoices
    - Receipts
    """

    def __init__(self, config: ConnectionConfig) -> None:
        """
        Initialize PDF adapter.

        Args:
            config: Configuration with file_path and pdf_type in options
        """
        super().__init__(config)
        self.file_path = Path(config.options.get("file_path", ""))
        self.pdf_type = config.options.get("pdf_type", "auto")  # auto, bank_statement, invoice
        self.parser_rules = config.options.get("parser_rules", None)

        if not self.file_path:
            raise ValueError("file_path must be specified in config.options")

        if PyPDF2 is None:
            raise ImportError(
                "PyPDF2 is required for PDF adapter. Install with: pip install PyPDF2"
            )

    def connect(self) -> None:
        """Validate that the PDF file exists."""
        if not self.file_path.exists():
            raise FileNotFoundError(f"PDF file not found: {self.file_path}")

        if self.file_path.suffix.lower() != ".pdf":
            raise ValueError(f"File must be a PDF: {self.file_path}")

        self._connected = True

    def disconnect(self) -> None:
        """No-op for file-based adapter."""
        self._connected = False

    def fetch_ledger(
        self, start_date: Optional[str] = None, end_date: Optional[str] = None
    ) -> pd.DataFrame:
        """
        Extract ledger data from PDF.

        Args:
            start_date: Optional start date filter
            end_date: Optional end date filter

        Returns:
            DataFrame with extracted transaction data
        """
        text = self._extract_text_from_pdf()

        if self.pdf_type == "auto":
            self.pdf_type = self._detect_pdf_type(text)

        if self.pdf_type == "bank_statement":
            df = self._parse_bank_statement(text)
        elif self.pdf_type == "invoice":
            df = self._parse_invoice(text)
        else:
            df = self._generic_parse(text)

        # Apply date filtering
        if start_date or end_date:
            if "date" in df.columns:
                df["date"] = pd.to_datetime(df["date"], errors="coerce")
                if start_date:
                    df = df[df["date"] >= start_date]
                if end_date:
                    df = df[df["date"] <= end_date]

        return df

    def fetch_transactions(
        self, start_date: Optional[str] = None, end_date: Optional[str] = None
    ) -> pd.DataFrame:
        """Alias for fetch_ledger."""
        return self.fetch_ledger(start_date, end_date)

    def _extract_text_from_pdf(self) -> str:
        """Extract all text from the PDF file."""
        with open(self.file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
        return text

    def _detect_pdf_type(self, text: str) -> str:
        """
        Auto-detect the type of PDF document.

        Args:
            text: Extracted text from PDF

        Returns:
            Detected type: bank_statement, invoice, or generic
        """
        text_lower = text.lower()

        # Bank statement indicators
        bank_keywords = ["bank statement", "checking account", "savings account",
                        "opening balance", "closing balance", "transaction history"]
        if any(keyword in text_lower for keyword in bank_keywords):
            return "bank_statement"

        # Invoice indicators
        invoice_keywords = ["invoice", "bill to", "due date", "invoice number",
                           "payment terms", "total due"]
        if any(keyword in text_lower for keyword in invoice_keywords):
            return "invoice"

        return "generic"

    def _parse_bank_statement(self, text: str) -> pd.DataFrame:
        """
        Parse a bank statement PDF.

        Args:
            text: Extracted text from PDF

        Returns:
            DataFrame with transaction data
        """
        transactions = []

        # Common bank statement patterns
        # Pattern: MM/DD/YYYY Description Amount
        date_pattern = r"(\d{1,2}/\d{1,2}/\d{2,4})"
        amount_pattern = r"[\$\-]?([\d,]+\.\d{2})"

        lines = text.split("\n")
        for line in lines:
            # Try to find date
            date_match = re.search(date_pattern, line)
            if date_match:
                date_str = date_match.group(1)

                # Find all amounts in the line
                amounts = re.findall(amount_pattern, line)

                # Extract description (text between date and first amount)
                date_end = date_match.end()
                first_amount_pos = line.find("$") if "$" in line else line.find(amounts[0]) if amounts else len(line)
                description = line[date_end:first_amount_pos].strip()

                if amounts:
                    # Determine if debit or credit
                    amount_str = amounts[0].replace(",", "")
                    amount = float(amount_str)

                    # Negative amounts or amounts after description are typically debits
                    is_debit = "-" in line or len(amounts) == 1

                    transactions.append({
                        "date": date_str,
                        "description": description,
                        "debit": amount if is_debit else None,
                        "credit": None if is_debit else amount,
                        "amount": amount if is_debit else -amount,
                    })

        if not transactions:
            return pd.DataFrame(columns=["date", "description", "debit", "credit", "amount"])

        df = pd.DataFrame(transactions)
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        return df

    def _parse_invoice(self, text: str) -> pd.DataFrame:
        """
        Parse an invoice PDF.

        Args:
            text: Extracted text from PDF

        Returns:
            DataFrame with invoice line items
        """
        # Extract invoice metadata
        invoice_number = self._extract_field(text, r"invoice\s+#?(\S+)", "INV-UNKNOWN")
        invoice_date = self._extract_field(text, r"date[:\s]+(\d{1,2}/\d{1,2}/\d{2,4})", None)
        due_date = self._extract_field(text, r"due\s+date[:\s]+(\d{1,2}/\d{1,2}/\d{2,4})", None)
        total = self._extract_field(text, r"total[:\s]+\$?([\d,]+\.\d{2})", "0.00")

        # Create a single transaction for the invoice
        transaction = {
            "date": invoice_date or datetime.now().strftime("%m/%d/%Y"),
            "description": f"Invoice {invoice_number}",
            "invoice_number": invoice_number,
            "due_date": due_date,
            "amount": float(total.replace(",", "")),
            "type": "invoice",
        }

        return pd.DataFrame([transaction])

    def _generic_parse(self, text: str) -> pd.DataFrame:
        """
        Generic parsing for unknown PDF types.

        Extracts any date and amount patterns found.

        Args:
            text: Extracted text from PDF

        Returns:
            DataFrame with extracted data
        """
        transactions = []

        # Find all dates and amounts
        date_pattern = r"(\d{1,2}/\d{1,2}/\d{2,4})"
        amount_pattern = r"\$?([\d,]+\.\d{2})"

        dates = re.findall(date_pattern, text)
        amounts = re.findall(amount_pattern, text)

        # Try to pair dates with amounts
        for i, date in enumerate(dates):
            if i < len(amounts):
                amount_str = amounts[i].replace(",", "")
                transactions.append({
                    "date": date,
                    "description": f"Transaction {i+1}",
                    "amount": float(amount_str),
                })

        if not transactions:
            return pd.DataFrame(columns=["date", "description", "amount"])

        df = pd.DataFrame(transactions)
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        return df

    def _extract_field(self, text: str, pattern: str, default: Optional[str] = None) -> Optional[str]:
        """
        Extract a field from text using regex pattern.

        Args:
            text: Text to search
            pattern: Regex pattern with one capture group
            default: Default value if not found

        Returns:
            Extracted value or default
        """
        match = re.search(pattern, text, re.IGNORECASE)
        return match.group(1) if match else default
