"""Process transaction data from PDF files.

This module handles the complete pipeline of processing multiple PDF files:
converting them to text, extracting transactions, and consolidating into CSV.
"""

import csv
import os
from datetime import datetime
from multiprocessing import Pool
from pathlib import Path

from .common.pdf_convert_txt import pdf_to_text
from .common.txt_convert_csv import txt_to_csv
from .common.utils import configure_logging, get_logger

__all__ = ["process_transaction"]

# Configure logging
configure_logging()
log = get_logger(__name__)


def process_single_csv(csv_path: str) -> tuple[str, list[list[str]]]:
    """Process individual CSV files and return parsed data.

    Args:
        csv_path: Path to the CSV file to process

    Returns:
        Tuple of (report_date, transaction_data) where report_date is extracted
        from the filename and transaction_data is a list of rows

    Raises:
        FileNotFoundError: If CSV file does not exist
        ValueError: If filename format is invalid
    """
    if not os.path.exists(csv_path):
        msg = f"CSV file not found: {csv_path}"
        log.error("csv_file_not_found", csv_path=csv_path)
        raise FileNotFoundError(msg)

    content: list[list[str]] = []
    with open(csv_path, encoding="utf-8") as f:
        csv_reader = csv.reader(f)
        for i, row in enumerate(csv_reader):
            if i == 0:  # Skip header
                continue
            content.append(row)

    filename = os.path.basename(csv_path)
    parts = filename.split("-")
    if not parts:
        msg = f"Invalid CSV filename format: {filename}"
        log.error("invalid_csv_filename_format", filename=filename)
        raise ValueError(msg)

    log.debug("csv_file_processed", filename=filename, row_count=len(content))
    return (parts[0], content)


def process_single_report_data(
    report_date: str, data: list[list[str]]
) -> list[list[str]]:
    """Process transaction data by adding year information based on report date.

    Handles year boundaries where transactions from December may appear
    on a January statement.

    Args:
        report_date: Report date in YYYYMMDD format
        data: List of transaction rows
            [posting_date, transaction_date, description, amount]

    Returns:
        List of processed transaction rows with full dates including year

    Raises:
        ValueError: If report date format is invalid
    """
    try:
        report_date_obj = datetime.strptime(report_date, "%Y%m%d")
    except ValueError as e:
        msg = f"Invalid report date format: {report_date}"
        log.error("invalid_report_date_format", report_date=report_date, error=str(e))
        raise ValueError(msg) from e

    modified_data: list[list[str]] = []
    skipped_rows = 0

    for row in data:
        try:
            year = report_date_obj.year

            # Parse posting date month
            posting_parts = row[0].split("/")
            if len(posting_parts) < 2:
                log.warning("invalid_posting_date_format", posting_date=row[0])
                skipped_rows += 1
                continue
            posting_month = int(posting_parts[1])

            # Parse transaction date month
            trans_parts = row[1].split("/")
            if len(trans_parts) < 2:
                log.warning("invalid_transaction_date_format", transaction_date=row[1])
                skipped_rows += 1
                continue
            trans_month = int(trans_parts[1])

            # Handle year boundary for posting date
            post_year = (
                year - 1 if report_date_obj.month == 1 and posting_month == 12 else year
            )

            # Handle year boundary for transaction date
            trans_year = (
                year - 1 if report_date_obj.month == 1 and trans_month == 12 else year
            )

            modified_data.append(
                [
                    f"{row[0]}/{post_year}",  # Posting Date
                    f"{row[1]}/{trans_year}",  # Transaction Date
                    row[2],  # Description
                    row[3],  # Amount
                ]
            )
        except (ValueError, IndexError) as e:
            log.warning("error_processing_row", row=row, error=str(e))
            skipped_rows += 1
            continue

    if skipped_rows > 0:
        log.info(
            "report_processing_complete",
            report_date=report_date,
            processed_rows=len(modified_data),
            skipped_rows=skipped_rows,
        )

    return modified_data


def process_transaction(
    input_folder: str, output_file: str, password: str, dataset_folder: str
) -> None:
    """Process transaction data from PDF files.

    Complete pipeline that:
    1. Converts PDF files to text
    2. Extracts transactions to CSV
    3. Consolidates and sorts all transactions
    4. Writes final CSV output

    Args:
        input_folder: Folder containing PDF files
        output_file: Path to output CSV file
        password: Password for encrypted PDFs
        dataset_folder: Folder for intermediate files (txt and csv)

    Raises:
        FileNotFoundError: If input folder does not exist
        ValueError: If no PDF files found or processing fails
        OSError: If unable to create output directories or files
    """
    log.info(
        "transaction_processing_started",
        input_folder=input_folder,
        output_file=output_file,
        dataset_folder=dataset_folder,
    )

    if not os.path.exists(input_folder):
        msg = f"Input folder not found: {input_folder}"
        log.error("input_folder_not_found", input_folder=input_folder)
        raise FileNotFoundError(msg)

    # Create dataset folders
    txt_folder = os.path.join(dataset_folder, "txt")
    csv_folder = os.path.join(dataset_folder, "csv")

    Path(txt_folder).mkdir(parents=True, exist_ok=True)
    Path(csv_folder).mkdir(parents=True, exist_ok=True)
    log.debug("dataset_folders_created", txt_folder=txt_folder, csv_folder=csv_folder)

    # Get list of PDF files
    pdf_files = [
        os.path.join(input_folder, f)
        for f in os.listdir(input_folder)
        if f.lower().endswith(".pdf")
    ]

    if not pdf_files:
        msg = f"No PDF files found in {input_folder}"
        log.error("no_pdf_files_found", input_folder=input_folder)
        raise ValueError(msg)

    log.info("pdf_files_discovered", pdf_count=len(pdf_files))

    # Generate corresponding txt and csv file paths
    txt_files = [
        os.path.join(txt_folder, f"{os.path.splitext(os.path.basename(f))[0]}.txt")
        for f in pdf_files
    ]

    csv_files = [
        os.path.join(
            csv_folder,
            f'{os.path.splitext(os.path.basename(f))[0].split("_")[-1]}-'
            f'{os.path.splitext(os.path.basename(f))[0].split("_")[0]}.csv',
        )
        for f in pdf_files
    ]

    # Convert PDFs to text using multiprocessing
    log.info("converting_pdfs_to_text", pdf_count=len(pdf_files))
    with Pool() as pool:
        pool.starmap(
            pdf_to_text,
            [
                (pdf, txt, password)
                for pdf, txt in zip(pdf_files, txt_files, strict=False)
            ],
        )
    log.info("pdfs_converted_to_text")

    # Convert text to CSV using multiprocessing
    log.info("converting_text_to_csv", file_count=len(txt_files))
    with Pool() as pool:
        pool.starmap(
            txt_to_csv,
            [(txt, csv) for txt, csv in zip(txt_files, csv_files, strict=False)],
        )
    log.info("text_files_converted_to_csv")

    # Process CSV files and collect data
    log.info("processing_csv_files", csv_count=len(csv_files))
    dataset: dict[str, list[list[str]]] = {}
    with Pool() as pool:
        results: list[tuple[str, list[list[str]]]] = pool.map(
            process_single_csv, csv_files
        )
        for key, content in results:
            dataset[key] = content

    log.info("csv_files_processed", pdf_count=len(pdf_files), report_count=len(dataset))

    # Process report data to add year information
    log.info("processing_report_data", report_count=len(dataset))
    with Pool() as pool:
        processed_lists: list[list[list[str]]] = pool.starmap(
            process_single_report_data,
            list(dataset.items()),
        )

    # Consolidate all transactions
    final_csv: list[list[str]] = []
    for data in processed_lists:
        final_csv.extend(data)

    log.info("transactions_consolidated", transaction_count=len(final_csv))

    # Sort by transaction date
    log.debug("sorting_transactions_by_date")
    try:
        final_csv = sorted(final_csv, key=lambda x: datetime.strptime(x[1], "%d/%m/%Y"))
        log.debug("transactions_sorted_successfully")
    except ValueError as e:
        log.warning("error_sorting_transactions", error=str(e))

    # Add header row
    final_csv.insert(0, ["Posting Date", "Transaction Date", "Description", "Amount"])

    # Write final CSV
    log.debug("writing_output_file", output_file=output_file, row_count=len(final_csv))
    try:
        with open(output_file, "w", newline="", encoding="utf-8") as f:
            csv_writer = csv.writer(f)
            csv_writer.writerows(final_csv)
        log.info(
            "processing_complete",
            output_file=output_file,
            row_count=len(final_csv),
            pdf_count=len(pdf_files),
        )
    except OSError as e:
        log.error(
            "failed_to_write_output",
            output_file=output_file,
            error=str(e),
            exc_info=True,
        )
        raise
