"""Content fetching helpers for web pages and PDFs."""
from __future__ import annotations

import asyncio
import io
import logging
from dataclasses import dataclass
from typing import Any, Coroutine, List, Optional, TypeVar

import requests
from PIL import Image
from playwright.async_api import TimeoutError, async_playwright

try:
    import fitz  # type: ignore
except ImportError as exc:  # pragma: no cover - surfaced during runtime
    raise ImportError("PyMuPDF is required to extract PDF content.") from exc


@dataclass
class WebContent:
    """Normalized representation of extracted content."""

    text: str
    images: List[bytes]
    source_type: str


USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
)


T = TypeVar("T")


def _run_async(coro: Coroutine[Any, Any, T]) -> T:
    """Execute a coroutine regardless of the surrounding event loop state."""

    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:
        return asyncio.run(coro)

    try:
        import nest_asyncio
    except ImportError as exc:  # pragma: no cover - depends on runtime
        raise RuntimeError(
            "An asyncio event loop is already running. Install nest_asyncio to allow nested execution."
        ) from exc

    nest_asyncio.apply(loop)
    return loop.run_until_complete(coro)


def _resolve_logger(logger: Optional[logging.Logger]) -> logging.Logger:
    if logger is not None:
        return logger
    default_logger = logging.getLogger("aiknowledge")
    if not default_logger.handlers:
        default_logger.addHandler(logging.NullHandler())
    return default_logger


def _head_request(url: str, timeout: int, logger: logging.Logger) -> Optional[requests.Response]:
    try:
        response = requests.head(url, allow_redirects=True, timeout=timeout, headers={"User-Agent": USER_AGENT})
        response.raise_for_status()
        return response
    except requests.RequestException as exc:
        logger.warning("HEAD request failed for %s: %s", url, exc)
        return None


def _is_pdf(url: str, head_response: Optional[requests.Response]) -> bool:
    if head_response and "pdf" in head_response.headers.get("Content-Type", "").lower():
        return True
    return url.lower().endswith(".pdf")


def _truncate_text(text: str, max_chars: int) -> str:
    text = " ".join(text.split())
    if len(text) <= max_chars:
        return text
    return text[:max_chars] + " ... [truncated]"


def _ensure_image_size(image: Image.Image, max_bytes: int, logger: logging.Logger) -> bytes:
    buffer = io.BytesIO()
    image.save(buffer, format="PNG", optimize=True)
    data = buffer.getvalue()

    while len(data) > max_bytes and image.height > 200:
        new_height = max(200, int(image.height * 0.9))
        image = image.crop((0, 0, image.width, new_height))
        buffer = io.BytesIO()
        image.save(buffer, format="PNG", optimize=True)
        data = buffer.getvalue()
        logger.debug("Cropped screenshot to %spx height (%.2f MB)", new_height, len(data) / (1024 * 1024))

    return data


async def _fetch_web_page_async(
    url: str,
    *,
    logger: logging.Logger,
    max_chars: int,
    max_image_mb: float,
    request_timeout: int,
) -> WebContent:
    max_bytes = int(max_image_mb * 1024 * 1024)
    text_content = ""
    screenshot_bytes: List[bytes] = []

    browser = None
    page = None
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            page.set_default_timeout(request_timeout * 1000)
            await page.goto(url, wait_until="networkidle")
            raw_text = await page.inner_text("body")
            text_content = _truncate_text(raw_text, max_chars)
            screenshot = await page.screenshot(full_page=True)
            image = Image.open(io.BytesIO(screenshot))
            try:
                screenshot_bytes.append(_ensure_image_size(image, max_bytes, logger))
            finally:
                image.close()
    except TimeoutError as exc:
        logger.error("Timed out while loading %s: %s", url, exc)
        raise RuntimeError(f"Timed out while loading {url}.") from exc
    except Exception as exc:  # pragma: no cover - Playwright errors are runtime specific
        logger.error("Failed to capture web page %s: %s", url, exc)
        raise RuntimeError(f"Failed to capture web page {url}.") from exc
    finally:
        if page is not None and not page.is_closed():
            try:
                await page.close()
            except Exception:  # pragma: no cover - cleanup best effort
                logger.debug("Failed to close page for %s during cleanup", url)
        if browser is not None:
            try:
                await browser.close()
            except Exception:  # pragma: no cover - cleanup best effort
                logger.debug("Failed to close browser for %s during cleanup", url)

    if not text_content and not screenshot_bytes:
        raise RuntimeError(f"No content could be extracted from {url}.")

    return WebContent(text=text_content, images=screenshot_bytes, source_type="web")


def _fetch_web_page(
    url: str,
    *,
    logger: logging.Logger,
    max_chars: int,
    max_image_mb: float,
    request_timeout: int,
) -> WebContent:
    return _run_async(
        _fetch_web_page_async(
            url,
            logger=logger,
            max_chars=max_chars,
            max_image_mb=max_image_mb,
            request_timeout=request_timeout,
        )
    )


def _fetch_pdf(
    url: str,
    *,
    logger: logging.Logger,
    max_chars: int,
    max_pages: int,
    request_timeout: int,
) -> WebContent:
    try:
        response = requests.get(url, timeout=request_timeout, headers={"User-Agent": USER_AGENT})
        response.raise_for_status()
    except requests.RequestException as exc:
        logger.error("Failed to download PDF %s: %s", url, exc)
        raise RuntimeError(f"Failed to download PDF {url}.") from exc

    doc = fitz.open(stream=response.content, filetype="pdf")
    texts = []
    images: List[bytes] = []

    try:
        for index, page in enumerate(doc):
            if index >= max_pages:
                break
            text = page.get_text("text")
            texts.append(text)
            pix = page.get_pixmap()
            images.append(pix.tobytes("png"))
    finally:
        doc.close()

    aggregated = _truncate_text("\n\n".join(texts), max_chars)
    return WebContent(text=aggregated, images=images, source_type="pdf")


def fetch_web_or_pdf(
    url: str,
    *,
    logger: Optional[logging.Logger] = None,
    max_chars: int,
    max_image_mb: float,
    max_pdf_pages: int,
    request_timeout: int,
) -> WebContent:
    """Fetch content from a web page or PDF URL."""

    resolved_logger = _resolve_logger(logger)
    head_response = _head_request(url, request_timeout, resolved_logger)

    if _is_pdf(url, head_response):
        resolved_logger.debug("Detected PDF content at %s", url)
        return _fetch_pdf(
            url,
            logger=resolved_logger,
            max_chars=max_chars,
            max_pages=max_pdf_pages,
            request_timeout=request_timeout,
        )

    resolved_logger.debug("Detected web page content at %s", url)
    return _fetch_web_page(
        url,
        logger=resolved_logger,
        max_chars=max_chars,
        max_image_mb=max_image_mb,
        request_timeout=request_timeout,
    )
