import json
import logging
from pydantic import (
    BaseModel,
    Field,
    field_validator,
    model_validator,
)
from pydantic_settings import BaseSettings
from urllib.parse import urlparse
import re
from typing import Any, Dict, List, TYPE_CHECKING

import httpx

from fraudcrawler.settings import (
    GOOGLE_LANGUAGES_FILENAME,
    GOOGLE_LOCATIONS_FILENAME,
)
from fraudcrawler.settings import (
    DEFAULT_HTTPX_TIMEOUT,
    DEFAULT_HTTPX_LIMITS,
    DEFAULT_HTTPX_REDIRECTS,
)

if TYPE_CHECKING:
    from fraudcrawler.scraping.zyte import ZyteAPI

logger = logging.getLogger(__name__)

# Load google locations and languages
with open(GOOGLE_LOCATIONS_FILENAME, "r") as gfile:
    _locs = json.load(gfile)
_LOCATION_CODES = {loc["name"]: loc["country_code"].lower() for loc in _locs}
with open(GOOGLE_LANGUAGES_FILENAME, "r") as gfile:
    _langs = json.load(gfile)
_LANGUAGE_CODES = {lang["language_name"]: lang["language_code"] for lang in _langs}


# Base classes
class Setup(BaseSettings):
    """Class for loading environment variables."""

    # Crawler ENV variables
    serpapi_key: str
    dataforseo_user: str
    dataforseo_pwd: str
    zyteapi_key: str
    openaiapi_key: str

    class Config:
        env_file = ".env"
        env_file_encoding = "utf-8"


class Host(BaseModel):
    """Model for host details (e.g. `Host(name="Galaxus", domains="galaxus.ch, digitec.ch")`)."""

    name: str
    domains: str | List[str]

    @staticmethod
    def _normalize_domain(domain: str) -> str:
        """Make it lowercase and strip 'www.' and 'https?://' prefixes from the domain."""
        domain = domain.strip().lower()
        return re.sub(r"^(https?://)?(www\.)?", "", domain)

    @field_validator("domains", mode="before")
    def normalize_domains(cls, val):
        if isinstance(val, str):
            val = val.split(",")
        return [cls._normalize_domain(dom.strip()) for dom in val]


class ClassificationResult(BaseModel):
    """Model for classification results."""

    result: int
    input_tokens: int
    output_tokens: int


class Location(BaseModel):
    """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""

    name: str
    code: str = ""

    @model_validator(mode="before")
    def set_code(cls, values):
        """Set the location code if not provided and make it lower case."""
        name = values.get("name")
        code = values.get("code")
        if code is None or not len(code):
            code = _LOCATION_CODES.get(name)
            if code is None:
                raise ValueError(f'Location code not found for location name="{name}"')
        code = code.lower()
        return {"name": name, "code": code}


class Language(BaseModel):
    """Model for language details (e.g. `Language(name="German", code="de")`)."""

    name: str
    code: str = ""

    @model_validator(mode="before")
    def set_code(cls, values):
        """Set the language code if not provided and make it lower case."""
        name = values.get("name")
        code = values.get("code")
        if code is None or not len(code):
            code = _LANGUAGE_CODES.get(name)
            if code is None:
                raise ValueError(f'Language code not found for language name="{name}"')
        code = code.lower()
        return {"name": name, "code": code}


class Enrichment(BaseModel):
    """Model for enriching initial search_term with alternative ones."""

    additional_terms: int
    additional_urls_per_term: int


class Deepness(BaseModel):
    """Model for search depth."""

    num_results: int
    enrichment: Enrichment | None = None


class ProductItem(BaseModel):
    """Model representing a product item."""

    # Serp/Enrich parameters
    search_term: str
    search_term_type: str
    url: str
    url_resolved: str
    search_engine_name: str
    domain: str

    # Zyte parameters
    product_name: str | None = None
    product_price: str | None = None
    product_description: str | None = None
    product_images: List[str] | None = None
    probability: float | None = None
    html: str | None = None
    html_clean: str | None = None

    # Processor parameters are set dynamic so we must allow extra fields
    classifications: Dict[str, int] = Field(default_factory=dict)

    # Usage parameters
    usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)

    # Filtering parameters
    filtered: bool = False
    filtered_at_stage: str | None = None


class Prompt(BaseModel):
    """Model for prompts."""

    name: str
    system_prompt: str
    product_item_fields: List[str]
    allowed_classes: List[int]

    @field_validator("allowed_classes", mode="before")
    def check_for_positive_value(cls, val):
        """Check if all values are positive."""
        if not all(isinstance(i, int) and i >= 0 for i in val):
            raise ValueError("all values in allowed_classes must be positive integers.")
        return val

    @field_validator("product_item_fields", mode="before")
    def validate_product_item_fields(cls, val):
        """Ensure all product_item_fields are valid ProductItem attributes."""
        valid_fields = set(ProductItem.model_fields.keys())
        for field in val:
            if field not in valid_fields:
                raise ValueError(
                    f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
                )
        return val


class HttpxAsyncClient(httpx.AsyncClient):
    """Httpx async client that can be used to retain the default settings."""

    def __init__(
        self,
        timeout: httpx.Timeout | Dict[str, Any] = DEFAULT_HTTPX_TIMEOUT,
        limits: httpx.Limits | Dict[str, Any] = DEFAULT_HTTPX_LIMITS,
        follow_redirects: bool = DEFAULT_HTTPX_REDIRECTS,
        **kwargs: Any,
    ) -> None:
        if isinstance(timeout, dict):
            timeout = httpx.Timeout(**timeout)
        if isinstance(limits, dict):
            limits = httpx.Limits(**limits)

        kwargs.setdefault("timeout", timeout)
        kwargs.setdefault("limits", limits)
        kwargs.setdefault("follow_redirects", follow_redirects)
        super().__init__(**kwargs)


class DomainUtils:
    """Utility class for domain extraction and normalization.

    Handles domain parsing from URLs, removes common prefixes (www, http/https),
    and provides consistent domain formatting for search and scraping operations.
    """

    _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"

    def _get_domain(self, url: str) -> str:
        """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).

        Args:
            url: The URL to be processed.
        """
        # Add scheme; urlparse requires it
        if not url.startswith(("http://", "https://")):
            url = "http://" + url

        # Get the hostname
        hostname = urlparse(url).hostname
        if hostname is None and (match := re.search(self._hostname_pattern, url)):
            hostname = match.group(1)
        if hostname is None:
            logger.warning(
                f'Failed to extract domain from url="{url}"; full url is returned'
            )
            return url.lower()

        # Remove www. prefix
        if hostname and hostname.startswith("www."):
            hostname = hostname[4:]
        return hostname.lower()

    async def _unblock_url(self, url: str, zyte_api: "ZyteAPI") -> bytes | None:
        """Attempts to unblock a URL using Zyte proxy mode when direct access fails.

        This method is specifically designed to handle 403 Forbidden errors for domains
        that may be blocking requests from certain IP ranges (like cloud providers).

        Args:
            url: The URL to fetch using Zyte proxy mode.
            zyte_api: An instance of ZyteAPI to use for the request.

        Returns:
            The HTML content as bytes if successful, None if failed.
        """
        try:
            logger.info(f"Attempting to unblock URL using Zyte proxy: {url}")
            details = await zyte_api.details(url)

            if details and "httpResponseBody" in details:
                # Decode the base64 content
                import base64

                html_content = base64.b64decode(details["httpResponseBody"])
                logger.info(f"Successfully unblocked URL using Zyte proxy: {url}")
                return html_content
            else:
                logger.warning(f"Zyte proxy request failed for URL: {url}")
                return None

        except Exception as e:
            logger.error(f"Error unblocking URL with Zyte proxy: {url}, error: {e}")
            return None
