import requests
import xml.etree.ElementTree as ET
from urllib.parse import urlparse, urljoin
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import random


class DetailedSitemapParser:
    def __init__(self, sitemap_url: str, max_workers: int = 10):
        self.sitemap_url = sitemap_url
        self.max_workers = max_workers
        
        # Shared session for consistent cookies & fewer redirects
        self.session = requests.Session()

        self.session.headers.update({
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/115.0 Safari/537.36"
            ),
            "Accept-Language": "en-US,en;q=0.9",
            "Referer": "https://www.google.com/"
        })


    # ======================================================
    # Fetch sitemap XML
    # ======================================================
    def fetch_sitemap(self):
        response = self.session.get(self.sitemap_url, timeout=20)
        response.raise_for_status()
        return response.text


    # ======================================================
    # Parse <loc> URLs and determine category + slug
    # ======================================================
    def parse_urls(self):
        xml_data = self.fetch_sitemap()
        root = ET.fromstring(xml_data)
        namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}

        rows = []

        for url_tag in root.findall("ns:url", namespace):
            loc_tag = url_tag.find("ns:loc", namespace)
            if loc_tag is None:
                continue

            full_url = loc_tag.text.strip()
            path = urlparse(full_url).path.strip("/")

            if path == "":
                category = ""
                slug = ""
            else:
                parts = path.split("/")
                category = parts[0]
                slug = "/".join(parts[1:]) if len(parts) > 1 else ""

            rows.append([full_url, category, slug])

        df = pd.DataFrame(rows, columns=["url", "category", "slug"])
        return df


    # ======================================================
    # Detect and follow meta-refresh redirect pages
    # ======================================================
    def follow_meta_refresh(self, html, base_url):
        soup = BeautifulSoup(html, "lxml")
        meta = soup.find("meta", attrs={"http-equiv": lambda v: v and v.lower() == "refresh"})

        if not meta or "content" not in meta.attrs:
            return None

        content = meta["content"]  # e.g., "0;url=/target"
        parts = content.split(";")
        if len(parts) < 2:
            return None

        url_part = parts[1].strip()
        url_part = url_part.split("=", 1)[-1].strip().strip("'\"")

        target_url = urljoin(base_url, url_part)

        try:
            r = self.session.get(target_url, timeout=10)
            r.raise_for_status()
            return r
        except Exception:
            return None


    # ======================================================
    # Fetch HTML with meta-refresh handling & retries
    # ======================================================
    def fetch_rendered_page(self, url, retries=2):
        try:
            time.sleep(random.uniform(0.05, 0.15))
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            html = response.text

        except Exception:
            if retries > 0:
                return self.fetch_rendered_page(url, retries - 1)
            return "", url

        soup = BeautifulSoup(html, "lxml")
        title_text = soup.title.string.strip() if soup.title and soup.title.string else ""

        # If we hit a redirecting page OR meta-refresh
        if title_text.lower() == "redirecting..." or soup.find("meta", attrs={"http-equiv": lambda v: v and v.lower() == "refresh"}):
            r2 = self.follow_meta_refresh(html, response.url)
            if r2:
                return r2.text, r2.url

        return html, response.url


    # ======================================================
    # Parse metadata from HTML
    # ======================================================
    def extract_metadata(self, html):
        if not html:
            return "", "", ""

        soup = BeautifulSoup(html, "lxml")

        # Meta Title
        meta_title = soup.title.string.strip() if soup.title and soup.title.string else ""

        # Meta Description
        meta_desc_tag = soup.find("meta", attrs={"name": "description"})
        meta_description = meta_desc_tag["content"].strip() if meta_desc_tag else ""

        # H1
        h1_tag = soup.find("h1")
        h1 = h1_tag.get_text(strip=True) if h1_tag else ""

        return meta_title, meta_description, h1


    # ======================================================
    # Worker for threaded execution
    # ======================================================
    def fetch_page_data(self, idx, url):
        html, final_url = self.fetch_rendered_page(url)
        meta_title, meta_description, h1 = self.extract_metadata(html)

        return idx, {
            "meta_title": meta_title,
            "meta_description": meta_description,
            "h1": h1,
        }


    # ======================================================
    # Thread-safe metadata enrichment
    # ======================================================
    def enrich_metadata(self, df):
        urls = df["url"].tolist()
        results = {i: {"meta_title": "", "meta_description": "", "h1": ""} for i in range(len(urls))}

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            futures = {
                executor.submit(self.fetch_page_data, idx, url): idx
                for idx, url in enumerate(urls)
            }

            for future in as_completed(futures):
                idx, data = future.result()
                results[idx] = data

        # Rebuild aligned columns
        df["meta_title"] = [results[i]["meta_title"] for i in range(len(results))]
        df["meta_description"] = [results[i]["meta_description"] for i in range(len(results))]
        df["h1"] = [results[i]["h1"] for i in range(len(results))]

        return df


    # ======================================================
    # MAIN PROCESS
    # ======================================================
    def parse(self):
        df = self.parse_urls()
        df = self.enrich_metadata(df)
        return df

