#!/usr/bin/env python3
"""
Fast Page Summary Generator (Env-Aware + Auto-Detect)
-----------------------------------------------------
Summarizes each PDF page text (~150 words, SEO-optimized),
compares new summaries with existing ones via DeepZero,
and updates manifest.json.

It auto-detects a working base directory based on:
    - Environment variables
    - Host-specific defaults
    - Fallback to CWD

Usage
-----
    from abstract_hugpy.hugpy_console.metadata_console.page_summary_generator_fast import run_page_summary_generator_fast
    run_page_summary_generator_fast()              # auto-detects base_dir
    run_page_summary_generator_fast("/custom/path")
"""


from ..imports import *
from .metadata_utils import scan_matadata_from_pdf_dirs
from .summary_judge import SummaryJudge

# ------------------------------------------------------------------
# Environment keys (override in ~/.bashrc, .env, or systemd)
# ------------------------------------------------------------------
BASE_DIR_KEY_PROD   = "HUGPY_BASE_DIR_SUMMARY_GENERATOR_FAST_PROD"
BASE_DIR_KEY_SERVER = "HUGPY_BASE_DIR_SUMMARY_GENERATOR_FAST_SERVER"
BASE_DIR_KEY_LOCAL  = "HUGPY_BASE_DIR_SUMMARY_GENERATOR_FAST_LOCAL"

# ------------------------------------------------------------------
# Defaults
# ------------------------------------------------------------------
SUMMARY_WORDS  = 150
CHARS_LIMIT    = 2000
SUMMARY_DIR    = "summaries"
MANIFEST_NAME  = "manifest.json"
N_PROCESSES    = max(1, os.cpu_count() // 2)
judge          = SummaryJudge()

# ------------------------------------------------------------------
# Environment detection helpers
# ------------------------------------------------------------------
def get_env_value_or_none(key: str, path: str | None = None) -> str | None:
    """Pull a variable from env file or system environment."""
    try:
        val = get_env_value(key=key, path=path)
        return val if val and os.path.exists(val) else None
    except Exception:
        return None

def get_env_basedirs(env_path: str | None = None) -> list[Path]:
    """Return list of candidate base paths in order of preference."""
    prod   = get_env_value_or_none(BASE_DIR_KEY_PROD, env_path)
    server = get_env_value_or_none(BASE_DIR_KEY_SERVER, env_path)
    local  = get_env_value_or_none(BASE_DIR_KEY_LOCAL, env_path)
    return [p for p in [prod, server, local] if p]

def detect_base_dir(env_path: str | None = None) -> Path:
    """Choose a valid base directory using environment + defaults."""
    for path_str in get_env_basedirs(env_path):
        p = Path(path_str)
        if p.exists():
            return p

    # Fallback defaults
    candidates = [
        Path("/mnt/24T/media/thedailydialectics/pdfs"),
        Path("/var/www/media/thedailydialectics/pdfs"),
        Path.home() / "Documents/pythonTools/data/pdfs",
        Path.cwd(),
    ]
    for c in candidates:
        if c.exists():
            return c
    return Path.cwd()

# ------------------------------------------------------------------
# Core utilities
# ------------------------------------------------------------------
def truncate_text(text: str, max_chars: int = CHARS_LIMIT) -> str:
    return text[:max_chars]

def summarize_text(text: str) -> str:
    try:
        summary = get_summarizer_summary(
            text=text,
            summary_mode="medium",
            max_chunk_tokens=200,
            summary_words=SUMMARY_WORDS,
        ).strip()
        if len(summary.split()) < 30:
            summary += " (short)"
        return summary
    except Exception as e:
        return f"[Summarizer error: {e}]"

def build_seo_json(page_id: str, summary: str,pdf_dir:str) -> dict:
    desc = " ".join(summary.split()[:SUMMARY_WORDS])
    title = get_pdf_title(pdf_dir)
    title_page_no = f"{title} {page_id}"
    title_page_no_text = f"{title} Page {page_id}"
    return {
        "page_id": page_id,
        "title": f"{title_page_no} | {title_page_no_text} Summary",
        "description": desc,
        "alt":  f"{title_page_no_text} abstract",
        "summary": summary,
        "length_words": len(summary.split()),
    }

# ------------------------------------------------------------------
# Page processing
# ------------------------------------------------------------------
def process_page(txt_path: Path):
    txt_path = Path(str(txt_path))
    try:
        summary_dir = get_pdf_dir(txt_path)
        summary_dir = Path(summary_dir)
        summary_dir.mkdir(exist_ok=True)
        out_json = summary_dir / f"{txt_path.stem}.json"
        out_txt  = summary_dir / f"{a}.txt"

        text = clean_text(truncate_text(txt_path.read_text(encoding="utf-8", errors="ignore")))
        if len(text) < 40:
            return None

        new_summary = summarize_text(text)
        seo_json    = build_seo_json(txt_path.stem, new_summary)

        # Compare or create
        if out_json.exists():
            existing = safe_load_json(out_json)
            old_summary = existing.get("summary") or existing.get("text") or ""
            best_summary, best_score, other_score = judge.compare(text, new_summary, old_summary)
            if best_summary == new_summary:
                safe_write_json(out_json, seo_json)
                out_txt.write_text(new_summary, encoding="utf-8")
                action = "replaced"
            else:
                action = "kept_old"
        else:
            safe_write_json(out_json, seo_json)
            out_txt.write_text(new_summary, encoding="utf-8")
            best_score, other_score, action = 1.0, 0.0, "new"

        return {"id": txt_path.stem, "action": action,
                "best_score": best_score, "other_score": other_score}

    except Exception as e:
        return {"id": txt_path.name, "error": str(e)}
def get_pdf_path(pdf_path):
    pdf_path = str(pdf_path)
    if os.path.isdir(pdf_path):
        dirlist = os.listdir(pdf_path)
        pdfs = [item for item in dirlist if item and item.endswith('.pdf') and '_page_' not in item ]
        if pdfs and len(pdfs)>0:
            pdf_path = pdfs[0]
    if pdf_path and os.path.isfile(pdf_path) and pdf_path.endswith('.pdf'):
        return pdf_path
def get_pdf_dir(pdf_dir):
    pdf_dir = str(pdf_dir)
    if os.path.isdir(pdf_dir):
        dirlist = os.listdir(pdf_dir)
        pdfs = [item for item in dirlist if item and item.endswith('.pdf') and '_page_' not in item ]
        if pdfs and len(pdfs)>0:
            return pdf_dir
    if pdf_dir and os.path.isfile(pdf_dir) and pdf_path.endswith('.pdf'):
        return os.path.dirname(pdf_dir)
    if pdf_dir and os.path.isfile(pdf_dir) and (pdf_path.endswith('.txt') or pdf_path.endswith('.png')):
        file_parts = get_file_parts(pdf_dir)
        return file_parts.get('parent_dirname')    
def get_manifest_path(pdf_dir):
    pdf_dir = str(pdf_dir)
    pdf_dir = get_pdf_dir(pdf_dir)
    manifest_path = os.path.join(str(pdf_dir),MANIFEST_NAME)
    return manifest_path
def load_manifest(pdf_dir=None,manifest_path=None):
    pdf_dir = str(pdf_dir) if pdf_dir else pdf_dir
    manifest_path = str(manifest_path) if manifest_path else manifest_path
    if pdf_dir==None and manifest_path==None:
        return {}
    manifest_path = manifest_path or get_manifest_path(pdf_dir)
    if not os.path.isfile(manifest_path):
        safe_dump_to_json(data={},file_path=manifest_path)
    manifest = safe_load_json(manifest_path)
    return manifest
def save_manifest_data(data=None,pdf_dir=None,override=False):
    pdf_dir = str(pdf_dir) if pdf_dir else pdf_dir
    manifest_path = get_manifest_path(pdf_dir)
    if data in [None,{}] and override == True:
        data = load_manifest(pdf_dir,manifest_path=manifest_path)
    else:
        data = {}
    safe_dump_to_json(data=data,file_path=manifest_path)

def get_title_retrun_manifest(pdf_dir):
    pdf_dir = str(pdf_dir)
    manifest = load_manifest(pdf_dir)
    title = manifest.get('title')
    if not title:
        manifest['title'] = detect_pdf_title(pdf_path)
        save_manifest_data(data=manifest,pdf_dir=pdf_dir)
     return manifest
def get_pdf_title(pdf_dir):
    pdf_dir = str(pdf_dir)
    manifest = get_title_retrun_manifest(pdf_dir)
    return manifest.get('title')
# ------------------------------------------------------------------
# Manifest update
# ------------------------------------------------------------------
def update_manifest(pdf_dir: Path, new_entries: list):
    manifest = get_title_retrun_manifest(pdf_dir)
    manifest.setdefault("pages", {})
    for entry in new_entries:
        if not entry or "error" in entry:
            continue
        manifest["pages"][entry["id"]] = entry.get("data", {})
    save_manifest_data(data=manifest,pdf_dir=pdf_dir)

# ------------------------------------------------------------------
# Directory processing
# ------------------------------------------------------------------
def process_pdf_dir(pdf_dir: Path):
    txt_files = list(pdf_dir.glob("*.txt"))
    if not txt_files:
        scan_matadata_from_pdf_dirs([pdf_dir],output_dir=pdf_dir)
        txt_files = list(pdf_dir.glob("*.txt"))
        if not txt_files:
            return 
    print(f"\n📄 Processing {patent_dir.name} ({len(txt_files)} pages)...")
    results = []
    title = get_pdf_title(pdf_dir)

    with mproc.Pool(N_PROCESSES) as pool:
        for res in tqdm(pool.imap_unordered(process_page, txt_files),
                        total=len(txt_files), desc=pdf_dir.name):
            if res:
                results.append(res)
    update_manifest(pdf_dir, results)
    print(f"✅ Updated manifest for {patent_dir.name}")

# ------------------------------------------------------------------
# Entrypoint
# ------------------------------------------------------------------
def run_page_summary_generator_fast(base_dir: str | Path = None, env_path: str | None = None):
    base_dir = Path(base_dir) if base_dir else detect_base_dir(env_path)
    if not base_dir.exists():
        raise FileNotFoundError(f"Base directory not found: {base_dir}")

    pdf_dirs = [p for p in get_files_and_dirs(str(base_dir),allowed_exts=['.pdf'])[-1] if '_page_' not in p]
    if not pdf_dirs:
        print(f"⚠️ No subdirectories found in {base_dir}")
        return

    print(f"🏗 Using base directory: {base_dir}")
    for item in pdf_dirs:
        pdf_dir = item
        if os.path.isfile(item):
            pdf_dir = os.path.dirname(item)
        process_pdf_dir(Path(pdf_dir))
    print("🏁 All summaries complete.")


