# NOTE: This version has been edited in an effort to provide more thorough debugging logs.
# plink_utils

import os
import subprocess
import logging
from typing import Optional


def run_plink2(command: str):
    """
    Run a shell command invoking PLINK2, capturing stdout/stderr for logging.

    On non-zero exit, raise RuntimeError with PLINK2's stderr so callers
    fail fast instead of hitting missing .fam/.bed later.
    """
    logging.info("[plink_utils:run_plink2] %s", command)
    result = subprocess.run(
        command,
        shell=True,
        capture_output=True,
        text=True,
        check=False,
    )

    if result.returncode != 0:
        stderr = (result.stderr or "").strip()
        stdout = (result.stdout or "").strip()
        logging.error(
            "[plink_utils:run_plink2] PLINK2 failed (code %d).\nSTDERR:\n%s\nSTDOUT:\n%s",
            result.returncode,
            stderr,
            stdout,
        )
        raise RuntimeError(
            f"PLINK2 failed (exit {result.returncode}).\n"
            f"Command: {command}\n"
            f"STDERR:\n{stderr}\n"
            f"STDOUT:\n{stdout}"
        )
    else:
        logging.debug("[plink_utils:run_plink2] Output:\n%s", (result.stdout or "").strip())


def generate_bed_bim_fam(
    plink2_path: str,
    ped_file: str,
    map_file: str,
    output_prefix: str,
    relax_mind_threshold: bool = False,
    maf_threshold: Optional[float] = None,
    sample_keep_path: Optional[str] = None,
    autosomes_only: bool = False,
):
    """
    Generates BED/BIM/FAM from PED/MAP using PLINK2, matching Hao's R:

        plink2 --pedmap <prefix> --make-bed --geno 0.1 --mind 0.1 --out <prefix>

    Notes:
      - We *ignore* ped_file/map_file here; PLINK reads <prefix>.ped/.map via --pedmap.
      - Ensure <output_prefix>.ped and <output_prefix>.map exist before calling.
      - Default: no --maf, no --keep, no --chr filter.
    """
    ped_expect = f"{output_prefix}.ped"
    map_expect = f"{output_prefix}.map"

    if not os.path.exists(ped_expect):
        raise FileNotFoundError(f"Missing PED for --pedmap: {ped_expect}")
    if not os.path.exists(map_expect):
        raise FileNotFoundError(f"Missing MAP for --pedmap: {map_expect}")

    mind = "" if relax_mind_threshold else "--mind 0.1"
    maf = f"--maf {maf_threshold}" if maf_threshold is not None else ""   # usually None
    keep = f"--keep {sample_keep_path}" if sample_keep_path else ""       # usually none
    chrflag = "--chr 1-19" if autosomes_only else ""                      # usually off

    logging.info(
        "[plink_utils:generate_bed_bim_fam] --pedmap %s -> BED with %s %s %s %s",
        output_prefix, mind or "no --mind", maf or "no --maf",
        keep or "no --keep", chrflag or "all chr",
    )

    cmd = (
        f"{plink2_path} --pedmap {output_prefix} "
        f"--make-bed --geno 0.1 {mind} {maf} {keep} {chrflag} --out {output_prefix}"
    )
    run_plink2(cmd)

    # Sanity-check that PLINK2 actually produced the expected BED/BIM/FAM
    fam_expect = f"{output_prefix}.fam"
    if not os.path.exists(fam_expect):
        raise FileNotFoundError(
            f"PLINK2 finished but FAM file not found: {fam_expect}. "
            "Check PLINK2 STDERR/STDOUT for details."
        )


def calculate_kinship_matrix(
    plink2_path: str,
    input_prefix: str,
    output_prefix: str,
    sample_keep_path: Optional[str] = None,
):
    """
    Create PLINK .kin files from BED/BIM/FAM files.
    """
    keep = f"--keep {sample_keep_path}" if sample_keep_path else ""
    cmd = f"{plink2_path} --bfile {input_prefix} {keep} --make-rel square --out {output_prefix}"
    run_plink2(cmd)


def calculate_kinship_from_pedmap(
    plink2_path: str,
    pedmap_prefix: str,
    kin_prefix: str,
):
    """
    Compute kinship directly from --pedmap <prefix>.
    Assumes <pedmap_prefix>.ped and <pedmap_prefix>.map exist.
    """
    cmd = f"{plink2_path} --pedmap {pedmap_prefix} --make-rel square --out {kin_prefix}"
    run_plink2(cmd)


def rewrite_pheno_ids_from_fam(pheno_path: str, fam_path: str, out_path: str) -> None:
    """
    Make PHENO rows match FAM in both order AND IID values (no de-duplication).
      - PHENO: FID IID zscore value   (we REPLACE IID with FAM's IID in FAM order)
      - FAM:   FID IID PID MID SEX PHE

    Counts must already match per FID if PED and PHENO were written together.
    """
    import pandas as pd

    fam = pd.read_csv(
        fam_path,
        sep=r"\s+",
        header=None,
        names=["FID", "IID", "PID", "MID", "SEX", "PHE"],
        engine="python",
    )
    phe = pd.read_csv(
        pheno_path,
        sep=r"\s+",
        header=None,
        names=["FID", "IID", "zscore", "value"],
        engine="python",
    )

    out_chunks = []
    phe_groups = {k: g for k, g in phe.groupby("FID", sort=False)}
    for fid, fam_grp in fam.groupby("FID", sort=False):
        if fid not in phe_groups:
            raise ValueError(f"FID present in FAM but missing in PHENO: {fid}")

        phe_grp = phe_groups[fid].copy()

        # Strict 1:1 check (DO NOT drop duplicates; we want exact replicate counts)
        if len(phe_grp) != len(fam_grp):
            raise ValueError(
                f"PHENO vs FAM row-count mismatch for FID={fid}: "
                f"pheno={len(phe_grp)} fam={len(fam_grp)}"
            )

        phe_grp = phe_grp.reset_index(drop=True)
        phe_grp["IID"] = fam_grp["IID"].reset_index(drop=True)  # copy FAM IIDs (with suffixes)
        out_chunks.append(phe_grp[["FID", "IID", "zscore", "value"]])

    out = pd.concat(out_chunks, axis=0)
    out.to_csv(out_path, sep=" ", header=False, index=False)



