import polars as pl
import polars.testing as pl_testing

import polars_bio as pb


def test_vcf_ensembl_1_parsing():
    vcf_path = "tests/data/io/vcf/ensembl.vcf"
    info_fields = [
        "dbSNP_156",
        "TSA",
        "E_Freq",
        "E_Phenotype_or_Disease",
        "E_ExAC",
        "E_TOPMed",
        "E_gnomAD",
        "CLIN_uncertain_significance",
        "AA",
    ]
    df = pb.read_vcf(vcf_path, info_fields=info_fields)

    expected_df = pl.DataFrame(
        {
            "chrom": ["21", "21"],
            "start": [33248751, 5025532],
            "end": [33248751, 5025532],
            "id": ["rs549962048", "rs1879593094"],
            "ref": ["A", "G"],
            "alt": ["C|G", "C"],
            "qual": [None, None],
            "filter": ["", ""],
            "dbsnp_156": [True, True],
            "tsa": ["SNV", "SNV"],
            "e_freq": [True, True],
            "e_phenotype_or_disease": [True, False],
            "e_exac": [True, False],
            "e_topmed": [True, False],
            "e_gnomad": [True, False],
            "clin_uncertain_significance": [False, False],
            "aa": ["A", "G"],
        },
        schema={
            "chrom": pl.Utf8,
            "start": pl.UInt32,
            "end": pl.UInt32,
            "id": pl.Utf8,
            "ref": pl.Utf8,
            "alt": pl.Utf8,
            "qual": pl.Float64,
            "filter": pl.Utf8,
            "dbsnp_156": pl.Boolean,
            "tsa": pl.Utf8,
            "e_freq": pl.Boolean,
            "e_phenotype_or_disease": pl.Boolean,
            "e_exac": pl.Boolean,
            "e_topmed": pl.Boolean,
            "e_gnomad": pl.Boolean,
            "clin_uncertain_significance": pl.Boolean,
            "aa": pl.Utf8,
        },
    )

    for col in expected_df.columns:
        pl_testing.assert_series_equal(df[col], expected_df[col], check_dtypes=True)


def test_vcf_ensembl_2_parsing():
    vcf_path = "tests/data/io/vcf/ensembl-2.vcf"
    info_fields = [
        "COSMIC_100",
        "dbSNP_156",
        "HGMD-PUBLIC_20204",
        "ClinVar_202409",
        "TSA",
        "E_Cited",
        "E_Multiple_observations",
        "E_Freq",
        "E_TOPMed",
        "E_Hapmap",
        "E_Phenotype_or_Disease",
        "E_ESP",
        "E_gnomAD",
        "E_1000G",
        "E_ExAC",
        "CLIN_risk_factor",
        "CLIN_protective",
        "CLIN_confers_sensitivity",
        "CLIN_other",
        "CLIN_drug_response",
        "CLIN_uncertain_significance",
        "CLIN_benign",
        "CLIN_likely_pathogenic",
        "CLIN_pathogenic",
        "CLIN_likely_benign",
        "CLIN_histocompatibility",
        "CLIN_not_provided",
        "CLIN_association",
        "MA",
        "MAF",
        "MAC",
        "AA",
    ]
    df = pb.read_vcf(vcf_path, info_fields=info_fields)

    expected_df = pl.DataFrame(
        {
            "chrom": ["1"],
            "start": [2491309],
            "end": [2491309],
            "id": ["rs368445617"],
            "ref": ["T"],
            "alt": ["A|C"],
            "qual": [None],
            "filter": [""],
            "cosmic_100": [False],
            "dbsnp_156": [True],
            "hgmd-public_20204": [False],
            "clinvar_202409": [False],
            "tsa": ["SNV"],
            "e_cited": [False],
            "e_multiple_observations": [False],
            "e_freq": [True],
            "e_topmed": [True],
            "e_hapmap": [False],
            "e_phenotype_or_disease": [True],
            "e_esp": [True],
            "e_gnomad": [True],
            "e_1000g": [False],
            "e_exac": [True],
            "clin_risk_factor": [False],
            "clin_protective": [False],
            "clin_confers_sensitivity": [False],
            "clin_other": [False],
            "clin_drug_response": [False],
            "clin_uncertain_significance": [True],
            "clin_benign": [False],
            "clin_likely_pathogenic": [False],
            "clin_pathogenic": [False],
            "clin_likely_benign": [False],
            "clin_histocompatibility": [False],
            "clin_not_provided": [False],
            "clin_association": [False],
            "ma": [None],
            "maf": [None],
            "mac": [None],
            "aa": ["T"],
        },
        schema={
            "chrom": pl.Utf8,
            "start": pl.UInt32,
            "end": pl.UInt32,
            "id": pl.Utf8,
            "ref": pl.Utf8,
            "alt": pl.Utf8,
            "qual": pl.Float64,
            "filter": pl.Utf8,
            "cosmic_100": pl.Boolean,
            "dbsnp_156": pl.Boolean,
            "hgmd-public_20204": pl.Boolean,
            "clinvar_202409": pl.Boolean,
            "tsa": pl.Utf8,
            "e_cited": pl.Boolean,
            "e_multiple_observations": pl.Boolean,
            "e_freq": pl.Boolean,
            "e_topmed": pl.Boolean,
            "e_hapmap": pl.Boolean,
            "e_phenotype_or_disease": pl.Boolean,
            "e_esp": pl.Boolean,
            "e_gnomad": pl.Boolean,
            "e_1000g": pl.Boolean,
            "e_exac": pl.Boolean,
            "clin_risk_factor": pl.Boolean,
            "clin_protective": pl.Boolean,
            "clin_confers_sensitivity": pl.Boolean,
            "clin_other": pl.Boolean,
            "clin_drug_response": pl.Boolean,
            "clin_uncertain_significance": pl.Boolean,
            "clin_benign": pl.Boolean,
            "clin_likely_pathogenic": pl.Boolean,
            "clin_pathogenic": pl.Boolean,
            "clin_likely_benign": pl.Boolean,
            "clin_histocompatibility": pl.Boolean,
            "clin_not_provided": pl.Boolean,
            "clin_association": pl.Boolean,
            "ma": pl.Utf8,
            "maf": pl.Float32,
            "mac": pl.Int32,
            "aa": pl.Utf8,
        },
    )

    for col in expected_df.columns:
        pl_testing.assert_series_equal(df[col], expected_df[col], check_dtypes=True)
