from pathlib import Path

import pandas as pd
import concurrent.futures

from examples.run_defects4j import get_defects4j_one_line_results
from examples.run_dypy import get_dypy_results
from examples.run_human_eval import get_human_eval_results
from examples.run_mbpp import get_mbpp_results
from lmwrapper.openai_wrapper import OpenAiModelNames
cur_path = Path(__file__).parent.absolute()
output_dir = cur_path / "results"
output_dir.mkdir(exist_ok=True)


def run_everything():
    print("Gather human eval results...")
    human_eval_results = get_human_eval_results(
        output_directory=output_dir,
        read_file_if_exists=True,
    )
    print(human_eval_results.columns)

    print("Gather mbpp results...")
    mbpp_results = get_mbpp_results(
        output_directory=output_dir,
    )
    print("Gather defects4j results...")
    defects4j_results = get_defects4j_one_line_results(
        output_directory=output_dir,
    )
    print("Gather dypybench results...")
    dypy_results = get_dypy_results(
        output_directory=output_dir,
    )
    print("Done")
    return pd.concat(
        [
            human_eval_results,
            mbpp_results,
            defects4j_results,
            dypy_results,
        ]
    )


def read_all_parquets():
    def read_parquet_file(file):
        return pd.read_parquet(file)
    
    parquet_files = list(output_dir.glob("*.parquet"))
    with concurrent.futures.ThreadPoolExecutor() as executor:
        dfs = list(executor.map(read_parquet_file, parquet_files))
    
    return pd.concat(dfs) if dfs else pd.DataFrame()


def make_table(df):
    tasks = ["human_eval", "mbpp_func", "defects4j_one_line", "dypybench"]
    # print(df['solver_key'].unique())
    lines = [
        "\\begin{tabular}{ll|" + "r" * len(tasks) + "}",
        "LM Name & Solver & HumanEval & MBPP-f & Defects4J-1L & DyPyBench-LC \\\\ \\hline",
    ]
    for lm_name_friendly, solver_key, solver_type_friendly in [
        ("Codegen-350M", "Salesforce/codegen-350M-multi", "Completion"),
        ("Mistral-7B", "mistralai/Mistral-7B-v0.1", "Completion"),
        ("QwenCoder 2.5 0.5B", "Qwen/Qwen2.5-Coder-0.5B", "Completion"),
        #("GPT-3.5-Turbo-Instruct", "gpt_3_5_turbo_instruct", "Completion"),
        #("Qwen3-4B", "Qwen/Qwen3-4B", "Dialogue"),
        ("GPT-3.5-Turbo-Instruct", str(OpenAiModelNames.gpt_3_5_turbo_instruct), "Completion"),
        ("GPT-3.5-Turbo", str(OpenAiModelNames.gpt_3_5_turbo), "Dialogue"),
        #("GPT-4-Turbo", "gpt_4_turbo", "Dialogue"),
        ("GPT-4.1-Nano", str(OpenAiModelNames.gpt_4_1_nano), "Dialogue"),
        ("GPT-4.1", str(OpenAiModelNames.gpt_4_1), "Dialogue"),
    ]:
        row = [lm_name_friendly, solver_type_friendly]
        for task_name in tasks:
            filt_df = df[(df.solver_key == solver_key) & (df.task_name == task_name)]
            if len(filt_df) == 0:
                row.append("-")
                continue
            val = filt_df.main_metric__is_success.mean()
            row.append(f"{val * 100:.1f}\\%")
        lines.append(" & ".join(row) + "\\\\")
    lines.append("\\end{tabular}")
    return "\n".join(lines)


def main():
    df = run_everything()
    df = read_all_parquets()
    print(df)
    print(make_table(df))


if __name__ == "__main__":
    main()
