import hashlib
from pathlib import Path
import pandas as pd


def calculate_md5(file_path):
    with open(file_path, "rb") as f:
        file_hash = hashlib.md5(f.read()).hexdigest()
    return file_hash


if Path("scores.csv").exists():
    file_md5 = calculate_md5("scores.csv")
else:
    print("Warning: scores.csv does not exist. MD5 check will be skipped.")
    file_md5 = None
    
"""
find . | grep -i sample | grep -i submission | grep -v sample_submission.csv | grep -v zip_files  | grep -v 'sample/'
./denoising-dirty-documents/sampleSubmission.csv
./the-icml-2013-whale-challenge-right-whale-redux/sampleSubmission.csv
./text-normalization-challenge-russian-language/ru_sample_submission_2.csv.zip
./text-normalization-challenge-russian-language/ru_sample_submission_2.csv
./random-acts-of-pizza/sampleSubmission.csv
./text-normalization-challenge-english-language/en_sample_submission_2.csv.zip
./text-normalization-challenge-english-language/en_sample_submission_2.csv
./detecting-insults-in-social-commentary/sample_submission_null.csv
"""

# Find sample submission file dynamically
input_dir = Path('{% include "scenarios.data_science.share:scen.input_path" %}')
sample_submission_files = list(input_dir.glob("*sample_submission*.csv")) + list(
    input_dir.glob("*sampleSubmission*.csv")
)

if not sample_submission_files:
    print(f'Error: No sample submission file found in {% include "scenarios.data_science.share:scen.input_path" %}')
    sample_submission_name = None
    SAMPLE_SUBMISSION_PATH = None
else:
    sample_submission_name = sample_submission_files[0].name
    SAMPLE_SUBMISSION_PATH = str(sample_submission_files[0])
    print(f"Using sample submission file: {sample_submission_name}")

if SAMPLE_SUBMISSION_PATH is not None and not Path(SAMPLE_SUBMISSION_PATH).exists():
    print(f"Error: {sample_submission_name} not found at {SAMPLE_SUBMISSION_PATH}")

if not Path("submission.csv").exists():
    print("Error: submission.csv not found")

if SAMPLE_SUBMISSION_PATH is not None and Path(SAMPLE_SUBMISSION_PATH).exists() and Path("submission.csv").exists():
    sample_submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
    our_submission = pd.read_csv("submission.csv")

    success = True
    print(f"Columns in {sample_submission_name}:", sample_submission.columns)
    print("Columns in our_submission.csv:", our_submission.columns)

    for col in sample_submission.columns:
        if col not in our_submission.columns:
            success = False
            print(f"Column {col} not found in submission.csv")

    if success:
        print(f"submission.csv's columns aligns with {sample_submission_name} .")
    else:
        print(f"submission.csv's columns does not align with {sample_submission_name} .")


    def print_first_rows(file_path, file_name, num_rows=5):
        print(f"\nFirst {num_rows} rows of {file_name}:")
        try:
            with open(file_path, "r") as file:
                for i, line in enumerate(file):
                    if i < num_rows:
                        print(line.strip())
                    else:
                        break
        except FileNotFoundError:
            print(f"Error: {file_name} not found.")


    print_first_rows(SAMPLE_SUBMISSION_PATH, sample_submission_name)
    print_first_rows("submission.csv", "submission.csv")

    if file_md5 is not None:
        if calculate_md5("scores.csv") != file_md5:
            print("Warning: scores.csv has been rewritten in the test script!")
else:
    print("Skipping comparison and preview due to missing files.")

print(
    f"\nPlease Checked the content of the submission file(submission.csv should has the same format with {sample_submission_name} but might not the same index with {sample_submission_name}). "
)
