from mteb.abstasks.retrieval import AbsTaskRetrieval
from mteb.abstasks.task_metadata import TaskMetadata

_EVAL_SPLIT = "test"

_LANGUAGES = {
    "ara": ["ara-Arab"],
    "aze": ["aze-Latn"],
    "ben": ["ben-Beng"],
    "bul": ["bul-Cyrl"],
    "cat": ["cat-Latn"],
    "ces": ["ces-Latn"],
    "dan": ["dan-Latn"],
    "deu": ["deu-Latn"],
    "ell": ["ell-Grek"],
    "eng": ["eng-Latn"],
    "est": ["est-Latn"],
    "fas": ["fas-Arab"],
    "fin": ["fin-Latn"],
    "fra": ["fra-Latn"],
    "heb": ["heb-Hebr"],
    "hin": ["hin-Deva"],
    "hrv": ["hrv-Latn"],
    "hun": ["hun-Latn"],
    "ind": ["ind-Latn"],
    "isl": ["isl-Latn"],
    "ita": ["ita-Latn"],
    "jpn": ["jpn-Jpan"],
    "kat": ["kat-Geor"],
    "kaz": ["kaz-Cyrl"],
    "kor": ["kor-Kore"],
    "lav": ["lav-Latn"],
    "lit": ["lit-Latn"],
    "mar": ["mar-Deva"],
    "msa": ["msa-Latn"],
    "nld": ["nld-Latn"],
    "nor": ["nor-Latn", "nob-Latn", "nno-Latn"],
    "pol": ["pol-Latn"],
    "por": ["por-Latn"],
    "ron": ["ron-Latn"],
    "rus": ["rus-Cyrl"],
    "slk": ["slk-Latn"],
    "slv": ["slv-Latn"],
    "spa": ["spa-Latn"],
    "sqi": ["sqi-Latn"],
    "srp": ["srp-Cyrl"],
    "swe": ["swe-Latn"],
    "tgl": ["tgl-Latn"],
    "tha": ["tha-Thai"],
    "tur": ["tur-Latn"],
    "ukr": ["ukr-Cyrl"],
    "urd": ["urd-Arab"],
    "uzb": ["uzb-Latn"],
    "vie": ["vie-Latn"],
    "zho": ["zho-Hans"],
}


class WebFAQRetrieval(AbsTaskRetrieval):
    metadata = TaskMetadata(
        name="WebFAQRetrieval",
        description="WebFAQ is a broad-coverage corpus of natural question-answer pairs in 75 languages, gathered from FAQ pages on the web.",
        reference="https://huggingface.co/PaDaS-Lab",
        dataset={
            "path": "mteb/WebFAQRetrieval",
            "revision": "f64f483ad0f31d2e78209d524c14a4a867965959",
        },
        type="Retrieval",
        category="t2t",
        modalities=["text"],
        eval_splits=[_EVAL_SPLIT],
        eval_langs=_LANGUAGES,
        main_score="ndcg_at_10",
        date=("2022-09-01", "2024-10-01"),
        domains=["Web", "Written"],
        task_subtypes=["Question answering"],
        license="cc-by-4.0",
        annotations_creators="derived",
        dialect=[],
        sample_creation="found",
        bibtex_citation=r"""
@misc{dinzinger2025webfaq,
  archiveprefix = {arXiv},
  author = {Michael Dinzinger and Laura Caspari and Kanishka Ghosh Dastidar and Jelena Mitrović and Michael Granitzer},
  eprint = {2502.20936},
  primaryclass = {cs.CL},
  title = {WebFAQ: A Multilingual Collection of Natural Q&amp;A Datasets for Dense Retrieval},
  url = {https://arxiv.org/abs/2502.20936},
  year = {2025},
}
""",
    )
