# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

import warnings
from typing import Dict, Tuple

import plotly.express as px
import plotly.graph_objects as go
from datasets import Dataset

from validmind import RawData, tags, tasks
from validmind.errors import MissingDependencyError
from validmind.vm_models import VMDataset

from .utils import get_ragas_config, get_renamed_columns

try:
    from ragas import evaluate
    from ragas.metrics import SemanticSimilarity as semantic_similarity
except ImportError as e:
    if "ragas" in str(e):
        raise MissingDependencyError(
            "Missing required package `ragas` for AnswerSimilarity. "
            "Please run `pip install validmind[llm]` to use LLM tests",
            required_dependencies=["ragas"],
            extra="llm",
        ) from e

    raise e


@tags("ragas", "llm")
@tasks("text_qa", "text_generation", "text_summarization")
def SemanticSimilarity(
    dataset: VMDataset,
    response_column: str = "response",
    reference_column: str = "reference",
    judge_llm=None,
    judge_embeddings=None,
) -> Tuple[Dict[str, list], go.Figure, go.Figure, RawData]:
    """
    Calculates the semantic similarity between generated responses and ground truths

    The concept of Answer Semantic Similarity pertains to the assessment of the semantic
    resemblance between the generated answer and the ground truth. This evaluation is
    based on the `reference` and the `response`, with values falling within the range
    of 0 to 1. A higher score signifies a better alignment between the generated response
    and the ground truth.

    Measuring the semantic similarity between answers can offer valuable insights into
    the quality of the generated response. This evaluation utilizes a cross-encoder
    model to calculate the semantic similarity score.

    See this paper for more details: https://arxiv.org/pdf/2108.06130.pdf

    The following steps are involved in computing the answer similarity score:
    1. Vectorize the ground truth answer using the specified embedding model.
    2. Vectorize the generated answer using the same embedding model.
    3. Compute the cosine similarity between the two vectors.

    ### Configuring Columns

    This metric requires the following columns in your dataset:

    - `response` (str): The text response generated by the model.
    - `reference` (str): The ground truth answer that the generated answer is compared
    against.

    If the above data is not in the appropriate column, you can specify different column
    names for these fields using the parameters `response_column`, and `reference_column`.

    For example, if your dataset has this data stored in different columns, you can
    pass the following parameters:
    ```python
    {
        "response_column": "llm_output_col",
        "reference_column": "my_ground_truth_col",
    }
    ```

    If answer is stored as a dictionary in another column, specify the column and key
    like this:
    ```python
    pred_col = dataset.prediction_column(model)
    params = {
        "response_column": f"{pred_col}.generated_answer",
        "reference_column": "my_ground_truth_col",
    }
    ```

    For more complex situations, you can use a function to extract the data:
    ```python
    pred_col = dataset.prediction_column(model)
    params = {
        "response_column": lambda row: "\\n\\n".join(row[pred_col]["messages"]),
        "reference_column": "my_ground_truth_col",
    }
    ```
    """
    warnings.filterwarnings(
        "ignore",
        category=FutureWarning,
        message="promote has been superseded by promote_options='default'.",
    )

    required_columns = {
        "response": response_column,
        "reference": reference_column,
    }

    df = get_renamed_columns(dataset._df, required_columns)

    result_df = evaluate(
        Dataset.from_pandas(df),
        metrics=[semantic_similarity()],
        **get_ragas_config(judge_llm, judge_embeddings)
    ).to_pandas()

    score_column = "semantic_similarity"

    fig_histogram = px.histogram(
        x=result_df[score_column].to_list(), nbins=10, title="Semantic Similarity"
    )
    fig_box = px.box(x=result_df[score_column].to_list(), title="Semantic Similarity")

    return (
        {
            # "Scores (will not be uploaded to ValidMind Platform)": result_df[
            #     ["response", "reference", "semantic_similarity"]
            # ],
            "Aggregate Scores": [
                {
                    "Mean Score": result_df[score_column].mean(),
                    "Median Score": result_df[score_column].median(),
                    "Max Score": result_df[score_column].max(),
                    "Min Score": result_df[score_column].min(),
                    "Standard Deviation": result_df[score_column].std(),
                    "Count": result_df.shape[0],
                }
            ],
        },
        fig_histogram,
        fig_box,
        RawData(evaluation_results=result_df, dataset=dataset.input_id),
    )
