# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
# See the LICENSE file in the root of this repository for details.
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial

from typing import Any, Dict

import numpy as np
import pandas as pd
from numpy.linalg import LinAlgError

from validmind import tags, tasks
from validmind.errors import MissingDependencyError, SkipTestError
from validmind.logging import get_logger
from validmind.vm_models import VMDataset

try:
    from arch.unitroot import PhillipsPerron
except ImportError as e:
    if "arch" in str(e):
        raise MissingDependencyError(
            "Missing required package `arch` for PhillipsPerronArch. "
            "Please run `pip install validmind[stats]` to use statistical tests",
            required_dependencies=["arch"],
            extra="stats",
        ) from e
    raise e

logger = get_logger(__name__)


@tags("time_series_data", "forecasting", "statistical_test", "unit_root_test")
@tasks("regression")
def PhillipsPerronArch(dataset: VMDataset) -> Dict[str, Any]:
    """
    Assesses the stationarity of time series data in each feature of the ML model using the Phillips-Perron test.

    ### Purpose

    The Phillips-Perron (PP) test is used to determine the stationarity of time series data for each feature in a
    dataset, which is crucial for forecasting tasks. It tests the null hypothesis that a time series is unit-root
    non-stationary. This is vital for understanding the stochastic behavior of the data and ensuring the robustness and
    validity of predictions generated by regression analysis models.

    ### Test Mechanism

    The PP test is conducted for each feature in the dataset as follows:
    - A data frame is created from the dataset.
    - For each column, the Phillips-Perron method calculates the test statistic, p-value, lags used, and number of
    observations.
    - The results are then stored for each feature, providing a metric that indicates the stationarity of the time
    series data.

    ### Signs of High Risk

    - A high p-value, indicating that the series has a unit root and is non-stationary.
    - Test statistic values exceeding critical values, suggesting non-stationarity.
    - High 'usedlag' value, pointing towards autocorrelation issues that may degrade model performance.

    ### Strengths

    - Resilience against heteroskedasticity in the error term.
    - Effective for long time series data.
    - Helps in determining whether the time series is stationary, aiding in the selection of suitable forecasting
    models.

    ### Limitations

    - Applicable only within a univariate time series framework.
    - Relies on asymptotic theory, which may reduce the test’s power for small sample sizes.
    - Non-stationary time series must be converted to stationary series through differencing, potentially leading to
    loss of important data points.
    """
    df = dataset.df.dropna()

    if not isinstance(df.index, (pd.DatetimeIndex, pd.PeriodIndex)):
        raise ValueError(
            "Dataset index must be a datetime or period index for time series analysis."
        )

    # Filter numeric columns first
    numeric_columns = df.select_dtypes(include=np.number).columns
    if not any(col in numeric_columns for col in dataset.feature_columns):
        raise SkipTestError("No numeric columns found for Phillips-Perron test.")

    pp_table = []

    for col in dataset.feature_columns:
        # Skip non-numeric columns
        if col not in numeric_columns:
            logger.warning(f"Skipping non-numeric column: {col}")
            continue

        try:
            # Drop any NaN values for this column
            series = df[col].dropna()
            if len(series) == 0:
                logger.warning(
                    f"Skipping column '{col}': No valid data after dropping NaN values"
                )
                continue

            pp = PhillipsPerron(series.values)
            pp_table.append(
                {
                    "Variable": col,
                    "stat": pp.stat,
                    "pvalue": pp.pvalue,
                    "usedlag": pp.lags,
                    "nobs": pp.nobs,
                }
            )
        except LinAlgError as e:
            logger.error(f"Error processing column '{col}': {e}")
            continue
        except Exception as e:
            logger.error(f"Unexpected error processing column '{col}': {e}")
            continue

    if not pp_table:
        raise SkipTestError("No valid columns found for Phillips-Perron test.")

    return {
        "Phillips-Perron Test Results": pp_table,
    }
