# src/arize/spans/client.py
# type: ignore[pb2]
import base64
import json
import logging
import os
import re
import tempfile
from datetime import datetime, timezone
from functools import partial
from typing import Any, Dict

import numpy as np
import pandas as pd
import pyarrow as pa
import requests

from arize._flight.client import ArizeFlightClient, WriteSpanResponse
from arize._flight.types import FlightRequestType
from arize._generated.protocol.flight.ingest_pb2 import (
    WriteSpanAnnotationResponse,
    WriteSpanEvaluationResponse,
)
from arize.config import SDKConfiguration
from arize.constants.spans import DEFAULT_DATETIME_FMT
from arize.exceptions.base import ValidationFailure
from arize.exceptions.models import MissingProjectNameError
from arize.exceptions.spaces import MissingSpaceIDError
from arize.logging import CtxAdapter, get_arize_project_url
from arize.utils.arrow import append_to_pyarrow_metadata, write_arrow_file
from arize.utils.pandas import remove_extraneous_columns, reset_dataframe_index
from arize.utils.proto import get_pb_schema_tracing, message_to_dict

logger = logging.getLogger(__name__)


INVALID_ARROW_CONVERSION_MSG = (
    "The dataframe needs to convert to pyarrow but has failed to do so. "
    "There may be unrecognized data types in the dataframe. "
    "Another reason may be that a column in the dataframe has a mix of strings and "
    "numbers, in which case you may want to convert the strings in that column to NaN. "
)


class SpansClient:
    def __init__(self, sdk_config: SDKConfiguration):
        self.sdk_config = sdk_config
        self._headers = dict()

    def log(
        self,
        space_id: str,
        project_name: str,
        dataframe: pd.DataFrame,
        evals_dataframe: pd.DataFrame | None = None,
        datetime_format: str = DEFAULT_DATETIME_FMT,
        validate: bool = True,
        timeout: float | None = None,
        tmp_dir: str = "",
    ) -> requests.Response:
        """
        Logs a pandas dataframe containing LLM tracing data to Arize via a POST request. Returns a
        :class:`Response` object from the Requests HTTP library to ensure successful delivery of
        records.

        Args:
            dataframe (pd.DataFrame): The dataframe containing the LLM traces.
            evals_dataframe (pd.DataFrame, optional): A dataframe containing LLM evaluations data.
                The evaluations are joined to their corresponding spans via a left outer join, i.e.,
                using only `context.span_id` from the spans dataframe. Defaults to None.
            datetime_format (str): format for the timestamp captured in the LLM traces.
                Defaults to "%Y-%m-%dT%H:%M:%S.%f+00:00".
            validate (bool, optional): When set to True, validation is run before sending data.
                Defaults to True.
            tmp_dir (str, optional): Temporary directory/file to store the serialized data in binary
                before sending to Arize.
            timeout (float, optional): You can stop waiting for a response after a given number
                of seconds with the timeout parameter. Defaults to None.
            project_name (str, optional): A unique name to identify your project in the Arize platform.

        Returns:
            `Response` object

        """
        from arize.spans.columns import (
            EVAL_COLUMN_PATTERN,
            ROOT_LEVEL_SPAN_KIND_COL,
            SPAN_KIND_COL,
            SPAN_OPENINFERENCE_COLUMNS,
            SPAN_SPAN_ID_COL,
        )
        from arize.spans.conversion import (
            convert_timestamps,
            jsonify_dictionaries,
        )
        from arize.spans.validation.evals import evals_validation
        from arize.spans.validation.spans import spans_validation

        # This method requires a space_id and project_name
        if not space_id:
            raise MissingSpaceIDError()
        if not project_name:
            raise MissingProjectNameError()

        # We need our own copy since we will manipulate the underlying data and
        # do not want side effects
        spans_df = dataframe.copy()
        evals_df = (
            evals_dataframe.copy() if evals_dataframe is not None else None
        )

        # Create headers copy for the spans client
        # Safe to mutate, returns a deep copy
        self._headers = self.sdk_config.headers
        # Send the number of rows in the dataframe as a header
        # This helps the Arize server to return appropriate feedback, specially for async logging
        self._headers.update(
            {
                "number-of-rows": str(len(spans_df)),
                "arize-space-id": space_id,
            }
        )

        # Bind common context for this operation
        log = CtxAdapter(
            logger,
            {
                "resource": "spans",
                "operation": "log",
                "space_id": space_id,
                "project": project_name,
                "validate": validate,
                "spans_df_rows": len(spans_df),
                "evals_df_rows": len(evals_df) if evals_df is not None else 0,
            },
        )

        # We expect the index to be 0,1,2,3..., len(df)-1. Phoenix, for example, will give us a dataframe
        # with context_id as the index
        reset_dataframe_index(dataframe=spans_df)
        if evals_df:
            reset_dataframe_index(dataframe=evals_df)

        log.debug("Performing direct input type validation")
        errors = spans_validation.validate_argument_types(
            spans_dataframe=spans_df,
            project_name=project_name,
            dt_fmt=datetime_format,
        )
        if evals_df is not None:
            eval_errors = evals_validation.validate_argument_types(
                evals_dataframe=evals_df,
                project_name=project_name,
            )
            errors += eval_errors
        for e in errors:
            log.error(e)
        if errors:
            raise ValidationFailure(errors)

        if validate:
            log.debug("Performing dataframe form validation")
            errors = spans_validation.validate_dataframe_form(
                spans_dataframe=spans_df
            )
            if evals_df is not None:
                eval_errors = evals_validation.validate_dataframe_form(
                    evals_dataframe=evals_df
                )
                errors += eval_errors
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

        log.debug("Removing unnecessary columns")
        spans_df = remove_extraneous_columns(
            df=spans_df,
            column_list=[col.name for col in SPAN_OPENINFERENCE_COLUMNS],
        )
        if evals_df:
            evals_df = remove_extraneous_columns(
                df=evals_df,
                column_list=[SPAN_SPAN_ID_COL.name],
                regex=EVAL_COLUMN_PATTERN,
            )

        log.debug("Converting timestamps")
        spans_df = convert_timestamps(df=spans_df, fmt=datetime_format)

        if validate:
            log.debug("Performing values validation")
            errors = spans_validation.validate_values(
                spans_dataframe=spans_df,
                project_name=project_name,
            )
            if evals_df:
                eval_errors = evals_validation.validate_values(
                    evals_dataframe=evals_df,
                    project_name=project_name,
                )
                errors += eval_errors
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

        log.debug("Converting dictionaries to JSON objects")
        spans_df = jsonify_dictionaries(spans_df)
        if (
            ROOT_LEVEL_SPAN_KIND_COL.name in spans_df.columns
            and SPAN_KIND_COL.name not in spans_df.columns
        ):
            log.debug("Moving span kind to atributes")
            spans_df.rename(
                columns={ROOT_LEVEL_SPAN_KIND_COL.name: SPAN_KIND_COL.name},
                inplace=True,
            )

        df = (
            pd.merge(spans_df, evals_df, on=SPAN_SPAN_ID_COL.name, how="left")
            if evals_df
            else spans_df
        )

        # Convert to Arrow table
        try:
            log.debug("Converting data to Arrow format")
            pa_table = pa.Table.from_pandas(df)
        except pa.ArrowInvalid as e:
            log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
            raise pa.ArrowInvalid(
                f"Error converting to Arrow format: {str(e)}"
            ) from e
        except Exception as e:
            log.error(f"Unexpected error creating Arrow table: {str(e)}")
            raise

        return self._log_arrow(
            project_name=project_name,
            pa_table=pa_table,
            timeout=timeout,
            tmp_dir=tmp_dir,
        )

    def update_evaluations(
        self,
        space_id: str,
        project_name: str,
        dataframe: pd.DataFrame,
        validate: bool = True,
        force_http: bool = False,
        timeout: float | None = None,
        tmp_dir: str = "",
    ) -> WriteSpanEvaluationResponse:
        """
        Logs a pandas dataframe containing LLM evaluations data to Arize via a Flight gRPC request.
        The dataframe must contain a column `context.span_id`
        such that Arize can assign each evaluation to its respective span.

        Args:
            dataframe (pd.DataFrame): A dataframe containing LLM evaluations data.
            model_id (str): A unique name to identify your model in the Arize platform.
                (Deprecated: Use `project_name` instead.)
            model_version (str, optional): Used to group a subset of traces a given
                model_id to compare and track changes. It should match the model_id of the spans
                sent previously, to which evaluations will be assigned. Defaults to None.
            validate (bool, optional): When set to True, validation is run before sending data.
                Defaults to True.
            path (str, optional): Temporary directory/file to store the serialized data in binary
                before sending to Arize.
            timeout (float, optional): You can stop waiting for a response after a given number
                of seconds with the timeout parameter. Defaults to None.
            project_name (str, optional): A unique name to identify your project in the Arize platform.
                Either model_id or project_name must be provided.
        """
        from arize.spans.columns import EVAL_COLUMN_PATTERN, SPAN_SPAN_ID_COL
        from arize.spans.validation.evals import evals_validation

        # This method requires a space_id and project_name
        if not space_id:
            raise MissingSpaceIDError()
        if not project_name:
            raise MissingProjectNameError()

        # Create headers copy for the spans client
        # Safe to mutate, returns a deep copy
        self._headers = self.sdk_config.headers
        # Send the number of rows in the dataframe as a header
        # This helps the Arize server to return appropriate feedback, specially for async logging
        self._headers.update(
            {
                "number-of-rows": str(len(dataframe)),
                "arize-space-id": space_id,
            }
        )

        # Bind common context for this operation
        log = CtxAdapter(
            logger,
            {
                "resource": "spans",
                "operation": "log",
                "space_id": space_id,
                "project": project_name,
                "validate": validate,
                "evals_df_rows": len(dataframe),
            },
        )

        # We need our own copy since we will manipulate the underlying data and
        # do not want side effects
        evals_df = dataframe.copy()

        # We expect the index to be 0,1,2,3..., len(df)-1. Phoenix, for example, will give us a dataframe
        # with context_id as the index; the old index is not meaningful in our copy of the original dataframe
        # so we can drop it.
        reset_dataframe_index(dataframe=evals_df)

        log.debug("Performing direct input type validation")
        errors = evals_validation.validate_argument_types(
            evals_dataframe=evals_df,
            project_name=project_name,
        )
        for e in errors:
            log.error(e)
        if errors:
            raise ValidationFailure(errors)

        if validate:
            log.debug("Performing dataframe form validation")
            errors = evals_validation.validate_dataframe_form(
                evals_dataframe=evals_df
            )
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

        log.debug("Removing unnecessary columns")
        evals_df = remove_extraneous_columns(
            df=evals_df,
            column_list=[SPAN_SPAN_ID_COL.name],
            regex=EVAL_COLUMN_PATTERN,
        )

        if validate:
            log.debug("Performing values validation")
            errors = evals_validation.validate_values(
                evals_dataframe=evals_df,
                project_name=project_name,
            )
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

        # Convert to Arrow table
        try:
            log.debug("Converting data to Arrow format")
            pa_table = pa.Table.from_pandas(evals_df)
        except pa.ArrowInvalid as e:
            log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
            raise pa.ArrowInvalid(
                f"Error converting to Arrow format: {str(e)}"
            ) from e
        except Exception as e:
            log.error(f"Unexpected error creating Arrow table: {str(e)}")
            raise

        if force_http:
            return self._log_arrow(
                project_name=project_name,
                pa_table=pa_table,
                timeout=timeout,
                tmp_dir=tmp_dir,
            )

        request_type = FlightRequestType.EVALUATION
        response = None
        with ArizeFlightClient(
            api_key=self.sdk_config.api_key,
            host=self.sdk_config.flight_server_host,
            port=self.sdk_config.flight_server_port,
            scheme=self.sdk_config.flight_scheme,
        ) as flight_client:
            try:
                response = flight_client.log_arrow_table(
                    space_id=space_id,
                    project_name=project_name,
                    pa_table=pa_table,
                    request_type=request_type,
                )
            except Exception as e:
                msg = f"Error during update request: {str(e)}"
                log.error(msg)
                raise RuntimeError(msg) from e

        if response is None:
            # This should not happen with proper Flight client implementation,
            # but we handle it defensively
            msg = "No response received from flight server during update"
            log.error(msg)
            raise RuntimeError(msg)

        _log_flight_update_summary(
            project_name=project_name,
            total_spans=len(pa_table),
            request_type=request_type,
            response=response,
        )

        # Convert Protocol Buffer SpanError objects to dictionaries for easier access
        return message_to_dict(response)

    def update_annotations(
        self,
        space_id: str,
        project_name: str,
        dataframe: pd.DataFrame,
        validate: bool = True,
    ) -> WriteSpanAnnotationResponse:
        """
        Logs a pandas dataframe containing LLM span annotations to Arize via a Flight gRPC request.
        The dataframe must contain a column `context.span_id`
        such that Arize can assign each annotation to its respective span.
        Annotation columns should follow the pattern `annotation.<name>.<suffix>` where suffix is
        either `label` or `score`. An optional `annotation.notes` column can be included for
        free-form text notes.

        Args:
            dataframe (pd.DataFrame): A dataframe containing LLM annotation data.
            project_name (str): A unique name to identify your project in the Arize platform.
            validate (bool, optional): When set to True, validation is run before sending data.
                Defaults to True.
        """
        from arize.spans.columns import (
            ANNOTATION_COLUMN_PATTERN,
            ANNOTATION_LABEL_SUFFIX,
            ANNOTATION_NOTES_COLUMN_NAME,
            ANNOTATION_SCORE_SUFFIX,
            ANNOTATION_UPDATED_AT_SUFFIX,
            ANNOTATION_UPDATED_BY_SUFFIX,
            SPAN_SPAN_ID_COL,
        )
        from arize.spans.validation.annotations import annotations_validation

        # This method requires a space_id and project_name
        if not space_id:
            raise MissingSpaceIDError()
        if not project_name:
            raise MissingProjectNameError()

        # Create headers copy for the spans client
        # Safe to mutate, returns a deep copy
        self._headers = self.sdk_config.headers
        # Send the number of rows in the dataframe as a header
        # This helps the Arize server to return appropriate feedback, specially for async logging
        self._headers.update(
            {
                "number-of-rows": str(len(dataframe)),
                "arize-space-id": space_id,
            }
        )

        # Bind common context for this operation
        log = CtxAdapter(
            logger,
            {
                "resource": "spans",
                "operation": "log",
                "space_id": space_id,
                "project": project_name,
                "validate": validate,
                "evals_df_rows": len(dataframe),
            },
        )

        anno_df = dataframe.copy()

        # We expect the index to be 0,1,2,3..., len(df)-1. Phoenix, for example, will give us a dataframe
        # with context_id as the index; the old index is not meaningful in our copy of the original dataframe
        # so we can drop it.
        reset_dataframe_index(dataframe=anno_df)

        log.debug(
            "Checking for and autogenerating missing updated_by/updated_at annotation columns"
        )
        annotation_cols = [
            col
            for col in anno_df.columns
            if re.match(ANNOTATION_COLUMN_PATTERN, col)
        ]
        annotation_names = set()
        # Extract unique annotation names (e.g., "quality" from "annotation.quality.label")
        for col in annotation_cols:
            match = re.match(r"^annotation\.([a-zA-Z0-9_\s]+?)(\..+)$", col)
            if match:
                annotation_names.add(match.group(1))

        log.debug(f"Found annotation names: {annotation_names}")

        current_time_ms = int(datetime.now(timezone.utc).timestamp() * 1000)

        for name in annotation_names:
            updated_by_col = f"annotation.{name}{ANNOTATION_UPDATED_BY_SUFFIX}"
            updated_at_col = f"annotation.{name}{ANNOTATION_UPDATED_AT_SUFFIX}"
            label_col = f"annotation.{name}{ANNOTATION_LABEL_SUFFIX}"
            score_col = f"annotation.{name}{ANNOTATION_SCORE_SUFFIX}"

            # Check if *any* part of this annotation exists (label or score)
            # Only add metadata if the annotation itself is present
            if label_col in anno_df.columns or score_col in anno_df.columns:
                if updated_by_col not in anno_df.columns:
                    log.debug(f"Autogenerating column: {updated_by_col}")
                    anno_df[updated_by_col] = "SDK"
                if updated_at_col not in anno_df.columns:
                    log.debug(f"Autogenerating column: {updated_at_col}")
                    anno_df[updated_at_col] = current_time_ms
            else:
                log.debug(
                    f"Skipping metadata generation for '{name}' as no label or score column found."
                )

        if ANNOTATION_NOTES_COLUMN_NAME in anno_df.columns:
            log.debug(
                f"Formatting {ANNOTATION_NOTES_COLUMN_NAME} column to JSON strings within lists."
            )
            anno_df[ANNOTATION_NOTES_COLUMN_NAME] = anno_df[
                ANNOTATION_NOTES_COLUMN_NAME
            ].apply(
                partial(
                    _format_note_for_storage,
                    current_time_ms=current_time_ms,
                )
            )

        log.debug("Performing direct input type validation for annotations")
        errors = annotations_validation.validate_argument_types(
            annotations_dataframe=anno_df,
            project_name=project_name,
        )
        for e in errors:
            log.error(e)
        if errors:
            raise ValidationFailure(errors)

        if validate:
            log.debug("Performing dataframe form validation for annotations")
            errors = annotations_validation.validate_dataframe_form(
                annotations_dataframe=anno_df
            )
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

        log.debug("Removing unnecessary annotation columns")
        # Update columns to keep: span_id, annotation.notes, and annotation pattern
        columns_to_keep = [SPAN_SPAN_ID_COL.name]
        if ANNOTATION_NOTES_COLUMN_NAME in anno_df.columns:
            columns_to_keep.append(ANNOTATION_NOTES_COLUMN_NAME)
        anno_df = remove_extraneous_columns(
            df=anno_df,
            column_list=columns_to_keep,
            regex=ANNOTATION_COLUMN_PATTERN,
        )

        if validate:
            log.debug("Performing annotation values validation")
            errors = annotations_validation.validate_values(
                annotations_dataframe=anno_df,
                project_name=project_name,
            )
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

        # Convert to Arrow table
        try:
            log.debug("Converting data to Arrow format")
            pa_table = pa.Table.from_pandas(anno_df)
        except pa.ArrowInvalid as e:
            log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
            raise pa.ArrowInvalid(
                f"Error converting to Arrow format: {str(e)}"
            ) from e
        except Exception as e:
            log.error(f"Unexpected error creating Arrow table: {str(e)}")
            raise

        if ANNOTATION_NOTES_COLUMN_NAME in anno_df.columns:
            notes_field = pa_table.schema.field(ANNOTATION_NOTES_COLUMN_NAME)
            if not (
                isinstance(notes_field.type, pa.ListType)
                and notes_field.type.value_type == pa.string()
            ):
                log.warning(
                    f"Warning: Inferred type for {ANNOTATION_NOTES_COLUMN_NAME} is "
                    f"{notes_field.type}, expected list<string>."
                )

        request_type = FlightRequestType.ANNOTATION
        response = None
        with ArizeFlightClient(
            api_key=self.sdk_config.api_key,
            host=self.sdk_config.flight_server_host,
            port=self.sdk_config.flight_server_port,
            scheme=self.sdk_config.flight_scheme,
        ) as flight_client:
            try:
                response = flight_client.log_arrow_table(
                    space_id=space_id,
                    project_name=project_name,
                    pa_table=pa_table,
                    request_type=request_type,
                )
            except Exception as e:
                msg = f"Error during update request: {str(e)}"
                log.error(msg)
                raise RuntimeError(msg) from e

        if response is None:
            # This should not happen with proper Flight client implementation,
            # but we handle it defensively
            msg = "No response received from flight server during update"
            log.error(msg)
            raise RuntimeError(msg)

        _log_flight_update_summary(
            project_name=project_name,
            total_spans=len(pa_table),
            request_type=request_type,
            response=response,
        )

        # Convert Protocol Buffer SpanError objects to dictionaries for easier access
        return message_to_dict(response)

    def update_metadata(
        self,
        space_id: str,
        project_name: str,
        dataframe: pd.DataFrame,
        patch_document_column_name: str = "patch_document",
        validate: bool = True,
    ) -> Dict[str, Any]:
        """
        Log metadata updates using JSON Merge Patch format. This method is only supported for LLM model types.

        The dataframe must contain a column `context.span_id` to identify spans and either:
        1. A column with JSON patch documents (specified by patch_document_column_name), or
        2. One or more columns with prefix `attributes.metadata.` that will be automatically
           converted to a patch document (e.g., `attributes.metadata.tag` → `{"tag": value}`).

        If both methods are used, the explicit patch document is applied after the individual field updates.
        The patches will be applied to the `attributes.metadata` field of each span.

        **Type Handling:**
        - The client primarily supports string, integer, and float data types.
        - Boolean values are converted to string representations.
        - Nested JSON objects and arrays are serialized to JSON strings during transmission.
        - Setting a field to `None` or `null` will set the field to JSON null in the metadata.
          Note: This differs from standard JSON Merge Patch where null values remove fields.

        Args:
            dataframe: DataFrame with span_ids and either patch documents or metadata field columns.
            project_name: A unique name to identify your project in the Arize platform.
            patch_document_column_name: Name of the column containing JSON patch documents.
                Defaults to "patch_document".
            validate: When set to True, validation is run before sending data.

        Returns:
            Dictionary containing update results with the following keys:
                - spans_processed: Total number of spans in the input dataframe
                - spans_updated: Count of successfully updated span metadata records
                - spans_failed: Count of spans that failed to update
                - errors: List of dictionaries with 'span_id' and 'error_message' keys for each failed span

            Error types from the server include:
                - parse_failure: Failed to parse JSON metadata
                - patch_failure: Failed to apply JSON patch
                - type_conflict: Type conflict in metadata
                - connection_failure: Connection issues
                - segment_not_found: No matching segment found
                - druid_rejection: Backend rejected the update

        Raises:
            AuthError: When API key or space ID is missing
            ValidationFailure: When validation of the dataframe or values fails
            ImportError: When required tracing dependencies are missing
            ArrowInvalid: When the dataframe cannot be converted to Arrow format
            RuntimeError: If the request fails or no response is received

        Example:
            ```python
            # Method 1: Using a patch document
            df = pd.DataFrame(
                {
                    "context.span_id": ["span1", "span2"],
                    "patch_document": [
                        {"tag": "important"},
                        {"priority": "high"},
                    ],
                }
            )

            # Method 2: Using direct field columns
            df = pd.DataFrame(
                {
                    "context.span_id": ["span1", "span2"],
                    "attributes.metadata.tag": ["important", "standard"],
                    "attributes.metadata.priority": ["high", "medium"],
                }
            )

            # Method 3: Combining both approaches
            df = pd.DataFrame(
                {
                    "context.span_id": ["span1"],
                    "attributes.metadata.tag": ["important"],
                    "patch_document": [
                        {"priority": "high"}
                    ],  # This will override any conflicting fields
                }
            )

            # Method 4: Setting fields to null
            df = pd.DataFrame(
                {
                    "context.span_id": ["span1"],
                    "attributes.metadata.old_field": [
                        None
                    ],  # Sets field to JSON null
                    "patch_document": [
                        {"other_field": None}
                    ],  # Also sets field to JSON null
                }
            )
            ```
        """
        # Import validation modules
        from arize.spans.columns import SPAN_SPAN_ID_COL
        from arize.spans.validation.metadata.argument_validation import (
            validate_argument_types,
        )
        from arize.spans.validation.metadata.dataframe_form_validation import (
            validate_dataframe_form,
        )
        from arize.spans.validation.metadata.value_validation import (
            validate_values,
        )

        # This method requires a space_id and project_name
        if not space_id:
            raise MissingSpaceIDError()
        if not project_name:
            raise MissingProjectNameError()

        # Create headers copy for the spans client
        # Safe to mutate, returns a deep copy
        self._headers = self.sdk_config.headers
        # Send the number of rows in the dataframe as a header
        # This helps the Arize server to return appropriate feedback, specially for async logging
        self._headers.update(
            {
                "number-of-rows": str(len(dataframe)),
                "arize-space-id": space_id,
            }
        )

        # Bind common context for this operation
        log = CtxAdapter(
            logger,
            {
                "resource": "spans",
                "operation": "log",
                "space_id": space_id,
                "project": project_name,
                "validate": validate,
                "evals_df_rows": len(dataframe),
            },
        )

        # We need our own copy since we will manipulate the underlying data and
        # do not want side effects
        metadata_df = dataframe.copy()

        # We expect the index to be 0,1,2,3..., len(df)-1. Phoenix, for example, will give us a dataframe
        # with context_id as the index; the old index is not meaningful in our copy of the original dataframe
        # so we can drop it.
        reset_dataframe_index(dataframe=metadata_df)

        # Check if we have any attributes.metadata.* columns to build a patch document
        metadata_prefix = "attributes.metadata."
        metadata_fields = [
            col
            for col in metadata_df.columns
            if col.startswith(metadata_prefix)
        ]
        has_metadata_fields = len(metadata_fields) > 0
        has_patch_document = patch_document_column_name in metadata_df.columns

        if not has_metadata_fields and not has_patch_document:
            error_msg = (
                f"No metadata fields found. Either provide columns with prefix '{metadata_prefix}' "
                f"or a '{patch_document_column_name}' column with JSON patch documents."
            )
            log.error(error_msg)
            raise ValueError(error_msg)

        if has_metadata_fields:
            log.debug(
                f"Found {len(metadata_fields)} metadata field columns with prefix '{metadata_prefix}'"
            )

        # Create a new column for patch documents if we're going to use it
        if has_metadata_fields or has_patch_document:
            # Use 'patch_document' as the standardized column name for downstream processing
            final_patch_column = "patch_document"
            if final_patch_column not in metadata_df.columns:
                metadata_df[final_patch_column] = None

        # Process metadata field columns if they exist
        if has_metadata_fields:
            # Create patch documents from metadata fields
            field_patches = metadata_df.apply(_build_patch_document, axis=1)

            # If there's an existing patch document column, we'll handle merging
            if has_patch_document:
                # Apply the processing function to each row
                merged_patches = [
                    _process_patch_document(
                        metadata_df,
                        patch_document_column_name,
                        field_patches,
                        idx,
                    )
                    for idx in range(len(metadata_df))
                ]
                metadata_df[final_patch_column] = merged_patches
            else:
                # Just use the field patches directly
                metadata_df[final_patch_column] = field_patches
        elif (
            has_patch_document
            and patch_document_column_name != final_patch_column
        ):
            # If there are only patch documents (no metadata fields) and the column
            # isn't already named patch_document, rename it
            metadata_df[final_patch_column] = metadata_df[
                patch_document_column_name
            ]

        # Now process any patch documents that need to be parsed from strings to dicts
        if final_patch_column in metadata_df.columns:
            validation_errors = []

            # Process each row
            processed_patches = []
            for idx in range(len(metadata_df)):
                patch, errors = _ensure_dict_patch(
                    metadata_df,
                    final_patch_column,
                    idx,
                )
                if patch:
                    processed_patches.append(patch)
                if errors:
                    validation_errors.append(errors)

            # If validation is enabled and errors found, raise ValidationFailure
            if validate and validation_errors:
                for e in validation_errors:
                    log.error(e)
                raise ValidationFailure(validation_errors)

            metadata_df[final_patch_column] = processed_patches

        # Run validations on the processed dataframe
        if validate:
            log.debug("Validating metadata update input")

            # Type validation
            errors = validate_argument_types(
                metadata_dataframe=metadata_df, project_name=project_name
            )
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

            # Dataframe form validation
            log.debug("Validating metadata update dataframe form")
            errors = validate_dataframe_form(
                metadata_dataframe=metadata_df,
                patch_document_column_name=final_patch_column,
            )
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

            # Value validation
            log.debug("Validating metadata update values")
            errors = validate_values(
                metadata_dataframe=metadata_df,
                patch_document_column_name=final_patch_column,
            )
            for e in errors:
                log.error(e)
            if errors:
                raise ValidationFailure(errors)

        # Keep only the required columns
        metadata_df = remove_extraneous_columns(
            df=metadata_df,
            column_list=[SPAN_SPAN_ID_COL.name, final_patch_column],
        )

        log.debug("Using column names: context.span_id and patch_document")
        # Ensure all patches are JSON strings for sending
        if final_patch_column in metadata_df.columns:
            metadata_df[final_patch_column] = metadata_df[
                final_patch_column
            ].apply(
                lambda p: json.dumps(p)
                if not isinstance(p, float) or not np.isnan(p)
                else json.dumps({})
            )

        # Convert to Arrow table
        try:
            log.debug("Converting data to Arrow format")
            pa_table = pa.Table.from_pandas(metadata_df)
        except pa.ArrowInvalid as e:
            log.error(f"{INVALID_ARROW_CONVERSION_MSG}: {str(e)}")
            raise pa.ArrowInvalid(
                f"Error converting to Arrow format: {str(e)}"
            ) from e
        except Exception as e:
            log.error(f"Unexpected error creating Arrow table: {str(e)}")
            raise

        request_type = FlightRequestType.METADATA
        response = None
        with ArizeFlightClient(
            api_key=self.sdk_config.api_key,
            host=self.sdk_config.flight_server_host,
            port=self.sdk_config.flight_server_port,
            scheme=self.sdk_config.flight_scheme,
        ) as flight_client:
            try:
                response = flight_client.log_arrow_table(
                    space_id=space_id,
                    project_name=project_name,
                    pa_table=pa_table,
                    request_type=request_type,
                )
            except Exception as e:
                msg = f"Error during update request: {str(e)}"
                log.error(msg)
                raise RuntimeError(msg) from e

        if response is None:
            # This should not happen with proper Flight client implementation,
            # but we handle it defensively
            msg = "No response received from flight server during update"
            log.error(msg)
            raise RuntimeError(msg)

        _log_flight_update_summary(
            project_name=project_name,
            total_spans=len(pa_table),
            request_type=request_type,
            response=response,
        )

        # Convert Protocol Buffer SpanError objects to dictionaries for easier access
        return message_to_dict(response)

    def _log_arrow(
        self,
        project_name: str,
        pa_table: pa.Table,
        timeout: float | None = None,
        tmp_dir: str = "",
    ) -> requests.Response:
        logger.debug("Preparing to log Arrow table via file upload")
        logger.debug(
            "Preparing to log Arrow table via file upload",
            extra={"rows": pa_table.num_rows, "cols": pa_table.num_columns},
        )
        proto_schema = get_pb_schema_tracing(project_name=project_name)

        logger.debug("Serializing schema")
        base64_schema = base64.b64encode(proto_schema.SerializeToString())
        pa_schema = append_to_pyarrow_metadata(
            pa_table.schema, {"arize-schema": base64_schema}
        )

        # --- decide output file path ---
        # cases:
        # 1) tmp_dir == ""        -> we own a TemporaryDirectory, we write to a file
        #                            in it, clean the entire dir
        # 2) tmp_dir is a dir     -> user owns the directory, we create a temp file
        #                            inside it (and remove only that file)
        # 3) tmp_dir is a file    -> user owns the file, we write exactly there (no cleanup)

        tdir = None  # Assume caller owns the directory
        cleanup_file = False
        if not tmp_dir:
            # we own the directory. Best effort cleanup on Windows:
            # https://www.scivision.dev/python-tempfile-permission-error-windows/
            tdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True)
            outfile = _mktemp_in(tdir.name)
        elif os.path.isdir(tmp_dir):
            outfile = _mktemp_in(tmp_dir)
            cleanup_file = True  # we own the file
        else:
            # explicit file path
            outfile = tmp_dir

        try:
            # Write arrow file
            logger.debug(f"Writing table to temporary file: {outfile}")
            write_arrow_file(outfile, pa_table, pa_schema)

            # Send to Arize
            logger.debug(
                "Uploading file to Arize",
                extra={"path": outfile, "size_bytes": _filesize(outfile)},
            )
            resp = self._post_file(path=outfile, timeout=timeout)
            _maybe_log_project_url(resp)
            return resp
        finally:
            if tdir is not None:
                try:
                    # triggers TemporaryDirectory cleanup (best-effort on Windows)
                    tdir.cleanup()  # cleaning the entire dir, no need to clean the file
                except Exception as e:
                    logger.warning(
                        f"Failed to remove temporary directory {tdir.name}: {str(e)}"
                    )
            elif cleanup_file:
                try:
                    os.remove(outfile)
                except Exception as e:
                    logger.warning(
                        f"Failed to remove temporary file {outfile}: {str(e)}"
                    )

    def _post_file(
        self,
        path: str,
        timeout: float | None = None,
    ) -> requests.Response:
        with open(path, "rb") as f:
            return requests.post(
                self.sdk_config.files_url,
                timeout=timeout,
                data=f,
                headers=self._headers,
                verify=self.sdk_config.request_verify,
            )


def _build_patch_document(row):
    # Extract and preserve metadata values with proper types
    patch = {}
    for key in row.index:
        if key.startswith("attributes.metadata."):
            field_name = key.replace("attributes.metadata.", "")
            # Check if the value is an array/list or other iterable (except strings)
            if isinstance(row[key], (list, np.ndarray)) or (
                hasattr(row[key], "__iter__") and not isinstance(row[key], str)
            ):
                # For arrays/iterables, just add the value (nulls will be handled later)
                patch[field_name] = row[key]
            else:
                # For scalar values, include even if it's None/null
                # This is important for explicitly setting fields to null
                patch[field_name] = row[key]
    return patch


def _process_patch_document(
    metadata_df, patch_document_column_name, field_patches, row_idx
):
    # Get the field patch for this row
    field_patch = field_patches.iloc[row_idx]

    # Get and process the explicit patch document
    patch_doc = metadata_df.loc[row_idx, patch_document_column_name]

    # Handle different patch document formats
    if patch_doc is None:
        # None (as opposed to NaN) is a valid value but creates an empty patch
        explicit_patch = {}
    elif isinstance(patch_doc, float) and np.isnan(patch_doc):
        # NaN is treated as an empty patch
        explicit_patch = {}
    elif isinstance(patch_doc, dict):
        # Dict is used directly
        explicit_patch = patch_doc
    elif isinstance(patch_doc, str):
        try:
            explicit_patch = json.loads(patch_doc)
            if not isinstance(explicit_patch, dict):
                logger.warning(
                    f"Row {row_idx}: Parsed patch document is not a dictionary. "
                    f"Using empty dictionary instead."
                )
                explicit_patch = {}
        except json.JSONDecodeError as e:
            logger.warning(
                f"Row {row_idx}: Failed to parse patch document: {e}. "
                f"Using empty dictionary instead."
            )
            explicit_patch = {}
    else:
        logger.warning(
            f"Row {row_idx}: Unsupported patch document type: {type(patch_doc)}. "
            f"Using empty dictionary instead."
        )
        explicit_patch = {}

    # Merge patches - explicit patch takes precedence
    merged_patch = {**field_patch, **explicit_patch}
    return merged_patch


def _ensure_dict_patch(
    metadata_df: pd.DataFrame,
    final_patch_column: str,
    row_idx: int,
):
    patch = metadata_df.loc[row_idx, final_patch_column]
    validation_errors = []

    # For None/null values, return an empty dict
    if patch is None:
        return {}, validation_errors

    # Handle NaN differently from None
    if isinstance(patch, float) and np.isnan(patch):
        return {}, validation_errors

    # If already a dict, return as is
    if isinstance(patch, dict):
        return patch, validation_errors

    # If string, try to parse as JSON
    if isinstance(patch, str):
        try:
            parsed = json.loads(patch)
            if isinstance(parsed, dict):
                return parsed
            else:
                error_msg = (
                    f"Row {row_idx}: JSON must be an object/dictionary, "
                    f"got {type(parsed).__name__}"
                )
                logger.warning(error_msg)
                validation_errors.append(error_msg)
                return {}, validation_errors  # if not validate else None
        except json.JSONDecodeError as e:
            error_msg = f"Row {row_idx}: Invalid JSON in patch document: {e}"
            logger.warning(error_msg)
            validation_errors.append(error_msg)
            return {}, validation_errors  # if not validate else None

    # For other types, log warning
    error_msg = f"Row {row_idx}: Unsupported patch type: {type(patch).__name__}"
    logger.warning(error_msg)
    validation_errors.append(error_msg)
    return {}, validation_errors  # if not validate else None


def _format_note_for_storage(
    note_text: str,
    current_time_ms: int,
):
    if pd.isna(note_text):
        return None
    note_obj = {
        "text": str(note_text),
        "updated_by": "SDK",
        "updated_at": current_time_ms,
    }
    return [json.dumps(note_obj)]


def _log_flight_update_summary(
    project_name: str,
    total_spans: int,
    request_type: FlightRequestType,
    response: WriteSpanResponse,
) -> None:
    spans_updated = getattr(response, "spans_updated", None)
    if spans_updated is None:
        # Fallback for older response types
        spans_updated = getattr(response, "records_updated", None)
    spans_processed = getattr(response, "spans_processed", None)
    raw_errors = getattr(response, "errors", None)
    errors = (
        [
            {"span_id": e.span_id, "error_message": e.error_message}
            for e in raw_errors
        ]
        if raw_errors
        else []
    )

    # Normalize request_type to a readable string
    req_type_str = getattr(request_type, "name", None) or str(request_type)

    # Compute metrics safely
    success_rate = None
    spans_failed = None
    if isinstance(spans_processed, (int, float)) and spans_processed:
        su = int(spans_updated or 0)
        sp = int(spans_processed)
        success_rate = round(100.0 * su / sp, 2)
        spans_failed = max(sp - su, 0)

    metrics = {
        "project": project_name,
        "request_type": req_type_str,
        "total_spans": int(total_spans),
        "spans_processed": spans_processed,
        "spans_updated": spans_updated,
        "spans_failed": spans_failed,
        "success_rate": success_rate,
        "error_count": len(errors),
    }

    # One summary log line (great for JSON pipelines, readable in pretty mode)
    if spans_processed is None or spans_updated is None:
        logger.warning("Flight update response missing counts", extra=metrics)
    else:
        all_processed = int(spans_processed) == int(total_spans)
        msg = (
            "✅ All spans processed" if all_processed else "Partial processing"
        )
        logger.info(msg, extra=metrics)

    # Emit individual error lines (structured per-error, easy to aggregate)
    for err in errors:
        logger.error(
            "Span update error",
            extra={
                "project": project_name,
                "request_type": req_type_str,
                **err,
            },
        )


def _maybe_log_project_url(response: requests.Response) -> None:
    try:
        url = get_arize_project_url(response)
        if url:
            logger.info("✅ Success! Check out your data at %s", url)
    except Exception as e:
        logger.warning("Failed to get project URL: %s", e)


def _mktemp_in(directory: str) -> str:
    """
    Create a unique temp file path inside `directory` without leaving
    an open file descriptor around (Windows-safe). The file exists on
    disk and is closed; caller can open/write it later.
    """
    with tempfile.NamedTemporaryFile(
        dir=directory,
        prefix="arize-",
        suffix=".arrow",
        delete=False,  # important on Windows: don't keep the file open
    ) as f:
        return f.name  # file is closed when we exit the context


def _filesize(path: str) -> int:
    try:
        return os.path.getsize(path)
    except Exception:
        return -1
