"""
This module provides an extended turn class for audio dialogue processing.

The AudioTurn class extends the base Turn class with comprehensive audio-specific
functionality, including audio data storage, timing information, and metadata
for room acoustics simulation and audio processing workflows.
"""

# SPDX-FileCopyrightText: Copyright © 2025 Idiap Research Institute <contact@idiap.ch>
# SPDX-FileContributor: Yanis Labrak <yanis.labrak@univ-avignon.fr>
# SPDX-License-Identifier: MIT
import numpy as np

from sdialog import Turn


class AudioTurn(Turn):
    """
    Extended turn class with comprehensive audio processing capabilities.

    AudioTurn extends the base Turn class to support audio generation, storage,
    and processing. It maintains all the functionality of the base Turn while
    adding audio-specific features for individual utterance processing and
    room acoustics simulation.

    Key Features:

      - Audio data storage with lazy loading capabilities
      - Timing information for audio synchronization
      - Voice and speaker position metadata
      - Audio quality metrics (SNR)
      - Integration with room acoustics simulation
      - Support for external audio processing tools (dscaper)

    Audio Processing Attributes:
      - _audio: Raw audio data as numpy array
      - sampling_rate: Audio sampling rate in Hz
      - audio_path: File path to stored audio data
      - audio_duration: Duration of the audio in seconds
      - audio_start_time: Start time within the dialogue timeline

    Metadata Attributes:
      - voice: Voice identifier used for TTS generation
      - position: Spatial position of the speaker
      - microphone_position: Microphone position for recording
      - snr: Signal-to-noise ratio of the audio
      - is_stored_in_dscaper: Flag for external tool integration

    :ivar _audio: Raw audio data as numpy array (lazy-loaded).
    :vartype _audio: Optional[np.ndarray]
    :ivar sampling_rate: Audio sampling rate in Hz.
    :vartype sampling_rate: Optional[int]
    :ivar audio_path: File path to the stored audio data.
    :vartype audio_path: str
    :ivar audio_duration: Duration of the audio in seconds.
    :vartype audio_duration: float
    :ivar audio_start_time: Start time within the dialogue timeline in seconds.
    :vartype audio_start_time: float
    :ivar snr: Signal-to-noise ratio of the audio.
    :vartype snr: float
    :ivar voice: Voice identifier used for TTS generation.
    :vartype voice: str
    :ivar position: Spatial position of the speaker in the room.
    :vartype position: str
    :ivar microphone_position: Microphone position for recording.
    :vartype microphone_position: str
    :ivar is_stored_in_dscaper: Flag indicating integration with dscaper tool.
    :vartype is_stored_in_dscaper: bool
    """

    _audio: np.ndarray = None
    sampling_rate: int = -1
    audio_path: str = ""
    audio_duration: float = -1.0
    audio_start_time: float = -1.0
    snr: float = -1.0
    voice: str = ""
    position: str = ""
    microphone_position: str = ""
    is_stored_in_dscaper: bool = False

    def get_audio(self) -> np.ndarray:
        """
        Retrieves the raw audio data for this turn.

        This method returns the audio data stored in the turn. The audio data
        is typically generated by a TTS engine and stored as a numpy array
        representing the audio waveform.

        :return: Numpy array containing the audio waveform data.
        :rtype: np.ndarray
        :raises AttributeError: If no audio data has been set for this turn.
        """
        return self._audio

    def set_audio(self, audio: np.ndarray, sampling_rate: int):
        """
        Sets the audio data and sampling rate for this turn.

        This method stores the generated audio data along with its sampling rate.
        The audio data is typically generated by a TTS engine and represents
        the waveform of the spoken utterance.

        :param audio: Numpy array containing the audio waveform data.
        :type audio: np.ndarray
        :param sampling_rate: Audio sampling rate in Hz (e.g., 24000, 44100).
        :type sampling_rate: int
        """
        self._audio = audio
        self.sampling_rate = sampling_rate

    @staticmethod
    def from_turn(turn: Turn):
        """
        Creates an AudioTurn object from a base Turn object.

        This static method converts a regular Turn object into an AudioTurn by
        copying the base turn data (text, speaker).

        :return: A new AudioTurn object with audio-specific functionality.
        :rtype: AudioTurn
        """

        # Create AudioTurn with base turn data
        audio_turn = AudioTurn(text=turn.text, speaker=turn.speaker)

        return audio_turn
