# multi_agent_evaluator.py
import time
import random

from auditora import sentinel, session, monitor, report
from auditora.adats.session import DefaultSession
from auditora.adats.monitor import DefaultMonitor
from auditora.adats.report import DefaultReport


class CustomSession(DefaultSession):
    """Custom session with additional LLM-specific tracking."""

    def __init__(self, session_id: str = None):
        super().__init__(session_id)
        self.llm_calls = 0
        self.evaluation_results = []

    def record_llm_call(self, model: str, tokens: int):
        self.llm_calls += 1
        self.set(f'llm_call_{self.llm_calls}', {'model': model, 'tokens': tokens})


@sentinel(
    session=CustomSession("multi_agent_eval_001"),
    monitor=DefaultMonitor(),
    report=DefaultReport(log_level="DEBUG")
)
def evaluate_multi_agent_system(user_query: str, agent_types: list):
    """
    Evaluate a multi-agent LLM system with comprehensive monitoring.
    """
    report.info("Starting multi-agent system evaluation", query=user_query)
    session.add_tag("multi_agent")
    session.set('user_query', user_query)
    session.set('agent_types', agent_types)

    agent_results = {}

    # Simulate agent orchestration
    for agent_type in agent_types:
        report.debug(f"Activating agent: {agent_type}")

        # Simulate agent processing time
        processing_time = random.uniform(0.1, 0.5)
        start_timer = monitor.start_timer(f"{agent_type}_processing")
        time.sleep(processing_time)  # Simulate work

        # Simulate LLM call
        prompt_tokens = len(user_query.split())
        completion_tokens = random.randint(20, 100)
        llm_response_time = random.uniform(0.2, 1.0)

        duration = monitor.stop_timer(
            f"{agent_type}_processing",
            start_timer,
            agent_type=agent_type,
            processing_time=processing_time
        )

        # Log LLM call with structured metadata
        report.log_llm_call(
            model=f"{agent_type}-llm-v1",
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            response_time=llm_response_time,
            agent_type=agent_type,
            query_length=len(user_query)
        )

        # Record in custom session
        session.record_llm_call(f"{agent_type}-llm-v1", prompt_tokens + completion_tokens)

        # Simulate evaluation result
        score = random.uniform(0.6, 0.95)
        monitor.increment_metric(f"{agent_type}_score", score)
        report.log_evaluation_results(
            metric_name=f"{agent_type}_coherence",
            value=score,
            threshold=0.7,
            agent_type=agent_type
        )

        agent_results[agent_type] = {
            'score': score,
            'response_time': llm_response_time,
            'tokens': completion_tokens
        }

        session.set(f"{agent_type}_result", agent_results[agent_type])

    # Final coordination and analysis
    avg_score = sum(r['score'] for r in agent_results.values()) / len(agent_results)
    total_tokens = sum(r['tokens'] for r in agent_results.values())

    report.info(
        "Multi-agent evaluation completed",
        average_score=f"{avg_score:.3f}",
        total_tokens=total_tokens,
        agents_evaluated=len(agent_results)
    )

    session.set('final_average_score', avg_score)
    session.set('total_tokens_processed', total_tokens)

    return {
        'results': agent_results,
        'summary': {
            'average_score': avg_score,
            'total_tokens': total_tokens,
            'session_id': session.session_id
        }
    }


def main():
    """Demonstrate Auditora's capabilities with external context access."""

    # Run the evaluation
    user_query = "Analyze the impact of AI on healthcare diagnostics"
    agents = ["researcher", "analyst", "validator", "summarizer"]

    results = evaluate_multi_agent_system(user_query, agents)

    print("=" * 60)
    print("EVALUATION RESULTS")
    print("=" * 60)
    print(f"Final Average Score: {results['summary']['average_score']:.3f}")
    print(f"Total Tokens Processed: {results['summary']['total_tokens']}")
    print(f"Session ID: {results['summary']['session_id']}")

    # Demonstrate external access to context objects
    print("\n" + "=" * 60)
    print("CONTEXT OBJECT INSPECTION")
    print("=" * 60)

    # Access session externally
    eval_session = evaluate_multi_agent_system._session
    print(f"Session: {eval_session}")
    print(f"Session Tags: {eval_session.get_tags()}")
    print(f"LLM Calls Made: {eval_session.llm_calls}")

    # Access monitor externally
    eval_monitor = evaluate_multi_agent_system._monitor
    monitor_summary = eval_monitor.get_summary()
    print(f"\nMonitor Summary: {monitor_summary}")

    # Access specific events
    events = eval_monitor.get_events()
    llm_events = [e for e in events if 'LLM call completed' in str(e.get('event', ''))]
    print(f"\nLLM Events Tracked: {len(llm_events)}")

    # Access report configuration
    eval_report = evaluate_multi_agent_system._report
    print(f"\nReport Configuration: {eval_report}")

    # Demonstrate manual context usage with Bifrost
    print("\n" + "=" * 60)
    print("MANUAL CONTEXT USAGE (BIFROST)")
    print("=" * 60)

    from auditora.core.bifrost import bifrost_sync

    # Create custom context objects
    manual_session = DefaultSession("manual_test")
    manual_monitor = DefaultMonitor()
    manual_report = DefaultReport()

    with bifrost_sync(manual_session, manual_monitor, manual_report):
        # Now we can use global session/monitor/report in this block
        manual_report.info("Manual context test")
        manual_session.set('test_key', 'test_value')
        manual_monitor.track('manual_test_event', test_param="success")

        print(f"Manual session state: {manual_session.get_state_snapshot()}")
        print(f"Manual monitor events: {len(manual_monitor.get_events())}")


if __name__ == "__main__":
    main()