.pre-commit-config.yaml
CHANGELOG.md
CONTRIBUTING.md
LICENSE
MANIFEST.in
README.md
ROADMAP.md
pyproject.toml
requirements-dev.txt
requirements.txt
.github/workflows/ci.yml
.github/workflows/docs.yml
.github/workflows/release.yml
.github/workflows/tests.yml
examples/azure_evaluation.py
examples/enhanced_agent_scoring_example.py
examples/example_aggregators.py
examples/ijson_streaming_example.py
examples/multi_tool_agent_scoring_example.py
examples/openai_inference_example.py
examples/panel_evaluation.py
examples/run_agent_evaluation.py
examples/MMLU_REPORT/clean_gpt_oss_data.py
examples/MMLU_REPORT/mmlu_analysis_script.py
examples/MMLU_REPORT/ollama_mmlu_evaluation.py
examples/MMLU_REPORT/ollama_mmlu_rescoring.py
examples/MMLU_REPORT/openai_mmlu_evaluation.py
examples/MMLU_REPORT/gpt_4o_mini_openai_run_20250810_091625/mode_unspecified/results.json
examples/MMLU_REPORT/gpt_5_openai_run_20250810_145132/mode_unspecified/results.json
examples/MMLU_REPORT/gpt_oss_cleaned_ollama_run_20250812_180842/mode_high/results.json
examples/MMLU_REPORT/gpt_oss_cleaned_ollama_run_20250812_180842/mode_low/results.json
examples/MMLU_REPORT/gpt_oss_cleaned_ollama_run_20250812_180842/mode_medium/results.json
examples/MMLU_REPORT/gpt_oss_cleaned_ollama_run_20250812_180842/mode_unspecified/results.json
examples/MMLU_REPORT/gpt_oss_ollama_run_20250809_144406/mode_high/results.json
examples/MMLU_REPORT/gpt_oss_ollama_run_20250809_144406/mode_low/results.json
examples/MMLU_REPORT/gpt_oss_ollama_run_20250809_144406/mode_medium/results.json
examples/MMLU_REPORT/gpt_oss_ollama_run_20250809_144406/mode_unspecified/results.json
examples/MMLU_REPORT/multi_pattern_gpt_4o_mini_openai_run_20250810_091625/mode_unspecified/results.json
examples/MMLU_REPORT/multi_pattern_gpt_5_openai_run_20250810_145132/mode_unspecified/results.json
examples/MMLU_REPORT/multi_pattern_gpt_oss_cleaned_ollama_run_20250812_180842/mode_high/results.json
examples/MMLU_REPORT/multi_pattern_gpt_oss_cleaned_ollama_run_20250812_180842/mode_low/results.json
examples/MMLU_REPORT/multi_pattern_gpt_oss_cleaned_ollama_run_20250812_180842/mode_medium/results.json
examples/MMLU_REPORT/multi_pattern_gpt_oss_cleaned_ollama_run_20250812_180842/mode_unspecified/results.json
examples/MMLU_REPORT/multi_pattern_o3_concatenated_results/mode_unspecified/results.json
examples/MMLU_REPORT/o3_concatenated_results/mode_unspecified/results.json
examples/agent/agent_evaluator_example.py
examples/agent/agent_evaluator_with_aggregation.py
examples/agent/agent_scoring_example.py
examples/ci_cd_configs/basic_evaluation.yaml
examples/ci_cd_configs/github_actions_workflow.yml
examples/ci_cd_configs/noveum_improvements_config.yaml
examples/ci_cd_configs/panel_evaluation.yaml
examples/ci_cd_configs/panel_judge_evaluation.yaml
examples/ci_cd_configs/sample_config.yaml
examples/evaluations/basic_evaluation.py
examples/guides/ai-agent-eval-guidelines.md
novaeval.egg-info/PKG-INFO
novaeval.egg-info/SOURCES.txt
novaeval.egg-info/dependency_links.txt
novaeval.egg-info/entry_points.txt
novaeval.egg-info/requires.txt
novaeval.egg-info/top_level.txt
src/novaeval/__init__.py
src/novaeval/cli.py
src/novaeval/py.typed
src/novaeval/agents/__init__.py
src/novaeval/agents/agent_data.py
src/novaeval/config/__init__.py
src/novaeval/config/job_config.py
src/novaeval/config/schema.py
src/novaeval/datasets/__init__.py
src/novaeval/datasets/agent_dataset.py
src/novaeval/datasets/base.py
src/novaeval/datasets/custom.py
src/novaeval/datasets/huggingface.py
src/novaeval/datasets/mmlu.py
src/novaeval/datasets/noveum_spans_dataset.py
src/novaeval/datasets/swe_agent_trajectories_dataset.py
src/novaeval/evaluators/__init__.py
src/novaeval/evaluators/agent_evaluator.py
src/novaeval/evaluators/aggregators.py
src/novaeval/evaluators/base.py
src/novaeval/evaluators/standard.py
src/novaeval/integrations/__init__.py
src/novaeval/integrations/noveum.py
src/novaeval/models/__init__.py
src/novaeval/models/anthropic.py
src/novaeval/models/azure_openai.py
src/novaeval/models/base.py
src/novaeval/models/gemini.py
src/novaeval/models/ollama.py
src/novaeval/models/openai.py
src/novaeval/noveum_platform/__init__.py
src/novaeval/noveum_platform/client.py
src/novaeval/noveum_platform/exceptions.py
src/novaeval/noveum_platform/models.py
src/novaeval/noveum_platform/noveum_datasets_api.py
src/novaeval/noveum_platform/noveum_scorer_results_api.py
src/novaeval/noveum_platform/noveum_traces_api.py
src/novaeval/noveum_platform/utils.py
src/novaeval/reporting/__init__.py
src/novaeval/reporting/metrics.py
src/novaeval/scorers/__init__.py
src/novaeval/scorers/accuracy.py
src/novaeval/scorers/advanced_generation_scorers.py
src/novaeval/scorers/agent_scorers.py
src/novaeval/scorers/agent_scorers_system_prompts.py
src/novaeval/scorers/base.py
src/novaeval/scorers/basic_rag_scorers.py
src/novaeval/scorers/conversational.py
src/novaeval/scorers/g_eval.py
src/novaeval/scorers/panel_judge.py
src/novaeval/scorers/rag.py
src/novaeval/scorers/rag_pipeline_evaluator.py
src/novaeval/scorers/rag_prompts.py
src/novaeval/utils/__init__.py
src/novaeval/utils/config.py
src/novaeval/utils/llm.py
src/novaeval/utils/logging.py
src/novaeval/utils/parsing.py
tests/__init__.py
tests/conftest.py
tests/integration/__init__.py
tests/integration/conftest.py
tests/integration/test_cli_integration.py
tests/integration/test_evaluation_workflow.py
tests/integration/test_models_azure_openai_integration.py
tests/integration/test_models_gemini_integration.py
tests/integration/test_noveum_platform_api_integration.py
tests/integration/test_noveum_platform_scorer_results_integration.py
tests/integration/test_utils.py
tests/unit/conftest.py
tests/unit/test_advanced_generation_coverage.py
tests/unit/test_advanced_generation_scorers_extended.py
tests/unit/test_advanced_rag_scorers.py
tests/unit/test_agent_data.py
tests/unit/test_agent_dataset.py
tests/unit/test_agent_evaluator.py
tests/unit/test_agent_scorers.py
tests/unit/test_agent_scorers_system_prompts.py
tests/unit/test_agents_init.py
tests/unit/test_aggregation_bug.py
tests/unit/test_aggregators.py
tests/unit/test_basic_rag_scorers.py
tests/unit/test_cli.py
tests/unit/test_config.py
tests/unit/test_conversational_async_helper.py
tests/unit/test_conversational_coverage.py
tests/unit/test_datasets_base.py
tests/unit/test_datasets_custom.py
tests/unit/test_datasets_huggingface.py
tests/unit/test_datasets_mmlu.py
tests/unit/test_evaluators_base.py
tests/unit/test_evaluators_standard.py
tests/unit/test_job_config.py
tests/unit/test_models_anthropic.py
tests/unit/test_models_azure_openai.py
tests/unit/test_models_base.py
tests/unit/test_models_gemini.py
tests/unit/test_models_ollama.py
tests/unit/test_models_openai.py
tests/unit/test_models_retry_logic.py
tests/unit/test_noveum_platform_client_core.py
tests/unit/test_noveum_platform_client_flow.py
tests/unit/test_noveum_platform_client_resources.py
tests/unit/test_noveum_platform_exceptions.py
tests/unit/test_noveum_platform_init.py
tests/unit/test_noveum_platform_models.py
tests/unit/test_noveum_spans_dataset.py
tests/unit/test_panel_judge_coverage.py
tests/unit/test_rag_pipeline_evaluator.py
tests/unit/test_rag_prompts.py
tests/unit/test_rag_scorers.py
tests/unit/test_reporting_metrics.py
tests/unit/test_scorers_accuracy.py
tests/unit/test_scorers_base.py
tests/unit/test_scorers_conversational.py
tests/unit/test_scorers_g_eval.py
tests/unit/test_scorers_panel_judge.py
tests/unit/test_scorers_panel_judge_extended.py
tests/unit/test_scorers_rag.py
tests/unit/test_swe_agent_trajectories.py
tests/unit/test_utils.py
tests/unit/test_utils_llm.py
tests/unit/test_utils_logging.py
tests/unit/test_utils_parsing.py