LICENSE.md
MANIFEST.in
README.md
pyproject.toml
src/openbench/__init__.py
src/openbench/_registry.py
src/openbench/config.py
src/openbench/eval_config.py
src/openbench/provider_config.py
src/openbench/py.typed
src/openbench.egg-info/PKG-INFO
src/openbench.egg-info/SOURCES.txt
src/openbench.egg-info/dependency_links.txt
src/openbench.egg-info/entry_points.txt
src/openbench.egg-info/requires.txt
src/openbench.egg-info/top_level.txt
src/openbench/_cli/__init__.py
src/openbench/_cli/cache_command.py
src/openbench/_cli/describe_command.py
src/openbench/_cli/eval_command.py
src/openbench/_cli/eval_retry_command.py
src/openbench/_cli/export.py
src/openbench/_cli/list_command.py
src/openbench/_cli/utils.py
src/openbench/_cli/view_command.py
src/openbench/agents/__init__.py
src/openbench/agents/aider.py
src/openbench/agents/base.py
src/openbench/agents/claude.py
src/openbench/agents/docker_manager.py
src/openbench/agents/manager.py
src/openbench/agents/opencode.py
src/openbench/agents/roo.py
src/openbench/datasets/__init__.py
src/openbench/datasets/arc_agi.py
src/openbench/datasets/boolq.py
src/openbench/datasets/browsecomp.py
src/openbench/datasets/clockbench.py
src/openbench/datasets/detailbench.py
src/openbench/datasets/drop.py
src/openbench/datasets/exercism.py
src/openbench/datasets/graphwalks.py
src/openbench/datasets/healthbench.py
src/openbench/datasets/hle.py
src/openbench/datasets/humaneval.py
src/openbench/datasets/ifeval.py
src/openbench/datasets/livemcpbench.py
src/openbench/datasets/math.py
src/openbench/datasets/mbpp.py
src/openbench/datasets/mgsm.py
src/openbench/datasets/mmmu.py
src/openbench/datasets/mmstar.py
src/openbench/datasets/mockaime.py
src/openbench/datasets/mrcr.py
src/openbench/datasets/multichallenge.py
src/openbench/datasets/rootly_terraform.py
src/openbench/datasets/scicode.py
src/openbench/datasets/simpleqa.py
src/openbench/datasets/tumlu.py
src/openbench/datasets/jsonschemabench/__init__.py
src/openbench/datasets/jsonschemabench/jsonschemabench.py
src/openbench/datasets/jsonschemabench/openai_compatible_ids.txt
src/openbench/evals/__init__.py
src/openbench/evals/agieval.py
src/openbench/evals/anli.py
src/openbench/evals/arabic_exams.py
src/openbench/evals/arc.py
src/openbench/evals/arc_agi.py
src/openbench/evals/bigbench.py
src/openbench/evals/bigbench_hard.py
src/openbench/evals/blimp.py
src/openbench/evals/boolq.py
src/openbench/evals/browsecomp.py
src/openbench/evals/clockbench.py
src/openbench/evals/detailbench.py
src/openbench/evals/drop.py
src/openbench/evals/ethics.py
src/openbench/evals/global_mmlu.py
src/openbench/evals/glue.py
src/openbench/evals/glue_standard.py
src/openbench/evals/gpqa.py
src/openbench/evals/gpqa_diamond.py
src/openbench/evals/graphwalks.py
src/openbench/evals/headqa.py
src/openbench/evals/healthbench.py
src/openbench/evals/hellaswag.py
src/openbench/evals/hle.py
src/openbench/evals/humaneval.py
src/openbench/evals/ifeval.py
src/openbench/evals/jsonschemabench.py
src/openbench/evals/legalsupport.py
src/openbench/evals/livemcpbench.py
src/openbench/evals/logiqa.py
src/openbench/evals/math.py
src/openbench/evals/mathqa.py
src/openbench/evals/mbpp.py
src/openbench/evals/medmcqa.py
src/openbench/evals/medqa.py
src/openbench/evals/mgsm.py
src/openbench/evals/mmlu.py
src/openbench/evals/mmlu_pro.py
src/openbench/evals/mmmlu.py
src/openbench/evals/mmmu.py
src/openbench/evals/mmmu_pro.py
src/openbench/evals/mmstar.py
src/openbench/evals/mockaime.py
src/openbench/evals/mrcr.py
src/openbench/evals/multichallenge.py
src/openbench/evals/musr.py
src/openbench/evals/openbookqa.py
src/openbench/evals/piqa.py
src/openbench/evals/prost.py
src/openbench/evals/pubmedqa.py
src/openbench/evals/qa4mre.py
src/openbench/evals/qasper.py
src/openbench/evals/race.py
src/openbench/evals/rootly_gmcq.py
src/openbench/evals/rootly_terraform.py
src/openbench/evals/scicode.py
src/openbench/evals/sciq.py
src/openbench/evals/simpleqa.py
src/openbench/evals/social_iqa.py
src/openbench/evals/supergpqa.py
src/openbench/evals/swag.py
src/openbench/evals/toxigen.py
src/openbench/evals/truthfulqa.py
src/openbench/evals/tumlu.py
src/openbench/evals/winogrande.py
src/openbench/evals/wsc273.py
src/openbench/evals/xcopa.py
src/openbench/evals/xstorycloze.py
src/openbench/evals/xwinograd.py
src/openbench/evals/exercism/Dockerfile
src/openbench/evals/exercism/compose.yaml
src/openbench/evals/exercism/exercism.py
src/openbench/evals/matharena/__init__.py
src/openbench/evals/matharena/matharena.py
src/openbench/evals/matharena/aime_2023_I/__init__.py
src/openbench/evals/matharena/aime_2023_I/aime_2023_I.py
src/openbench/evals/matharena/aime_2023_II/__init__.py
src/openbench/evals/matharena/aime_2023_II/aime_2023_II.py
src/openbench/evals/matharena/aime_2024/__init__.py
src/openbench/evals/matharena/aime_2024/aime_2024.py
src/openbench/evals/matharena/aime_2024_I/__init__.py
src/openbench/evals/matharena/aime_2024_I/aime_2024_I.py
src/openbench/evals/matharena/aime_2024_II/__init__.py
src/openbench/evals/matharena/aime_2024_II/aime_2024_II.py
src/openbench/evals/matharena/aime_2025/__init__.py
src/openbench/evals/matharena/aime_2025/aime_2025.py
src/openbench/evals/matharena/aime_2025_II/__init__.py
src/openbench/evals/matharena/aime_2025_II/aime_2025_II.py
src/openbench/evals/matharena/brumo_2025/__init__.py
src/openbench/evals/matharena/brumo_2025/brumo_2025.py
src/openbench/evals/matharena/hmmt_feb_2023/__init__.py
src/openbench/evals/matharena/hmmt_feb_2023/hmmt_feb_2023.py
src/openbench/evals/matharena/hmmt_feb_2024/__init__.py
src/openbench/evals/matharena/hmmt_feb_2024/hmmt_feb_2024.py
src/openbench/evals/matharena/hmmt_feb_2025/__init__.py
src/openbench/evals/matharena/hmmt_feb_2025/hmmt_feb_2025.py
src/openbench/metrics/__init__.py
src/openbench/metrics/clockbench.py
src/openbench/metrics/drop.py
src/openbench/metrics/graphwalks.py
src/openbench/metrics/grouped.py
src/openbench/metrics/healthbench.py
src/openbench/metrics/hle.py
src/openbench/metrics/ifeval.py
src/openbench/metrics/json_schema.py
src/openbench/metrics/mgsm.py
src/openbench/metrics/mmlu.py
src/openbench/metrics/mmlu_pro.py
src/openbench/metrics/mmstar.py
src/openbench/metrics/mrcr.py
src/openbench/metrics/multichallenge.py
src/openbench/metrics/scicode.py
src/openbench/metrics/simpleqa.py
src/openbench/model/__init__.py
src/openbench/model/_providers/__init__.py
src/openbench/model/_providers/ai21.py
src/openbench/model/_providers/baseten.py
src/openbench/model/_providers/cerebras.py
src/openbench/model/_providers/cohere.py
src/openbench/model/_providers/crusoe.py
src/openbench/model/_providers/deepinfra.py
src/openbench/model/_providers/friendli.py
src/openbench/model/_providers/groq.py
src/openbench/model/_providers/huggingface.py
src/openbench/model/_providers/hyperbolic.py
src/openbench/model/_providers/lambda_ai.py
src/openbench/model/_providers/minimax.py
src/openbench/model/_providers/moonshot.py
src/openbench/model/_providers/nebius.py
src/openbench/model/_providers/nous.py
src/openbench/model/_providers/novita.py
src/openbench/model/_providers/openrouter.py
src/openbench/model/_providers/parasail.py
src/openbench/model/_providers/reka.py
src/openbench/model/_providers/sambanova.py
src/openbench/model/_providers/vercel.py
src/openbench/monkeypatch/__init__.py
src/openbench/monkeypatch/display_results_patch.py
src/openbench/monkeypatch/file_recorder_logfile_patch.py
src/openbench/scorers/__init__.py
src/openbench/scorers/arc_agi.py
src/openbench/scorers/browsecomp.py
src/openbench/scorers/clockbench.py
src/openbench/scorers/detailbench.py
src/openbench/scorers/drop.py
src/openbench/scorers/exercism.py
src/openbench/scorers/fallback_scorer.py
src/openbench/scorers/graphwalks.py
src/openbench/scorers/healthbench.py
src/openbench/scorers/hle.py
src/openbench/scorers/humaneval.py
src/openbench/scorers/ifeval.py
src/openbench/scorers/json_schema.py
src/openbench/scorers/livemcpbench.py
src/openbench/scorers/math.py
src/openbench/scorers/mbpp.py
src/openbench/scorers/mcq.py
src/openbench/scorers/mgsm.py
src/openbench/scorers/mmmu.py
src/openbench/scorers/mmstar.py
src/openbench/scorers/mockaime.py
src/openbench/scorers/mrcr.py
src/openbench/scorers/multichallenge.py
src/openbench/scorers/open_answer.py
src/openbench/scorers/robust_boxed.py
src/openbench/scorers/scicode.py
src/openbench/scorers/score_boxed.py
src/openbench/scorers/score_last_number.py
src/openbench/scorers/simpleqa.py
src/openbench/scorers/tumlu.py
src/openbench/solvers/__init__.py
src/openbench/solvers/clockbench.py
src/openbench/solvers/exercism_solver.py
src/openbench/solvers/jsonschemabench.py
src/openbench/solvers/mmstar.py
src/openbench/solvers/scicode.py
src/openbench/tools/livemcpbench/copilot/__init__.py
src/openbench/tools/livemcpbench/copilot/arg_generation.py
src/openbench/tools/livemcpbench/copilot/matcher.py
src/openbench/tools/livemcpbench/copilot/mcp_connection.py
src/openbench/tools/livemcpbench/copilot/prepare.py
src/openbench/tools/livemcpbench/copilot/router.py
src/openbench/tools/livemcpbench/copilot/schemas.py
src/openbench/tools/livemcpbench/copilot/server.py
src/openbench/tools/livemcpbench/copilot/toolsource.py
src/openbench/tools/livemcpbench/copilot/upstream_cache.py
src/openbench/utils/__init__.py
src/openbench/utils/arc_parsing.py
src/openbench/utils/cache.py
src/openbench/utils/cli_commands.py
src/openbench/utils/docker.py
src/openbench/utils/image.py
src/openbench/utils/imports.py
src/openbench/utils/mcq.py
src/openbench/utils/metadata.py
src/openbench/utils/text.py
tests/test_cache.py
tests/test_image_utils.py
tests/test_json_schema_scorer.py
tests/test_open_answer_scorer.py
tests/test_registry.py
tests/test_robust_scorers.py
tests/test_text_utils.py