.gitignore
.readthedocs.yaml
CMakeLists.txt
CODE_OF_CONDUCT.md
DCO
Dockerfile
Dockerfile.openEuler
LICENSE
README.md
README.zh.md
collect_env.py
format.sh
mypy.ini
packages.txt
pyproject.toml
pytest.ini
requirements-dev.txt
requirements-lint.txt
requirements.txt
setup.py
.github/Dockerfile.buildwheel
.github/PULL_REQUEST_TEMPLATE.md
.github/actionlint.yaml
.github/dependabot.yml
.github/labeler.yml
.github/ISSUE_TEMPLATE/100-documentation.yml
.github/ISSUE_TEMPLATE/110-user-story.yml
.github/ISSUE_TEMPLATE/200-installation.yml
.github/ISSUE_TEMPLATE/300-usage.yml
.github/ISSUE_TEMPLATE/400-bug-report.yml
.github/ISSUE_TEMPLATE/500-feature-request.yml
.github/ISSUE_TEMPLATE/600-new-model.yml
.github/ISSUE_TEMPLATE/700-performance-discussion.yml
.github/ISSUE_TEMPLATE/750-RFC.yml
.github/ISSUE_TEMPLATE/800-others.yml
.github/ISSUE_TEMPLATE/config.yml
.github/workflows/accuracy_report.yaml
.github/workflows/accuracy_test.yaml
.github/workflows/image_openeuler.yml
.github/workflows/image_ubuntu.yml
.github/workflows/label_merge_conflict.yml
.github/workflows/labeler.yml
.github/workflows/nightly_benchmarks.yaml
.github/workflows/release_code.yml
.github/workflows/release_whl.yml
.github/workflows/shellcheck.yml
.github/workflows/vllm_ascend_doctest.yaml
.github/workflows/vllm_ascend_test.yaml
.github/workflows/vllm_ascend_test_long_term.yaml
.github/workflows/vllm_ascend_test_pd.yaml
.github/workflows/matchers/actionlint.json
.github/workflows/matchers/mypy.json
.github/workflows/matchers/ruff.json
benchmarks/README.md
benchmarks/requirements-bench.txt
benchmarks/ops/ben_vocabparallelembedding.py
benchmarks/scripts/convert_json_to_markdown.py
benchmarks/scripts/patch_benchmark_dataset.py
benchmarks/scripts/perf_result_template.md
benchmarks/scripts/run-performance-benchmarks.sh
benchmarks/scripts/run_accuracy.py
benchmarks/tests/latency-tests.json
benchmarks/tests/serving-tests.json
benchmarks/tests/throughput-tests.json
cmake/utils.cmake
csrc/camem_allocator.cpp
csrc/ops.h
csrc/torch_binding.cpp
csrc/utils.h
csrc/kernels/advance_step.cpp
csrc/kernels/get_masked_input_and_mask_kernel.cpp
csrc/kernels/pos_encoding_kernels.cpp
csrc/kernels/types.h
csrc/kernels/utils.h
docs/Makefile
docs/README.md
docs/requirements-docs.txt
docs/requirements-test.txt
docs/source/conf.py
docs/source/faqs.md
docs/source/index.md
docs/source/installation.md
docs/source/quick_start.md
docs/source/_templates/sections/header.html
docs/source/assets/multi_node_dp.png
docs/source/community/contributors.md
docs/source/community/governance.md
docs/source/community/versioning_policy.md
docs/source/community/user_stories/index.md
docs/source/community/user_stories/llamafactory.md
docs/source/developer_guide/contribution/index.md
docs/source/developer_guide/contribution/testing.md
docs/source/developer_guide/evaluation/index.md
docs/source/developer_guide/evaluation/using_evalscope.md
docs/source/developer_guide/evaluation/using_lm_eval.md
docs/source/developer_guide/evaluation/using_opencompass.md
docs/source/developer_guide/evaluation/accuracy_report/index.md
docs/source/developer_guide/feature_guide/index.md
docs/source/developer_guide/feature_guide/patch.md
docs/source/developer_guide/modeling/adding_a_new_model.md
docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md
docs/source/developer_guide/modeling/index.md
docs/source/developer_guide/performance/index.md
docs/source/developer_guide/performance/optimization_and_tuning.md
docs/source/developer_guide/performance/performance_benchmark.md
docs/source/developer_guide/performance/profile_execute_duration.md
docs/source/logos/vllm-ascend-logo-text-dark.png
docs/source/logos/vllm-ascend-logo-text-light.png
docs/source/tutorials/index.md
docs/source/tutorials/multi_node.md
docs/source/tutorials/multi_npu.md
docs/source/tutorials/multi_npu_quantization.md
docs/source/tutorials/multi_npu_qwen3_moe.md
docs/source/tutorials/single_npu.md
docs/source/tutorials/single_npu_multimodal.md
docs/source/user_guide/release_notes.md
docs/source/user_guide/configuration/additional_config.md
docs/source/user_guide/configuration/env_vars.md
docs/source/user_guide/configuration/index.md
docs/source/user_guide/feature_guide/graph_mode.md
docs/source/user_guide/feature_guide/index.md
docs/source/user_guide/feature_guide/lora.md
docs/source/user_guide/feature_guide/quantization.md
docs/source/user_guide/feature_guide/sleep_mode.md
docs/source/user_guide/feature_guide/structured_output.md
docs/source/user_guide/feature_guide/images/structured_output_1.png
docs/source/user_guide/support_matrix/index.md
docs/source/user_guide/support_matrix/supported_features.md
docs/source/user_guide/support_matrix/supported_models.md
examples/eplb_generate_map.py
examples/offline_disaggregated_prefill_npu.py
examples/offline_distributed_inference_npu.py
examples/offline_dualbatch_overlap_npu.py
examples/offline_inference_audio_language.py
examples/offline_inference_npu.py
examples/offline_inference_npu_v1.py
examples/offline_multi_step_custom_ops.py
examples/prompt_embedding_inference.py
examples/run_dp_attention_etp16_benmark.sh
examples/run_dp_server.sh
examples/run_dp_with_cached_graph_etp16.sh
examples/disaggregate_prefill_v1/README.md
examples/disaggregate_prefill_v1/gen_ranktable.py
examples/disaggregate_prefill_v1/gen_ranktable.sh
examples/disaggregate_prefill_v1/load_balance_proxy_server_example.py
examples/disaggregate_prefill_v1/run_server.sh
examples/disaggregated_prefill/disaggregated_prefill_offline.py
examples/disaggregated_prefill/dp_proxy.py
examples/disaggregated_prefill/find_device_ips.py
examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py
examples/disaggregated_prefill/run_decode_server.sh
examples/disaggregated_prefill/run_prefill_server.sh
examples/dp_offline/data_parallel.py
examples/dp_offline/run_dp.sh
examples/external_online_dp/README.md
examples/external_online_dp/launch_dp_program.py
examples/external_online_dp/run_dp_template.sh
tests/__init__.py
tests/conftest.py
tests/model_utils.py
tests/utils.py
tests/e2e/common.sh
tests/e2e/run_disagg_pd.sh
tests/e2e/run_doctests.sh
tests/e2e/doctests/001-quickstart-test.sh
tests/e2e/pd_disaggreate/run_edge_case_test.sh
tests/e2e/pd_disaggreate/setup_pd.sh
tests/e2e/pd_disaggreate/test_edge_cases.py
tests/e2e/pd_disaggreate/test_pd_e2e.py
tests/long_term/test_accuracy.py
tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
tests/long_term/spec_decode_v0/__init__.py
tests/long_term/spec_decode_v0/conftest.py
tests/long_term/spec_decode_v0/test_dynamic_spec_decode.py
tests/long_term/spec_decode_v0/test_multi_step_worker.py
tests/long_term/spec_decode_v0/test_ngram_worker.py
tests/long_term/spec_decode_v0/test_spec_decode_worker.py
tests/long_term/spec_decode_v0/test_utils.py
tests/long_term/spec_decode_v0/utils.py
tests/long_term/spec_decode_v0/e2e/__init__.py
tests/long_term/spec_decode_v0/e2e/conftest.py
tests/long_term/spec_decode_v0/e2e/test_eagle_correctness.py
tests/long_term/spec_decode_v0/e2e/test_medusa_correctness.py
tests/long_term/spec_decode_v0/e2e/test_mlp_correctness.py
tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
tests/long_term/spec_decode_v0/e2e/test_ngram_correctness.py
tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
tests/long_term/spec_decode_v1/test_v1_spec_decode.py
tests/multicard/test_data_parallel.py
tests/multicard/test_dynamic_npugraph_batchsize.py
tests/multicard/test_ilama_lora_tp2.py
tests/multicard/test_model_qwen3_w4a8.py
tests/multicard/test_multimodal_context_parallel.py
tests/multicard/test_offline_inference_distributed.py
tests/multicard/test_pyhccl_distributed.py
tests/multicard/test_torchair_graph_mode.py
tests/multicard/test_w4a8_deepseek.py
tests/ops/test_vocabparallelembedding.py
tests/singlecard/__init__.py
tests/singlecard/test_aclgraph.py
tests/singlecard/test_ascend_config.py
tests/singlecard/test_camem.py
tests/singlecard/test_chunked.py
tests/singlecard/test_guided_decoding.py
tests/singlecard/test_ilama_lora.py
tests/singlecard/test_offline_inference.py
tests/singlecard/test_profile_execute_duration.py
tests/singlecard/test_prompt_embedding.py
tests/singlecard/test_pyhccl.py
tests/singlecard/test_sampler.py
tests/singlecard/test_scheduler.py
tests/singlecard/compile/__init__.py
tests/singlecard/compile/test_simple.py
tests/singlecard/core/__init__.py
tests/singlecard/core/test_ascend_scheduler.py
tests/singlecard/core/test_ascend_scheduler_e2e.py
tests/singlecard/ops/__init__.py
tests/singlecard/ops/test_fused_moe.py
tests/singlecard/ops/test_multi_step.py
tests/singlecard/ops/test_rotary_embedding.py
tests/singlecard/sample/__init__.py
tests/singlecard/sample/test_rejection_sampler.py
tests/ut/test_distributed_tensor_parallel.py
tests/ut/test_token_dispatcher.py
tests/ut/kv_connector/test_llmdatadist_connector.py
tests/ut/kv_connector/test_remote_decode_lifecycle.py
tests/ut/kv_connector/test_remote_prefill_lifecycle.py
tests/ut/kv_connector/utils.py
tests/ut/ops/test_expert_load_balancer.py
tests/ut/patch/worker/patch_common/test_patch_sampler.py
tools/actionlint.sh
tools/check_repo.sh
tools/mypy.sh
tools/png-lint.sh
tools/shellcheck.sh
tools/sphinx-lint.sh
vllm_ascend/__init__.py
vllm_ascend/_version.py
vllm_ascend/ascend_config.py
vllm_ascend/ascend_forward_context.py
vllm_ascend/cpu_binding.py
vllm_ascend/envs.py
vllm_ascend/platform.py
vllm_ascend/soc_info.py
vllm_ascend/utils.py
vllm_ascend.egg-info/PKG-INFO
vllm_ascend.egg-info/SOURCES.txt
vllm_ascend.egg-info/dependency_links.txt
vllm_ascend.egg-info/entry_points.txt
vllm_ascend.egg-info/requires.txt
vllm_ascend.egg-info/top_level.txt
vllm_ascend/attention/__init__.py
vllm_ascend/attention/attention.py
vllm_ascend/attention/attention_v1.py
vllm_ascend/attention/mla_v1.py
vllm_ascend/attention/utils.py
vllm_ascend/compilation/__init__.py
vllm_ascend/compilation/piecewise_backend.py
vllm_ascend/core/__init__.py
vllm_ascend/core/schedule_config.py
vllm_ascend/core/scheduler.py
vllm_ascend/device_allocator/__init__.py
vllm_ascend/device_allocator/camem.py
vllm_ascend/distributed/__init__.py
vllm_ascend/distributed/communicator.py
vllm_ascend/distributed/context_parallel_utils.py
vllm_ascend/distributed/llmdatadist_c_mgr_connector.py
vllm_ascend/distributed/llmdatadist_connector.py
vllm_ascend/distributed/parallel_state.py
vllm_ascend/distributed/tensor_parallel.py
vllm_ascend/distributed/device_communicators/__init__.py
vllm_ascend/distributed/device_communicators/pyhccl.py
vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py
vllm_ascend/distributed/kv_transfer/__init__.py
vllm_ascend/distributed/kv_transfer/simple_buffer.py
vllm_ascend/distributed/kv_transfer/simple_connector.py
vllm_ascend/distributed/kv_transfer/simple_pipe.py
vllm_ascend/distributed/kv_transfer/utils.py
vllm_ascend/eplb/__init__.py
vllm_ascend/eplb/eplb_updator.py
vllm_ascend/eplb/adaptor/__init__.py
vllm_ascend/eplb/adaptor/abstract_adaptor.py
vllm_ascend/eplb/adaptor/vllm_adaptor.py
vllm_ascend/eplb/core/__init__.py
vllm_ascend/eplb/core/eplb_device_transfer_loader.py
vllm_ascend/eplb/core/eplb_utils.py
vllm_ascend/eplb/core/eplb_worker.py
vllm_ascend/eplb/core/policy/__init__.py
vllm_ascend/eplb/core/policy/policy_abstract.py
vllm_ascend/eplb/core/policy/policy_dynamic_ep.py
vllm_ascend/eplb/core/policy/policy_dynamic_ep_v2.py
vllm_ascend/eplb/core/policy/policy_factory.py
vllm_ascend/eplb/core/policy/policy_random.py
vllm_ascend/lora/__init__.py
vllm_ascend/lora/punica_wrapper/__init__.py
vllm_ascend/lora/punica_wrapper/punica_npu.py
vllm_ascend/models/__init__.py
vllm_ascend/models/deepseek_dbo.py
vllm_ascend/models/deepseek_mtp.py
vllm_ascend/models/deepseek_v2.py
vllm_ascend/models/qwen2.py
vllm_ascend/models/qwen2_5_vl.py
vllm_ascend/models/qwen2_5_vl_without_padding.py
vllm_ascend/models/qwen2_vl.py
vllm_ascend/models/qwen3.py
vllm_ascend/models/qwen3_dbo.py
vllm_ascend/models/qwen3_moe.py
vllm_ascend/multistream/__init__.py
vllm_ascend/multistream/base.py
vllm_ascend/multistream/context.py
vllm_ascend/multistream/decorator.py
vllm_ascend/multistream/layers.py
vllm_ascend/multistream/metadata.py
vllm_ascend/multistream/ms_split.py
vllm_ascend/ops/__init__.py
vllm_ascend/ops/activation.py
vllm_ascend/ops/attention.py
vllm_ascend/ops/cache.py
vllm_ascend/ops/comm_utils.py
vllm_ascend/ops/common_fused_moe.py
vllm_ascend/ops/expert_load_balancer.py
vllm_ascend/ops/fused_moe.py
vllm_ascend/ops/layernorm.py
vllm_ascend/ops/rotary_embedding.py
vllm_ascend/ops/sequence_parallel.py
vllm_ascend/ops/vocab_parallel_embedding.py
vllm_ascend/ops/moe_dispatcher/__init__.py
vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
vllm_ascend/patch/__init__.py
vllm_ascend/patch/platform/__init__.py
vllm_ascend/patch/platform/patch_0_9_1/__init__.py
vllm_ascend/patch/platform/patch_0_9_1/patch_cache_manager.py
vllm_ascend/patch/platform/patch_0_9_1/patch_configs.py
vllm_ascend/patch/platform/patch_0_9_1/patch_core.py
vllm_ascend/patch/platform/patch_0_9_1/patch_core_client.py
vllm_ascend/patch/platform/patch_0_9_1/patch_decorator.py
vllm_ascend/patch/platform/patch_common/__init__.py
vllm_ascend/patch/platform/patch_common/patch_distributed.py
vllm_ascend/patch/platform/patch_main/__init__.py
vllm_ascend/patch/worker/__init__.py
vllm_ascend/patch/worker/patch_0_9_1/__init__.py
vllm_ascend/patch/worker/patch_common/__init__.py
vllm_ascend/patch/worker/patch_common/patch_distributed.py
vllm_ascend/patch/worker/patch_common/patch_eagle.py
vllm_ascend/patch/worker/patch_common/patch_minicpm.py
vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py
vllm_ascend/patch/worker/patch_common/patch_sampler.py
vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py
vllm_ascend/patch/worker/patch_common/patch_utils.py
vllm_ascend/patch/worker/patch_main/__init__.py
vllm_ascend/quantization/__init__.py
vllm_ascend/quantization/func_wrapper.py
vllm_ascend/quantization/quant_config.py
vllm_ascend/quantization/quantizer.py
vllm_ascend/quantization/w4a8_dynamic.py
vllm_ascend/quantization/w8a8.py
vllm_ascend/quantization/w8a8_dynamic.py
vllm_ascend/sample/__init__.py
vllm_ascend/sample/rejection_sampler.py
vllm_ascend/worker/__init__.py
vllm_ascend/worker/cache_engine.py
vllm_ascend/worker/draft_model_runner.py
vllm_ascend/worker/model_runner.py
vllm_ascend/worker/model_runner_v1.py
vllm_ascend/worker/mtp_proposer_v1.py
vllm_ascend/worker/multi_step_runner.py
vllm_ascend/worker/multi_step_worker.py
vllm_ascend/worker/pooling_model_runner.py
vllm_ascend/worker/worker.py
vllm_ascend/worker/worker_v1.py