Skip to content
25 changes: 22 additions & 3 deletions buildkite/test-template-ci.j2
Original file line number Diff line number Diff line change
Expand Up @@ -453,10 +453,10 @@ steps:
limit: 1
agents:
queue: amd-cpu

{% for step in steps %}
{% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %}
- label: "AMD MI300: {{ step.label }}"
{% if step.grade == "Blocking" %}
- label: "AMD MI325 blocking: {{ step.label }}"
depends_on: amd-build
agents:
{% if step.label and step.label=="Benchmarks" or step.label=="Kernels Attention Test %N" or step.label=="LoRA Test %N" or step.label=="Kernels Quantization Test %N" %}
Expand All @@ -473,7 +473,26 @@ steps:
DOCKER_BUILDKIT: "1"
priority: 100
soft_fail: false
{% endif %}
{% else %}
- label: "AMD MI325 softfail: {{ step.label }}"
depends_on: amd-build
agents:
{% if step.label and step.label=="Benchmarks" or step.label=="Kernels Attention Test %N" or step.label=="LoRA Test %N" or step.label=="Kernels Quantization Test %N" %}
queue: amd_mi325_8
{% elif step.label=="Distributed Tests (4 GPUs)" or step.label=="2 Node Tests (4 GPUs in total)" or step.label=="Multi-step Tests (4 GPUs)" or step.label=="Pipeline Parallelism Test" or step.label=="LoRA TP Test (Distributed)" %}
queue: amd_mi325_4
{% elif step.label=="Distributed Comm Ops Test" or step.label=="Distributed Tests (2 GPUs)" or step.label=="Plugin Tests (2 GPUs)" or step.label=="Weight Loading Multiple GPU Test" or step.label=="Weight Loading Multiple GPU Test - Large Models" %}
queue: amd_mi325_2
{% else %}
queue: amd_mi325_1
{% endif%}
command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
priority: 100
soft_fail: true
{% endif %}
{% endif %}
{% endfor %}
{% for step in steps %}
# removed because of lack of HW resources: step.label and step.label=="Benchmarks" or step.label=="Pipeline Parallelism Test" or
Expand Down