GuideLLM v0.4.0 Enablement (#479)

sjmonson · web-flow · commit ef9eb94a4792 · 2025-10-29T16:37:42.000-04:00
* Bump guidellm in container to devel build

* Update guidellm harness to use scenarios and fix convert script

* Add a few basic workload examples

* Bump GuideLLM

* GuideLLM now converts single dataset to a list automatically

* Install CPU version of torch to cut down on install size
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -63,11 +63,12 @@ RUN cd vllm; \
 
 ARG GUIDELLM_REPO=https://github.com/vllm-project/guidellm.git
 ARG GUIDELLM_BRANCH=main
-ARG GUIDELLM_COMMIT=72374efdf7d4432173fafec3924dc94ac3b11449
+ARG GUIDELLM_COMMIT=ba51acf5b0ba377c5edc35109a78cd3ebb402922
 RUN git clone --branch ${GUIDELLM_BRANCH} ${GUIDELLM_REPO}
 RUN cd guidellm; \
+    pip install torch --index-url https://download.pytorch.org/whl/cpu; \
     git checkout ${GUIDELLM_COMMIT}; \
-    pip install .
+    pip install .[recommended]
 
 RUN echo "fmperf: ${FM_PERF_REPO} ${FM_PERF_BRANCH}" > /workspace/repos.txt; \
     echo "inference-perf: ${INFERENCE_PERF_REPO} ${INFERENCE_PERF_BRANCH}" >> /workspace/repos.txt; \
diff --git a/workload/harnesses/guidellm-llm-d-benchmark.sh b/workload/harnesses/guidellm-llm-d-benchmark.sh
@@ -1,8 +1,9 @@
 #!/usr/bin/env bash
+
+echo Using experiment result dir: "$LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR"
 mkdir -p "$LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR"
-cd ${LLMDBENCH_RUN_WORKSPACE_DIR}/guidellm/
-cp -f ${LLMDBENCH_RUN_WORKSPACE_DIR}/profiles/guidellm/${LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME} $LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR/${LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME}
-guidellm benchmark --$(cat ${LLMDBENCH_RUN_WORKSPACE_DIR}/profiles/guidellm/${LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME} | yq -r 'to_entries | map("\(.key)=\(.value)") | join(" --")' | sed -e 's^=none ^^g' -e 's^=none$^^g') --output-path=$LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR/results.json > >(tee -a $LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR/stdout.log) 2> >(tee -a $LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR/stderr.log >&2)
+pushd "$LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR"
+guidellm benchmark --scenario "${LLMDBENCH_RUN_WORKSPACE_DIR}/profiles/guidellm/${LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME}" --output-path "${LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR}/results.json" --disable-progress > >(tee -a $LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR/stdout.log) 2> >(tee -a $LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR/stderr.log >&2)
 export LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC=$?
 
 # If benchmark harness returned with an error, exit here
diff --git a/workload/profiles/guidellm/chatbot_synthetic.yaml.in b/workload/profiles/guidellm/chatbot_synthetic.yaml.in
@@ -0,0 +1,16 @@
+target: REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL
+model: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL
+request_type: text_completions
+profile: constant
+rate: [1,2,4,8]
+max_seconds: 120
+data:
+  prompt_tokens_min: 10
+  prompt_tokens_max: 8192
+  prompt_tokens: 4096
+  prompt_tokens_stdev: 2048
+  output_tokens_min: 10
+  output_tokens_max: 2048
+  output_tokens: 1024
+  output_tokens_stdev: 512
+  samples: 1000
diff --git a/workload/profiles/guidellm/sanity_concurrent.yaml.in b/workload/profiles/guidellm/sanity_concurrent.yaml.in
@@ -1,5 +1,9 @@
 target: REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL
-rate-type: concurrent
+model: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL
+profile: concurrent
+request_type: text_completions
 rate: 2
-max-seconds: 30
-data: prompt_tokens=256,output_tokens=128
+max_seconds: 30
+data:
+  prompt_tokens: 256
+  output_tokens: 128
diff --git a/workload/profiles/guidellm/sanity_random.yaml.in b/workload/profiles/guidellm/sanity_random.yaml.in
@@ -0,0 +1,15 @@
+target: REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL
+model: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL
+request_type: text_completions
+profile: constant
+rate: 1
+max_seconds: 30
+data:
+  prompt_tokens: 50
+  prompt_tokens_stdev: 10
+  prompt_tokens_min: 10
+  prompt_tokens_max: 100
+  output_tokens: 50
+  output_tokens_stdev: 10
+  output_tokens_min: 10
+  output_tokens_max: 100
diff --git a/workload/profiles/guidellm/shared_prefix_synthetic.yaml.in b/workload/profiles/guidellm/shared_prefix_synthetic.yaml.in
@@ -0,0 +1,11 @@
+target: REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL
+model: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL
+request_type: text_completions
+profile: constant
+rate: [2,5,8,10,12,15,20]
+max_seconds: 50
+data:
+  prefix_tokens: 2048
+  prefix_count: 32
+  prompt_tokens: 256
+  output_tokens: 256
diff --git a/workload/profiles/guidellm/summarization_synthetic.yaml.in b/workload/profiles/guidellm/summarization_synthetic.yaml.in
@@ -0,0 +1,16 @@
+target: REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL
+model: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL
+request_type: text_completions
+profile: constant
+rate: [1,2,4,8]
+max_seconds: 120
+data:
+  prompt_tokens_min: 10
+  prompt_tokens_max: 4096
+  prompt_tokens: 2048
+  prompt_tokens_stdev: 1024
+  output_tokens_min: 10
+  output_tokens_max: 256
+  output_tokens: 128
+  output_tokens_stdev: 64
+  samples: 1000
diff --git a/workload/report/convert.py b/workload/report/convert.py
@@ -419,19 +419,21 @@ def import_guidellm(results_file: str) -> BenchmarkReport:
     """
     check_file(results_file)
 
-    # Everything falls under ['benchmarks'][0], so just grab that part
-    results = import_yaml(results_file)['benchmarks'][0]
+    data = import_yaml(results_file)
+
+    # TODO: Read each benchmark in file
+    results = data["benchmarks"][0]
 
     # Get environment variables from llm-d-benchmark run as a dict following the
     # schema of BenchmarkReport
     br_dict = _get_llmd_benchmark_envars()
     # Append to that dict the data from GuideLLM
     update_dict(br_dict, {
         "scenario": {
-            "model": {"name": results['worker']['backend_model']},
+            "model": {"name": data["args"].get("model", "unknown")},
             "load": {
                 "name": WorkloadGenerator.GUIDELLM,
-                "args": results['args'],
+                "args": data['args'],
             },
         },
         "metrics": {