Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions experiments/inference-scheduling-guidellm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
setup:
factors:
- LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE
levels:
LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE: "inf-sche-none.yaml, inf-sche-prefix.yaml, inf-sche-kv.yaml, inf-sche-queue.yaml"
# LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE: "inf-sche-prefix.yaml, inf-sche-kv.yaml, inf-sche-queue.yaml"
treatments:
- "inf-sche-none.yaml"
- "inf-sche-prefix.yaml"
- "inf-sche-kv.yaml"
- "inf-sche-queue.yaml"
run:
factors:
- prompt_tokens
- output_tokens
treatments:
- "100,100"
- "100,300"
- "100,1000"
- "300,100"
- "300,300"
- "300,1000"
- "1000,100"
- "1000,300"
- "1000,1000"
# levels:
# data: "prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=100REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048,prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=300REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=300REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048, prompt_tokens=300REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=100REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048, prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048"
# treatments:
# - "prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048"
# - "prompt_tokens=100REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048"
# - "prompt_tokens=100REPLACE_COMMAoutput_tokens=1000REPLACE_COMMAprefix_tokens=2048"
# - "prompt_tokens=300REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048"
# - "prompt_tokens=300REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048"
# - "prompt_tokens=300REPLACE_COMMAoutput_tokens=1000REPLACE_COMMAprefix_tokens=2048"
# - "prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048"
# - "prompt_tokens=100REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048"
# - "prompt_tokens=100REPLACE_COMMAoutput_tokens=1000REPLACE_COMMAprefix_tokens=2048"
40 changes: 40 additions & 0 deletions experiments/pd-disaggregation-guidellm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
setup:
factors:
- LLMDBENCH_DEPLOY_METHODS
- LLMDBENCH_VLLM_COMMON_REPLICAS
- LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM
- LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS
- LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM
- LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS
- LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM
levels:
LLMDBENCH_VLLM_COMMON_REPLICAS: "2,4"
LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM: "8"
LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS: "2,4,6,8"
LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM: "1,2"
LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS: "1,2,4"
LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM: "2,4,8"
treatments:
- "modelservice,NA,NA,1,4,3,4"
- "modelservice,NA,NA,1,8,1,8"
- "modelservice,NA,NA,2,4,1,8"
- "modelservice,NA,NA,2,4,2,4"
- "modelservice,NA,NA,3,4,1,4"
- "standalone,1,2,NA,NA,NA,NA"
- "standalone,1,4,NA,NA,NA,NA"
- "standalone,1,8,NA,NA,NA,NA"
run:
factors:
- rate
- data_samples
levels:
rate: "1,8,32,64,128,256"
data_samples: "10,80,320,640,1280,2560"
treatments:
- "1,10"
- "8,80"
- "32,320"
- "64,640"
- "128,1280"
- "256,2560"

125 changes: 125 additions & 0 deletions scenarios/guides/inference-scheduling-guidellm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# INFERENCE SCHEDULING WELL LIT PATH
# Based on https://github.com/llm-d/llm-d/tree/main/guides/inference-scheduling
# Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG
# Removed extra volumes metrics-volume and torch-compile-volume; they are not needed for this model and tested hardware.
# Use LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS and LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES to add them if needed.

# IMPORTANT NOTE
# All parameters not defined here or exported externally will be the default values found in setup/env.sh
# Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible.

# Model parameters
export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
# export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"

# PVC parameters
# Storage class (leave uncommented to automatically detect the "default" storage class)
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=standard-rwx
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=shared-vast
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti

# Routing configuration (via gaie)
#export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="default-plugins.yaml" # already the default

# Routing configuration (via modelservice)
export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true # (default is "false")

# Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU... WILL WORK FOR OpenShift, Kubernetes and GKE)
export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift
# export LLMDBENCH_VLLM_COMMON_AFFINITY=kubernetes.io/hostname:pokstg-b64r39s1 # OpenShift
#export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes
#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-tesla-a100 # GKE
#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb # GKE
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S # OpenShift
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB # OpenShift
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu # ANY GPU (useful for Minikube)

# Uncomment to request specific network devices
#####export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/roce_gdr
#######export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/ib
#export LLMDBENCH_VLLM_COMMON_NETWORK_NR=4

# Common parameters across standalone and llm-d (prefill and decode) pods
export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=16000
export LLMDBENCH_VLLM_COMMON_BLOCK_SIZE=64

export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=$(mktemp)
cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
- name: UCX_TLS
value: "rc,sm,cuda_ipc,cuda_copy,tcp"
- name: UCX_SOCKADDR_TLS_PRIORITY
value: "tcp"
###- name: UCX_NET_DEVICES
### value: mlx5_1:1
###- name: NCCL_IB_HCA
### value: mlx5_1
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "REPLACE_ENV_LLMDBENCH_VLLM_COMMON_NIXL_SIDE_CHANNEL_PORT"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_LOGGING_LEVEL
value: DEBUG
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
value: "1"
EOF

export LLMDBENCH_VLLM_COMMON_EXTRA_CONTAINER_CONFIG=$(mktemp)
cat << EOF > ${LLMDBENCH_VLLM_COMMON_EXTRA_CONTAINER_CONFIG}
ports:
- containerPort: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_NIXL_SIDE_CHANNEL_PORT
protocol: TCP
- containerPort: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_METRICS_PORT
name: metrics
protocol: TCP
EOF

# Prefill parameters: 0 prefill pod
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=0
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR=0

# Decode parameters: 2 decode pods
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
# Uncomment (###) the following line to enable multi-nic
###export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$LLMDBENCH_CONTROL_USERNAME,modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute
# Uncomment (#####) the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband)
#####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
#####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=1
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=2
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=vllmServe
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS="[\
--enforce-eager____\
--block-size____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_BLOCK_SIZE____\
--kv-transfer-config____'{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'____\
--disable-log-requests____\
--disable-uvicorn-access-log____\
--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN\
]"
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=2

# Workload parameters
export LLMDBENCH_HARNESS_NAME=guidellm
export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=shared_prefix_synthetic.yaml

# Local directory to copy benchmark runtime files and results
export LLMDBENCH_CONTROL_WORK_DIR=~/data/inference-scheduling/guidellm

# My stuff
export export LLMDBENCH_IMAGE_REGISTRY=quay.io
export LLMDBENCH_IMAGE_REPO=jgchen
export LLMDBENCH_IMAGE_NAME=llm-d-benchmark
export LLMDBENCH_IMAGE_TAG=0.0.25

# export LLMDBENCH_IMAGE_REGISTRY="quay.io"
# export LLMDBENCH_IMAGE_REPO="rh_ee_smonson"
# export LLMDBENCH_IMAGE_NAME="llm-d-benchmark"
# export LLMDBENCH_IMAGE_TAG="0.3.0-amd64"

export LLMDBENCH_VLLM_COMMON_NAMESPACE=jchen
export LLMDBENCH_HARNESS_NAMESPACE=jchen
15 changes: 12 additions & 3 deletions scenarios/guides/inference-scheduling.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
# Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible.

# Model parameters
#export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
# export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"

# PVC parameters
Expand All @@ -28,7 +28,7 @@ export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true # (default is "false")

# Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU... WILL WORK FOR OpenShift, Kubernetes and GKE)
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift
export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift
#export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes
#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-tesla-a100 # GKE
#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb # GKE
Expand Down Expand Up @@ -108,3 +108,12 @@ export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=shared_prefix_synthetic.yaml

# Local directory to copy benchmark runtime files and results
export LLMDBENCH_CONTROL_WORK_DIR=~/data/inference-scheduling

# My stuff
export export LLMDBENCH_IMAGE_REGISTRY=quay.io
export LLMDBENCH_IMAGE_REPO=jgchen
export LLMDBENCH_IMAGE_NAME=llm-d-benchmark
export LLMDBENCH_IMAGE_TAG=0.0.25

export LLMDBENCH_VLLM_COMMON_NAMESPACE=jchen
export LLMDBENCH_HARNESS_NAMESPACE=jchen
150 changes: 150 additions & 0 deletions scenarios/guides/pd-disaggregation-guidellm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
# P/D DISAGGREGATION WELL LIT PATH
# Based on https://github.com/llm-d/llm-d/tree/main/guides/pd-disaggregation
# Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG
# Removed extra volumes metrics-volume and torch-compile-volume; they are not needed for this model and tested hardware.
# Use LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS and LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES to add them if needed.

# IMPORTANT NOTE
# All parameters not defined here or exported externally will be the default values found in setup/env.sh
# Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible.

# Model parameters
#export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
# export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"

# PVC parameters
# Storage class (leave uncommented to automatically detect the "default" storage class)
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=standard-rwx
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=shared-vast
#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs
export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti

# Routing configuration (via gaie)
#export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="default-plugins.yaml" # already the default

# Routing configuration (via modelservice)
export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true # (default is "false")
# export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default

# Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU... WILL WORK FOR OpenShift, Kubernetes and GKE)
export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift
#export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes
#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-tesla-a100 # GKE
#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb # GKE
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S # OpenShift
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB # OpenShift
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu # ANY GPU (useful for Minikube)

# Uncomment to request specific network devices
#####export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/roce_gdr
#######export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/ib
#export LLMDBENCH_VLLM_COMMON_NETWORK_NR=4

# Uncomment to use hostNetwork (only ONE PODE PER NODE)
#export LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG=$(mktemp)
#cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG}
# hostNetwork: true
# dnsPolicy: ClusterFirstWithHostNet
#EOF

# Common parameters across standalone and llm-d (prefill and decode) pods
export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=16000
export LLMDBENCH_VLLM_COMMON_BLOCK_SIZE=128

# Uncomment (###) to select additional network devices (e.g., when multi-nic is enabled)
export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=$(mktemp)
cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
- name: UCX_TLS
value: "rc,sm,cuda_ipc,cuda_copy,tcp"
- name: UCX_SOCKADDR_TLS_PRIORITY
value: "tcp"
###- name: UCX_NET_DEVICES
### value: mlx5_1:1
###- name: NCCL_IB_HCA
### value: mlx5_1
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "REPLACE_ENV_LLMDBENCH_VLLM_COMMON_NIXL_SIDE_CHANNEL_PORT"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: VLLM_LOGGING_LEVEL
value: DEBUG
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
value: "1"
EOF

# Prefill parameters
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM=1
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=2
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_NR=32
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_MEM=128Gi
# Uncomment (###) the following line to enable multi-nic
###export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PODANNOTATIONS=deployed-by:$LLMDBENCH_CONTROL_USERNAME,modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute
# Uncomment (#####) the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband)
#####export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_RESOURCE=rdma/roce_gdr
#####export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_NR=1
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=vllmServe
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS="[\
--block-size____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_BLOCK_SIZE____\
--kv-transfer-config____'{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'____\
--disable-log-requests____\
--disable-uvicorn-access-log____\
--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN\
]"

# Decode parameters
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=2
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=32
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=128Gi
# Uncomment (###) the following line to enable multi-nic
###export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$LLMDBENCH_CONTROL_USERNAME,modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute
# Uncomment (#####) the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband)
#####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
#####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=1
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=vllmServe
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS="[\
--block-size____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_BLOCK_SIZE____\
--kv-transfer-config____'{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'____\
--disable-log-requests____\
--disable-uvicorn-access-log____\
--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN\
]"

export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUME_MOUNTS=$(mktemp)
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUME_MOUNTS}
- name: dshm
mountPath: /dev/shm
EOF

export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUMES=$(mktemp)
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUMES}
- name: dshm
emptyDir:
medium: Memory
sizeLimit: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_SHM_MEM
EOF

# Timeout for benchmark operations
export LLMDBENCH_CONTROL_WAIT_TIMEOUT=900000
export LLMDBENCH_HARNESS_WAIT_TIMEOUT=900000

# Workload parameters
export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=random_concurrent.yaml
export LLMDBENCH_HARNESS_NAME=guidellm

# Local directory to copy benchmark runtime files and results
export LLMDBENCH_CONTROL_WORK_DIR=~/data/pd-disaggregation/guidellm


# My stuff
export export LLMDBENCH_IMAGE_REGISTRY=quay.io
export LLMDBENCH_IMAGE_REPO=jgchen
export LLMDBENCH_IMAGE_NAME=llm-d-benchmark
export LLMDBENCH_IMAGE_TAG=0.0.24

export LLMDBENCH_VLLM_COMMON_NAMESPACE=jchen2
export LLMDBENCH_HARNESS_NAMESPACE=jchen2
2 changes: 1 addition & 1 deletion setup/functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1197,4 +1197,4 @@ function user_has_hf_model_access {

case "$http_code" in 200) return 0 ;; 401|403) return 1 ;; *) return 2 ;; esac
}
export -f user_has_hf_model_access
export -f user_has_hf_model_access
Loading