Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ Specifically,
and
[here](https://github.com/vllm-project/vllm/blob/ee8217e5bee5860469204ee57077a91138c9af02/vllm/engine/arg_utils.py#L201).

When using local model files, specify the path to the model in the `model` field.
By default relative paths are resolved relative to the working directory of the Triton server process.
To specify a path relative to the `model.json` file, set the `resolve_model_relative_to_config_file` field to `true`.

For multi-GPU support, EngineArgs like tensor_parallel_size can be specified in
[model.json](samples/model_repository/vllm_model/1/model.json).

Expand Down
14 changes: 14 additions & 0 deletions ci/L0_backend_vllm/vllm_backend/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ function assert_curl_success {
}

rm -rf models && mkdir -p models

# operational vllm model
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
# `vllm_opt` model will be loaded on server start and stay loaded throughout
# unittesting. To test vllm model load/unload we use a dedicated
Expand All @@ -58,10 +60,22 @@ cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json
cp -r models/vllm_opt models/vllm_load_test

# python model
mkdir -p models/add_sub/1/
wget -P models/add_sub/1/ https://raw.githubusercontent.com/triton-inference-server/python_backend/main/examples/add_sub/model.py
wget -P models/add_sub https://raw.githubusercontent.com/triton-inference-server/python_backend/main/examples/add_sub/config.pbtxt

# local vllm model
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_local
sed -i 's/"facebook\/opt-125m"/"./local_model"/' models/vllm_local/1/model.json
sed -i '/"model": /a "resolve_model_relative_to_config_file": true,' models/vllm_local/1/model.json
wget -P models/vllm_local/1/local_model https://huggingface.co/facebook/opt-125m/resolve/main/config.json
wget -P models/vllm_local/1/local_model https://huggingface.co/facebook/opt-125m/resolve/main/merges.txt
wget -P models/vllm_local/1/local_model https://huggingface.co/facebook/opt-125m/resolve/main/pytorch_model.bin
wget -P models/vllm_local/1/local_model https://huggingface.co/facebook/opt-125m/resolve/main/special_tokens_map.json
wget -P models/vllm_local/1/local_model https://huggingface.co/facebook/opt-125m/resolve/main/tokenizer_config.json
wget -P models/vllm_local/1/local_model https://huggingface.co/facebook/opt-125m/resolve/main/vocab.json

# Invalid model attribute
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_invalid_1/
sed -i 's/"disable_log_requests"/"invalid_attribute"/' models/vllm_invalid_1/1/model.json
Expand Down
26 changes: 26 additions & 0 deletions ci/L0_backend_vllm/vllm_backend/vllm_backend_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def setUp(self):
self.python_model_name = "add_sub"
self.ensemble_model_name = "ensemble_model"
self.vllm_load_test = "vllm_load_test"
self.local_vllm_model_name = "vllm_local"

def test_vllm_triton_backend(self):
# Load both vllm and add_sub models
Expand Down Expand Up @@ -93,6 +94,31 @@ def test_vllm_triton_backend(self):
)
self.triton_client.unload_model(self.vllm_load_test)
self.assertFalse(self.triton_client.is_model_ready(self.vllm_load_test))

def test_local_vllm_model(self):
# Load local vllm model
self.triton_client.load_model(self.local_vllm_model_name)
self.assertTrue(self.triton_client.is_model_ready(self.local_vllm_model_name))

# Test local vllm model
self._test_vllm_model(
prompts=PROMPTS,
sampling_parameters=SAMPLING_PARAMETERS,
stream=False,
send_parameters_as_tensor=True,
model_name=self.local_vllm_model_name,
)
self._test_vllm_model(
prompts=PROMPTS,
sampling_parameters=SAMPLING_PARAMETERS,
stream=False,
send_parameters_as_tensor=False,
model_name=self.local_vllm_model_name,
)

# Unload local vllm model
self.triton_client.unload_model(self.local_vllm_model_name)
self.assertFalse(self.triton_client.is_model_ready(self.local_vllm_model_name))

def test_model_with_invalid_attributes(self):
model_name = "vllm_invalid_1"
Expand Down
14 changes: 14 additions & 0 deletions src/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,20 @@ def init_engine(self):
# Check for LoRA config and set it up if enabled
self.setup_lora()

# Resolve the model path relative to the config file
if self.vllm_engine_config.pop("resolve_model_relative_to_config_file", False):
new_path = os.path.abspath(
os.path.join(
pb_utils.get_model_dir(), self.vllm_engine_config["model"]
)
)
# Check if the resolved path is subdirectory of the model directory
if not new_path.startswith(pb_utils.get_model_dir()):
raise ValueError(
f"Resolved model path '{new_path}' is not a subdirectory of the model directory '{pb_utils.get_model_dir()}'"
)
self.vllm_engine_config["model"] = new_path

# Create an AsyncLLMEngine from the config from JSON
aync_engine_args = AsyncEngineArgs(**self.vllm_engine_config)
self.llm_engine = AsyncLLMEngine.from_engine_args(aync_engine_args)
Expand Down