Skip to content

Commit ada5799

Browse files
authored
Update TensorRT-LLM backend (triton-inference-server#512)
1 parent 62cd00f commit ada5799

File tree

9 files changed

+235
-17
lines changed

9 files changed

+235
-17
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,11 @@ The below commands will build the same Triton TRT-LLM container as the one on th
6767
```bash
6868
# Prepare the TRT-LLM base image using the dockerfile from tensorrtllm_backend.
6969
cd tensorrtllm_backend
70+
git lfs install
71+
git submodule update --init --recursive
72+
7073
# Specify the build args for the dockerfile.
71-
BASE_IMAGE=nvcr.io/nvidia/pytorch:24.03-py3
74+
BASE_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
7275
TRT_VERSION=10.0.1.6
7376
TRT_URL_x86=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
7477
TRT_URL_ARM=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.ubuntu-22.04.aarch64-gnu.cuda-12.4.tar.gz

all_models/inflight_batcher_llm/tensorrt_llm/1/model.py

Lines changed: 193 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,11 @@ def convert_decoding_mode(decoding_mode: str):
311311
f"decoding_mode value of '{decoding_mode}' is not supported.")
312312

313313

314+
def convert_timestamp_to_seconds(timestamp: str):
315+
return int(
316+
datetime.datetime.strptime(timestamp, "%m-%d-%Y %H:%M:%S").timestamp())
317+
318+
314319
class TritonPythonModel:
315320
"""Your Python model must use the same class name. Every Python model
316321
that is created must have "TritonPythonModel" as the class name.
@@ -422,6 +427,155 @@ def get_executor_config(self, model_config):
422427
kwargs = {k: v for k, v in kwargs.items() if v is not None}
423428
return trtllm.ExecutorConfig(**kwargs)
424429

430+
def create_metrics(self, model: str, version: str, is_v1_model: bool):
431+
self.request_metric_family = pb_utils.MetricFamily(
432+
name="nv_trt_llm_request_metrics",
433+
description="TRT LLM request metrics",
434+
kind=pb_utils.MetricFamily.GAUGE,
435+
)
436+
self.runtime_memory_metric_family = pb_utils.MetricFamily(
437+
name="nv_trt_llm_runtime_memory_metrics",
438+
description="TRT LLM runtime memory metrics",
439+
kind=pb_utils.MetricFamily.GAUGE,
440+
)
441+
self.kv_cache_metric_family = pb_utils.MetricFamily(
442+
name="nv_trt_llm_kv_cache_block_metrics",
443+
description="TRT LLM KV cache block metrics",
444+
kind=pb_utils.MetricFamily.GAUGE,
445+
)
446+
model_type = "v1" if is_v1_model else "inflight_batcher"
447+
self.model_type_metric_family = pb_utils.MetricFamily(
448+
name=f"nv_trt_llm_{model_type}_metrics",
449+
description=f"TRT LLM {model_type}-specific metrics",
450+
kind=pb_utils.MetricFamily.GAUGE,
451+
)
452+
self.general_metric_family = pb_utils.MetricFamily(
453+
name="nv_trt_llm_general_metrics",
454+
description="General TRT LLM metrics",
455+
kind=pb_utils.MetricFamily.GAUGE,
456+
)
457+
common_labels = {"model": model, "version": version}
458+
self.all_metrics = {
459+
# Request metrics
460+
"num_active_requests":
461+
self.request_metric_family.Metric(labels={
462+
"request_type": "active",
463+
**common_labels
464+
}),
465+
"max_num_active_requests":
466+
self.request_metric_family.Metric(labels={
467+
"request_type": "max",
468+
**common_labels
469+
}),
470+
"num_scheduled_requests":
471+
self.request_metric_family.Metric(labels={
472+
"request_type": "scheduled",
473+
**common_labels
474+
}),
475+
"num_context_requests":
476+
self.request_metric_family.Metric(labels={
477+
"request_type": "context",
478+
**common_labels
479+
}),
480+
# Runtime metrics
481+
"cpu_mem_usage":
482+
self.runtime_memory_metric_family.Metric(labels={
483+
"memory_type": "cpu",
484+
**common_labels
485+
}),
486+
"gpu_mem_usage":
487+
self.runtime_memory_metric_family.Metric(labels={
488+
"memory_type": "gpu",
489+
**common_labels
490+
}),
491+
"pinned_mem_usage":
492+
self.runtime_memory_metric_family.Metric(labels={
493+
"memory_type": "pinned",
494+
**common_labels
495+
}),
496+
# KV cache metrics
497+
"max_num_blocks":
498+
self.kv_cache_metric_family.Metric(labels={
499+
"kv_cache_block_type": "max",
500+
**common_labels
501+
}),
502+
"free_num_blocks":
503+
self.kv_cache_metric_family.Metric(labels={
504+
"kv_cache_block_type": "free",
505+
**common_labels
506+
}),
507+
"used_num_blocks":
508+
self.kv_cache_metric_family.Metric(labels={
509+
"kv_cache_block_type": "used",
510+
**common_labels
511+
}),
512+
"tokens_per_block":
513+
self.kv_cache_metric_family.Metric(labels={
514+
"kv_cache_block_type": "tokens_per",
515+
**common_labels
516+
}),
517+
# General metrics
518+
"timestamp":
519+
self.general_metric_family.Metric(labels={
520+
"general_type": "timestamp",
521+
**common_labels
522+
}),
523+
"iter":
524+
self.general_metric_family.Metric(labels={
525+
"general_type": "iteration_counter",
526+
**common_labels
527+
}),
528+
}
529+
if is_v1_model:
530+
self.all_metrics.update({
531+
"num_ctx_tokens":
532+
self.model_type_metric_family.Metric(labels={
533+
"v1_specific_metric": "total_context_tokens",
534+
**common_labels
535+
}),
536+
"num_gen_tokens":
537+
self.model_type_metric_family.Metric(
538+
labels={
539+
"v1_specific_metric": "total_generation_tokens",
540+
**common_labels
541+
}),
542+
"empty_gen_slots":
543+
self.model_type_metric_family.Metric(
544+
labels={
545+
"v1_specific_metric": "empty_generation_slots",
546+
**common_labels
547+
}),
548+
})
549+
else:
550+
self.all_metrics.update({
551+
"num_ctx_tokens":
552+
self.model_type_metric_family.Metric(
553+
labels={
554+
"inflight_batcher_specific_metric":
555+
"total_context_tokens",
556+
**common_labels
557+
}),
558+
"num_gen_requests":
559+
self.model_type_metric_family.Metric(
560+
labels={
561+
"inflight_batcher_specific_metric":
562+
"generation_requests",
563+
**common_labels
564+
}),
565+
"micro_batch_id":
566+
self.model_type_metric_family.Metric(
567+
labels={
568+
"inflight_batcher_specific_metric": "micro_batch_id",
569+
**common_labels
570+
}),
571+
"num_paused_requests":
572+
self.model_type_metric_family.Metric(
573+
labels={
574+
"inflight_batcher_specific_metric": "paused_requests",
575+
**common_labels
576+
}),
577+
})
578+
425579
def initialize(self, args):
426580
"""`initialize` is called only once when the model is being loaded.
427581
Implementing `initialize` function is optional. This function allows
@@ -453,22 +607,30 @@ def initialize(self, args):
453607
model_config)
454608
self.cancellation_check_period_ms = get_parameter(
455609
model_config, "cancellation_check_period_ms", int) or 100
610+
self.stats_check_period_ms = get_parameter(
611+
model_config, "stats_check_period_ms", int) or 100
456612

457613
if not self.decoupled:
458614
raise pb_utils.TritonModelException(
459615
"Please enable decoupled transaction policy in the model configuration to serve this model"
460616
)
461617

618+
self.create_metrics(args["model_name"],
619+
args["model_version"],
620+
is_v1_model=executor_config.batching_type ==
621+
trtllm.BatchingType.STATIC)
462622
self.triton_id_to_req_id = {}
463623
self.req_id_to_response_sender = {}
464624
self.lock = Lock()
465625
self.running = False
466626
self.awaiter_thread = Thread(target=self.awaiter_loop)
467627
self.cancellation_thread = Thread(target=self.cancellation_loop)
628+
self.metrics_thread = Thread(target=self.metrics_loop)
468629
if self.executor.can_enqueue_requests():
469630
self.running = True
470631
self.awaiter_thread.start()
471632
self.cancellation_thread.start()
633+
self.metrics_thread.start()
472634
else:
473635
# In leader mode, worker ranks will wait here until leader is done.
474636
self.executor.shutdown()
@@ -564,7 +726,6 @@ def awaiter_loop(self):
564726
del self.req_id_to_response_sender[req_id]
565727
# Remove local reference so response_sender can be cleaned properly.
566728
del response_sender
567-
# TODO: Read stats: https://jirasw.nvidia.com/browse/TRTLLM-563
568729

569730
def cancellation_loop(self):
570731
"""Checks if any pending requests have been cancelled."""
@@ -578,6 +739,36 @@ def cancellation_loop(self):
578739
# Remove local reference so response_sender can be cleaned properly.
579740
del response_sender
580741

742+
def metrics_loop(self):
743+
"""Updates triton metrics using stats from the executor."""
744+
while self.running:
745+
time.sleep(self.stats_check_period_ms / 1000.0)
746+
for stat in self.executor.get_latest_iteration_stats():
747+
try:
748+
for key, metric in self.all_metrics.items():
749+
value = None
750+
if hasattr(stat, key):
751+
value = getattr(stat, key)
752+
elif stat.kv_cache_stats is not None and hasattr(
753+
stat.kv_cache_stats, key):
754+
value = getattr(stat.kv_cache_stats, key)
755+
elif stat.static_batching_stats is not None and hasattr(
756+
stat.static_batching_stats, key):
757+
value = getattr(stat.static_batching_stats, key)
758+
elif stat.inflight_batching_stats is not None and hasattr(
759+
stat.inflight_batching_stats, key):
760+
value = getattr(stat.inflight_batching_stats, key)
761+
if value is not None:
762+
if key == "timestamp":
763+
value = convert_timestamp_to_seconds(value)
764+
metric.set(value)
765+
else:
766+
pb_utils.Logger.log_warn(
767+
f"Metric \"{key}\" not found.")
768+
except Exception as e:
769+
pb_utils.Logger.log_warn(
770+
f"Error while processing metrics: {e}")
771+
581772
def finalize(self):
582773
"""`finalize` is called only once when the model is being unloaded.
583774
Implementing `finalize` function is optional. This function allows
@@ -587,4 +778,5 @@ def finalize(self):
587778
self.running = False
588779
self.awaiter_thread.join()
589780
self.cancellation_thread.join()
781+
self.metrics_thread.join()
590782
self.executor.shutdown()

all_models/tests/test_python_backend.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,3 +572,8 @@ def test_get_executor_config_minimal():
572572
assert config.iter_stats_max_iterations == 1000
573573
assert config.request_stats_max_iterations == 0
574574
assert config.logits_post_processor_map is None
575+
576+
577+
def test_convert_timestamp_to_seconds():
578+
assert convert_timestamp_to_seconds("01-01-1970 00:00:00") == 0
579+
assert convert_timestamp_to_seconds("05-17-2024 23:28:39") == 1715988519

ci/L0_backend_trtllm/custom_metrics_verification_tests.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,18 @@ def _parse_log_file(self, filename):
6464
with open(filename) as log_file:
6565
for line in reversed(list(log_file)):
6666
if "Active Request Count" in line:
67-
json_format = re.sub(r"^.*?{", "{", line)
68-
return json.loads(json_format)
67+
match = re.search(r'({.*})', line)
68+
if match:
69+
json_string = match.group(1)
70+
try:
71+
json_string = json_string.replace('\\"', '"')
72+
except json.JSONDecodeError as e:
73+
raise Exception("Error parsing the JSON string: ",
74+
e)
75+
else:
76+
raise Exception("No JSON found in the log file")
77+
78+
return json.loads(json_string)
6979

7080
def _parse_triton_metrics(self, filename, is_v1):
7181
curl_counts = {}

ci/L0_backend_trtllm/test.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ BASE_METRICS_VERIFICATION_LOG="base_metrics_verification.log"
3838
CUSTOM_METRICS_VERIFICATION_TEST=custom_metrics_verification_tests.py
3939
CUSTOM_METRICS_VERIFICATION_LOG="custom_metrics_verification.log"
4040
SERVER_PID=0
41+
SLEEP_DURATION=3
4142

4243
# Force environment to use python version 3
4344
apt update -q=2 \
@@ -237,7 +238,10 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
237238
fi
238239
set +e
239240

241+
# Make sure the metrics is retrieved after the server has updated the metrics internally
242+
sleep ${SLEEP_DURATION}
240243
curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out
244+
241245
kill_server
242246
wait_for_server_terminated ${SERVER_PID[@]}
243247

@@ -285,7 +289,10 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
285289
fi
286290
set +e
287291

292+
# Make sure the metrics is retrieved after the server has updated the metrics internally
293+
sleep ${SLEEP_DURATION}
288294
curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_no_stream_metrics.out
295+
289296
kill_server
290297
wait_for_server_terminated ${SERVER_PID[@]}
291298

@@ -342,7 +349,10 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
342349
fi
343350
set +e
344351

352+
# Make sure the metrics is retrieved after the server has updated the metrics internally
353+
sleep ${SLEEP_DURATION}
345354
curl localhost:8002/metrics -o ${NUM_GPU}gpu_multi_model_metrics.out
355+
346356
kill_server
347357
wait_for_server_terminated ${SERVER_PID[@]}
348358
fi
@@ -375,7 +385,10 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
375385
fi
376386
set +e
377387

388+
# Make sure the metrics is retrieved after the server has updated the metrics internally
389+
sleep ${SLEEP_DURATION}
378390
curl localhost:8002/metrics -o ${NUM_GPU}gpu_IFB_stream_metrics.out
391+
379392
kill_server
380393
wait_for_server_terminated ${SERVER_PID[@]}
381394

dockerfile/Dockerfile.triton.trt_llm_backend

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -57,24 +57,19 @@ ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:${LD_LIBRARY_PATH}
5757
ENV TRT_ROOT=/usr/local/tensorrt
5858

5959
FROM install_dependencies as tensorrt_llm_build
60-
61-
ARG TENSORRT_LLM_REPO=https://github.com/NVIDIA/TensorRT-LLM.git
62-
ARG TENSORRT_LLM_REPO_TAG=main
63-
6460
RUN pip3 install --no-cache-dir \
6561
cmake \
6662
polygraphy==0.49.0 \
6763
mpi4py==3.1.5
6864

69-
WORKDIR /workspace/
70-
RUN git clone --recurse-submodules --branch ${TENSORRT_LLM_REPO_TAG} ${TENSORRT_LLM_REPO} tenosrrt_llm
71-
72-
WORKDIR /workspace/tenosrrt_llm
73-
RUN python3 scripts/build_wheel.py --trt_root /usr/local/tensorrt
65+
WORKDIR /workspace
66+
COPY scripts scripts
67+
COPY tensorrt_llm tensorrt_llm
68+
RUN cd tensorrt_llm && python3 scripts/build_wheel.py --trt_root="${TRT_ROOT}" --clean --job_count 18 && cd ..
7469

7570
FROM install_dependencies as base
7671

7772
WORKDIR /tmp
78-
COPY --from=tensorrt_llm_build /workspace/tenosrrt_llm/build/tensorrt_llm*whl .
73+
COPY --from=tensorrt_llm_build /workspace/tensorrt_llm/build/tensorrt_llm*whl .
7974

8075
RUN pip3 install --no-cache-dir --extra-index-url https://pypi.nvidia.com tensorrt_llm*.whl

dockerfile/Dockerfile.trt_llm_backend

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
2-
ARG BASE_TAG=24.04-py3
2+
ARG BASE_TAG=24.05-py3
33

44
FROM ${BASE_IMAGE}:${BASE_TAG} as base
55

tensorrt_llm

Submodule tensorrt_llm updated 94 files

tools/version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
bb75970fe2f21b2cb9a7d231010540397f6dfd79
1+
73b896d12a81662027fa6746ab3ed99450150e18

0 commit comments

Comments
 (0)