Skip to content

Commit e8ae70c

Browse files
authored
Update TensorRT-LLM backend (triton-inference-server#161)
* Update TensorRT-LLM backend
1 parent 37ed967 commit e8ae70c

File tree

30 files changed

+1278
-248
lines changed

30 files changed

+1278
-248
lines changed

README.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,83 @@ You might have to contact your cluster's administrator to help you customize the
377377
pkill tritonserver
378378
```
379379

380+
## Triton Metrics
381+
Starting with the 23.11 release of Triton, users can now obtain TRT LLM Batch Manager [statistics](https://github.com/NVIDIA/TensorRT-LLM/blob/ffd5af342a817a2689d38e4af2cc59ded877e339/docs/source/batch_manager.md#statistics) by querying the Triton metrics endpoint. This can be accomplished by launching a Triton server in any of the ways described above (ensuring the build code / container is 23.11 or later) and querying the sever with the generate endpoint. Upon receiving a successful response, you can query the metrics endpoint by entering the following:
382+
```bash
383+
curl localhost:8002/metrics
384+
```
385+
Batch manager statistics are reported by the metrics endpoint in fields that are prefixed with `nv_trt_llm_`. Your output for these fields should look similar to the following (assuming your model is an inflight batcher model):
386+
```bash
387+
# HELP nv_trt_llm_request_statistics TRT LLM request metrics
388+
# TYPE nv_trt_llm_request_statistics gauge
389+
nv_trt_llm_request_statistics{model="tensorrt_llm",request_type="context",version="1"} 1
390+
nv_trt_llm_request_statistics{model="tensorrt_llm",request_type="scheduled",version="1"} 1
391+
nv_trt_llm_request_statistics{model="tensorrt_llm",request_type="max",version="1"} 512
392+
nv_trt_llm_request_statistics{model="tensorrt_llm",request_type="active",version="1"} 0
393+
# HELP nv_trt_llm_runtime_memory_statistics TRT LLM runtime memory metrics
394+
# TYPE nv_trt_llm_runtime_memory_statistics gauge
395+
nv_trt_llm_runtime_memory_statistics{memory_type="pinned",model="tensorrt_llm",version="1"} 0
396+
nv_trt_llm_runtime_memory_statistics{memory_type="gpu",model="tensorrt_llm",version="1"} 1610236
397+
nv_trt_llm_runtime_memory_statistics{memory_type="cpu",model="tensorrt_llm",version="1"} 0
398+
# HELP nv_trt_llm_kv_cache_block_statistics TRT LLM KV cache block metrics
399+
# TYPE nv_trt_llm_kv_cache_block_statistics gauge
400+
nv_trt_llm_kv_cache_block_statistics{kv_cache_block_type="tokens_per",model="tensorrt_llm",version="1"} 64
401+
nv_trt_llm_kv_cache_block_statistics{kv_cache_block_type="used",model="tensorrt_llm",version="1"} 1
402+
nv_trt_llm_kv_cache_block_statistics{kv_cache_block_type="free",model="tensorrt_llm",version="1"} 6239
403+
nv_trt_llm_kv_cache_block_statistics{kv_cache_block_type="max",model="tensorrt_llm",version="1"} 6239
404+
# HELP nv_trt_llm_inflight_batcher_statistics TRT LLM inflight_batcher-specific metrics
405+
# TYPE nv_trt_llm_inflight_batcher_statistics gauge
406+
nv_trt_llm_inflight_batcher_statistics{inflight_batcher_specific_metric="micro_batch_id",model="tensorrt_llm",version="1"} 0
407+
nv_trt_llm_inflight_batcher_statistics{inflight_batcher_specific_metric="generation_requests",model="tensorrt_llm",version="1"} 0
408+
nv_trt_llm_inflight_batcher_statistics{inflight_batcher_specific_metric="total_context_tokens",model="tensorrt_llm",version="1"} 0
409+
# HELP nv_trt_llm_general_statistics General TRT LLM statistics
410+
# TYPE nv_trt_llm_general_statistics gauge
411+
nv_trt_llm_general_statistics{general_type="iteration_counter",model="tensorrt_llm",version="1"} 0
412+
nv_trt_llm_general_statistics{general_type="timestamp",model="tensorrt_llm",version="1"} 1700074049
413+
```
414+
If, instead, you launched a V1 model, your output will look similar to the output above except the inflight batcher related fields will be replaced with something similar to the following:
415+
```bash
416+
# HELP nv_trt_llm_v1_statistics TRT LLM v1-specific metrics
417+
# TYPE nv_trt_llm_v1_statistics gauge
418+
nv_trt_llm_v1_statistics{model="tensorrt_llm",v1_specific_metric="total_generation_tokens",version="1"} 20
419+
nv_trt_llm_v1_statistics{model="tensorrt_llm",v1_specific_metric="empty_generation_slots",version="1"} 0
420+
nv_trt_llm_v1_statistics{model="tensorrt_llm",v1_specific_metric="total_context_tokens",version="1"} 5
421+
```
422+
Please note that as of the 23.11 Triton release, a link between base Triton metrics (such as inference request count and latency) is being actively developed, but is not yet supported.
423+
As such, the following fields will report 0:
424+
```bash
425+
# HELP nv_inference_request_success Number of successful inference requests, all batch sizes
426+
# TYPE nv_inference_request_success counter
427+
nv_inference_request_success{model="tensorrt_llm",version="1"} 0
428+
# HELP nv_inference_request_failure Number of failed inference requests, all batch sizes
429+
# TYPE nv_inference_request_failure counter
430+
nv_inference_request_failure{model="tensorrt_llm",version="1"} 0
431+
# HELP nv_inference_count Number of inferences performed (does not include cached requests)
432+
# TYPE nv_inference_count counter
433+
nv_inference_count{model="tensorrt_llm",version="1"} 0
434+
# HELP nv_inference_exec_count Number of model executions performed (does not include cached requests)
435+
# TYPE nv_inference_exec_count counter
436+
nv_inference_exec_count{model="tensorrt_llm",version="1"} 0
437+
# HELP nv_inference_request_duration_us Cumulative inference request duration in microseconds (includes cached requests)
438+
# TYPE nv_inference_request_duration_us counter
439+
nv_inference_request_duration_us{model="tensorrt_llm",version="1"} 0
440+
# HELP nv_inference_queue_duration_us Cumulative inference queuing duration in microseconds (includes cached requests)
441+
# TYPE nv_inference_queue_duration_us counter
442+
nv_inference_queue_duration_us{model="tensorrt_llm",version="1"} 0
443+
# HELP nv_inference_compute_input_duration_us Cumulative compute input duration in microseconds (does not include cached requests)
444+
# TYPE nv_inference_compute_input_duration_us counter
445+
nv_inference_compute_input_duration_us{model="tensorrt_llm",version="1"} 0
446+
# HELP nv_inference_compute_infer_duration_us Cumulative compute inference duration in microseconds (does not include cached requests)
447+
# TYPE nv_inference_compute_infer_duration_us counter
448+
nv_inference_compute_infer_duration_us{model="tensorrt_llm",version="1"} 0
449+
# HELP nv_inference_compute_output_duration_us Cumulative inference compute output duration in microseconds (does not include cached requests)
450+
# TYPE nv_inference_compute_output_duration_us counter
451+
nv_inference_compute_output_duration_us{model="tensorrt_llm",version="1"} 0
452+
# HELP nv_inference_pending_request_count Instantaneous number of pending requests awaiting execution per-model.
453+
# TYPE nv_inference_pending_request_count gauge
454+
nv_inference_pending_request_count{model="tensorrt_llm",version="1"} 0
455+
```
456+
380457
## Testing the TensorRT-LLM Backend
381458
Please follow the guide in [`ci/README.md`](ci/README.md) to see how to run
382459
the testing for TensorRT-LLM backend.

all_models/gpt/ensemble/config.pbtxt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ input [
99
},
1010
{
1111
name: "max_tokens"
12-
data_type: TYPE_UINT32
12+
data_type: TYPE_INT32
1313
dims: [ -1 ]
1414
},
1515
{
@@ -24,19 +24,19 @@ input [
2424
},
2525
{
2626
name: "end_id"
27-
data_type: TYPE_UINT32
27+
data_type: TYPE_INT32
2828
dims: [ 1 ]
2929
optional: true
3030
},
3131
{
3232
name: "pad_id"
33-
data_type: TYPE_UINT32
33+
data_type: TYPE_INT32
3434
dims: [ 1 ]
3535
optional: true
3636
},
3737
{
3838
name: "top_k"
39-
data_type: TYPE_UINT32
39+
data_type: TYPE_INT32
4040
dims: [ 1 ]
4141
optional: true
4242
},
@@ -66,7 +66,7 @@ input [
6666
},
6767
{
6868
name: "min_length"
69-
data_type: TYPE_UINT32
69+
data_type: TYPE_INT32
7070
dims: [ 1 ]
7171
optional: true
7272
},
@@ -84,7 +84,7 @@ input [
8484
},
8585
{
8686
name: "beam_width"
87-
data_type: TYPE_UINT32
87+
data_type: TYPE_INT32
8888
dims: [ 1 ]
8989
optional: true
9090
},

all_models/gpt/preprocessing/config.pbtxt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ input [
1919
},
2020
{
2121
name: "REQUEST_OUTPUT_LEN"
22-
data_type: TYPE_UINT32
22+
data_type: TYPE_INT32
2323
dims: [ -1 ]
2424
}
2525
]
@@ -46,12 +46,12 @@ output [
4646
},
4747
{
4848
name: "REQUEST_OUTPUT_LEN"
49-
data_type: TYPE_UINT32
49+
data_type: TYPE_INT32
5050
dims: [ -1 ]
5151
},
5252
{
5353
name: "PROMPT_LEARNING_TASK_NAME_IDS"
54-
data_type: TYPE_UINT32
54+
data_type: TYPE_INT32
5555
dims: [ 1 ]
5656
}
5757
]

all_models/gpt/tensorrt_llm/config.pbtxt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,24 +21,24 @@ input [
2121
},
2222
{
2323
name: "request_output_len"
24-
data_type: TYPE_UINT32
24+
data_type: TYPE_INT32
2525
dims: [ -1 ]
2626
},
2727
{
2828
name: "end_id"
29-
data_type: TYPE_UINT32
29+
data_type: TYPE_INT32
3030
dims: [ 1 ]
3131
reshape: { shape: [ ] }
3232
},
3333
{
3434
name: "pad_id"
35-
data_type: TYPE_UINT32
35+
data_type: TYPE_INT32
3636
dims: [ 1 ]
3737
reshape: { shape: [ ] }
3838
},
3939
{
4040
name: "beam_width"
41-
data_type: TYPE_UINT32
41+
data_type: TYPE_INT32
4242
dims: [ 1 ]
4343
reshape: { shape: [ ] }
4444
optional: true
@@ -52,7 +52,7 @@ input [
5252
},
5353
{
5454
name: "runtime_top_k"
55-
data_type: TYPE_UINT32
55+
data_type: TYPE_INT32
5656
dims: [ 1 ]
5757
reshape: { shape: [ ] }
5858
optional: true
@@ -80,7 +80,7 @@ input [
8080
},
8181
{
8282
name: "min_length"
83-
data_type: TYPE_UINT32
83+
data_type: TYPE_INT32
8484
dims: [ 1 ]
8585
reshape: { shape: [ ] }
8686
optional: true

all_models/inflight_batcher_llm/ensemble/config.pbtxt

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ input [
3535
},
3636
{
3737
name: "max_tokens"
38-
data_type: TYPE_UINT32
38+
data_type: TYPE_INT32
3939
dims: [ -1 ]
4040
},
4141
{
@@ -52,19 +52,19 @@ input [
5252
},
5353
{
5454
name: "end_id"
55-
data_type: TYPE_UINT32
55+
data_type: TYPE_INT32
5656
dims: [ 1 ]
5757
optional: true
5858
},
5959
{
6060
name: "pad_id"
61-
data_type: TYPE_UINT32
61+
data_type: TYPE_INT32
6262
dims: [ 1 ]
6363
optional: true
6464
},
6565
{
6666
name: "top_k"
67-
data_type: TYPE_UINT32
67+
data_type: TYPE_INT32
6868
dims: [ 1 ]
6969
optional: true
7070
},
@@ -94,7 +94,7 @@ input [
9494
},
9595
{
9696
name: "min_length"
97-
data_type: TYPE_UINT32
97+
data_type: TYPE_INT32
9898
dims: [ 1 ]
9999
optional: true
100100
},
@@ -110,9 +110,15 @@ input [
110110
dims: [ 1 ]
111111
optional: true
112112
},
113+
{
114+
name: "return_log_probs"
115+
data_type: TYPE_BOOL
116+
dims: [ 1 ]
117+
optional: true
118+
},
113119
{
114120
name: "beam_width"
115-
data_type: TYPE_UINT32
121+
data_type: TYPE_INT32
116122
dims: [ 1 ]
117123
optional: true
118124
},
@@ -130,7 +136,7 @@ input [
130136
},
131137
{
132138
name: "prompt_vocab_size"
133-
data_type: TYPE_UINT32
139+
data_type: TYPE_INT32
134140
dims: [ 1 ]
135141
optional: true
136142
},
@@ -152,6 +158,16 @@ output [
152158
name: "text_output"
153159
data_type: TYPE_STRING
154160
dims: [ -1 ]
161+
},
162+
{
163+
name: "cum_log_probs"
164+
data_type: TYPE_FP32
165+
dims: [ -1 ]
166+
},
167+
{
168+
name: "output_log_probs"
169+
data_type: TYPE_FP32
170+
dims: [ -1, -1 ]
155171
}
156172
]
157173
ensemble_scheduling {
@@ -267,6 +283,10 @@ ensemble_scheduling {
267283
key: "random_seed"
268284
value: "random_seed"
269285
}
286+
input_map {
287+
key: "return_log_probs"
288+
value: "return_log_probs"
289+
}
270290
input_map {
271291
key: "beam_width"
272292
value: "beam_width"
@@ -298,6 +318,14 @@ ensemble_scheduling {
298318
output_map {
299319
key: "sequence_length"
300320
value: "_SEQUENCE_LENGTH"
321+
},
322+
output_map {
323+
key: "cum_log_probs"
324+
value: "_CUM_LOG_PROBS"
325+
}
326+
output_map {
327+
key: "output_log_probs"
328+
value: "_OUTPUT_LOG_PROBS"
301329
}
302330
},
303331
{
@@ -307,6 +335,14 @@ ensemble_scheduling {
307335
key: "TOKENS_BATCH"
308336
value: "_TOKENS_BATCH"
309337
}
338+
input_map {
339+
key: "CUM_LOG_PROBS"
340+
value: "_CUM_LOG_PROBS"
341+
}
342+
input_map {
343+
key: "OUTPUT_LOG_PROBS"
344+
value: "_OUTPUT_LOG_PROBS"
345+
}
310346
input_map {
311347
key: "SEQUENCE_LENGTH"
312348
value: "_SEQUENCE_LENGTH"
@@ -315,6 +351,14 @@ ensemble_scheduling {
315351
key: "OUTPUT"
316352
value: "text_output"
317353
}
354+
output_map {
355+
key: "OUT_OUTPUT_LOG_PROBS"
356+
value: "output_log_probs"
357+
}
358+
output_map {
359+
key: "OUT_CUM_LOG_PROBS"
360+
value: "cum_log_probs"
361+
}
318362
}
319363
]
320364
}

all_models/inflight_batcher_llm/postprocessing/1/model.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,14 @@ def execute(self, requests):
113113
sequence_lengths = pb_utils.get_input_tensor_by_name(
114114
request, 'SEQUENCE_LENGTH').as_numpy()
115115

116+
# Get cum log probs
117+
cum_log_probs = pb_utils.get_input_tensor_by_name(
118+
request, 'CUM_LOG_PROBS').as_numpy()
119+
120+
# Get sequence length
121+
output_log_probs = pb_utils.get_input_tensor_by_name(
122+
request, 'OUTPUT_LOG_PROBS').as_numpy()
123+
116124
# Reshape Input
117125
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
118126
# tokens_batch = tokens_batch.T
@@ -126,15 +134,22 @@ def execute(self, requests):
126134
'OUTPUT',
127135
np.array(outputs).astype(self.output_dtype))
128136

137+
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
138+
cum_log_probs)
139+
140+
out_output_log_probs = pb_utils.Tensor('OUT_OUTPUT_LOG_PROBS',
141+
output_log_probs)
142+
129143
# Create InferenceResponse. You can set an error here in case
130144
# there was a problem with handling this inference request.
131145
# Below is an example of how you can set errors in inference
132146
# response:
133147
#
134148
# pb_utils.InferenceResponse(
135149
# output_tensors=..., TritonError("An error occurred"))
136-
inference_response = pb_utils.InferenceResponse(
137-
output_tensors=[output_tensor])
150+
inference_response = pb_utils.InferenceResponse(output_tensors=[
151+
output_tensor, out_cum_log_probs, out_output_log_probs
152+
])
138153
responses.append(inference_response)
139154

140155
# You should return a list of pb_utils.InferenceResponse. Length

0 commit comments

Comments
 (0)