Open
Description
system info
- x64
- 23.10-trtllm-python-py3 docker
- trt-llm 0.11.0.dev2024061800
- l40s
- tensorrt 10.0.1.6
- use 38g * 2
- running in container (--cap-add=SYS_PTRACE --cap-add=SYS_ADMIN --security-opt seccomp=unconfined --ipc=host --network=host --gpus=all --shm-size=16g --privileged --ulimit memlock=-1 )
- host memory 200g
issue
Hi, I use the InternVL2-20B model, whose decoder is InternLM2. I have successfully converted the decoder model to a TRT engine. The model runs without any bugs when I use the following command:
trtllm-build --checkpoint_dir /code/models/internlm2-chat-26b_vlm/trt_engines/fp16/2-gpu/\
--output_dir /code/models/trt_models/internlm2_26b/batch16 \
--gemm_plugin float16 \
--gpt_attention_plugin float16\
--paged_kv_cache enable\
--remove_input_padding enable\
--max_batch_size 16 \
--max_input_len 512 \
--max_seq_len 2048 \
--max_multimodal_len 4096 # 128 (max_batch_size) * 256 (num_visual_features)
However, when I change the max_batch_size
from 16 to 32, 64, or 128:
trtllm-build --checkpoint_dir /code/models/internlm2-chat-26b_vlm/trt_engines/fp16/2-gpu/\
--output_dir /code/models/trt_models/internlm2_26b/batch32 \
--gemm_plugin float16 \
--gpt_attention_plugin float16\
--paged_kv_cache enable\
--remove_input_padding enable\
--max_batch_size 32 \
--max_input_len 384 \
--max_seq_len 2560 \
--max_multimodal_len 8192 # 128 (max_batch_size) * 256 (num_visual_features)
I use the Triton Inference Server to serve the model:
mpirun -n 1 --allow-run-as-root /code/triton_2405/tritonserver/bin/tritonserver --http-port=8223 --metrics-port=8224\
--model-repository=/code/triton_models/internlm_triton_v1_20B --disable-auto-complete-config\
--backend-config=python,shm-region-prefix-name=prefix0 --model-control-mode=explicit --load-model=preprocessing\
--load-model=tensorrt_llm --load-model=postprocessing --load-model=tensorrt_llm_bls
When the batch size is greater than 16, I encounter the following error:
query[0].decode() Please describe the figure in English:
image.shape (452, 419, 3)
query input: Please describe the figure in English:<img>
pre_input_ids: tensor([[ 5658, 7654, 410, 7219, 435, 6519, 334, 92544]])
post_input_ids: tensor([[92545]])
[debug] input_id shape torch.Size([1, 265])
[debug] prompt_table shape torch.Size([256, 6144])
[debug] input_len tensor([264], dtype=torch.int32)
[debug] prompt_table_data shape torch.Size([1, 256, 6144])
[debug] tasks shape torch.Size([1])
[debug] task_vocab_size tensor([[256]], dtype=torch.int32)
[TensorRT-LLM][ERROR] tensorrt_llm::common::TllmException: [TensorRT-LLM][ERROR] CUDA runtime error in ::cudaFreeHost(ptr): unspecified launch failure (/code/lab/projects/TensorRT-LLM/cpp/tensorrt_llm/runtime/tllmBuffers.h:168)
1 0x7f16727e92a8 /opt/tritonserver/backends/tensorrtllm/libtensorrt_llm.so(+0x71d2a8) [0x7f16727e92a8]
2 0x7f167448ae3c virtual thunk to tensorrt_llm::runtime::GenericTensor<tensorrt_llm::runtime::PinnedAllocator>::~GenericTensor() + 140
3 0x7f167479b701 tensorrt_llm::batch_manager::PromptTuningBuffers::fill(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, tensorrt_llm::runtime::BufferManager const&, bool) + 5793
4 0x7f167479ee3d tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 9549
5 0x7f16747a1d02 tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 194
6 0x7f16747c1242 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 162
7 0x7f16747c13f6 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 246
8 0x7f16747c1d79 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2409
9 0x7f16747e21eb tensorrt_llm::executor::Executor::Impl::forwardAsync(std::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 411
10 0x7f16747e57fd tensorrt_llm::executor::Executor::Impl::executionLoop() + 301
11 0x7f1670a72080 /opt/tritonserver/backends/tensorrtllm/libtensorrt_llm_nvrtc_wrapper.so(+0x31e3080) [0x7f1670a72080]
12 0x7f166d386ac3 /lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7f166d386ac3]
13 0x7f166d418a40 /lib/x86_64-linux-gnu/libc.so.6(+0x126a40) [0x7f166d418a40]
[TensorRT-LLM][ERROR] Encountered an error in forwardAsync function: [TensorRT-LLM][ERROR] CUDA runtime error in cudaMemcpyAsync(dst.data(), src, dst.getSizeInBytes(), cudaMemcpyDefault, mStream->get()): unspecified launch failure (/code/lab/projects/TensorRT-LLM/cpp/tensorrt_llm/runtime/bufferManager.cpp:131)
1 0x55a3ebc92015 void tensorrt_llm::common::check<cudaError>(cudaError, char const*, char const*, int) + 149
2 0x7f167479e706 tensorrt_llm::batch_manager::RuntimeBuffers::setFromInputs(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 7702
3 0x7f16747a1d02 tensorrt_llm::batch_manager::RuntimeBuffers::prepareStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int, int, tensorrt_llm::batch_manager::DecoderBuffers&, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::kv_cache_manager::KVCacheManager*, tensorrt_llm::batch_manager::rnn_state_manager::RnnStateManager*, std::map<unsigned long, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::shared_ptr<std::vector<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig, std::allocator<tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig> > > > > > const&, tensorrt_llm::runtime::TllmRuntime const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&) + 194
4 0x7f16747c1242 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeStep(std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, std::vector<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&, int) + 162
5 0x7f16747c13f6 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::executeBatch(tensorrt_llm::batch_manager::ScheduledRequests const&) + 246
6 0x7f16747c1d79 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::forwardAsync(std::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > > const&) + 2409
7 0x7f16747e21eb tensorrt_llm::executor::Executor::Impl::forwardAsync(std::list<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest>, std::allocator<std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> > >&) + 411
8 0x7f16747e57fd tensorrt_llm::executor::Executor::Impl::executionLoop() + 301
9 0x7f1670a72080 /opt/tritonserver/backends/tensorrtllm/libtensorrt_llm_nvrtc_wrapper.so(+0x31e3080) [0x7f1670a72080]
10 0x7f166d386ac3 /lib/x86_64-linux-gnu/libc.so.6(+0x94ac3) [0x7f166d386ac3]
11 0x7f166d418a40 /lib/x86_64-linux-gnu/libc.so.6(+0x126a40) [0x7f166d418a40]
^CSignal (15) received.