-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
System Info
GPU: NVIDIA A100
Driver Version: 545.23.08
CUDA: 12.3
versions:
- https://github.com/NVIDIA/TensorRT-LLM.git (71d8d4d)
- https://github.com/triton-inference-server/tensorrtllm_backend.git (bf5e900)
Model: zephyr-7b-beta
Who can help?
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
step 1:
- python3 ./tensorrtllm_backend/tensorrt_llm/examples/llama/convert_checkpoint.py --model_dir zephyr-7b-beta --output_dir zephyr-7b-beta-converted --dtype float16
step 2:
- trtllm-build --checkpoint_dir zephyr-7b-beta-converted \
--output_dir zephyr-7b-beta-trt-engine \
--workers 1 \
--remove_input_padding enable \
--context_fmha enable \
--gpt_attention_plugin float16 \
--gemm_plugin float16 \
--paged_kv_cache enable \
--max_num_tokens 65536 \
--max_batch_size 32 \
--max_input_len 16384 \
--multi_block_mode enable \
--strongly_typed
step 3 tensorrtllm_backend parameters:
- MODEL_PATH=zephyr-7b-beta
- MODEL_PIPELINE_NAME=triton_model_repo
- MAX_BATCH_SIZE=32
- ENGINE_PATH=zephyr-7b-beta-trt-engine
- MAX_ATTENTION_WINDOW_SIZE=4096
- KV_CACHE_FREE_GPU_MEM_FRACTION=0.5
- batch_scheduler_policy=guaranteed_no_evict
- python3 tools/fill_template.py -i triton_model_repo/preprocessing/config.pbtxt tokenizer_dir:zephyr-7b-beta/,triton_max_batch_size:${MAX_BATCH_SIZE},preprocessing_instance_count:1
- python3 tools/fill_template.py -i ${MODEL_PIPELINE_NAME}/postprocessing/config.pbtxt tokenizer_dir:${MODEL_PATH},triton_max_batch_size:${MAX_BATCH_SIZE},postprocessing_instance_count:1
- python3 tools/fill_template.py -i ${MODEL_PIPELINE_NAME}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,bls_instance_count:1,accumulate_tokens:False
- python3 tools/fill_template.py -i ${MODEL_PIPELINE_NAME}/ensemble/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE}
- python3 tools/fill_template.py -i ${MODEL_PIPELINE_NAME}/tensorrt_llm/config.pbtxt triton_max_batch_size:${MAX_BATCH_SIZE},decoupled_mode:False,max_beam_width:1,engine_dir:${ENGINE_PATH},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:0,batch_scheduler_policy:${batch_scheduler_policy}
- python3 scripts/launch_triton_server.py --world_size=1 --model_repo=/code/tensorrtllm_backend/${MODEL_PIPELINE_NAME} --http_port=8081 --log --log-file ${MODEL_PIPELINE_NAME}_triton_log.txt
step 4:
- run inference on a long input text ~ 7000 tokens
Expected behavior
"response": {
"context_logits": 0.0,
"cum_log_probs": 0.0,
"generation_logits": 0.0,
"model_name": "ensemble",
"model_version": "1",
"output_log_probs": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
],
"sequence_end": false,
"sequence_id": 0,
"sequence_start": false,
"text_output": "1. <NAME> will set up an account and a control tower for the POC.\n2. <NAME> will check if the \"raise hand\" feature is available with the Realwear device.\n3. <NAME> will start the POC next week."
},
"status_code": 200,
"request_time": 2.1334147453308105
}
actual behavior
"response": {
"context_logits": 0.0,
"cum_log_probs": 0.0,
"generation_logits": 0.0,
"model_name": "ensemble",
"model_version": "1",
"output_log_probs": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
],
"sequence_end": false,
"sequence_id": 0,
"sequence_start": false,
"text_output": "\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6\u68a6"
},
"status_code": 200,
"request_time": 10.642920732498169
}
additional notes
The issue also persists for
...
--max_batch_size 8 \
--max_input_len 16384 \
...
The issue disappears for
...
--max_batch_size 1 \
--max_input_len 16384 \
...
The issue persists for actual input concurrency = 1, 2 or 3, and may disappear when the concurrency >= 4 (for parameters --max_batch_size 8 and --max_batch_size 32)
The issue does not occur for smoothquant zephyr-7b-beta model for any of above reported parameter sets
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working