-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Closed
Labels
bugSomething isn't workingSomething isn't workingtriagedIssue has been triaged by maintainersIssue has been triaged by maintainers
Description
System Info
3090 gpu
0.7.1 tensorrt-llm
Who can help?
No response
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
- get bin_model:
python hf_llama_convert.py -i /root/models/Llama-2-7b/ -o /root/TensorRT -LLM/examples/llama/llama2_7b_w8_int8_kv_cache/ --calibrate-kv-cache -t fp16
- build model:
python build.py --bin_model_dir /root/TensorRT-LLM/examples/llama/llama2_7b_w8_int8_kv_cache/bin_model_dir/ --dtype float16 --use_gpt_attention_plugin float16 --use_gemm_plugin float16 --output_dir /root/TensorRT-LLM/examples/llama/llama2_7b_w8_int8_kv_cache/1-gpu --int8_kv_cache --use_weight_only
- test the model:
python mmlu.py --hf_model_dir /root/models/Llama-2-7b/ --engine_dir /root/TensorRT-LLM/examples/llama/llama2_7b_w8_int8_kv_cache/1-gpu/ --test_trt_llm
(mmlu.py is provided by TensorRT-LLM here: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/mmlu.py
Unfortunately, step 3 gives me:
Average accuracy 0.297 - math
Average accuracy 0.399 - health
Average accuracy 0.300 - physics
Average accuracy 0.519 - business
Average accuracy 0.361 - biology
Average accuracy 0.274 - chemistry
Average accuracy 0.299 - computer science
Average accuracy 0.349 - economics
Average accuracy 0.317 - engineering
Average accuracy 0.367 - philosophy
Average accuracy 0.513 - other
Average accuracy 0.439 - history
Average accuracy 0.404 - geography
Average accuracy 0.475 - politics
Average accuracy 0.380 - psychology
Average accuracy 0.512 - culture
Average accuracy 0.330 - law
Average accuracy 0.306 - STEM
Average accuracy 0.367 - humanities
Average accuracy 0.409 - social sciences
Average accuracy 0.457 - other (business, health, misc.)
**Average accuracy: 0.384**
the final mmlu accuracy is 38.4, but fp16 accuracy is 45.9, which is very bad. But according to some LLM quantization papers, the acc should not drop so much in this case.
the config.json generated by build.py is something like this:
{
"builder_config": {
"autopp_config": null,
"gather_context_logits": false,
"gather_generation_logits": false,
"hf_modules_to_trtllm_modules": {
"down_proj": "mlp_4h_to_h",
"gate_proj": "mlp_h_to_4h",
"k_proj": "attn_k",
"o_proj": "attn_dense",
"q_proj": "attn_q",
"up_proj": "mlp_gate",
"v_proj": "attn_v"
},
"hidden_act": "silu",
"hidden_size": 4096,
"int8": true,
"lora_target_modules": null,
"max_batch_size": 8,
"max_beam_width": 1,
"max_input_len": 2048,
"max_num_tokens": null,
"max_output_len": 512,
"max_position_embeddings": 2048,
"max_prompt_embedding_table_size": 0,
"mlp_hidden_size": 11008,
"name": "llama",
"num_heads": 32,
"num_kv_heads": 32,
"num_layers": 32,
"parallel_build": false,
"pipeline_parallel": 1,
"precision": "float16",
"quant_mode": 66,
"tensor_parallel": 1,
"trtllm_modules_to_hf_modules": {
"attn_dense": "o_proj",
"attn_k": "k_proj",
"attn_q": "q_proj",
"attn_v": "v_proj",
"mlp_4h_to_h": "down_proj",
"mlp_gate": "up_proj",
"mlp_h_to_4h": "gate_proj"
},
"use_refit": false,
"vocab_size": 32000
},
"plugin_config": {
"attention_qk_half_accumulation": false,
"bert_attention_plugin": false,
"context_fmha_type": 0,
"enable_xqa": false,
"gemm_plugin": "float16",
"gpt_attention_plugin": "float16",
"identity_plugin": false,
"layernorm_plugin": false,
"layernorm_quantization_plugin": false,
"lookup_plugin": false,
"lora_plugin": false,
"multi_block_mode": false,
"nccl_plugin": false,
"paged_kv_cache": false,
"quantize_per_token_plugin": false,
"quantize_tensor_plugin": false,
"remove_input_padding": false,
"rmsnorm_plugin": false,
"rmsnorm_quantization_plugin": false,
"smooth_quant_gemm_plugin": false,
"tokens_per_block": 0,
"use_context_fmha_for_generation": false,
"use_custom_all_reduce": false,
"use_paged_context_fmha": false,
"weight_only_groupwise_quant_matmul_plugin": false,
"weight_only_quant_matmul_plugin": "float16"
}
}
Is there any bug in the quantization code?
Expected behavior
expected mmlu acc does not drop that much
actual behavior
mmlu acc drops so much
additional notes
no more
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingtriagedIssue has been triaged by maintainersIssue has been triaged by maintainers