diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
index 0fab4b58c0..a6c51dffee 100644
--- a/.github/scripts/action_tools.py
+++ b/.github/scripts/action_tools.py
@@ -100,7 +100,7 @@ def _load_hf_results(test_results: dict, model_name: str):
     return out
 
 
-def evaluate(models: List[str], workspace: str):
+def evaluate(models: List[str], datasets: List[str], workspace: str):
     """Evaluate models from lmdeploy using opencompass.
 
     Args:
@@ -150,6 +150,7 @@ def evaluate(models: List[str], workspace: str):
             continue
         logging.info(f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n')
         with open(config_path_new, 'a') as f:
+            f.write(f'\ndatasets = {datasets}\n')
             f.write(f'\nmodels = [ {target_model} ]\n')
 
         work_dir = os.path.join(workspace, target_model)
diff --git a/.github/scripts/eval_opencompass_config.py b/.github/scripts/eval_opencompass_config.py
index 16a2737243..6b2fa43f15 100644
--- a/.github/scripts/eval_opencompass_config.py
+++ b/.github/scripts/eval_opencompass_config.py
@@ -4,27 +4,25 @@
 
 with read_base():
     # choose a list of datasets
-    # from .datasets.ceval.ceval_gen_5f30c7 import \
-    #     ceval_datasets  # noqa: F401, E501
-    # from .datasets.crowspairs.crowspairs_gen_381af0 import \
-    #     crowspairs_datasets  # noqa: F401, E501
+    from .datasets.ceval.ceval_gen_5f30c7 import \
+        ceval_datasets  # noqa: F401, E501
+    from .datasets.crowspairs.crowspairs_gen_381af0 import \
+        crowspairs_datasets  # noqa: F401, E501
     from .datasets.gsm8k.gsm8k_gen_1d7fe4 import \
         gsm8k_datasets  # noqa: F401, E501
     from .datasets.mmlu.mmlu_gen_a484b3 import \
         mmlu_datasets  # noqa: F401, E501
-    # from .datasets.race.race_gen_69ee4f import \
-    #     race_datasets  # noqa: F401, E501
-    # from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
-    #     WiC_datasets  # noqa: F401, E501
-    # from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
-    #     WSC_datasets  # noqa: F401, E501
-    # from .datasets.triviaqa.triviaqa_gen_2121ce import \
-    #     triviaqa_datasets  # noqa: F401, E501
+    from .datasets.race.race_gen_69ee4f import \
+        race_datasets  # noqa: F401, E501
+    from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \
+        WiC_datasets  # noqa: F401, E501
+    from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \
+        WSC_datasets  # noqa: F401, E501
+    from .datasets.triviaqa.triviaqa_gen_2121ce import \
+        triviaqa_datasets  # noqa: F401, E501
     # and output the results in a chosen format
     from .summarizers.medium import summarizer  # noqa: F401, E501
 
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
-
 internlm_meta_template = dict(round=[
     dict(role='HUMAN', begin='<|User|>:', end='\n'),
     dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
@@ -46,6 +44,17 @@
 ],
                             eos_token_id=2)
 
+llama3_meta_template = dict(round=[
+    dict(role='HUMAN',
+         begin='<|start_header_id|>user<|end_header_id|>\n\n',
+         end='<|eot_id|>'),
+    dict(role='BOT',
+         begin='<|start_header_id|>assistant<|end_header_id|>\n\n',
+         end='<|eot_id|>',
+         generate=True),
+],
+                            eos_token_id=[128001, 128009])
+
 qwen_meta_template = dict(round=[
     dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
     dict(role='BOT',
@@ -54,7 +63,7 @@
          generate=True),
 ], )
 
-qwen1dot5_meta_template = dict(
+qwen1_5_meta_template = dict(
     round=[
         dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
         dict(role='BOT',
@@ -91,104 +100,200 @@
 ],
                            eos_token_id=1)
 
+MAX_SESSION_LEN = 2048
+MAX_NEW_TOKENS = 100
+
+tb_engine_config_template_max_bs_8 = dict(session_len=MAX_SESSION_LEN,
+                                          max_batch_size=8,
+                                          rope_scaling_factor=1.0)
+tb_engine_config_template_max_bs_16 = dict(session_len=MAX_SESSION_LEN,
+                                           max_batch_size=16,
+                                           rope_scaling_factor=1.0)
+tb_engine_config_template_max_bs_32 = dict(session_len=MAX_SESSION_LEN,
+                                           max_batch_size=32,
+                                           rope_scaling_factor=1.0)
+tb_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN,
+                                            max_batch_size=128,
+                                            rope_scaling_factor=1.0)
+tb_engine_config_template_max_bs_128_tp2 = dict(session_len=MAX_SESSION_LEN,
+                                                max_batch_size=128,
+                                                tp=2,
+                                                rope_scaling_factor=1.0)
+
+pt_engine_config_template_max_bs_8 = dict(session_len=MAX_SESSION_LEN,
+                                          max_batch_size=8)
+pt_engine_config_template_max_bs_16 = dict(session_len=MAX_SESSION_LEN,
+                                           max_batch_size=16)
+pt_engine_config_template_max_bs_32 = dict(session_len=MAX_SESSION_LEN,
+                                           max_batch_size=32)
+pt_engine_config_template_max_bs_64 = dict(session_len=MAX_SESSION_LEN,
+                                           max_batch_size=64)
+pt_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN,
+                                            max_batch_size=128)
+pt_engine_config_template_max_bs_128_tp2 = dict(session_len=MAX_SESSION_LEN,
+                                                tp=2,
+                                                max_batch_size=128)
+pt_engine_config_template_max_bs_64_tp2 = dict(session_len=MAX_SESSION_LEN,
+                                               tp=2,
+                                               max_batch_size=64)
+
+pt_engine_config_template_max_bs_8_prefill = dict(session_len=MAX_SESSION_LEN,
+                                                  cache_max_entry_count=0.5,
+                                                  max_prefill_token_num=4096,
+                                                  max_batch_size=8)
+pt_engine_config_template_max_bs_16_prefill = dict(session_len=MAX_SESSION_LEN,
+                                                   cache_max_entry_count=0.5,
+                                                   max_prefill_token_num=4096,
+                                                   max_batch_size=16)
+pt_engine_config_template_max_bs_64_prefill = dict(session_len=MAX_SESSION_LEN,
+                                                   cache_max_entry_count=0.5,
+                                                   max_prefill_token_num=4096,
+                                                   max_batch_size=64)
+pt_engine_config_template_max_bs_128_prefill = dict(
+    session_len=MAX_SESSION_LEN,
+    cache_max_entry_count=0.5,
+    max_prefill_token_num=4096,
+    max_batch_size=128)
+pt_engine_config_template_max_bs_8_prefill_tp2 = dict(
+    session_len=MAX_SESSION_LEN,
+    cache_max_entry_count=0.5,
+    max_prefill_token_num=4096,
+    max_batch_size=8,
+    tp=2)
+pt_engine_config_template_max_bs_64_prefill_tp2 = dict(
+    session_len=MAX_SESSION_LEN,
+    cache_max_entry_count=0.5,
+    max_prefill_token_num=4096,
+    max_batch_size=64,
+    tp=2)
+pt_engine_config_template_max_bs_128_prefill_tp2 = dict(
+    session_len=MAX_SESSION_LEN,
+    cache_max_entry_count=0.5,
+    max_prefill_token_num=4096,
+    max_batch_size=128,
+    tp=2)
+tb_awq_engine_config_template_max_bs_8 = dict(session_len=MAX_SESSION_LEN,
+                                              max_batch_size=8,
+                                              model_format='awq',
+                                              rope_scaling_factor=1.0)
+tb_awq_engine_config_template_max_bs_32 = dict(session_len=MAX_SESSION_LEN,
+                                               max_batch_size=32,
+                                               model_format='awq',
+                                               rope_scaling_factor=1.0)
+tb_awq_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN,
+                                                max_batch_size=128,
+                                                model_format='awq',
+                                                rope_scaling_factor=1.0)
+
+tb_awq_engine_config_template_max_bs_128_tp2 = dict(
+    session_len=MAX_SESSION_LEN,
+    max_batch_size=128,
+    model_format='awq',
+    tp=2,
+    rope_scaling_factor=1.0)
+
+tb_kvint4_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN,
+                                                   max_batch_size=128,
+                                                   quant_policy=4,
+                                                   rope_scaling_factor=1.0)
+
+tb_kvint4_engine_config_template_max_bs_128_tp2 = dict(
+    session_len=MAX_SESSION_LEN,
+    max_batch_size=128,
+    quant_policy=4,
+    tp=2,
+    rope_scaling_factor=1.0)
+
+gen_config_template = dict(top_k=1,
+                           top_p=0.8,
+                           temperature=1.0,
+                           max_new_tokens=MAX_NEW_TOKENS)
+qwen_gen_config_template = dict(top_k=1,
+                                top_p=0.8,
+                                temperature=1.0,
+                                stop_words=[151645],
+                                max_new_tokens=MAX_NEW_TOKENS)
+
+tokenizer_kwargs_template = dict(padding_side='left',
+                                 truncation_side='left',
+                                 use_fast=False,
+                                 trust_remote_code=True)
+model_kwargs_template = dict(device_map='auto', trust_remote_code=True)
+
+run_cfg_tp1_template = dict(num_gpus=1, num_procs=1)
+run_cfg_tp2_template = dict(num_gpus=2, num_procs=1)
+
 # ===== Configs for internlm/internlm-chat-7b =====
 # config for internlm-chat-7b
 hf_internlm_chat_7b = dict(type=HuggingFaceCausalLM,
                            abbr='internlm-chat-7b-hf',
                            path='internlm/internlm-chat-7b',
                            tokenizer_path='internlm/internlm-chat-7b',
-                           model_kwargs=dict(
-                               trust_remote_code=True,
-                               device_map='auto',
-                           ),
-                           tokenizer_kwargs=dict(
-                               padding_side='left',
-                               truncation_side='left',
-                               use_fast=False,
-                               trust_remote_code=True,
-                           ),
-                           max_out_len=256,
-                           max_seq_len=2048,
+                           model_kwargs=model_kwargs_template,
+                           tokenizer_kwargs=tokenizer_kwargs_template,
+                           max_out_len=MAX_NEW_TOKENS,
+                           max_seq_len=MAX_SESSION_LEN,
                            batch_size=16,
                            batch_padding=False,
                            meta_template=internlm_meta_template,
-                           run_cfg=dict(num_gpus=1, num_procs=1),
+                           run_cfg=run_cfg_tp1_template,
                            end_str='<eoa>')
 
 # config for internlm-chat-7b
 tb_internlm_chat_7b = dict(type=TurboMindModel,
                            abbr='internlm-chat-7b-turbomind',
                            path='internlm/internlm-chat-7b',
-                           engine_config=dict(session_len=2048,
-                                              max_batch_size=32,
-                                              rope_scaling_factor=1.0),
-                           gen_config=dict(top_k=1,
-                                           top_p=0.8,
-                                           temperature=1.0,
-                                           max_new_tokens=256),
-                           max_out_len=256,
-                           max_seq_len=2048,
+                           engine_config=tb_engine_config_template_max_bs_32,
+                           gen_config=gen_config_template,
+                           max_out_len=MAX_NEW_TOKENS,
+                           max_seq_len=MAX_SESSION_LEN,
                            batch_size=32,
                            concurrency=32,
                            meta_template=internlm_meta_template,
-                           run_cfg=dict(num_gpus=1, num_procs=1),
+                           run_cfg=run_cfg_tp1_template,
                            end_str='<eoa>')
 
 # config for pt internlm-chat-7b
 pt_internlm_chat_7b = dict(type=LmdeployPytorchModel,
                            abbr='internlm-chat-7b-pytorch',
                            path='internlm/internlm-chat-7b',
-                           engine_config=dict(session_len=2048,
-                                              max_batch_size=16),
-                           gen_config=dict(top_k=1,
-                                           top_p=0.8,
-                                           temperature=1.0,
-                                           max_new_tokens=256),
-                           max_out_len=256,
-                           max_seq_len=2048,
+                           engine_config=pt_engine_config_template_max_bs_16,
+                           gen_config=gen_config_template,
+                           max_out_len=MAX_NEW_TOKENS,
+                           max_seq_len=MAX_SESSION_LEN,
                            batch_size=16,
                            concurrency=16,
                            meta_template=internlm_meta_template,
-                           run_cfg=dict(num_gpus=1, num_procs=1),
+                           run_cfg=run_cfg_tp1_template,
                            end_str='<eoa>')
 
-tb_internlm_chat_7b_w4a16 = dict(type=TurboMindModel,
-                                 abbr='internlm-chat-7b-4bits-turbomind',
-                                 path='internlm/internlm-chat-7b-4bits',
-                                 engine_config=dict(session_len=2048,
-                                                    max_batch_size=32,
-                                                    model_format='awq',
-                                                    rope_scaling_factor=1.0),
-                                 gen_config=dict(top_k=1,
-                                                 top_p=0.8,
-                                                 temperature=1.0,
-                                                 max_new_tokens=256),
-                                 max_out_len=256,
-                                 max_seq_len=2048,
-                                 batch_size=32,
-                                 concurrency=32,
-                                 meta_template=internlm_meta_template,
-                                 run_cfg=dict(num_gpus=1, num_procs=1),
-                                 end_str='<eoa>')
+tb_internlm_chat_7b_w4a16 = dict(
+    type=TurboMindModel,
+    abbr='internlm-chat-7b-4bits-turbomind',
+    path='internlm/internlm-chat-7b-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_32,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=32,
+    concurrency=32,
+    meta_template=internlm_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<eoa>')
 
 # ===== Configs for internlm/internlm-chat-20b =====
 # config for internlm-chat-20b
 tb_internlm_chat_20b = dict(type=TurboMindModel,
                             abbr='internlm-chat-20b-turbomind',
                             path='internlm/internlm-chat-20b',
-                            engine_config=dict(session_len=2048,
-                                               max_batch_size=8,
-                                               rope_scaling_factor=1.0),
-                            gen_config=dict(top_k=1,
-                                            top_p=0.8,
-                                            temperature=1.0,
-                                            max_new_tokens=256),
-                            max_out_len=256,
-                            max_seq_len=2048,
+                            engine_config=tb_engine_config_template_max_bs_8,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
                             batch_size=8,
                             concurrency=8,
                             meta_template=internlm_meta_template,
-                            run_cfg=dict(num_gpus=1, num_procs=1),
+                            run_cfg=run_cfg_tp1_template,
                             end_str='<eoa>')
 
 # config for internlm-chat-20b
@@ -196,79 +301,59 @@
                             abbr='internlm-chat-20b-hf',
                             path='internlm/internlm-chat-20b',
                             tokenizer_path='internlm/internlm-chat-20b',
-                            tokenizer_kwargs=dict(
-                                padding_side='left',
-                                truncation_side='left',
-                                use_fast=False,
-                                trust_remote_code=True,
-                            ),
-                            max_out_len=256,
-                            max_seq_len=2048,
+                            tokenizer_kwargs=tokenizer_kwargs_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
                             batch_size=8,
                             batch_padding=False,
                             model_kwargs=dict(trust_remote_code=True,
                                               device_map='auto'),
-                            run_cfg=dict(num_gpus=2, num_procs=1),
+                            run_cfg=run_cfg_tp2_template,
                             end_str='<eoa>')
 
 # config for internlm-chat-20b-w4 model
-tb_internlm_chat_20b_w4a16 = dict(type=TurboMindModel,
-                                  abbr='internlm-chat-20b-4bits-turbomind',
-                                  path='internlm/internlm-chat-20b-4bits',
-                                  engine_config=dict(session_len=2048,
-                                                     max_batch_size=8,
-                                                     model_format='awq',
-                                                     rope_scaling_factor=1.0),
-                                  gen_config=dict(top_k=1,
-                                                  top_p=0.8,
-                                                  temperature=1.0,
-                                                  max_new_tokens=256),
-                                  max_out_len=256,
-                                  max_seq_len=2048,
-                                  batch_size=8,
-                                  concurrency=8,
-                                  meta_template=internlm_meta_template,
-                                  run_cfg=dict(num_gpus=1, num_procs=1),
-                                  end_str='<eoa>')
+tb_internlm_chat_20b_w4a16 = dict(
+    type=TurboMindModel,
+    abbr='internlm-chat-20b-4bits-turbomind',
+    path='internlm/internlm-chat-20b-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_8,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=8,
+    concurrency=8,
+    meta_template=internlm_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<eoa>')
 
 # config for internlm-chat-20b
-pt_internlm_chat_20b = dict(type=LmdeployPytorchModel,
-                            abbr='internlm-chat-20b-pytorch',
-                            path='internlm/internlm-chat-20b',
-                            engine_config=dict(session_len=2048,
-                                               cache_max_entry_count=0.5,
-                                               max_prefill_token_num=4096,
-                                               max_batch_size=8),
-                            gen_config=dict(top_k=1,
-                                            top_p=0.8,
-                                            temperature=1.0,
-                                            max_new_tokens=256),
-                            max_out_len=256,
-                            max_seq_len=2048,
-                            batch_size=8,
-                            concurrency=8,
-                            meta_template=internlm_meta_template,
-                            run_cfg=dict(num_gpus=1, num_procs=1),
-                            end_str='<eoa>')
+pt_internlm_chat_20b = dict(
+    type=LmdeployPytorchModel,
+    abbr='internlm-chat-20b-pytorch',
+    path='internlm/internlm-chat-20b',
+    engine_config=pt_engine_config_template_max_bs_8_prefill,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=8,
+    concurrency=8,
+    meta_template=internlm_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<eoa>')
 
 # ===== Configs for internlm/internlm2-chat-7b =====
 # config for internlm2-chat-7b
 tb_internlm2_chat_7b = dict(type=TurboMindModel,
                             abbr='internlm2-chat-7b-turbomind',
                             path='internlm/internlm2-chat-7b',
-                            engine_config=dict(session_len=2048,
-                                               max_batch_size=32,
-                                               rope_scaling_factor=1.0),
-                            gen_config=dict(top_k=1,
-                                            top_p=0.8,
-                                            temperature=1.0,
-                                            max_new_tokens=256),
-                            max_out_len=256,
-                            max_seq_len=2048,
-                            batch_size=32,
-                            concurrency=32,
+                            engine_config=tb_engine_config_template_max_bs_128,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
+                            batch_size=128,
+                            concurrency=128,
                             meta_template=internlm2_meta_template,
-                            run_cfg=dict(num_gpus=1, num_procs=1),
+                            run_cfg=run_cfg_tp1_template,
                             end_str='<|im_end|>')
 
 # config for internlm2-chat-7b
@@ -276,181 +361,162 @@
                             abbr='internlm2-chat-7b-hf',
                             path='internlm/internlm2-chat-7b',
                             tokenizer_path='internlm/internlm2-chat-7b',
-                            model_kwargs=dict(
-                                trust_remote_code=True,
-                                device_map='auto',
-                            ),
-                            tokenizer_kwargs=dict(
-                                padding_side='left',
-                                truncation_side='left',
-                                use_fast=False,
-                                trust_remote_code=True,
-                            ),
-                            max_out_len=256,
-                            max_seq_len=2048,
+                            model_kwargs=model_kwargs_template,
+                            tokenizer_kwargs=tokenizer_kwargs_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
                             batch_size=16,
                             batch_padding=False,
                             meta_template=internlm2_meta_template,
-                            run_cfg=dict(num_gpus=1, num_procs=1),
+                            run_cfg=run_cfg_tp1_template,
                             end_str='<|im_end|>')
 
 # config for internlm2-chat-7b-w4
-tb_internlm2_chat_7b_w4a16 = dict(type=TurboMindModel,
-                                  abbr='internlm2-chat-7b-4bits-turbomind',
-                                  path='internlm/internlm2-chat-7b-4bits',
-                                  engine_config=dict(session_len=2048,
-                                                     max_batch_size=32,
-                                                     model_format='awq',
-                                                     rope_scaling_factor=1.0),
-                                  gen_config=dict(top_k=1,
-                                                  top_p=0.8,
-                                                  temperature=1.0,
-                                                  max_new_tokens=256),
-                                  max_out_len=256,
-                                  max_seq_len=2048,
-                                  batch_size=32,
-                                  concurrency=32,
-                                  meta_template=internlm2_meta_template,
-                                  run_cfg=dict(num_gpus=1, num_procs=1),
-                                  end_str='<|im_end|>')
+tb_internlm2_chat_7b_w4a16 = dict(
+    type=TurboMindModel,
+    abbr='internlm2-chat-7b-4bits-turbomind',
+    path='internlm/internlm2-chat-7b-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=internlm2_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<|im_end|>')
+
+tb_internlm2_chat_7b_kvint4 = dict(
+    type=TurboMindModel,
+    abbr='internlm2-chat-7b-turbomind-kvint4',
+    path='internlm/internlm2-chat-7b',
+    engine_config=tb_kvint4_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=internlm2_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<|im_end|>')
 
 # config for pt internlm-chat-7b
 pt_internlm2_chat_7b = dict(type=LmdeployPytorchModel,
                             abbr='internlm2-chat-7b-pytorch',
                             path='internlm/internlm2-chat-7b',
-                            engine_config=dict(session_len=2048,
-                                               max_batch_size=16),
-                            gen_config=dict(top_k=1,
-                                            top_p=0.8,
-                                            temperature=1.0,
-                                            max_new_tokens=256),
-                            max_out_len=256,
-                            max_seq_len=2048,
-                            batch_size=16,
-                            concurrency=16,
+                            engine_config=pt_engine_config_template_max_bs_128,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
+                            batch_size=128,
+                            concurrency=128,
                             meta_template=internlm2_meta_template,
-                            run_cfg=dict(num_gpus=1, num_procs=1),
+                            run_cfg=run_cfg_tp1_template,
                             end_str='<|im_end|>')
 
 # ===== Configs for internlm/internlm2-chat-20b =====
 # config for internlm2-chat-20b
-tb_internlm2_chat_20b = dict(type=TurboMindModel,
-                             abbr='internlm2-chat-20b-turbomind',
-                             path='internlm/internlm2-chat-20b',
-                             engine_config=dict(session_len=2048,
-                                                max_batch_size=8,
-                                                rope_scaling_factor=1.0),
-                             gen_config=dict(top_k=1,
-                                             top_p=0.8,
-                                             temperature=1.0,
-                                             max_new_tokens=256),
-                             max_out_len=256,
-                             max_seq_len=2048,
-                             batch_size=8,
-                             concurrency=8,
-                             meta_template=internlm2_meta_template,
-                             run_cfg=dict(num_gpus=1, num_procs=1),
-                             end_str='<|im_end|>')
+tb_internlm2_chat_20b = dict(
+    type=TurboMindModel,
+    abbr='internlm2-chat-20b-turbomind',
+    path='internlm/internlm2-chat-20b',
+    engine_config=tb_engine_config_template_max_bs_128_tp2,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=internlm2_meta_template,
+    run_cfg=run_cfg_tp2_template,
+    end_str='<|im_end|>')
 
 # config for internlm2-chat-20b
 hf_internlm2_chat_20b = dict(type=HuggingFaceCausalLM,
                              abbr='internlm2-chat-20b-hf',
                              path='internlm/internlm2-chat-20b',
                              tokenizer_path='internlm/internlm2-chat-20b',
-                             model_kwargs=dict(
-                                 trust_remote_code=True,
-                                 device_map='auto',
-                             ),
-                             tokenizer_kwargs=dict(
-                                 padding_side='left',
-                                 truncation_side='left',
-                                 use_fast=False,
-                                 trust_remote_code=True,
-                             ),
-                             max_out_len=256,
-                             max_seq_len=2048,
+                             model_kwargs=model_kwargs_template,
+                             tokenizer_kwargs=tokenizer_kwargs_template,
+                             max_out_len=MAX_NEW_TOKENS,
+                             max_seq_len=MAX_SESSION_LEN,
                              batch_size=8,
                              batch_padding=False,
                              meta_template=internlm2_meta_template,
-                             run_cfg=dict(num_gpus=2, num_procs=1),
+                             run_cfg=run_cfg_tp2_template,
                              end_str='<|im_end|>')
 
 # config for internlm2-chat-20b-w4 model
-tb_internlm2_chat_20b_w4a16 = dict(type=TurboMindModel,
-                                   abbr='internlm2-chat-20b-4bits-turbomind',
-                                   path='internlm/internlm2-chat-20b-4bits',
-                                   engine_config=dict(session_len=2048,
-                                                      max_batch_size=8,
-                                                      model_format='awq',
-                                                      rope_scaling_factor=1.0),
-                                   gen_config=dict(top_k=1,
-                                                   top_p=0.8,
-                                                   temperature=1.0,
-                                                   max_new_tokens=256),
-                                   max_out_len=256,
-                                   max_seq_len=2048,
-                                   batch_size=8,
-                                   concurrency=8,
-                                   meta_template=internlm2_meta_template,
-                                   run_cfg=dict(num_gpus=1, num_procs=1),
-                                   end_str='<|im_end|>')
+tb_internlm2_chat_20b_w4a16 = dict(
+    type=TurboMindModel,
+    abbr='internlm2-chat-20b-4bits-turbomind',
+    path='internlm/internlm2-chat-20b-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_128_tp2,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=internlm2_meta_template,
+    run_cfg=run_cfg_tp2_template,
+    end_str='<|im_end|>')
+
+# config for internlm2-chat-20b-w4 model
+tb_internlm2_chat_20b_kvint4 = dict(
+    type=TurboMindModel,
+    abbr='internlm2-chat-20b-turbomind-kvint4',
+    path='internlm/internlm2-chat-20b-inner-4bits',
+    engine_config=tb_kvint4_engine_config_template_max_bs_128_tp2,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=internlm2_meta_template,
+    run_cfg=run_cfg_tp2_template,
+    end_str='<|im_end|>')
 
 # config for pt internlm-chat-20b
-pt_internlm2_chat_20b = dict(type=LmdeployPytorchModel,
-                             abbr='internlm2-chat-20b-pytorch',
-                             path='internlm/internlm2-chat-20b',
-                             engine_config=dict(session_len=2048,
-                                                cache_max_entry_count=0.5,
-                                                max_prefill_token_num=4096,
-                                                max_batch_size=8),
-                             gen_config=dict(top_k=1,
-                                             top_p=0.8,
-                                             temperature=1.0,
-                                             max_new_tokens=256),
-                             max_out_len=256,
-                             max_seq_len=2048,
-                             batch_size=8,
-                             concurrency=8,
-                             meta_template=internlm2_meta_template,
-                             run_cfg=dict(num_gpus=1, num_procs=1),
-                             end_str='<|im_end|>')
+pt_internlm2_chat_20b = dict(
+    type=LmdeployPytorchModel,
+    abbr='internlm2-chat-20b-pytorch',
+    path='internlm/internlm2-chat-20b',
+    engine_config=pt_engine_config_template_max_bs_64_prefill,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=64,
+    concurrency=64,
+    meta_template=internlm2_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<|im_end|>')
 
 # ===== Configs for Qwen/Qwen-7B-Chat =====
 # config for qwen-chat-7b turbomind
 tb_qwen_chat_7b = dict(type=TurboMindModel,
                        abbr='qwen-7b-chat-turbomind',
                        path='Qwen/Qwen-7B-Chat',
-                       engine_config=dict(session_len=2048,
-                                          max_batch_size=16,
-                                          rope_scaling_factor=1.0),
-                       gen_config=dict(top_k=1,
-                                       top_p=0.8,
-                                       temperature=1.0,
-                                       max_new_tokens=256),
-                       max_out_len=256,
-                       max_seq_len=2048,
+                       engine_config=tb_engine_config_template_max_bs_16,
+                       gen_config=qwen_gen_config_template,
+                       max_out_len=MAX_NEW_TOKENS,
+                       max_seq_len=MAX_SESSION_LEN,
                        batch_size=16,
                        concurrency=16,
                        meta_template=qwen_meta_template,
-                       run_cfg=dict(num_gpus=1, num_procs=1),
+                       run_cfg=run_cfg_tp1_template,
                        end_str='<|im_end|>')
 
 # config for qwen-chat-7b pytorch
 pt_qwen_chat_7b = dict(type=LmdeployPytorchModel,
                        abbr='qwen-7b-chat-pytorch',
                        path='Qwen/Qwen-7B-Chat',
-                       engine_config=dict(session_len=2048, max_batch_size=16),
-                       gen_config=dict(top_k=1,
-                                       top_p=0.8,
-                                       temperature=1.0,
-                                       stop_words=[151645],
-                                       max_new_tokens=256),
-                       max_out_len=256,
-                       max_seq_len=2048,
+                       engine_config=pt_engine_config_template_max_bs_16,
+                       gen_config=qwen_gen_config_template,
+                       max_out_len=MAX_NEW_TOKENS,
+                       max_seq_len=MAX_SESSION_LEN,
                        batch_size=16,
                        concurrency=16,
                        meta_template=qwen_meta_template,
-                       run_cfg=dict(num_gpus=1, num_procs=1),
+                       run_cfg=run_cfg_tp1_template,
                        end_str='<|im_end|>')
 
 # config for qwen-chat-7b huggingface
@@ -459,18 +525,15 @@
     abbr='qwen-7b-chat-hf',
     path='Qwen/Qwen-7B-Chat',
     tokenizer_path='Qwen/Qwen-7B-Chat',
-    model_kwargs=dict(device_map='auto', trust_remote_code=True),
-    tokenizer_kwargs=dict(padding_side='left',
-                          truncation_side='left',
-                          trust_remote_code=True,
-                          use_fast=False),
+    model_kwargs=model_kwargs_template,
+    tokenizer_kwargs=tokenizer_kwargs_template,
     pad_token_id=151643,
-    max_out_len=100,
-    max_seq_len=2048,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
     batch_size=16,
     batch_padding=False,
     meta_template=qwen_meta_template,
-    run_cfg=dict(num_gpus=1, num_procs=1),
+    run_cfg=run_cfg_tp1_template,
     end_str='<|im_end|>',
 )
 
@@ -479,37 +542,28 @@
 tb_llama2_chat_7b = dict(type=TurboMindModel,
                          abbr='llama-2-7b-chat-turbomind',
                          path='meta-llama/Llama-2-7b-chat-hf',
-                         engine_config=dict(session_len=2048,
-                                            max_batch_size=16,
-                                            rope_scaling_factor=1.0),
-                         gen_config=dict(top_k=1,
-                                         top_p=0.8,
-                                         temperature=1.0,
-                                         max_new_tokens=256),
-                         max_out_len=256,
-                         max_seq_len=2048,
-                         batch_size=16,
-                         concurrency=16,
+                         engine_config=tb_engine_config_template_max_bs_128,
+                         gen_config=gen_config_template,
+                         max_out_len=MAX_NEW_TOKENS,
+                         max_seq_len=MAX_SESSION_LEN,
+                         batch_size=128,
+                         concurrency=128,
                          meta_template=llama2_meta_template,
-                         run_cfg=dict(num_gpus=1, num_procs=1),
+                         run_cfg=run_cfg_tp1_template,
                          end_str='[INST]')
 
 # config for llama2-chat-7b pytorch
 pt_llama2_chat_7b = dict(type=LmdeployPytorchModel,
                          abbr='llama-2-7b-chat-pytorch',
                          path='meta-llama/Llama-2-7b-chat-hf',
-                         engine_config=dict(session_len=2048,
-                                            max_batch_size=16),
-                         gen_config=dict(top_k=1,
-                                         top_p=0.8,
-                                         temperature=1.0,
-                                         max_new_tokens=256),
-                         max_out_len=256,
-                         max_seq_len=2048,
-                         batch_size=16,
-                         concurrency=16,
+                         engine_config=pt_engine_config_template_max_bs_128,
+                         gen_config=gen_config_template,
+                         max_out_len=MAX_NEW_TOKENS,
+                         max_seq_len=MAX_SESSION_LEN,
+                         batch_size=128,
+                         concurrency=128,
                          meta_template=llama2_meta_template,
-                         run_cfg=dict(num_gpus=1, num_procs=1),
+                         run_cfg=run_cfg_tp1_template,
                          end_str='[INST]')
 
 # config for llama2-chat-7b huggingface
@@ -517,18 +571,14 @@
                          abbr='llama-2-7b-chat-hf',
                          path='meta-llama/Llama-2-7b-chat-hf',
                          tokenizer_path='meta-llama/Llama-2-7b-chat-hf',
-                         model_kwargs=dict(device_map='auto'),
-                         tokenizer_kwargs=dict(
-                             padding_side='left',
-                             truncation_side='left',
-                             use_fast=False,
-                         ),
+                         model_kwargs=model_kwargs_template,
+                         tokenizer_kwargs=tokenizer_kwargs_template,
                          meta_template=llama2_meta_template,
-                         max_out_len=256,
-                         max_seq_len=2048,
+                         max_out_len=MAX_NEW_TOKENS,
+                         max_seq_len=MAX_SESSION_LEN,
                          batch_size=16,
                          batch_padding=False,
-                         run_cfg=dict(num_gpus=1, num_procs=1),
+                         run_cfg=run_cfg_tp1_template,
                          end_str='[INST]')
 
 # ===== Configs for baichuan-inc/Baichuan2-7B-Chat =====
@@ -536,54 +586,41 @@
 tb_baichuan2_chat_7b = dict(type=TurboMindModel,
                             abbr='Baichuan2-7B-Chat-turbomind',
                             path='baichuan-inc/Baichuan2-7B-Chat',
-                            engine_config=dict(session_len=2048,
-                                               max_batch_size=16,
-                                               rope_scaling_factor=1.0),
-                            gen_config=dict(top_k=1,
-                                            top_p=0.8,
-                                            temperature=1.0,
-                                            max_new_tokens=256),
-                            max_out_len=256,
-                            max_seq_len=2048,
+                            engine_config=tb_engine_config_template_max_bs_16,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
                             batch_size=16,
                             concurrency=16,
                             meta_template=baichuan2_meta_template,
-                            run_cfg=dict(num_gpus=1, num_procs=1))
+                            run_cfg=run_cfg_tp1_template)
 
 # config for baichuan2-chat-7b huggingface
 hf_baichuan2_chat_7b = dict(type=HuggingFaceCausalLM,
                             abbr='baichuan2-7b-chat-hf',
                             path='baichuan-inc/Baichuan2-7B-Chat',
                             tokenizer_path='baichuan-inc/Baichuan2-7B-Chat',
-                            tokenizer_kwargs=dict(padding_side='left',
-                                                  truncation_side='left',
-                                                  trust_remote_code=True,
-                                                  use_fast=False),
+                            tokenizer_kwargs=tokenizer_kwargs_template,
                             meta_template=baichuan2_meta_template,
-                            max_out_len=100,
-                            max_seq_len=2048,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
                             batch_size=16,
                             batch_padding=False,
-                            model_kwargs=dict(device_map='auto',
-                                              trust_remote_code=True),
-                            run_cfg=dict(num_gpus=1, num_procs=1))
+                            model_kwargs=model_kwargs_template,
+                            run_cfg=run_cfg_tp1_template)
 
 # config for baichuan2-chat-7b pytorch
 pt_baichuan2_chat_7b = dict(type=LmdeployPytorchModel,
                             abbr='baichuan2-7b-chat-hf',
                             path='baichuan-inc/Baichuan2-7B-Chat',
-                            engine_config=dict(session_len=2048,
-                                               max_batch_size=16),
-                            gen_config=dict(top_k=1,
-                                            top_p=0.8,
-                                            temperature=1.0,
-                                            max_new_tokens=256),
-                            max_out_len=256,
-                            max_seq_len=2048,
+                            engine_config=pt_engine_config_template_max_bs_16,
+                            gen_config=gen_config_template,
+                            max_out_len=MAX_NEW_TOKENS,
+                            max_seq_len=MAX_SESSION_LEN,
                             batch_size=16,
                             concurrency=16,
                             meta_template=baichuan2_meta_template,
-                            run_cfg=dict(num_gpus=1, num_procs=1),
+                            run_cfg=run_cfg_tp1_template,
                             end_str=None)
 
 # ===== Configs for mistralai/Mistral-7B-Instruct-v0.1 =====
@@ -591,18 +628,14 @@
 pt_mistral_chat_7b = dict(type=LmdeployPytorchModel,
                           abbr='mistral-7b-instruct-v0.1-pytorch',
                           path='mistralai/Mistral-7B-Instruct-v0.1',
-                          engine_config=dict(session_len=2048,
-                                             max_batch_size=16),
-                          gen_config=dict(top_k=1,
-                                          top_p=0.8,
-                                          temperature=1.0,
-                                          max_new_tokens=256),
-                          max_out_len=256,
-                          max_seq_len=2048,
+                          engine_config=pt_engine_config_template_max_bs_16,
+                          gen_config=gen_config_template,
+                          max_out_len=MAX_NEW_TOKENS,
+                          max_seq_len=MAX_SESSION_LEN,
                           batch_size=16,
                           concurrency=16,
                           meta_template=mistral_meta_template,
-                          run_cfg=dict(num_gpus=1, num_procs=1),
+                          run_cfg=run_cfg_tp1_template,
                           end_str='</s>')
 
 # config for hf Mistral-7B-Instruct-v0.1
@@ -610,19 +643,14 @@
                           type=HuggingFaceCausalLM,
                           path='mistralai/Mistral-7B-Instruct-v0.1',
                           tokenizer_path='mistralai/Mistral-7B-Instruct-v0.1',
-                          model_kwargs=dict(device_map='auto',
-                                            trust_remote_code=True),
-                          tokenizer_kwargs=dict(
-                              padding_side='left',
-                              truncation_side='left',
-                              trust_remote_code=True,
-                          ),
+                          model_kwargs=model_kwargs_template,
+                          tokenizer_kwargs=tokenizer_kwargs_template,
                           meta_template=mistral_meta_template,
-                          max_out_len=256,
-                          max_seq_len=2048,
+                          max_out_len=MAX_NEW_TOKENS,
+                          max_seq_len=MAX_SESSION_LEN,
                           batch_size=16,
                           batch_padding=False,
-                          run_cfg=dict(num_gpus=1, num_procs=1),
+                          run_cfg=run_cfg_tp1_template,
                           end_str='</s>')
 
 # ===== Configs for mistralai/Mixtral-8x7B-Instruct-v0.1 =====
@@ -632,110 +660,197 @@
     type=HuggingFaceCausalLM,
     path='mistralai/Mixtral-8x7B-Instruct-v0.1',
     tokenizer_path='mistralai/Mixtral-8x7B-Instruct-v0.1',
-    model_kwargs=dict(device_map='auto', trust_remote_code=True),
-    tokenizer_kwargs=dict(padding_side='left',
-                          truncation_side='left',
-                          trust_remote_code=True),
+    model_kwargs=model_kwargs_template,
+    tokenizer_kwargs=tokenizer_kwargs_template,
     meta_template=mistral_meta_template,
-    max_out_len=256,
-    max_seq_len=2048,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
     batch_size=8,
     batch_padding=False,
-    run_cfg=dict(num_gpus=2, num_procs=1),
+    run_cfg=run_cfg_tp2_template,
     end_str='</s>')
 
 # config for pt Mixtral-8x7B-Instruct-v0.1
-pt_mixtral_chat_8x7b = dict(type=LmdeployPytorchModel,
-                            abbr='mixtral-8x7b-instruct-v0.1-pytorch',
-                            path='mistralai/Mixtral-8x7B-Instruct-v0.1',
-                            engine_config=dict(session_len=2048,
-                                               tp=2,
-                                               cache_max_entry_count=0.5,
-                                               max_prefill_token_num=4096,
-                                               max_batch_size=8),
-                            gen_config=dict(top_k=1,
-                                            top_p=0.8,
-                                            temperature=1.0,
-                                            max_new_tokens=256),
-                            max_out_len=256,
-                            max_seq_len=2048,
-                            batch_size=8,
-                            concurrency=8,
-                            meta_template=mistral_meta_template,
-                            run_cfg=dict(num_gpus=2, num_procs=1),
-                            end_str='</s>')
+pt_mixtral_chat_8x7b = dict(
+    type=LmdeployPytorchModel,
+    abbr='mixtral-8x7b-instruct-v0.1-pytorch',
+    path='mistralai/Mixtral-8x7B-Instruct-v0.1',
+    engine_config=pt_engine_config_template_max_bs_8_prefill_tp2,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=8,
+    concurrency=8,
+    meta_template=mistral_meta_template,
+    run_cfg=run_cfg_tp2_template,
+    end_str='</s>')
 
 # ===== Configs for Qwen/Qwen1.5-7B-Chat =====
-hf_qwen1dot5_chat_7b = dict(type=HuggingFaceCausalLM,
-                            abbr='qwen1.5-7b-chat-hf',
-                            path='Qwen/Qwen1.5-7B-Chat',
-                            model_kwargs=dict(device_map='auto',
-                                              trust_remote_code=True),
-                            tokenizer_kwargs=dict(padding_side='left',
-                                                  truncation_side='left',
-                                                  trust_remote_code=True,
-                                                  use_fast=False),
-                            meta_template=qwen1dot5_meta_template,
-                            pad_token_id=151645,
-                            max_out_len=256,
-                            max_seq_len=2048,
-                            batch_size=8,
-                            batch_padding=False,
-                            run_cfg=dict(num_gpus=1, num_procs=1),
-                            end_str='<|im_end|>')
-
-pt_qwen1dot5_chat_7b = dict(type=LmdeployPytorchModel,
-                            abbr='qwen1.5-7b-chat-pytorch',
-                            path='Qwen/Qwen1.5-7B-Chat',
-                            engine_config=dict(session_len=2048,
-                                               cache_max_entry_count=0.5,
-                                               max_prefill_token_num=4096,
-                                               max_batch_size=16),
-                            gen_config=dict(top_k=1,
-                                            top_p=0.8,
-                                            temperature=1.0,
-                                            max_new_tokens=256),
-                            max_out_len=256,
-                            max_seq_len=2048,
-                            batch_size=16,
-                            concurrency=16,
-                            meta_template=qwen1dot5_meta_template,
-                            run_cfg=dict(num_gpus=1, num_procs=1),
-                            end_str='<|im_end|>')
+hf_qwen1_5_chat_7b = dict(type=HuggingFaceCausalLM,
+                          abbr='qwen1.5-7b-chat-hf',
+                          path='Qwen/Qwen1.5-7B-Chat',
+                          model_kwargs=model_kwargs_template,
+                          tokenizer_kwargs=tokenizer_kwargs_template,
+                          meta_template=qwen1_5_meta_template,
+                          pad_token_id=151645,
+                          max_out_len=MAX_NEW_TOKENS,
+                          max_seq_len=MAX_SESSION_LEN,
+                          batch_size=8,
+                          batch_padding=False,
+                          run_cfg=run_cfg_tp1_template,
+                          end_str='<|im_end|>')
+
+tb_qwen1_5_chat_7b = dict(type=TurboMindModel,
+                          abbr='qwen1.5-7b-chat-turbomind',
+                          path='Qwen/Qwen1.5-7B-Chat',
+                          engine_config=tb_engine_config_template_max_bs_128,
+                          gen_config=gen_config_template,
+                          max_out_len=MAX_NEW_TOKENS,
+                          max_seq_len=MAX_SESSION_LEN,
+                          batch_size=128,
+                          concurrency=128,
+                          meta_template=qwen1_5_meta_template,
+                          run_cfg=run_cfg_tp1_template,
+                          end_str='<|im_end|>')
+
+tb_qwen1_5_chat_7b_w4a16 = dict(
+    type=TurboMindModel,
+    abbr='qwen1.5-7b-chat-4bits-turbomind',
+    path='Qwen/Qwen1.5-7B-Chat-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=qwen1_5_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<|im_end|>')
+
+tb_qwen1_5_chat_7b_kvint4 = dict(
+    type=TurboMindModel,
+    abbr='qwen1.5-7b-chat-turbomind-kvint4',
+    path='Qwen/Qwen1.5-7B-Chat',
+    engine_config=tb_kvint4_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=qwen1_5_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<|im_end|>')
+
+pt_qwen1_5_chat_7b = dict(type=LmdeployPytorchModel,
+                          abbr='qwen1.5-7b-chat-pytorch',
+                          path='Qwen/Qwen1.5-7B-Chat',
+                          engine_config=pt_engine_config_template_max_bs_128,
+                          gen_config=gen_config_template,
+                          max_out_len=MAX_NEW_TOKENS,
+                          max_seq_len=MAX_SESSION_LEN,
+                          batch_size=128,
+                          concurrency=128,
+                          meta_template=qwen1_5_meta_template,
+                          run_cfg=run_cfg_tp1_template,
+                          end_str='<|im_end|>')
+
+pt_qwen1_5_moe_2_7b_chat = dict(
+    type=LmdeployPytorchModel,
+    abbr='qwen1.5-moe-2.7b-chat-pytorch',
+    path='Qwen/Qwen1.5-MoE-A2.7B-Chat',
+    engine_config=pt_engine_config_template_max_bs_64,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=64,
+    concurrency=64,
+    meta_template=qwen1_5_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='<|im_end|>')
 
 # ===== Configs for google/gemma-7b-it =====
 hf_gemma_chat_7b = dict(type=HuggingFaceCausalLM,
                         abbr='gemma-7b-it-pytorch',
                         path='google/gemma-7b-it',
                         tokenizer_path='google/gemma-7b-it',
-                        model_kwargs=dict(device_map='auto',
-                                          trust_remote_code=True),
-                        tokenizer_kwargs=dict(
-                            padding_side='left',
-                            truncation_side='left',
-                            trust_remote_code=True,
-                        ),
+                        model_kwargs=model_kwargs_template,
+                        tokenizer_kwargs=tokenizer_kwargs_template,
                         meta_template=mistral_meta_template,
-                        max_out_len=256,
-                        max_seq_len=2048,
+                        max_out_len=MAX_NEW_TOKENS,
+                        max_seq_len=MAX_SESSION_LEN,
                         batch_size=16,
                         batch_padding=False,
-                        run_cfg=dict(num_gpus=1, num_procs=1),
+                        run_cfg=run_cfg_tp1_template,
                         end_str='end_of_turn')
 
 pt_gemma_chat_7b = dict(type=LmdeployPytorchModel,
                         abbr='gemma-7b-it-pytorch',
                         path='google/gemma-7b-it',
-                        engine_config=dict(session_len=2048,
-                                           max_batch_size=16),
-                        gen_config=dict(top_k=1,
-                                        top_p=0.8,
-                                        temperature=1.0,
-                                        max_new_tokens=256),
-                        max_out_len=256,
-                        max_seq_len=2048,
+                        engine_config=pt_engine_config_template_max_bs_16,
+                        gen_config=gen_config_template,
+                        max_out_len=MAX_NEW_TOKENS,
+                        max_seq_len=MAX_SESSION_LEN,
                         batch_size=16,
                         concurrency=16,
                         meta_template=gemma_meta_template,
-                        run_cfg=dict(num_gpus=1, num_procs=1),
+                        run_cfg=run_cfg_tp1_template,
                         end_str='<end_of_turn>')
+
+# ===== Configs for meta-llama/Meta-Llama-3-8B-Instruct =====
+# config for llama-3-8b-instruct turbomind
+tb_llama_3_8b_instruct = dict(
+    type=TurboMindModel,
+    abbr='llama-3-8b-instruct-turbomind',
+    path='meta-llama/Meta-Llama-3-8B-Instruct',
+    engine_config=tb_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=llama3_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='[INST]')
+
+tb_llama_3_8b_instruct_w4a16 = dict(
+    type=TurboMindModel,
+    abbr='llama-3-8b-instruct-4bits-turbomind',
+    path='meta-llama/Meta-Llama-3-8B-Instruct-inner-4bits',
+    engine_config=tb_awq_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=llama3_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='[INST]')
+
+tb_llama_3_8b_instruct_kvint4 = dict(
+    type=TurboMindModel,
+    abbr='llama-3-8b-instruct-turbomind-kvint4',
+    path='meta-llama/Meta-Llama-3-8B-Instruct',
+    engine_config=tb_kvint4_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=llama3_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='[INST]')
+
+# config for llama-3-8b-instruct pytorch
+pt_llama_3_8b_instruct = dict(
+    type=LmdeployPytorchModel,
+    abbr='llama-3-8b-instruct-pytorch',
+    path='meta-llama/Meta-Llama-3-8B-Instruct',
+    engine_config=pt_engine_config_template_max_bs_128,
+    gen_config=gen_config_template,
+    max_out_len=MAX_NEW_TOKENS,
+    max_seq_len=MAX_SESSION_LEN,
+    batch_size=128,
+    concurrency=128,
+    meta_template=llama3_meta_template,
+    run_cfg=run_cfg_tp1_template,
+    end_str='[INST]')
diff --git a/.github/scripts/set_benchmark_param.sh b/.github/scripts/set_benchmark_param.sh
index 502d53baf0..884e3ab0e9 100644
--- a/.github/scripts/set_benchmark_param.sh
+++ b/.github/scripts/set_benchmark_param.sh
@@ -12,15 +12,24 @@ else
     echo "MODEL_FORMAT=" >> "$GITHUB_ENV"
 fi
 
-if [[ $1 == *"llama"* ]] || [[ $1 == *"Llama"* ]]
+if [[ $1 == *"llama2"* ]] || [[ $1 == *"Llama-2"* ]]
 then
     echo "MAX_ENTRY_COUNT=--cache-max-entry-count 0.95" >> "$GITHUB_ENV"
+
 else
     echo "MAX_ENTRY_COUNT=--cache-max-entry-count 0.90" >> "$GITHUB_ENV"
 fi
 
+if [[ $1 == *"Llama-2-13b"* ]]
+then
+    echo "BATCHES=128" >> "$GITHUB_ENV"
+    echo "MAX_BATCH_SIZE=" >> "$GITHUB_ENV"
+else
+    echo "BATCHES=128 256" >> "$GITHUB_ENV"
+    echo "MAX_BATCH_SIZE=--max-batch-size 256" >> "$GITHUB_ENV"
+fi
 
-if [[ $1 == *"internlm2-chat-20b"* ]]
+if [[ $1 == *"internlm2-chat-20b"* ]] || [[ $1 == *"Qwen1.5-32B-Chat"* ]]
 then
   echo "TP_INFO=--tp 2" >> "$GITHUB_ENV"
 fi
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index c981721adb..46900ae7d2 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -52,7 +52,7 @@ on:
         required: true
         description: 'Set models run benchmark'
         type: string
-        default: "['internlm/internlm2-chat-20b','internlm/internlm2-chat-20b-inner-4bits','meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-7b-chat-hf-inner-4bits']"
+        default: "['internlm/internlm2-chat-20b','internlm/internlm2-chat-20b-inner-4bits','meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-7b-chat-hf-inner-4bits','meta-llama/Meta-Llama-3-8B-Instruct','Qwen/Qwen1.5-32B-Chat']"
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -61,7 +61,7 @@ env:
   REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }}
   DATASET_FILE: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
   TP_INFO: --tp 1
-  LOOP_NUM: 3
+  LOOP_NUM: 1
   TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas
 
 
@@ -115,7 +115,7 @@ jobs:
       CUDA_VISIBLE_DEVICES: 6,7
     container:
       image: nvcr.io/nvidia/tritonserver:22.12-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
@@ -211,7 +211,7 @@ jobs:
       CUDA_VISIBLE_DEVICES: 4,5
     container:
       image: nvcr.io/nvidia/tritonserver:22.12-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
@@ -268,8 +268,8 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir}
-          batches=(128 256)
-          for batch in "${batches[@]}"
+          batches=($BATCHES)
+          for batch in ${batches[@]}
           do
             for ((i=1; i<=$LOOP_NUM; i++))
               do
@@ -283,8 +283,8 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir}
-          batches=(128 256)
-          for batch in "${batches[@]}"
+          batches=($BATCHES)
+          for batch in ${batches[@]}
           do
             for ((i=1; i<=$LOOP_NUM; i++))
               do
@@ -298,8 +298,8 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir}
-          batches=(128 256)
-          for batch in "${batches[@]}"
+          batches=($BATCHES)
+          for batch in ${batches[@]}
           do
             for ((i=1; i<=$LOOP_NUM; i++))
               do
@@ -313,8 +313,8 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir}
-          batches=(128 256)
-          for batch in "${batches[@]}"
+          batches=($BATCHES)
+          for batch in ${batches[@]}
           do
             for ((i=1; i<=$LOOP_NUM; i++))
               do
@@ -352,7 +352,7 @@ jobs:
       CUDA_VISIBLE_DEVICES: 6,7
     container:
       image: nvcr.io/nvidia/tritonserver:22.12-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
@@ -405,7 +405,7 @@ jobs:
       - name: Start restful api turbomind
         if: contains(fromJSON(github.event.inputs.backend), 'turbomind')
         run: |
-          lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level ${{inputs.log_level}} > turbomind_run.log 2>&1 &
+          lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MAX_BATCH_SIZE $MODEL_FORMAT $TP_INFO --log-level ${{inputs.log_level}} > turbomind_run.log 2>&1 &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
           sleep 180s
       - name: Run restful benchmark
@@ -415,8 +415,8 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir}
-          batches=(128 256)
-          for batch in "${batches[@]}"
+          batches=($BATCHES)
+          for batch in ${batches[@]}
           do
             for ((i=1; i<=$LOOP_NUM; i++))
               do
@@ -441,7 +441,7 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir}
-          batches=(128 256)
+          batches=($BATCHES)
           for batch in "${batches[@]}"
           do
             for ((i=1; i<=$LOOP_NUM; i++))
@@ -466,7 +466,7 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir}
-          batches=(128 256)
+          batches=($BATCHES)
           for batch in "${batches[@]}"
           do
             for ((i=1; i<=$LOOP_NUM; i++))
@@ -491,7 +491,7 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir}
-          batches=(128 256)
+          batches=($BATCHES)
           for batch in "${batches[@]}"
           do
             for ((i=1; i<=$LOOP_NUM; i++))
@@ -544,7 +544,11 @@ jobs:
           repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
           ref: ${{github.event.inputs.repo_ref || 'main'}}
       - name: Set params
-        if: (contains( matrix.model, 'internlm2-chat-20b'))
+        run: |
+          chmod +x .github/scripts/set_benchmark_param.sh
+          .github/scripts/set_benchmark_param.sh ${{matrix.model}}
+      - name: Set params - cuda allocate
+        if: contains( env.TP_INFO, '--tp 2')
         run: |
           echo 'DEVICE="device=4,5"' >> "$GITHUB_ENV"
       - name: Create test container
@@ -560,6 +564,7 @@ jobs:
             --name "lmdeploy-ci-triton-$GITHUB_RUN_ID-$date_today" \
             --workdir /__w/lmdeploy/lmdeploy \
             --env NCCL_LAUNCH_MODE=GROUP \
+            --pull never \
             -v $(pwd)/../../:/__w \
             -v ${MODEL_PATH}:${MODEL_PATH} \
             -v ${WORKDIR}:/root/workspace/workdir \
@@ -575,7 +580,6 @@ jobs:
       - name: Build lmdeploy from source
         if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
         run: |
-          docker exec $CONTAINER_ID sed -i 's/https:\/\/github.com\/NVIDIA\/cutlass.git/https:\/\/521github.com\/extdomains\/github.com\/NVIDIA\/cutlass.git/g' CMakeLists.txt
           docker exec $CONTAINER_ID mkdir build
           docker exec --workdir /__w/lmdeploy/lmdeploy/build \
             --env http_proxy=${{secrets.PROXY}} \
@@ -664,7 +668,7 @@ jobs:
         run: |
           rm -rf ${result_dir}
           mkdir ${result_dir} -p
-          batches=(128 256)
+          batches=($BATCHES)
           for batch in "${batches[@]}"
           do
             for ((i=1; i<=$LOOP_NUM; i++))
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 852ec9b631..523bccc699 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -49,7 +49,7 @@ on:
         type: boolean
         default: true
   schedule:
-    - cron:  '00 20 * * 1-5'
+    - cron:  '00 20 * * 0-4'
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -106,7 +106,7 @@ jobs:
       MODELSCOPE_MODULES_CACHE: /root/modelscope_modules
     container:
       image: nvcr.io/nvidia/tritonserver:22.12-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
@@ -293,6 +293,7 @@ jobs:
             --name "lmdeploy-ci-triton-$GITHUB_RUN_ID" \
             --workdir /__w/lmdeploy/lmdeploy \
             --env NCCL_LAUNCH_MODE=GROUP \
+            --pull never \
             -v $(pwd)/../../:/__w \
             -v ${HF_MODEL}:/root/workspace/hf_model \
             -v ${WORKDIR}:/root/workspace/workdir \
@@ -431,7 +432,7 @@ jobs:
       REPORT_DIR: /nvme/qa_test_models/test-reports
     container:
       image: nvcr.io/nvidia/tritonserver:22.12-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 0ad62d5dd5..010424fd7b 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -17,27 +17,32 @@ on:
         required: true
         description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]'
         type: string
-        default: '[internlm2_chat_7b,internlm2_chat_20b,internlm2_chat_20b_w4a16,llama2_chat_7b,qwen_chat_7b]'
+        default: '[tb_internlm2_chat_7b,tb_internlm2_chat_20b,tb_internlm2_chat_20b_w4a16,tb_llama2_chat_7b,tb_qwen1_5_chat_7b,tb_llama_3_8b_instruct,pt_internlm2_chat_7b,pt_internlm2_chat_20b,pt_llama2_chat_7b,pt_qwen1_5_chat_7b,pt_qwen1_5_moe_2_7b_chat,pt_llama_3_8b_instruct,tb_internlm2_chat_7b_kvint4,tb_internlm2_chat_20b_kvint4,tb_qwen1_5_chat_7b_kvint4,tb_llama_3_8b_instruct_kvint4]'
+      datasets:
+        required: true
+        description: 'Tested datasets list. eg. [*mmlu_datasets, *ceval_datasets, *WiC_datasets, *WSC_datasets, *triviaqa_datasets, *gsm8k_datasets, *race_datasets, *crowspairs_datasets]'
+        type: string
+        default: '[*mmlu_datasets, *gsm8k_datasets]'
       devices:
         required: true
         description: 'CUDA_VISIBLE_DEVICES.'
         type: string
         default: '0,1,2,3,4,5,6,7'
 
+
 jobs:
   evaluate:
     runs-on: [self-hosted, linux-a100]
     timeout-minutes: 4320 # 72hours
-    environment: 'prod'
     container:
       image: nvcr.io/nvidia/tritonserver:22.12-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/github-actions/pip-cache:/root/.cache/pip
         - /nvme/github-actions/packages:/root/packages
         - /nvme/github-actions/resources:/root/resources
-        - /nvme/github-actions/evaluation-reports:/root/evaluation-reports
         - /nvme/github-actions/opencompass-data:/root/opencompass-data
+        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
         - /nvme/qa_test_models:/root/models
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
@@ -102,6 +107,7 @@ jobs:
 
           python3 .github/scripts/action_tools.py evaluate \
             --models "${{github.event.inputs.models}}" \
+            --datasets "${{github.event.inputs.datasets}}" \
             --workspace /root/evaluation-reports/$TIME_STAMP
       - name: Clear workspace
         if: always()
diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
index 3921b6b09a..27227f3bec 100644
--- a/.github/workflows/pr_ete_test.yml
+++ b/.github/workflows/pr_ete_test.yml
@@ -33,7 +33,7 @@ jobs:
       REPORT_DIR: /nvme/qa_test_models/test-reports
     container:
       image: nvcr.io/nvidia/tritonserver:24.03-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
         - /nvme/share_data/github-actions/packages:/root/packages
@@ -56,7 +56,6 @@ jobs:
         run: |
           python3 -m pip install cmake
           python3 -m pip install -r requirements/build.txt
-          sed -i 's/https:\/\/github.com\/NVIDIA\/cutlass.git/https:\/\/521github.com\/extdomains\/github.com\/NVIDIA\/cutlass.git/g' CMakeLists.txt
           mkdir build
           cd build
           cmake .. \
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index a2eca9d795..bdd9ecb993 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -36,7 +36,7 @@ jobs:
     timeout-minutes: 4320 # 72hours
     container:
       image: nvcr.io/nvidia/tritonserver:22.12-py3
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e CUDA_VISIBLE_DEVICES=2,3"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e CUDA_VISIBLE_DEVICES=2,3 --pull never"
       volumes:
         - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
         - /nvme/share_data/github-actions/packages:/root/packages
@@ -58,7 +58,6 @@ jobs:
         run: |
           python3 -m pip install cmake
           python3 -m pip install -r requirements/build.txt
-          sed -i 's/https:\/\/github.com\/NVIDIA\/cutlass.git/https:\/\/521github.com\/extdomains\/github.com\/NVIDIA\/cutlass.git/g' CMakeLists.txt
           mkdir build
           cd build
           cmake .. \
diff --git a/autotest/config.yaml b/autotest/config.yaml
index a25ffd2ff2..8100705465 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -16,14 +16,12 @@ turbomind_chat_model:
     - meta-llama/Llama-2-7b-chat-hf
     - meta-llama/Meta-Llama-3-8B-Instruct
     - internlm/internlm2-chat-1_8b
-    - internlm/internlm-chat-7b
     - internlm/internlm-chat-20b
     - internlm/internlm2-chat-7b
     - internlm/internlm2-chat-20b
     - internlm/internlm2-chat-7b-4bits
     - internlm/internlm2-chat-20b-4bits
     - Qwen/Qwen-7B-Chat
-    - Qwen/Qwen-14B-Chat
     - Qwen/Qwen1.5-7B-Chat
     - lmdeploy/llama2-chat-7b-w4
     - baichuan-inc/Baichuan2-7B-Chat
@@ -37,11 +35,12 @@ turbomind_chat_model:
     - deepseek-ai/deepseek-coder-1.3b-instruct
     - codellama/CodeLlama-7b-Instruct-hf
     - Qwen/Qwen1.5-4B-Chat-AWQ
+    - OpenGVLab/InternVL-Chat-V1-5
+    - internlm/internlm-xcomposer2-vl-7b
 
 pytorch_chat_model:
     - meta-llama/Llama-2-7b-chat-hf
     - meta-llama/Meta-Llama-3-8B-Instruct
-    - internlm/internlm-chat-7b
     - internlm/internlm-chat-20b
     - internlm/internlm2-chat-7b
     - internlm/internlm2-chat-20b
@@ -49,6 +48,7 @@ pytorch_chat_model:
     - baichuan-inc/Baichuan2-13B-Chat
     - 01-ai/Yi-6B-Chat
     - Qwen/Qwen1.5-7B-Chat
+    - Qwen/Qwen1.5-MoE-A2.7B-Chat
     - deepseek-ai/deepseek-moe-16b-chat
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mixtral-8x7B-Instruct-v0.1
@@ -73,29 +73,29 @@ vl_model:
     - liuhaotian/llava-v1.6-vicuna-7b
     - 01-ai/Yi-VL-6B
     - deepseek-ai/deepseek-vl-1.3b-chat
+    - OpenGVLab/InternVL-Chat-V1-5
+    - internlm/internlm-xcomposer2-vl-7b
 
 quatization_case_config:
     w4a16:
         - meta-llama/Llama-2-7b-chat-hf
         - internlm/internlm-chat-20b
         - Qwen/Qwen-7B-Chat
-        - Qwen/Qwen-14B-Chat
         - internlm/internlm2-chat-20b
         - baichuan-inc/Baichuan2-7B-Chat
         - internlm/internlm2-20b
         - Qwen/Qwen1.5-7B-Chat
+        - meta-llama/Meta-Llama-3-8B-Instruct
     kvint:
         - meta-llama/Llama-2-7b-chat-hf
         - meta-llama/Meta-Llama-3-8B-Instruct
         - internlm/internlm2-chat-1_8b
-        - internlm/internlm-chat-7b
         - internlm/internlm-chat-20b
         - internlm/internlm2-chat-7b
         - internlm/internlm2-chat-20b
         - internlm/internlm2-chat-7b-4bits
         - internlm/internlm2-chat-20b-4bits
         - Qwen/Qwen-7B-Chat
-        - Qwen/Qwen-14B-Chat
         - Qwen/Qwen1.5-7B-Chat
         - lmdeploy/llama2-chat-7b-w4
         - baichuan-inc/Baichuan2-7B-Chat
@@ -106,7 +106,6 @@ quatization_case_config:
     w8a8:
         - meta-llama/Llama-2-7b-chat-hf
         - meta-llama/Meta-Llama-3-8B-Instruct
-        - internlm/internlm-chat-7b
         - internlm/internlm-chat-20b
         - internlm/internlm2-chat-20b
         - internlm/internlm2-chat-7b