diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py index 0fab4b58c0..a6c51dffee 100644 --- a/.github/scripts/action_tools.py +++ b/.github/scripts/action_tools.py @@ -100,7 +100,7 @@ def _load_hf_results(test_results: dict, model_name: str): return out -def evaluate(models: List[str], workspace: str): +def evaluate(models: List[str], datasets: List[str], workspace: str): """Evaluate models from lmdeploy using opencompass. Args: @@ -150,6 +150,7 @@ def evaluate(models: List[str], workspace: str): continue logging.info(f'Start evaluating {target_model} ...\\nn{model_cfg}\n\n') with open(config_path_new, 'a') as f: + f.write(f'\ndatasets = {datasets}\n') f.write(f'\nmodels = [ {target_model} ]\n') work_dir = os.path.join(workspace, target_model) diff --git a/.github/scripts/eval_opencompass_config.py b/.github/scripts/eval_opencompass_config.py index 16a2737243..6b2fa43f15 100644 --- a/.github/scripts/eval_opencompass_config.py +++ b/.github/scripts/eval_opencompass_config.py @@ -4,27 +4,25 @@ with read_base(): # choose a list of datasets - # from .datasets.ceval.ceval_gen_5f30c7 import \ - # ceval_datasets # noqa: F401, E501 - # from .datasets.crowspairs.crowspairs_gen_381af0 import \ - # crowspairs_datasets # noqa: F401, E501 + from .datasets.ceval.ceval_gen_5f30c7 import \ + ceval_datasets # noqa: F401, E501 + from .datasets.crowspairs.crowspairs_gen_381af0 import \ + crowspairs_datasets # noqa: F401, E501 from .datasets.gsm8k.gsm8k_gen_1d7fe4 import \ gsm8k_datasets # noqa: F401, E501 from .datasets.mmlu.mmlu_gen_a484b3 import \ mmlu_datasets # noqa: F401, E501 - # from .datasets.race.race_gen_69ee4f import \ - # race_datasets # noqa: F401, E501 - # from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \ - # WiC_datasets # noqa: F401, E501 - # from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \ - # WSC_datasets # noqa: F401, E501 - # from .datasets.triviaqa.triviaqa_gen_2121ce import \ - # triviaqa_datasets # noqa: F401, E501 + from .datasets.race.race_gen_69ee4f import \ + race_datasets # noqa: F401, E501 + from .datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import \ + WiC_datasets # noqa: F401, E501 + from .datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import \ + WSC_datasets # noqa: F401, E501 + from .datasets.triviaqa.triviaqa_gen_2121ce import \ + triviaqa_datasets # noqa: F401, E501 # and output the results in a chosen format from .summarizers.medium import summarizer # noqa: F401, E501 -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) - internlm_meta_template = dict(round=[ dict(role='HUMAN', begin='<|User|>:', end='\n'), dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), @@ -46,6 +44,17 @@ ], eos_token_id=2) +llama3_meta_template = dict(round=[ + dict(role='HUMAN', + begin='<|start_header_id|>user<|end_header_id|>\n\n', + end='<|eot_id|>'), + dict(role='BOT', + begin='<|start_header_id|>assistant<|end_header_id|>\n\n', + end='<|eot_id|>', + generate=True), +], + eos_token_id=[128001, 128009]) + qwen_meta_template = dict(round=[ dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'), dict(role='BOT', @@ -54,7 +63,7 @@ generate=True), ], ) -qwen1dot5_meta_template = dict( +qwen1_5_meta_template = dict( round=[ dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', @@ -91,104 +100,200 @@ ], eos_token_id=1) +MAX_SESSION_LEN = 2048 +MAX_NEW_TOKENS = 100 + +tb_engine_config_template_max_bs_8 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=8, + rope_scaling_factor=1.0) +tb_engine_config_template_max_bs_16 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=16, + rope_scaling_factor=1.0) +tb_engine_config_template_max_bs_32 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=32, + rope_scaling_factor=1.0) +tb_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=128, + rope_scaling_factor=1.0) +tb_engine_config_template_max_bs_128_tp2 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=128, + tp=2, + rope_scaling_factor=1.0) + +pt_engine_config_template_max_bs_8 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=8) +pt_engine_config_template_max_bs_16 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=16) +pt_engine_config_template_max_bs_32 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=32) +pt_engine_config_template_max_bs_64 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=64) +pt_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=128) +pt_engine_config_template_max_bs_128_tp2 = dict(session_len=MAX_SESSION_LEN, + tp=2, + max_batch_size=128) +pt_engine_config_template_max_bs_64_tp2 = dict(session_len=MAX_SESSION_LEN, + tp=2, + max_batch_size=64) + +pt_engine_config_template_max_bs_8_prefill = dict(session_len=MAX_SESSION_LEN, + cache_max_entry_count=0.5, + max_prefill_token_num=4096, + max_batch_size=8) +pt_engine_config_template_max_bs_16_prefill = dict(session_len=MAX_SESSION_LEN, + cache_max_entry_count=0.5, + max_prefill_token_num=4096, + max_batch_size=16) +pt_engine_config_template_max_bs_64_prefill = dict(session_len=MAX_SESSION_LEN, + cache_max_entry_count=0.5, + max_prefill_token_num=4096, + max_batch_size=64) +pt_engine_config_template_max_bs_128_prefill = dict( + session_len=MAX_SESSION_LEN, + cache_max_entry_count=0.5, + max_prefill_token_num=4096, + max_batch_size=128) +pt_engine_config_template_max_bs_8_prefill_tp2 = dict( + session_len=MAX_SESSION_LEN, + cache_max_entry_count=0.5, + max_prefill_token_num=4096, + max_batch_size=8, + tp=2) +pt_engine_config_template_max_bs_64_prefill_tp2 = dict( + session_len=MAX_SESSION_LEN, + cache_max_entry_count=0.5, + max_prefill_token_num=4096, + max_batch_size=64, + tp=2) +pt_engine_config_template_max_bs_128_prefill_tp2 = dict( + session_len=MAX_SESSION_LEN, + cache_max_entry_count=0.5, + max_prefill_token_num=4096, + max_batch_size=128, + tp=2) +tb_awq_engine_config_template_max_bs_8 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=8, + model_format='awq', + rope_scaling_factor=1.0) +tb_awq_engine_config_template_max_bs_32 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=32, + model_format='awq', + rope_scaling_factor=1.0) +tb_awq_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=128, + model_format='awq', + rope_scaling_factor=1.0) + +tb_awq_engine_config_template_max_bs_128_tp2 = dict( + session_len=MAX_SESSION_LEN, + max_batch_size=128, + model_format='awq', + tp=2, + rope_scaling_factor=1.0) + +tb_kvint4_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN, + max_batch_size=128, + quant_policy=4, + rope_scaling_factor=1.0) + +tb_kvint4_engine_config_template_max_bs_128_tp2 = dict( + session_len=MAX_SESSION_LEN, + max_batch_size=128, + quant_policy=4, + tp=2, + rope_scaling_factor=1.0) + +gen_config_template = dict(top_k=1, + top_p=0.8, + temperature=1.0, + max_new_tokens=MAX_NEW_TOKENS) +qwen_gen_config_template = dict(top_k=1, + top_p=0.8, + temperature=1.0, + stop_words=[151645], + max_new_tokens=MAX_NEW_TOKENS) + +tokenizer_kwargs_template = dict(padding_side='left', + truncation_side='left', + use_fast=False, + trust_remote_code=True) +model_kwargs_template = dict(device_map='auto', trust_remote_code=True) + +run_cfg_tp1_template = dict(num_gpus=1, num_procs=1) +run_cfg_tp2_template = dict(num_gpus=2, num_procs=1) + # ===== Configs for internlm/internlm-chat-7b ===== # config for internlm-chat-7b hf_internlm_chat_7b = dict(type=HuggingFaceCausalLM, abbr='internlm-chat-7b-hf', path='internlm/internlm-chat-7b', tokenizer_path='internlm/internlm-chat-7b', - model_kwargs=dict( - trust_remote_code=True, - device_map='auto', - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - use_fast=False, - trust_remote_code=True, - ), - max_out_len=256, - max_seq_len=2048, + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, batch_padding=False, meta_template=internlm_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='') # config for internlm-chat-7b tb_internlm_chat_7b = dict(type=TurboMindModel, abbr='internlm-chat-7b-turbomind', path='internlm/internlm-chat-7b', - engine_config=dict(session_len=2048, - max_batch_size=32, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=tb_engine_config_template_max_bs_32, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=32, concurrency=32, meta_template=internlm_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='') # config for pt internlm-chat-7b pt_internlm_chat_7b = dict(type=LmdeployPytorchModel, abbr='internlm-chat-7b-pytorch', path='internlm/internlm-chat-7b', - engine_config=dict(session_len=2048, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=pt_engine_config_template_max_bs_16, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, concurrency=16, meta_template=internlm_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='') -tb_internlm_chat_7b_w4a16 = dict(type=TurboMindModel, - abbr='internlm-chat-7b-4bits-turbomind', - path='internlm/internlm-chat-7b-4bits', - engine_config=dict(session_len=2048, - max_batch_size=32, - model_format='awq', - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=32, - concurrency=32, - meta_template=internlm_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='') +tb_internlm_chat_7b_w4a16 = dict( + type=TurboMindModel, + abbr='internlm-chat-7b-4bits-turbomind', + path='internlm/internlm-chat-7b-inner-4bits', + engine_config=tb_awq_engine_config_template_max_bs_32, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=32, + concurrency=32, + meta_template=internlm_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='') # ===== Configs for internlm/internlm-chat-20b ===== # config for internlm-chat-20b tb_internlm_chat_20b = dict(type=TurboMindModel, abbr='internlm-chat-20b-turbomind', path='internlm/internlm-chat-20b', - engine_config=dict(session_len=2048, - max_batch_size=8, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=tb_engine_config_template_max_bs_8, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=8, concurrency=8, meta_template=internlm_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='') # config for internlm-chat-20b @@ -196,79 +301,59 @@ abbr='internlm-chat-20b-hf', path='internlm/internlm-chat-20b', tokenizer_path='internlm/internlm-chat-20b', - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - use_fast=False, - trust_remote_code=True, - ), - max_out_len=256, - max_seq_len=2048, + tokenizer_kwargs=tokenizer_kwargs_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=8, batch_padding=False, model_kwargs=dict(trust_remote_code=True, device_map='auto'), - run_cfg=dict(num_gpus=2, num_procs=1), + run_cfg=run_cfg_tp2_template, end_str='') # config for internlm-chat-20b-w4 model -tb_internlm_chat_20b_w4a16 = dict(type=TurboMindModel, - abbr='internlm-chat-20b-4bits-turbomind', - path='internlm/internlm-chat-20b-4bits', - engine_config=dict(session_len=2048, - max_batch_size=8, - model_format='awq', - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=8, - concurrency=8, - meta_template=internlm_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='') +tb_internlm_chat_20b_w4a16 = dict( + type=TurboMindModel, + abbr='internlm-chat-20b-4bits-turbomind', + path='internlm/internlm-chat-20b-inner-4bits', + engine_config=tb_awq_engine_config_template_max_bs_8, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=8, + concurrency=8, + meta_template=internlm_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='') # config for internlm-chat-20b -pt_internlm_chat_20b = dict(type=LmdeployPytorchModel, - abbr='internlm-chat-20b-pytorch', - path='internlm/internlm-chat-20b', - engine_config=dict(session_len=2048, - cache_max_entry_count=0.5, - max_prefill_token_num=4096, - max_batch_size=8), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=8, - concurrency=8, - meta_template=internlm_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='') +pt_internlm_chat_20b = dict( + type=LmdeployPytorchModel, + abbr='internlm-chat-20b-pytorch', + path='internlm/internlm-chat-20b', + engine_config=pt_engine_config_template_max_bs_8_prefill, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=8, + concurrency=8, + meta_template=internlm_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='') # ===== Configs for internlm/internlm2-chat-7b ===== # config for internlm2-chat-7b tb_internlm2_chat_7b = dict(type=TurboMindModel, abbr='internlm2-chat-7b-turbomind', path='internlm/internlm2-chat-7b', - engine_config=dict(session_len=2048, - max_batch_size=32, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=32, - concurrency=32, + engine_config=tb_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, meta_template=internlm2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='<|im_end|>') # config for internlm2-chat-7b @@ -276,181 +361,162 @@ abbr='internlm2-chat-7b-hf', path='internlm/internlm2-chat-7b', tokenizer_path='internlm/internlm2-chat-7b', - model_kwargs=dict( - trust_remote_code=True, - device_map='auto', - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - use_fast=False, - trust_remote_code=True, - ), - max_out_len=256, - max_seq_len=2048, + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, batch_padding=False, meta_template=internlm2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='<|im_end|>') # config for internlm2-chat-7b-w4 -tb_internlm2_chat_7b_w4a16 = dict(type=TurboMindModel, - abbr='internlm2-chat-7b-4bits-turbomind', - path='internlm/internlm2-chat-7b-4bits', - engine_config=dict(session_len=2048, - max_batch_size=32, - model_format='awq', - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=32, - concurrency=32, - meta_template=internlm2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<|im_end|>') +tb_internlm2_chat_7b_w4a16 = dict( + type=TurboMindModel, + abbr='internlm2-chat-7b-4bits-turbomind', + path='internlm/internlm2-chat-7b-inner-4bits', + engine_config=tb_awq_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=internlm2_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') + +tb_internlm2_chat_7b_kvint4 = dict( + type=TurboMindModel, + abbr='internlm2-chat-7b-turbomind-kvint4', + path='internlm/internlm2-chat-7b', + engine_config=tb_kvint4_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=internlm2_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') # config for pt internlm-chat-7b pt_internlm2_chat_7b = dict(type=LmdeployPytorchModel, abbr='internlm2-chat-7b-pytorch', path='internlm/internlm2-chat-7b', - engine_config=dict(session_len=2048, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=16, - concurrency=16, + engine_config=pt_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, meta_template=internlm2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='<|im_end|>') # ===== Configs for internlm/internlm2-chat-20b ===== # config for internlm2-chat-20b -tb_internlm2_chat_20b = dict(type=TurboMindModel, - abbr='internlm2-chat-20b-turbomind', - path='internlm/internlm2-chat-20b', - engine_config=dict(session_len=2048, - max_batch_size=8, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=8, - concurrency=8, - meta_template=internlm2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<|im_end|>') +tb_internlm2_chat_20b = dict( + type=TurboMindModel, + abbr='internlm2-chat-20b-turbomind', + path='internlm/internlm2-chat-20b', + engine_config=tb_engine_config_template_max_bs_128_tp2, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=internlm2_meta_template, + run_cfg=run_cfg_tp2_template, + end_str='<|im_end|>') # config for internlm2-chat-20b hf_internlm2_chat_20b = dict(type=HuggingFaceCausalLM, abbr='internlm2-chat-20b-hf', path='internlm/internlm2-chat-20b', tokenizer_path='internlm/internlm2-chat-20b', - model_kwargs=dict( - trust_remote_code=True, - device_map='auto', - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - use_fast=False, - trust_remote_code=True, - ), - max_out_len=256, - max_seq_len=2048, + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=8, batch_padding=False, meta_template=internlm2_meta_template, - run_cfg=dict(num_gpus=2, num_procs=1), + run_cfg=run_cfg_tp2_template, end_str='<|im_end|>') # config for internlm2-chat-20b-w4 model -tb_internlm2_chat_20b_w4a16 = dict(type=TurboMindModel, - abbr='internlm2-chat-20b-4bits-turbomind', - path='internlm/internlm2-chat-20b-4bits', - engine_config=dict(session_len=2048, - max_batch_size=8, - model_format='awq', - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=8, - concurrency=8, - meta_template=internlm2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<|im_end|>') +tb_internlm2_chat_20b_w4a16 = dict( + type=TurboMindModel, + abbr='internlm2-chat-20b-4bits-turbomind', + path='internlm/internlm2-chat-20b-inner-4bits', + engine_config=tb_awq_engine_config_template_max_bs_128_tp2, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=internlm2_meta_template, + run_cfg=run_cfg_tp2_template, + end_str='<|im_end|>') + +# config for internlm2-chat-20b-w4 model +tb_internlm2_chat_20b_kvint4 = dict( + type=TurboMindModel, + abbr='internlm2-chat-20b-turbomind-kvint4', + path='internlm/internlm2-chat-20b-inner-4bits', + engine_config=tb_kvint4_engine_config_template_max_bs_128_tp2, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=internlm2_meta_template, + run_cfg=run_cfg_tp2_template, + end_str='<|im_end|>') # config for pt internlm-chat-20b -pt_internlm2_chat_20b = dict(type=LmdeployPytorchModel, - abbr='internlm2-chat-20b-pytorch', - path='internlm/internlm2-chat-20b', - engine_config=dict(session_len=2048, - cache_max_entry_count=0.5, - max_prefill_token_num=4096, - max_batch_size=8), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=8, - concurrency=8, - meta_template=internlm2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<|im_end|>') +pt_internlm2_chat_20b = dict( + type=LmdeployPytorchModel, + abbr='internlm2-chat-20b-pytorch', + path='internlm/internlm2-chat-20b', + engine_config=pt_engine_config_template_max_bs_64_prefill, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=64, + concurrency=64, + meta_template=internlm2_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') # ===== Configs for Qwen/Qwen-7B-Chat ===== # config for qwen-chat-7b turbomind tb_qwen_chat_7b = dict(type=TurboMindModel, abbr='qwen-7b-chat-turbomind', path='Qwen/Qwen-7B-Chat', - engine_config=dict(session_len=2048, - max_batch_size=16, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=tb_engine_config_template_max_bs_16, + gen_config=qwen_gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, concurrency=16, meta_template=qwen_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='<|im_end|>') # config for qwen-chat-7b pytorch pt_qwen_chat_7b = dict(type=LmdeployPytorchModel, abbr='qwen-7b-chat-pytorch', path='Qwen/Qwen-7B-Chat', - engine_config=dict(session_len=2048, max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - stop_words=[151645], - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=pt_engine_config_template_max_bs_16, + gen_config=qwen_gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, concurrency=16, meta_template=qwen_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='<|im_end|>') # config for qwen-chat-7b huggingface @@ -459,18 +525,15 @@ abbr='qwen-7b-chat-hf', path='Qwen/Qwen-7B-Chat', tokenizer_path='Qwen/Qwen-7B-Chat', - model_kwargs=dict(device_map='auto', trust_remote_code=True), - tokenizer_kwargs=dict(padding_side='left', - truncation_side='left', - trust_remote_code=True, - use_fast=False), + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, pad_token_id=151643, - max_out_len=100, - max_seq_len=2048, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, batch_padding=False, meta_template=qwen_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='<|im_end|>', ) @@ -479,37 +542,28 @@ tb_llama2_chat_7b = dict(type=TurboMindModel, abbr='llama-2-7b-chat-turbomind', path='meta-llama/Llama-2-7b-chat-hf', - engine_config=dict(session_len=2048, - max_batch_size=16, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=16, - concurrency=16, + engine_config=tb_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, meta_template=llama2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='[INST]') # config for llama2-chat-7b pytorch pt_llama2_chat_7b = dict(type=LmdeployPytorchModel, abbr='llama-2-7b-chat-pytorch', path='meta-llama/Llama-2-7b-chat-hf', - engine_config=dict(session_len=2048, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=16, - concurrency=16, + engine_config=pt_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, meta_template=llama2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='[INST]') # config for llama2-chat-7b huggingface @@ -517,18 +571,14 @@ abbr='llama-2-7b-chat-hf', path='meta-llama/Llama-2-7b-chat-hf', tokenizer_path='meta-llama/Llama-2-7b-chat-hf', - model_kwargs=dict(device_map='auto'), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - use_fast=False, - ), + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, meta_template=llama2_meta_template, - max_out_len=256, - max_seq_len=2048, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, batch_padding=False, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='[INST]') # ===== Configs for baichuan-inc/Baichuan2-7B-Chat ===== @@ -536,54 +586,41 @@ tb_baichuan2_chat_7b = dict(type=TurboMindModel, abbr='Baichuan2-7B-Chat-turbomind', path='baichuan-inc/Baichuan2-7B-Chat', - engine_config=dict(session_len=2048, - max_batch_size=16, - rope_scaling_factor=1.0), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=tb_engine_config_template_max_bs_16, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, concurrency=16, meta_template=baichuan2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1)) + run_cfg=run_cfg_tp1_template) # config for baichuan2-chat-7b huggingface hf_baichuan2_chat_7b = dict(type=HuggingFaceCausalLM, abbr='baichuan2-7b-chat-hf', path='baichuan-inc/Baichuan2-7B-Chat', tokenizer_path='baichuan-inc/Baichuan2-7B-Chat', - tokenizer_kwargs=dict(padding_side='left', - truncation_side='left', - trust_remote_code=True, - use_fast=False), + tokenizer_kwargs=tokenizer_kwargs_template, meta_template=baichuan2_meta_template, - max_out_len=100, - max_seq_len=2048, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, batch_padding=False, - model_kwargs=dict(device_map='auto', - trust_remote_code=True), - run_cfg=dict(num_gpus=1, num_procs=1)) + model_kwargs=model_kwargs_template, + run_cfg=run_cfg_tp1_template) # config for baichuan2-chat-7b pytorch pt_baichuan2_chat_7b = dict(type=LmdeployPytorchModel, abbr='baichuan2-7b-chat-hf', path='baichuan-inc/Baichuan2-7B-Chat', - engine_config=dict(session_len=2048, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=pt_engine_config_template_max_bs_16, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, concurrency=16, meta_template=baichuan2_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str=None) # ===== Configs for mistralai/Mistral-7B-Instruct-v0.1 ===== @@ -591,18 +628,14 @@ pt_mistral_chat_7b = dict(type=LmdeployPytorchModel, abbr='mistral-7b-instruct-v0.1-pytorch', path='mistralai/Mistral-7B-Instruct-v0.1', - engine_config=dict(session_len=2048, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=pt_engine_config_template_max_bs_16, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, concurrency=16, meta_template=mistral_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='') # config for hf Mistral-7B-Instruct-v0.1 @@ -610,19 +643,14 @@ type=HuggingFaceCausalLM, path='mistralai/Mistral-7B-Instruct-v0.1', tokenizer_path='mistralai/Mistral-7B-Instruct-v0.1', - model_kwargs=dict(device_map='auto', - trust_remote_code=True), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - ), + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, meta_template=mistral_meta_template, - max_out_len=256, - max_seq_len=2048, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, batch_padding=False, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='') # ===== Configs for mistralai/Mixtral-8x7B-Instruct-v0.1 ===== @@ -632,110 +660,197 @@ type=HuggingFaceCausalLM, path='mistralai/Mixtral-8x7B-Instruct-v0.1', tokenizer_path='mistralai/Mixtral-8x7B-Instruct-v0.1', - model_kwargs=dict(device_map='auto', trust_remote_code=True), - tokenizer_kwargs=dict(padding_side='left', - truncation_side='left', - trust_remote_code=True), + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, meta_template=mistral_meta_template, - max_out_len=256, - max_seq_len=2048, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=8, batch_padding=False, - run_cfg=dict(num_gpus=2, num_procs=1), + run_cfg=run_cfg_tp2_template, end_str='') # config for pt Mixtral-8x7B-Instruct-v0.1 -pt_mixtral_chat_8x7b = dict(type=LmdeployPytorchModel, - abbr='mixtral-8x7b-instruct-v0.1-pytorch', - path='mistralai/Mixtral-8x7B-Instruct-v0.1', - engine_config=dict(session_len=2048, - tp=2, - cache_max_entry_count=0.5, - max_prefill_token_num=4096, - max_batch_size=8), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=8, - concurrency=8, - meta_template=mistral_meta_template, - run_cfg=dict(num_gpus=2, num_procs=1), - end_str='') +pt_mixtral_chat_8x7b = dict( + type=LmdeployPytorchModel, + abbr='mixtral-8x7b-instruct-v0.1-pytorch', + path='mistralai/Mixtral-8x7B-Instruct-v0.1', + engine_config=pt_engine_config_template_max_bs_8_prefill_tp2, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=8, + concurrency=8, + meta_template=mistral_meta_template, + run_cfg=run_cfg_tp2_template, + end_str='') # ===== Configs for Qwen/Qwen1.5-7B-Chat ===== -hf_qwen1dot5_chat_7b = dict(type=HuggingFaceCausalLM, - abbr='qwen1.5-7b-chat-hf', - path='Qwen/Qwen1.5-7B-Chat', - model_kwargs=dict(device_map='auto', - trust_remote_code=True), - tokenizer_kwargs=dict(padding_side='left', - truncation_side='left', - trust_remote_code=True, - use_fast=False), - meta_template=qwen1dot5_meta_template, - pad_token_id=151645, - max_out_len=256, - max_seq_len=2048, - batch_size=8, - batch_padding=False, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<|im_end|>') - -pt_qwen1dot5_chat_7b = dict(type=LmdeployPytorchModel, - abbr='qwen1.5-7b-chat-pytorch', - path='Qwen/Qwen1.5-7B-Chat', - engine_config=dict(session_len=2048, - cache_max_entry_count=0.5, - max_prefill_token_num=4096, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, - batch_size=16, - concurrency=16, - meta_template=qwen1dot5_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<|im_end|>') +hf_qwen1_5_chat_7b = dict(type=HuggingFaceCausalLM, + abbr='qwen1.5-7b-chat-hf', + path='Qwen/Qwen1.5-7B-Chat', + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, + meta_template=qwen1_5_meta_template, + pad_token_id=151645, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=8, + batch_padding=False, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') + +tb_qwen1_5_chat_7b = dict(type=TurboMindModel, + abbr='qwen1.5-7b-chat-turbomind', + path='Qwen/Qwen1.5-7B-Chat', + engine_config=tb_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=qwen1_5_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') + +tb_qwen1_5_chat_7b_w4a16 = dict( + type=TurboMindModel, + abbr='qwen1.5-7b-chat-4bits-turbomind', + path='Qwen/Qwen1.5-7B-Chat-inner-4bits', + engine_config=tb_awq_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=qwen1_5_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') + +tb_qwen1_5_chat_7b_kvint4 = dict( + type=TurboMindModel, + abbr='qwen1.5-7b-chat-turbomind-kvint4', + path='Qwen/Qwen1.5-7B-Chat', + engine_config=tb_kvint4_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=qwen1_5_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') + +pt_qwen1_5_chat_7b = dict(type=LmdeployPytorchModel, + abbr='qwen1.5-7b-chat-pytorch', + path='Qwen/Qwen1.5-7B-Chat', + engine_config=pt_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=qwen1_5_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') + +pt_qwen1_5_moe_2_7b_chat = dict( + type=LmdeployPytorchModel, + abbr='qwen1.5-moe-2.7b-chat-pytorch', + path='Qwen/Qwen1.5-MoE-A2.7B-Chat', + engine_config=pt_engine_config_template_max_bs_64, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=64, + concurrency=64, + meta_template=qwen1_5_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='<|im_end|>') # ===== Configs for google/gemma-7b-it ===== hf_gemma_chat_7b = dict(type=HuggingFaceCausalLM, abbr='gemma-7b-it-pytorch', path='google/gemma-7b-it', tokenizer_path='google/gemma-7b-it', - model_kwargs=dict(device_map='auto', - trust_remote_code=True), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - ), + model_kwargs=model_kwargs_template, + tokenizer_kwargs=tokenizer_kwargs_template, meta_template=mistral_meta_template, - max_out_len=256, - max_seq_len=2048, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, batch_padding=False, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='end_of_turn') pt_gemma_chat_7b = dict(type=LmdeployPytorchModel, abbr='gemma-7b-it-pytorch', path='google/gemma-7b-it', - engine_config=dict(session_len=2048, - max_batch_size=16), - gen_config=dict(top_k=1, - top_p=0.8, - temperature=1.0, - max_new_tokens=256), - max_out_len=256, - max_seq_len=2048, + engine_config=pt_engine_config_template_max_bs_16, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, batch_size=16, concurrency=16, meta_template=gemma_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=run_cfg_tp1_template, end_str='') + +# ===== Configs for meta-llama/Meta-Llama-3-8B-Instruct ===== +# config for llama-3-8b-instruct turbomind +tb_llama_3_8b_instruct = dict( + type=TurboMindModel, + abbr='llama-3-8b-instruct-turbomind', + path='meta-llama/Meta-Llama-3-8B-Instruct', + engine_config=tb_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=llama3_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='[INST]') + +tb_llama_3_8b_instruct_w4a16 = dict( + type=TurboMindModel, + abbr='llama-3-8b-instruct-4bits-turbomind', + path='meta-llama/Meta-Llama-3-8B-Instruct-inner-4bits', + engine_config=tb_awq_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=llama3_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='[INST]') + +tb_llama_3_8b_instruct_kvint4 = dict( + type=TurboMindModel, + abbr='llama-3-8b-instruct-turbomind-kvint4', + path='meta-llama/Meta-Llama-3-8B-Instruct', + engine_config=tb_kvint4_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=llama3_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='[INST]') + +# config for llama-3-8b-instruct pytorch +pt_llama_3_8b_instruct = dict( + type=LmdeployPytorchModel, + abbr='llama-3-8b-instruct-pytorch', + path='meta-llama/Meta-Llama-3-8B-Instruct', + engine_config=pt_engine_config_template_max_bs_128, + gen_config=gen_config_template, + max_out_len=MAX_NEW_TOKENS, + max_seq_len=MAX_SESSION_LEN, + batch_size=128, + concurrency=128, + meta_template=llama3_meta_template, + run_cfg=run_cfg_tp1_template, + end_str='[INST]') diff --git a/.github/scripts/set_benchmark_param.sh b/.github/scripts/set_benchmark_param.sh index 502d53baf0..884e3ab0e9 100644 --- a/.github/scripts/set_benchmark_param.sh +++ b/.github/scripts/set_benchmark_param.sh @@ -12,15 +12,24 @@ else echo "MODEL_FORMAT=" >> "$GITHUB_ENV" fi -if [[ $1 == *"llama"* ]] || [[ $1 == *"Llama"* ]] +if [[ $1 == *"llama2"* ]] || [[ $1 == *"Llama-2"* ]] then echo "MAX_ENTRY_COUNT=--cache-max-entry-count 0.95" >> "$GITHUB_ENV" + else echo "MAX_ENTRY_COUNT=--cache-max-entry-count 0.90" >> "$GITHUB_ENV" fi +if [[ $1 == *"Llama-2-13b"* ]] +then + echo "BATCHES=128" >> "$GITHUB_ENV" + echo "MAX_BATCH_SIZE=" >> "$GITHUB_ENV" +else + echo "BATCHES=128 256" >> "$GITHUB_ENV" + echo "MAX_BATCH_SIZE=--max-batch-size 256" >> "$GITHUB_ENV" +fi -if [[ $1 == *"internlm2-chat-20b"* ]] +if [[ $1 == *"internlm2-chat-20b"* ]] || [[ $1 == *"Qwen1.5-32B-Chat"* ]] then echo "TP_INFO=--tp 2" >> "$GITHUB_ENV" fi diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c981721adb..46900ae7d2 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -52,7 +52,7 @@ on: required: true description: 'Set models run benchmark' type: string - default: "['internlm/internlm2-chat-20b','internlm/internlm2-chat-20b-inner-4bits','meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-7b-chat-hf-inner-4bits']" + default: "['internlm/internlm2-chat-20b','internlm/internlm2-chat-20b-inner-4bits','meta-llama/Llama-2-7b-chat-hf','meta-llama/Llama-2-7b-chat-hf-inner-4bits','meta-llama/Meta-Llama-3-8B-Instruct','Qwen/Qwen1.5-32B-Chat']" env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -61,7 +61,7 @@ env: REPORT_DIR: /nvme/qa_test_models/benchmark-reports/${{ github.run_id }} DATASET_FILE: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json TP_INFO: --tp 1 - LOOP_NUM: 3 + LOOP_NUM: 1 TRITON_PTXAS_PATH: /usr/local/cuda/bin/ptxas @@ -115,7 +115,7 @@ jobs: CUDA_VISIBLE_DEVICES: 6,7 container: image: nvcr.io/nvidia/tritonserver:22.12-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages @@ -211,7 +211,7 @@ jobs: CUDA_VISIBLE_DEVICES: 4,5 container: image: nvcr.io/nvidia/tritonserver:22.12-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages @@ -268,8 +268,8 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} - batches=(128 256) - for batch in "${batches[@]}" + batches=($BATCHES) + for batch in ${batches[@]} do for ((i=1; i<=$LOOP_NUM; i++)) do @@ -283,8 +283,8 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} - batches=(128 256) - for batch in "${batches[@]}" + batches=($BATCHES) + for batch in ${batches[@]} do for ((i=1; i<=$LOOP_NUM; i++)) do @@ -298,8 +298,8 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} - batches=(128 256) - for batch in "${batches[@]}" + batches=($BATCHES) + for batch in ${batches[@]} do for ((i=1; i<=$LOOP_NUM; i++)) do @@ -313,8 +313,8 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} - batches=(128 256) - for batch in "${batches[@]}" + batches=($BATCHES) + for batch in ${batches[@]} do for ((i=1; i<=$LOOP_NUM; i++)) do @@ -352,7 +352,7 @@ jobs: CUDA_VISIBLE_DEVICES: 6,7 container: image: nvcr.io/nvidia/tritonserver:22.12-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages @@ -405,7 +405,7 @@ jobs: - name: Start restful api turbomind if: contains(fromJSON(github.event.inputs.backend), 'turbomind') run: | - lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MODEL_FORMAT $TP_INFO --log-level ${{inputs.log_level}} > turbomind_run.log 2>&1 & + lmdeploy serve api_server $MODEL_PATH $MAX_ENTRY_COUNT $MAX_BATCH_SIZE $MODEL_FORMAT $TP_INFO --log-level ${{inputs.log_level}} > turbomind_run.log 2>&1 & echo "restful_pid=$!" >> "$GITHUB_ENV" sleep 180s - name: Run restful benchmark @@ -415,8 +415,8 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} - batches=(128 256) - for batch in "${batches[@]}" + batches=($BATCHES) + for batch in ${batches[@]} do for ((i=1; i<=$LOOP_NUM; i++)) do @@ -441,7 +441,7 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} - batches=(128 256) + batches=($BATCHES) for batch in "${batches[@]}" do for ((i=1; i<=$LOOP_NUM; i++)) @@ -466,7 +466,7 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} - batches=(128 256) + batches=($BATCHES) for batch in "${batches[@]}" do for ((i=1; i<=$LOOP_NUM; i++)) @@ -491,7 +491,7 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} - batches=(128 256) + batches=($BATCHES) for batch in "${batches[@]}" do for ((i=1; i<=$LOOP_NUM; i++)) @@ -544,7 +544,11 @@ jobs: repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }} ref: ${{github.event.inputs.repo_ref || 'main'}} - name: Set params - if: (contains( matrix.model, 'internlm2-chat-20b')) + run: | + chmod +x .github/scripts/set_benchmark_param.sh + .github/scripts/set_benchmark_param.sh ${{matrix.model}} + - name: Set params - cuda allocate + if: contains( env.TP_INFO, '--tp 2') run: | echo 'DEVICE="device=4,5"' >> "$GITHUB_ENV" - name: Create test container @@ -560,6 +564,7 @@ jobs: --name "lmdeploy-ci-triton-$GITHUB_RUN_ID-$date_today" \ --workdir /__w/lmdeploy/lmdeploy \ --env NCCL_LAUNCH_MODE=GROUP \ + --pull never \ -v $(pwd)/../../:/__w \ -v ${MODEL_PATH}:${MODEL_PATH} \ -v ${WORKDIR}:/root/workspace/workdir \ @@ -575,7 +580,6 @@ jobs: - name: Build lmdeploy from source if: ${{github.event_name == 'schedule' || !inputs.offline_mode}} run: | - docker exec $CONTAINER_ID sed -i 's/https:\/\/github.com\/NVIDIA\/cutlass.git/https:\/\/521github.com\/extdomains\/github.com\/NVIDIA\/cutlass.git/g' CMakeLists.txt docker exec $CONTAINER_ID mkdir build docker exec --workdir /__w/lmdeploy/lmdeploy/build \ --env http_proxy=${{secrets.PROXY}} \ @@ -664,7 +668,7 @@ jobs: run: | rm -rf ${result_dir} mkdir ${result_dir} -p - batches=(128 256) + batches=($BATCHES) for batch in "${batches[@]}" do for ((i=1; i<=$LOOP_NUM; i++)) diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 852ec9b631..523bccc699 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -49,7 +49,7 @@ on: type: boolean default: true schedule: - - cron: '00 20 * * 1-5' + - cron: '00 20 * * 0-4' env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -106,7 +106,7 @@ jobs: MODELSCOPE_MODULES_CACHE: /root/modelscope_modules container: image: nvcr.io/nvidia/tritonserver:22.12-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages @@ -293,6 +293,7 @@ jobs: --name "lmdeploy-ci-triton-$GITHUB_RUN_ID" \ --workdir /__w/lmdeploy/lmdeploy \ --env NCCL_LAUNCH_MODE=GROUP \ + --pull never \ -v $(pwd)/../../:/__w \ -v ${HF_MODEL}:/root/workspace/hf_model \ -v ${WORKDIR}:/root/workspace/workdir \ @@ -431,7 +432,7 @@ jobs: REPORT_DIR: /nvme/qa_test_models/test-reports container: image: nvcr.io/nvidia/tritonserver:22.12-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 0ad62d5dd5..010424fd7b 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -17,27 +17,32 @@ on: required: true description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]' type: string - default: '[internlm2_chat_7b,internlm2_chat_20b,internlm2_chat_20b_w4a16,llama2_chat_7b,qwen_chat_7b]' + default: '[tb_internlm2_chat_7b,tb_internlm2_chat_20b,tb_internlm2_chat_20b_w4a16,tb_llama2_chat_7b,tb_qwen1_5_chat_7b,tb_llama_3_8b_instruct,pt_internlm2_chat_7b,pt_internlm2_chat_20b,pt_llama2_chat_7b,pt_qwen1_5_chat_7b,pt_qwen1_5_moe_2_7b_chat,pt_llama_3_8b_instruct,tb_internlm2_chat_7b_kvint4,tb_internlm2_chat_20b_kvint4,tb_qwen1_5_chat_7b_kvint4,tb_llama_3_8b_instruct_kvint4]' + datasets: + required: true + description: 'Tested datasets list. eg. [*mmlu_datasets, *ceval_datasets, *WiC_datasets, *WSC_datasets, *triviaqa_datasets, *gsm8k_datasets, *race_datasets, *crowspairs_datasets]' + type: string + default: '[*mmlu_datasets, *gsm8k_datasets]' devices: required: true description: 'CUDA_VISIBLE_DEVICES.' type: string default: '0,1,2,3,4,5,6,7' + jobs: evaluate: runs-on: [self-hosted, linux-a100] timeout-minutes: 4320 # 72hours - environment: 'prod' container: image: nvcr.io/nvidia/tritonserver:22.12-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" volumes: - /nvme/github-actions/pip-cache:/root/.cache/pip - /nvme/github-actions/packages:/root/packages - /nvme/github-actions/resources:/root/resources - - /nvme/github-actions/evaluation-reports:/root/evaluation-reports - /nvme/github-actions/opencompass-data:/root/opencompass-data + - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports - /nvme/qa_test_models:/root/models - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro steps: @@ -102,6 +107,7 @@ jobs: python3 .github/scripts/action_tools.py evaluate \ --models "${{github.event.inputs.models}}" \ + --datasets "${{github.event.inputs.datasets}}" \ --workspace /root/evaluation-reports/$TIME_STAMP - name: Clear workspace if: always() diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml index 3921b6b09a..27227f3bec 100644 --- a/.github/workflows/pr_ete_test.yml +++ b/.github/workflows/pr_ete_test.yml @@ -33,7 +33,7 @@ jobs: REPORT_DIR: /nvme/qa_test_models/test-reports container: image: nvcr.io/nvidia/tritonserver:24.03-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never" volumes: - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip - /nvme/share_data/github-actions/packages:/root/packages @@ -56,7 +56,6 @@ jobs: run: | python3 -m pip install cmake python3 -m pip install -r requirements/build.txt - sed -i 's/https:\/\/github.com\/NVIDIA\/cutlass.git/https:\/\/521github.com\/extdomains\/github.com\/NVIDIA\/cutlass.git/g' CMakeLists.txt mkdir build cd build cmake .. \ diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index a2eca9d795..bdd9ecb993 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -36,7 +36,7 @@ jobs: timeout-minutes: 4320 # 72hours container: image: nvcr.io/nvidia/tritonserver:22.12-py3 - options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e CUDA_VISIBLE_DEVICES=2,3" + options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e CUDA_VISIBLE_DEVICES=2,3 --pull never" volumes: - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip - /nvme/share_data/github-actions/packages:/root/packages @@ -58,7 +58,6 @@ jobs: run: | python3 -m pip install cmake python3 -m pip install -r requirements/build.txt - sed -i 's/https:\/\/github.com\/NVIDIA\/cutlass.git/https:\/\/521github.com\/extdomains\/github.com\/NVIDIA\/cutlass.git/g' CMakeLists.txt mkdir build cd build cmake .. \ diff --git a/autotest/config.yaml b/autotest/config.yaml index a25ffd2ff2..8100705465 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -16,14 +16,12 @@ turbomind_chat_model: - meta-llama/Llama-2-7b-chat-hf - meta-llama/Meta-Llama-3-8B-Instruct - internlm/internlm2-chat-1_8b - - internlm/internlm-chat-7b - internlm/internlm-chat-20b - internlm/internlm2-chat-7b - internlm/internlm2-chat-20b - internlm/internlm2-chat-7b-4bits - internlm/internlm2-chat-20b-4bits - Qwen/Qwen-7B-Chat - - Qwen/Qwen-14B-Chat - Qwen/Qwen1.5-7B-Chat - lmdeploy/llama2-chat-7b-w4 - baichuan-inc/Baichuan2-7B-Chat @@ -37,11 +35,12 @@ turbomind_chat_model: - deepseek-ai/deepseek-coder-1.3b-instruct - codellama/CodeLlama-7b-Instruct-hf - Qwen/Qwen1.5-4B-Chat-AWQ + - OpenGVLab/InternVL-Chat-V1-5 + - internlm/internlm-xcomposer2-vl-7b pytorch_chat_model: - meta-llama/Llama-2-7b-chat-hf - meta-llama/Meta-Llama-3-8B-Instruct - - internlm/internlm-chat-7b - internlm/internlm-chat-20b - internlm/internlm2-chat-7b - internlm/internlm2-chat-20b @@ -49,6 +48,7 @@ pytorch_chat_model: - baichuan-inc/Baichuan2-13B-Chat - 01-ai/Yi-6B-Chat - Qwen/Qwen1.5-7B-Chat + - Qwen/Qwen1.5-MoE-A2.7B-Chat - deepseek-ai/deepseek-moe-16b-chat - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mixtral-8x7B-Instruct-v0.1 @@ -73,29 +73,29 @@ vl_model: - liuhaotian/llava-v1.6-vicuna-7b - 01-ai/Yi-VL-6B - deepseek-ai/deepseek-vl-1.3b-chat + - OpenGVLab/InternVL-Chat-V1-5 + - internlm/internlm-xcomposer2-vl-7b quatization_case_config: w4a16: - meta-llama/Llama-2-7b-chat-hf - internlm/internlm-chat-20b - Qwen/Qwen-7B-Chat - - Qwen/Qwen-14B-Chat - internlm/internlm2-chat-20b - baichuan-inc/Baichuan2-7B-Chat - internlm/internlm2-20b - Qwen/Qwen1.5-7B-Chat + - meta-llama/Meta-Llama-3-8B-Instruct kvint: - meta-llama/Llama-2-7b-chat-hf - meta-llama/Meta-Llama-3-8B-Instruct - internlm/internlm2-chat-1_8b - - internlm/internlm-chat-7b - internlm/internlm-chat-20b - internlm/internlm2-chat-7b - internlm/internlm2-chat-20b - internlm/internlm2-chat-7b-4bits - internlm/internlm2-chat-20b-4bits - Qwen/Qwen-7B-Chat - - Qwen/Qwen-14B-Chat - Qwen/Qwen1.5-7B-Chat - lmdeploy/llama2-chat-7b-w4 - baichuan-inc/Baichuan2-7B-Chat @@ -106,7 +106,6 @@ quatization_case_config: w8a8: - meta-llama/Llama-2-7b-chat-hf - meta-llama/Meta-Llama-3-8B-Instruct - - internlm/internlm-chat-7b - internlm/internlm-chat-20b - internlm/internlm2-chat-20b - internlm/internlm2-chat-7b