diff --git a/calc_image_size_offline.py b/calc_image_size_offline.py new file mode 100644 index 000000000..67686e7e7 --- /dev/null +++ b/calc_image_size_offline.py @@ -0,0 +1,56 @@ +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +import json +from PIL import Image +import os + +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' + + +def load_jsonl(json_file): + with open(json_file) as f: + lines = f.readlines() + data = [] + for line in lines: + data.append(json.loads(line)) + return data + + +def calc_fn(data_dict): + size = {'width': 0, 'height': 0, 'image': 'None'} + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = Image.open(os.path.join(image_folder, + image_file)) + size['image'] = image_file + size['width'] = image.size[0] + size['height'] = image.size[1] + return size + + +if __name__ == '__main__': + print('start calculating modality length') + if data_path.endswith('.json'): + json_data = json.load(open(data_path)) + elif data_path.endswith('.jsonl'): + json_data = load_jsonl(data_path) + else: + raise NotImplementedError + + with ThreadPoolExecutor(max_workers=8) as executor: + length_list = list( + tqdm( + executor.map(calc_fn, json_data), + desc='Calculating modality length', + total=len(json_data))) + print('end calculating modality length') + + new_output_dict = {} + for i in range(len(length_list)): + if length_list[i]['image'] != 'None': + new_output_dict[length_list[i]['image']] = [length_list[i]['width'], length_list[i]['height']] + + with open('image_size.json', 'w') as f: + json.dump(new_output_dict, f) diff --git a/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_internvl_finetune.py b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_internvl_finetune.py new file mode 100644 index 000000000..7e88e7f66 --- /dev/null +++ b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_internvl_finetune.py @@ -0,0 +1,528 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from peft import LoraConfig +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/basemodel/checkpoints/llm/hf_hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/e8cf5276ae3e97cfde8a058e64a636f2cde47820' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +pretrained_pth = 'work_dirs/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain/iter_4871.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +sharegpt4v_caption_data_path = data_root + 'sharegpt4v_instruct_gpt4-vision_cap100k.jsonl' +sharegpt4v_caption_image_folder = data_root + 'data' + +llava_data_path = data_root + 'llava_instruct_150k_zh.jsonl' +llava_image_folder = data_root + 'data/coco' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +geoqa_data_path = data_root + 'geoqa+.jsonl' +geoqa_image_folder = data_root + 'data/geoqa+' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 4 # per_device 32gpu x 4bs +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 4000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 4000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + # attn_implementation='sdpa', + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### + +cache_root='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_llama3/' + +sharegpt4v_caption_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'sharegpt4v_caption_dataset', + data_path=sharegpt4v_caption_data_path, + image_folder=sharegpt4v_caption_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'llava_dataset', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + + +dvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +chartqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +ai2d_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +docvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +geoqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'geoqa_dataset', + data_path=geoqa_data_path, + image_folder=geoqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +synthdog_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataset = dict( + type=ConcatDataset, + datasets=[sharegpt4v_caption_dataset, llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + geoqa_dataset, synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=5), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +# val_dataset = [ +# dict( +# type=GQADataset, +# data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', +# ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', +# image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# ] +# +# test_dataset = [ +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=TextVQADataset, +# data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', +# ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', +# image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MMEDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', +# image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# # for_llava_prompt=True, # 开了后,perception 会掉 +# pad_image_to_square=True), +# dict( +# type=HallusionDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=POPEDataset, +# data_file=[ +# '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', +# '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', +# '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' +# ], +# coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=GQADataset, +# data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', +# ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', +# image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# dict( +# type=MultipleChoiceDataset, +# data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', +# prompt_template=prompt_template, +# tokenizer=tokenizer, +# image_processor=image_processor, +# pad_image_to_square=True), +# # dict( +# # type=VQAv2Dataset, +# # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', +# # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', +# # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', +# # prompt_template=PROMPT_TEMPLATE.vicuna, +# # tokenizer=tokenizer, +# # image_processor=image_processor, +# # pad_image_to_square=True), +# ] +# +# # TODO: We are not currently using val_evaluator +# # Don't support num_workers > 0 +# val_dataloader = dict( +# batch_size=1, +# num_workers=0, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict(type=ConcatDataset, datasets=val_dataset), +# collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +# val_evaluator = dict() +# val_cfg = dict(type=ValLoop) +# +# # TODO: We are not currently using test_evaluator +# test_dataloader = dict( +# batch_size=1, +# num_workers=0, +# drop_last=False, +# sampler=dict(type=DefaultSampler, shuffle=False), +# dataset=dict(type=ConcatDataset, datasets=test_dataset), +# collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +# ) +# +# test_evaluator = val_evaluator +# test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py new file mode 100644 index 000000000..3093fd1e1 --- /dev/null +++ b/xtuner/configs/llava/llama3_70b_chat/llava_llama3_70b_chat_clip_e1_gpu16_sharegpt4v_pretrain.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/basemodel/checkpoints/llm/hf_hub/models--meta-llama--Meta-Llama-3-70B-Instruct/snapshots/e8cf5276ae3e97cfde8a058e64a636f2cde47820' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/sharegpt4v/' +data_path = data_root + 'share-captioner_coco_lcs_sam_1246k_1107.json' +image_folder = data_root + 'data' +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(4096 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 8 # per_device 32GPUx8bs +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 5e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_llama3/pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + # pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=5), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=True, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/llama3_70b_chat/test_config.py b/xtuner/configs/llava/llama3_70b_chat/test_config.py new file mode 100644 index 000000000..9e676d433 --- /dev/null +++ b/xtuner/configs/llava/llama3_70b_chat/test_config.py @@ -0,0 +1,206 @@ +from xtuner.model import OpenAIModel +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn1 +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from peft import LoraConfig +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from xtuner.dataset import LLaVAProxyEvalDataset1 + +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +model = dict(type=OpenAIModel, base_url='http://10.140.24.142:23333/v1') +prompt_template = None + +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +proxy_eval_dataset = dict(type=LLaVAProxyEvalDataset1) + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), +] + +# # TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn1, extra_collate_keys=['img_id']) +) + +test_evaluator = {} +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py new file mode 100644 index 000000000..db96bb827 --- /dev/null +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_pretrain.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=None, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py new file mode 100644 index 000000000..a6ddcc2f8 --- /dev/null +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/sharegpt4v/' +data_path = data_root + 'share-captioner_coco_lcs_sam_1246k_1107.json' +image_folder = data_root + 'data' +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(4096 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_llama3/pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py new file mode 100644 index 000000000..84e68e658 --- /dev/null +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_finetune.py @@ -0,0 +1,399 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from peft import LoraConfig +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +pretrained_pth = 'work_dirs/llava_llama3_8b_chat_clip_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + # to speed inference + # attn_implementation='sdpa', + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/llama3_8b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py new file mode 100644 index 000000000..29c04623f --- /dev/null +++ b/xtuner/configs/llava/llama3_8b_chat/llava_llama3_8b_chat_clip_lora_e1_gpu8_internvl_finetune.py @@ -0,0 +1,528 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel +from peft import LoraConfig +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/fanqi/meta-llama/Meta-Llama-3-8B-Instruct' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +pretrained_pth = 'work_dirs/llava_llama3_8b_chat_clip_e1_gpu8_sharegpt4v_pretrain/iter_9742.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +sharegpt4v_caption_data_path = data_root + 'sharegpt4v_instruct_gpt4-vision_cap100k.jsonl' +sharegpt4v_caption_image_folder = data_root + 'data' + +llava_data_path = data_root + 'llava_instruct_150k_zh.jsonl' +llava_image_folder = data_root + 'data/coco' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +geoqa_data_path = data_root + 'geoqa+.jsonl' +geoqa_image_folder = data_root + 'data/geoqa+' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.llama3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 5000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 5000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + # attn_implementation='sdpa', + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### + +cache_root='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_llama3/' + +sharegpt4v_caption_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'sharegpt4v_caption_dataset', + data_path=sharegpt4v_caption_data_path, + image_folder=sharegpt4v_caption_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'llava_dataset', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + + +dvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +chartqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +ai2d_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +docvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +geoqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'geoqa_dataset', + data_path=geoqa_data_path, + image_folder=geoqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +synthdog_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root+'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataset = dict( + type=ConcatDataset, + datasets=[sharegpt4v_caption_dataset, llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + geoqa_dataset, synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/official/llava_v15_7b/llava_v15_7b_pretrain.py b/xtuner/configs/llava/official/llava_v15_7b/llava_v15_7b_pretrain.py index a30457cf8..6717089b9 100644 --- a/xtuner/configs/llava/official/llava_v15_7b/llava_v15_7b_pretrain.py +++ b/xtuner/configs/llava/official/llava_v15_7b/llava_v15_7b_pretrain.py @@ -5,7 +5,7 @@ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR from torch.optim import AdamW from transformers import (AutoModelForCausalLM, AutoTokenizer, - CLIPImageProcessor, CLIPVisionModel) + CLIPImageProcessor, CLIPVisionModel,LlamaModel) from xtuner.dataset import LLaVADataset from xtuner.dataset.collate_fns import default_collate_fn diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py new file mode 100644 index 000000000..474679eb8 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_finetune.py @@ -0,0 +1,397 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import MiniGeminiDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import MiniGeminiModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +from xtuner.model.modules import OpenCLIPVisionTower +from xtuner.dataset import MiniGeminiProxyEvalDataset + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (336 // 14) ** 2) +image_size_aux = 768 + +visual_encoder_aux_name = 'model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup' +visual_encoder_aux_path = '/mnt/petrelfs/share_data/zhaoxiangyu/models--laion--CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/snapshots/39918dfbdf69ccd2172e6510a430e92337ee23e1/' + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=MiniGeminiModel, + visual_encoder_aux=dict( + type=OpenCLIPVisionTower, + vision_tower=visual_encoder_aux_name, + vision_tower_path=visual_encoder_aux_path, + optimize_vision_tower_aux=False, + ), + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=MiniGeminiDataset, + image_size_aux=image_size_aux, # siglip 864, clip 768 + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['pixel_values_aux'])) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=TextVQADataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # 以下两个需要提交服务器进行在线评测 + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # vqav2 图片大概是 12w,推理要很久 + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = dict(type=MiniGeminiProxyEvalDataset, image_size_aux=image_size_aux), + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True, + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id','pixel_values_aux'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'pixel_values_aux']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain.py new file mode 100644 index 000000000..c4136acf1 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_clip_p14_384_convnext_e1_gpu8_pretrain.py @@ -0,0 +1,214 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import MiniGeminiDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import MiniGeminiModel +from xtuner.model.modules import OpenCLIPVisionTower + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (336 // 14) ** 2) + +visual_encoder_aux_name = 'model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup' +visual_encoder_aux_path = '/mnt/petrelfs/share_data/zhaoxiangyu/models--laion--CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup/snapshots/39918dfbdf69ccd2172e6510a430e92337ee23e1/' + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=MiniGeminiModel, + visual_encoder_aux=dict( + type=OpenCLIPVisionTower, + vision_tower=visual_encoder_aux_name, + vision_tower_path=visual_encoder_aux_path, + optimize_vision_tower_aux=False, + ), + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=MiniGeminiDataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', + image_size_aux=768, # siglip 864, clip 768 + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['pixel_values_aux'])) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py new file mode 100644 index 000000000..8bcad7190 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -0,0 +1,364 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel, BitsAndBytesConfig) +from peft import LoraConfig +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +import torch +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + llm_lora=dict( + type=LoraConfig, + r=512, + lora_alpha=256, + lora_dropout=0.05, + bias='none', + task_type='CAUSAL_LM'), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py new file mode 100644 index 000000000..39ddcedf0 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_qlora_siglip_so400m_p14_384_loar_e1_gpu8_finetune.py @@ -0,0 +1,367 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel, BitsAndBytesConfig) +from peft import LoraConfig +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +import torch +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + llm_lora=dict( + type=LoraConfig, + r=512, + lora_alpha=256, + lora_dropout=0.05, + bias='none', + task_type='CAUSAL_LM'), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py new file mode 100644 index 000000000..ae00d642f --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_all_finetune.py @@ -0,0 +1,357 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.plain +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py new file mode 100644 index 000000000..94c3a02e4 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_finetune.py @@ -0,0 +1,356 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = 'microsoft/phi-2' +visual_encoder_name_or_path = 'google/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['请描述一下这张照片', 'Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py new file mode 100644 index 000000000..ed45a9b4f --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_e1_gpu8_pretrain.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py new file mode 100644 index 000000000..ed00b7c80 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_384_s2_e1_gpu8_pretrain.py @@ -0,0 +1,207 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.model import LLaVAModel + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.plain +max_length = int(2048 - (384 // 14) ** 2) +s2_scales = [1, 2] + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + s2_scales=s2_scales, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + # phi2 不能用 flash attention, Loss 下降趋势不正常, fp16 推理也有潜在风险 + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + s2_scales=s2_scales, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py new file mode 100644 index 000000000..ebc97f3ea --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_all_finetune.py @@ -0,0 +1,383 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import AnyResLLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +from xtuner.dataset import AnyResLLaVAProxyEvalDataset + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[384, 768], [768, 384], [768, 768], [1152, 384], + [384, 1152]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + freeze_llm=False, + freeze_visual_encoder=False, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=4, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size']) +) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'orig_size'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'orig_size']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py new file mode 100644 index 000000000..c693c7a2c --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain.py @@ -0,0 +1,214 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import AnyResLLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +from xtuner.dataset import AnyResLLaVAProxyEvalDataset + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[384, 768], [768, 384], [768, 768], [1152, 384], + [384, 1152]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + freeze_llm=True, + freeze_visual_encoder=True, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=4, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=AnyResLLaVADataset, + pad_image_to_square=True, # change this + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_llava_pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size'])) +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py new file mode 100644 index 000000000..33bf6d9f5 --- /dev/null +++ b/xtuner/configs/llava/phi2_2_7b_siglip_so400m_p14_384/llava_phi2_2_7b_siglip_so400m_p14_anyres_pixel_shuffle_e1_gpu8_all_finetune.py @@ -0,0 +1,384 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + SiglipImageProcessor, SiglipVisionModel) + +from xtuner.dataset import AnyResLLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler +from xtuner.dataset import AnyResLLaVAProxyEvalDataset + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/phi-2' +visual_encoder_name_or_path = '/mnt/petrelfs/share_data/huanghaian/model/siglip-so400m-patch14-384' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/llava_phi2_2_7b_siglip_so400m_p14_anyres_e1_gpu8_pretrain/iter_2181.pth' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.vicuna +max_length = int(2048 - (384 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[384, 768], [768, 384], [768, 768], [1152, 384], + [384, 1152]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=SiglipImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + merge_type='pixel_shuffle', # xxxxxxx + freeze_llm=False, + freeze_visual_encoder=False, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=4, + pretrained_pth=pretrained_pth, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=SiglipVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi2_2_7b_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size']) +) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, # can save disk memory mmengine >=0.10.3 + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MMEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True) +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints), + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=PROMPT_TEMPLATE.vicuna, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'orig_size'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id', 'orig_size']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py b/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py new file mode 100644 index 000000000..176059073 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/allava_finetune.py @@ -0,0 +1,539 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/allava_pretrain/iter_4214.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +llava_data_path = data_root1 + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +llava_image_folder = data_root1 + 'llava_images' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_laion_data_path = data_root1 + 'allava_laion/ALLaVA-Instruct-LAION-4V_llava.json' +allava_laion_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/datasets--FreedomIntelligence--ALLaVA-4V/snapshots/624bd4c5fedc2209cf952eedf75712413d8d912c/' + +allava_vflan_data_path = data_root1 + 'allava_vflan/ALLaVA-Instruct-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 3000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 3000 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_root = '/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' +pad_image_to_square = True + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +dvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +chartqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +ai2d_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +docvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +synthdog_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +################## +cache_2k_root = data_root1 + 'phi3_mini_2k_offline/' + +allava_laion_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_laion_dataset_sft', + data_path=allava_laion_data_path, + image_folder=allava_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +allava_vflan_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset_sft', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + synthdog_dataset, allava_laion_dataset, allava_vflan_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py b/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py new file mode 100644 index 000000000..f2460a4ee --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/allava_finetune1.py @@ -0,0 +1,399 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = 'work_dirs/allava_pretrain/iter_4214.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/zhaoxiangyu/' +data_path = data_root + 'instruct_llava_allava_doc_dvqa_share_ai2d_1383k.json' +image_folder = '/mnt/petrelfs/share_data/huanghaian/xiangyu_mix_sft_data/' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 3000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 3000 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_path = '/mnt/petrelfs/share_data/huanghaian/xiangyu_mix_sft_data/phi3-mini-2k-sft' +pad_image_to_square = True + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_path, + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py new file mode 100644 index 000000000..58cd808d5 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/allava_pretrain.py @@ -0,0 +1,262 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset, ConcatDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/sharegpt4v/' +sharegpt4v_data_path = data_root + 'sharegpt4v_instruct_gpt4-vision_cap100k.json' +sharegpt4v_image_folder = data_root + 'data' + +data_root = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_laion_data_path = data_root + 'allava_laion/ALLaVA-Caption-LAION-4V_llava.json' +allava_laion_image_folder = 's3://xtuner/huanghaian/data/ALLaVA-4V/' + +data_root = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_vflan_data_path = data_root + 'allava_vflan/ALLaVA-Caption-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' + +allava_text_data_path = data_root + 'allava_text/Evol-Instruct-GPT4-Turbo-143K_llava.json' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device 16gx16 +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_2k_root = data_root + 'phi3_mini_2k_offline/' + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +allava_laion_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_laion_dataset', + data_path=allava_laion_data_path, + image_folder=allava_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +allava_vflan_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +allava_text_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_text_dataset', + data_path=allava_text_data_path, + tokenizer=tokenizer, + image_processor=image_processor, + image_folder=None, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataset = dict( + type=ConcatDataset, + datasets=[ + sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, + allava_text_dataset, allava_text_dataset + ]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_allava.py b/xtuner/configs/llava/phi3_mini_chat/llava_allava.py new file mode 100644 index 000000000..af15260cc --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_allava.py @@ -0,0 +1,450 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_vflan_data_path = data_root1 + 'allava_vflan/ALLaVA-Instruct-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' + + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +cache_2k_root = data_root1 + 'phi3_mini_2k_offline/' + +allava_vflan_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset_sft', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, allava_vflan_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.75), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py b/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py new file mode 100644 index 000000000..2fbcf84cd --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_allava_sharegpt.py @@ -0,0 +1,468 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Specify the pretrained pth +pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root1 = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +llava_data_path = data_root1 + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +llava_image_folder = data_root1 + 'llava_images' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/data/ALLaVA-4V/' +allava_vflan_data_path = data_root1 + 'allava_vflan/ALLaVA-Instruct-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/petrelfs/share_data/zhaoxiangyu/' + +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +cache_root = '/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' +sharegpt4v_sft_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +cache_2k_root = data_root1 + 'phi3_mini_2k_offline/' +allava_vflan_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_2k_root + 'allava_vflan_dataset_sft', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, allava_vflan_dataset, sharegpt4v_sft_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.75), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py new file mode 100644 index 000000000..0da0fd43a --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain.py @@ -0,0 +1,203 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='./phi3_mini_llava_pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py new file mode 100644 index 000000000..ff16eb0ca --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_finetune.py @@ -0,0 +1,456 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import AnyResLLaVADataset, AnyResLLaVAProxyEvalDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(4096 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], + [336, 1008]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=-1, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +pad_image_to_square = False + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder='./phi3_mini_llava_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size']) +) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +proxy_eval_dataset = dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints) + +val_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=pad_image_to_square), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + dict( + type=ChartQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + # 有问题,需要图片 + # dict( + # type=GeneralVQADataset, + # proxy_eval_dataset=proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size', 'img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size', 'img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_internvl_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_internvl_finetune.py new file mode 100644 index 000000000..6fa45b613 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_anyres_e1_gpu8_internvl_finetune.py @@ -0,0 +1,605 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import AnyResLLaVADataset, AnyResLLaVAProxyEvalDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.model import AnyResLLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +sharegpt4v_caption_data_path = data_root + 'sharegpt4v_instruct_gpt4-vision_cap100k.jsonl' +sharegpt4v_caption_image_folder = data_root + 'data' + +llava_data_path = data_root + 'llava_instruct_150k_zh.jsonl' +llava_image_folder = data_root + 'data/coco' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +geoqa_data_path = data_root + 'geoqa+.jsonl' +geoqa_image_folder = data_root + 'data/geoqa+' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 // 14) ** 2) + +# Scheduler & Optimizer +batch_size = 8 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], + [336, 1008]] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=AnyResLLaVAModel, + image_grid_pinpoints=image_grid_pinpoints, + token_merge_ratio=-1, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +pad_image_to_square = False + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_root='/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' + +sharegpt4v_caption_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'sharegpt4v_caption_dataset', + data_path=sharegpt4v_caption_data_path, + image_folder=sharegpt4v_caption_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +llava_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'llava_dataset', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +sharegpt4v_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + + +dvqa_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +chartqa_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +ai2d_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +docvqa_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +geoqa_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'geoqa_dataset', + data_path=geoqa_data_path, + image_folder=geoqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +synthdog_dataset = dict( + type=AnyResLLaVADataset, + image_grid_pinpoints=image_grid_pinpoints, + offline_processed_text_folder=cache_root+'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataset = dict( + type=ConcatDataset, + datasets=[sharegpt4v_caption_dataset, llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + geoqa_dataset, synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size'])) + + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +proxy_eval_dataset = dict(type=AnyResLLaVAProxyEvalDataset, + image_grid_pinpoints=image_grid_pinpoints) + +val_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=pad_image_to_square), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square), + dict( + type=ChartQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=pad_image_to_square + ), + # 有问题,需要图片 + # dict( + # type=GeneralVQADataset, + # proxy_eval_dataset=proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=pad_image_to_square + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size', 'img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['orig_size', 'img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py new file mode 100644 index 000000000..ae713b691 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_lora_p14_336_e1_gpu8_finetune.py @@ -0,0 +1,424 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='./phi3_mini_llava_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py new file mode 100644 index 000000000..1906935dc --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_instruct_full_clip_vit_large_p14_336_lldr_e1_gpu8_finetune.py @@ -0,0 +1,426 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14)**2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1500 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1500 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=True) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=True, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.75), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + 'LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_more_data.py b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_more_data.py new file mode 100644 index 000000000..2a2eaca10 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/llava_phi3_mini_4k_more_data.py @@ -0,0 +1,518 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = '/mnt/petrelfs/huanghaian/code/xtuner/work_dirs/llava_phi3_mini_4k_instruct_clip_vit_large_p14_336_e1_gpu8_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +llava_data_path = data_root1 + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +llava_image_folder = data_root1 + 'llava_images' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 3000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 3000 +SYSTEM = '' +evaluation_images = 'view.jpg' +evaluation_inputs = ['Please describe this picture'] + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=LLaVAModel, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_root = '/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' +pad_image_to_square = True +# sharegpt4v_caption_dataset = dict( +# type=LLaVADataset, +# offline_processed_text_folder=cache_root+'sharegpt4v_caption_dataset', +# data_path=sharegpt4v_caption_data_path, +# image_folder=sharegpt4v_caption_image_folder, +# tokenizer=tokenizer, +# image_processor=image_processor, +# dataset_map_fn=llava_map_fn, +# template_map_fn=dict( +# type=template_map_fn_factory, template=prompt_template), +# max_length=max_length, +# pad_image_to_square=pad_image_to_square) + +llava_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +sharegpt4v_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +dvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +chartqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +ai2d_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +docvqa_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +synthdog_dataset = dict( + type=LLaVADataset, + offline_processed_text_folder=cache_root + 'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + image_processor=image_processor, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + evaluation_images=evaluation_images, + system=SYSTEM, + prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +val_dataset = [ + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py new file mode 100644 index 000000000..db7ee4beb --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_finetune.py @@ -0,0 +1,456 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import InternVL_V1_5_LLaVADataset, InternVL_v1_5_LLaVAProxyEvalDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import InternVL_v1_5_LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/phi3_internvl_1-5_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 2000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 2000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +min_num = 1 +max_num = 6 +downsample_ratio = 0.5 + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=InternVL_v1_5_LLaVAModel, + downsample_ratio=downsample_ratio, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=True, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path), + visual_encoder_lora=dict( + type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none') +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + image_size_json='/mnt/petrelfs/huanghaian/code/mm/xtuner/image_size.json', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +eval_num = 6 + +proxy_eval_dataset = dict(type=InternVL_v1_5_LLaVAProxyEvalDataset, min_num=eval_num, max_num=eval_num) + +val_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_CN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/CCBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_CN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=MultipleChoiceDataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_TEST_EN.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + # dict( + # type=VQAv2Dataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test-dev2015.jsonl', + # test_file='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_llava_eval/llava_vqav2_mscoco_test2015.jsonl', + # image_folder='/mnt/petrelfs/share_data/zhaoxiangyu/vqav2_test2015', + # prompt_template=PROMPT_TEMPLATE.vicuna, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True), + dict( + type=ChartQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + # dict( + # type=GeneralVQADataset, + # proxy_eval_dataset = proxy_eval_dataset, + # data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/OCRVQA_TEST.tsv', + # prompt_template=prompt_template, + # tokenizer=tokenizer, + # image_processor=image_processor, + # pad_image_to_square=True + # ), +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data.py new file mode 100644 index 000000000..e0a4ec3cf --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data.py @@ -0,0 +1,532 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import InternVL_V1_5_LLaVADataset, InternVL_v1_5_LLaVAProxyEvalDataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook, EvaluateChatHook +from xtuner.model import InternVL_v1_5_LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from peft import LoraConfig +from xtuner.dataset.evaluation import MMEDataset, MultipleChoiceDataset, POPEDataset, \ + HallusionDataset, TextVQADataset, GQADataset, VQAv2Dataset, ChartQADataset, GeneralVQADataset +from xtuner.dataset import ConcatDataset +from xtuner.engine.runner import TrainLoop, ValLoop, TestLoop +from mmengine.dataset import DefaultSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' +# Specify the pretrained pth +pretrained_pth = './work_dirs/phi3_internvl_1-5_pretrain/iter_2181.pth' # noqa: E501 + +# Data +data_root = '/mnt/petrelfs/share_data/linzhihao/dataset/internvl_sft/' + +data_root1 = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +llava_data_path = data_root1 + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +llava_image_folder = data_root1 + 'llava_images' + +sharegpt4v_data_path = data_root + 'sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl' +sharegpt4v_image_folder = data_root + 'data' + +dvqa_data_path = data_root + 'dvqa_train_200k.jsonl' +dvqa_image_folder = data_root + 'data/dvqa' + +chartqa_data_path = data_root + 'chartqa_train_18k.jsonl' +chartqa_image_folder = data_root + 'data/chartqa' + +ai2d_data_path = data_root + 'ai2d_train_12k.jsonl' +ai2d_image_folder = data_root + 'data/ai2d' + +docvqa_data_path = data_root + 'docvqa_train_10k.jsonl' +docvqa_image_folder = data_root + 'data/docvqa' + +synthdog_data_path = data_root + 'synthdog_en.jsonl' +synthdog_image_folder = data_root + 'data/synthdog-en' + +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(2048 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 2e-5 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 3000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 3000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +min_num = 1 +max_num = 6 +downsample_ratio = 0.5 + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=InternVL_v1_5_LLaVAModel, + downsample_ratio=downsample_ratio, + use_lldr=True, # xxxxxxx + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=False, + freeze_visual_encoder=False, + pretrained_pth=pretrained_pth, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_root = '/mnt/petrelfs/share_data/huanghaian/internvl_finetune_phi3/' +pad_image_to_square = False + +llava_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_finetune', + data_path=llava_data_path, + image_folder=llava_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +sharegpt4v_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +dvqa_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'dvqa_dataset', + data_path=dvqa_data_path, + image_folder=dvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +chartqa_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'chartqa_dataset', + data_path=chartqa_data_path, + image_folder=chartqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +ai2d_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'ai2d_dataset', + data_path=ai2d_data_path, + image_folder=ai2d_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +docvqa_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'docvqa_dataset', + data_path=docvqa_data_path, + image_folder=docvqa_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +synthdog_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_root + 'synthdog_dataset', + data_path=synthdog_data_path, + image_folder=synthdog_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=pad_image_to_square) + +train_dataset = dict( + type=ConcatDataset, + datasets=[llava_dataset, sharegpt4v_dataset, + dvqa_dataset, chartqa_dataset, ai2d_dataset, docvqa_dataset, + synthdog_dataset]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + pin_memory=False, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs, val_interval=save_steps) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) + +# ==================== val and test cfg ======================= +eval_num = 6 + +proxy_eval_dataset = dict(type=InternVL_v1_5_LLaVAProxyEvalDataset, min_num=eval_num, max_num=eval_num) + +val_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), +] + +test_dataset = [ + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMBench_DEV_EN.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/SEEDBench_IMG.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ScienceQA_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MMMU_DEV_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/AI2D_TEST.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=TextVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/huanghaian/orig_llava_eval/textvqa/llava_textvqa_val_v051_ocr.jsonl', + ann_file='/mnt/petrelfs/share_data/huanghaian/text_vqa/TextVQA_0.5.1_val.json', + image_folder='/mnt/petrelfs/share_data/huanghaian/text_vqa/train_images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MMEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/MME.tsv', + image_folder='/mnt/petrelfs/share_data/duanhaodong/data/mme/MME_Benchmark_release', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + # for_llava_prompt=True, # 开了后,perception 会掉 + pad_image_to_square=True), + dict( + type=HallusionDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/HallusionBench.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=POPEDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=[ + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_adversarial.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_popular.json', + '/mnt/petrelfs/share_data/linzhihao/dataset/POPE/coco_pope_random.json' + ], + coco_val_path='/mnt/petrelfs/share_data/linzhihao/dataset/coco/val2014/', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=GQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/llava_gqa_testdev_balanced.jsonl', + ann_file='/mnt/petrelfs/share_data/zhaoxiangyu/gqa_llava_eval/testdev_balanced_questions.json', + image_folder='/mnt/petrelfs/share_data/basemodel/dataset/multimodality/gqa/images', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=MultipleChoiceDataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/share_data/zhaoxiangyu/datasets--Lin-Chen--MMStar/snapshots/mmstar/MMStar.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True), + dict( + type=ChartQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file=['/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_human.json', + '/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/test_augmented.json'], + image_folder='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/ChartQA/ChartQA Dataset/test/png', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/DocVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ), + dict( + type=GeneralVQADataset, + proxy_eval_dataset=proxy_eval_dataset, + data_file='/mnt/petrelfs/huanghaian/code/xtuner/LMUData/InfoVQA_VAL.tsv', + prompt_template=prompt_template, + tokenizer=tokenizer, + image_processor=image_processor, + pad_image_to_square=True + ) +] + +# TODO: We are not currently using val_evaluator +# Don't support num_workers > 0 +val_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=val_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id'])) +val_evaluator = dict() +val_cfg = dict(type=ValLoop) + +# TODO: We are not currently using test_evaluator +test_dataloader = dict( + batch_size=1, + num_workers=0, + drop_last=False, + sampler=dict(type=DefaultSampler, shuffle=False), + dataset=dict(type=ConcatDataset, datasets=test_dataset), + collate_fn=dict(type=mm_collate_fn, extra_collate_keys=['img_id']) +) + +test_evaluator = val_evaluator +test_cfg = dict(type=TestLoop, select_metric='first') diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py new file mode 100644 index 000000000..1d3e59916 --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_more_data_pretrain.py @@ -0,0 +1,722 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import InternVL_V1_5_LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_v1_5_LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE +from xtuner.dataset import ConcatDataset +from xtuner.dataset.utils import internvl_1_5_encode_fn +from xtuner.dataset.samplers import LengthGroupedSampler + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/hwfile/xtuner/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = '/mnt/hwfile/xtuner/linzhihao/model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +share_data_root = '/mnt/hwfile/xtuner/huanghaian/data/sharegpt4v/' +sharegpt4v_data_path = share_data_root + 'share-captioner_coco_lcs_sam_1246k_1107_llava.json' +sharegpt4v_image_folder = '/mnt/hwfile/xtuner/linzhihao/dataset/sharegpt4v/data' + +data_root = '/mnt/hwfile/xtuner/huanghaian/data/ALLaVA-4V/' +allava_laion_data_path = data_root + 'allava_laion/ALLaVA-Caption-LAION-4V_llava.json' +allava_laion_image_folder = '/mnt/hwfile/openmmlab/zhaoxiangyu/datasets--FreedomIntelligence--ALLaVA-4V/snapshots/624bd4c5fedc2209cf952eedf75712413d8d912c/' + +data_root = '/mnt/hwfile/xtuner/huanghaian/data/ALLaVA-4V/' +allava_vflan_data_path = data_root + 'allava_vflan/ALLaVA-Caption-VFLAN-4V_llava.json' +allava_vflan_image_folder = '/mnt/hwfile/openmmlab/zhaoxiangyu/' + +allava_text_data_path = data_root + 'allava_text/Evol-Instruct-GPT4-Turbo-143K_llava.json' + +laion_data_root = '/mnt/hwfile/xtuner/huanghaian/data/laion-coco/orig_merge_70m_data/' +laion_data_path0 = laion_data_root + 'filter_data_0_llava.json' +laion_data_path1 = laion_data_root + 'filter_data_1_llava.json' +laion_data_path2 = laion_data_root + 'filter_data_2_llava.json' +laion_data_path3 = laion_data_root + 'filter_data_3_llava.json' +laion_data_path4 = laion_data_root + 'filter_data_4_llava.json' +laion_data_path5 = laion_data_root + 'filter_data_5_llava.json' +laion_data_path6 = laion_data_root + 'filter_data_6_llava.json' +laion_data_path7 = laion_data_root + 'filter_data_7_llava.json' +laion_image_folder = 'public:s3://public-dataset/laion-coco/images/' + +# laion-coco-ocr +laion_ocr_data_root = '/mnt/hwfile/xtuner/huanghaian/data/laion-coco/orig_merge_17m_ocr_data/' +laion_ocr_data_path0 = laion_ocr_data_root + 'filter_data_0_llava.json' +laion_ocr_data_path1 = laion_ocr_data_root + 'filter_data_1_llava.json' + +# coyo_data_root = '/mnt/hwfile/xtuner/huanghaian/data/COYO-700M/' +# coyo_data_path1 = coyo_data_root + 'filter_rand_20m_llava_1.json' +# coyo_data_path2 = coyo_data_root + 'filter_rand_20m_llava_2.json' +# coyo_data_path3 = coyo_data_root + 'filter_rand_20m_llava_3.json' +# coyo_image_folder = 'public:s3://public-dataset/COYO-700M/' + +coco_caption_data_root = '/mnt/hwfile/xtuner/huanghaian/data/coco_caption/' +coco_caption_data_path = coco_caption_data_root + 'coco_karpathy_train_val_llava.json' +coco_caption_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/coco_caption/' + +max_length = 4096 + +prompt_template = PROMPT_TEMPLATE.phi3_chat + +# Scheduler & Optimizer +batch_size = 16 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 10000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 10000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + +min_num = 1 +max_num = 6 +downsample_ratio = 0.5 + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=InternVL_v1_5_LLaVAModel, + use_lldr=True, # xxxxxxx + downsample_ratio=downsample_ratio, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=False, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +cache_4k_root = laion_data_root + 'phi3_mini_4k_offline/' +laion_coco_dataset0 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_0', + data_path=laion_data_path0, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset1 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_1', + data_path=laion_data_path1, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset2 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_2', + data_path=laion_data_path2, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset3 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_3', + data_path=laion_data_path3, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset4 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_4', + data_path=laion_data_path4, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset5 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_5', + data_path=laion_data_path5, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset6 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_6', + data_path=laion_data_path6, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_dataset7 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_dataset_10m_7', + data_path=laion_data_path7, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +###############################################################################333 +cache_4k_root = laion_ocr_data_root + 'phi3_mini_4k_offline/' +laion_coco_ocr_dataset0 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_ocr_dataset_10m_0', + data_path=laion_ocr_data_path0, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) +laion_coco_ocr_dataset1 = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_coco_ocr_dataset_10m_1', + data_path=laion_ocr_data_path1, + image_folder=laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +# cache_2k_root = coyo_data_root + 'phi3_mini_2k_offline/' +# coyo_dataset1 = dict( +# type=InternVL_V1_5_LLaVADataset, +# use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 +# min_num=min_num, +# max_num=max_num, +# downsample_ratio=downsample_ratio, +# offline_processed_text_folder=cache_2k_root + 'coyo_dataset_20m_1', +# data_path=coyo_data_path1, +# image_folder=coyo_image_folder, +# tokenizer=tokenizer, +# image_processor=image_processor, +# dataset_map_fn=llava_map_fn, +# encode_map_fn=dict( +# type=internvl_1_5_encode_fn, +# min_num=min_num, +# max_num=max_num, +# use_patch=False), # 核心参数 +# template_map_fn=dict( +# type=template_map_fn_factory, template=prompt_template), +# max_length=2048) +# +# coyo_dataset2 = dict( +# type=InternVL_V1_5_LLaVADataset, +# use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 +# min_num=min_num, +# max_num=max_num, +# downsample_ratio=downsample_ratio, +# offline_processed_text_folder=cache_2k_root + 'coyo_dataset_20m_2', +# data_path=coyo_data_path2, +# image_folder=coyo_image_folder, +# tokenizer=tokenizer, +# image_processor=image_processor, +# dataset_map_fn=llava_map_fn, +# encode_map_fn=dict( +# type=internvl_1_5_encode_fn, +# min_num=min_num, +# max_num=max_num, +# use_patch=False), # 核心参数 +# template_map_fn=dict( +# type=template_map_fn_factory, template=prompt_template), +# max_length=2048) +# +# coyo_dataset3 = dict( +# type=InternVL_V1_5_LLaVADataset, +# use_patch=False, # 由于 image token 很少,所以可能也不需要 4k 上下文 +# min_num=min_num, +# max_num=max_num, +# downsample_ratio=downsample_ratio, +# offline_processed_text_folder=cache_2k_root + 'coyo_dataset_20m_3', +# data_path=coyo_data_path3, +# image_folder=coyo_image_folder, +# tokenizer=tokenizer, +# image_processor=image_processor, +# dataset_map_fn=llava_map_fn, +# encode_map_fn=dict( +# type=internvl_1_5_encode_fn, +# min_num=min_num, +# max_num=max_num, +# use_patch=False), # 核心参数 +# template_map_fn=dict( +# type=template_map_fn_factory, template=prompt_template), +# max_length=2048) + +cache_4k_root = share_data_root + 'phi3_mini_4k_offline/' +sharegpt4v_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'sharegpt4v_dataset', + data_path=sharegpt4v_data_path, + image_folder=sharegpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +cache_4k_root = data_root + 'phi3_mini_4k_offline/' +allava_laion_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'allava_laion_dataset', + data_path=allava_laion_data_path, + image_folder=allava_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +cache_4k_root = data_root + 'phi3_mini_4k_offline/' +allava_vflan_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'allava_vflan_dataset', + data_path=allava_vflan_data_path, + image_folder=allava_vflan_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +allava_text_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'allava_text_dataset', + data_path=allava_text_data_path, + tokenizer=tokenizer, + image_processor=image_processor, + image_folder=None, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +cache_4k_root = coco_caption_data_root + 'phi3_mini_4k_offline/' +coco_caption_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'coco_karpathy_train_val_llava', + data_path=coco_caption_data_path, + image_folder=coco_caption_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +laion_gpt4v_root = '/mnt/hwfile/xtuner/huanghaian/data/laion_gpt4v/' +laion_gpt4v_data_path = laion_gpt4v_root + 'laion_gpt4v_llava.json' +laion_gpt4v_image_folder = laion_gpt4v_root + 'images/' + +cache_4k_root = laion_gpt4v_root + 'phi3_mini_4k_offline/' +laion_gpt4v_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'laion_gpt4v_llava', + data_path=laion_gpt4v_data_path, + image_folder=laion_gpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +coco_text_root = '/mnt/hwfile/xtuner/huanghaian/data/coco_text/' +coco_text_data_path = coco_text_root + 'cocotext_v2_llava.json' +coco_text_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/coco_text/' + +cache_4k_root = coco_text_root + 'phi3_mini_4k_offline/' +coco_text_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'coco_text_dataset', + data_path=coco_text_data_path, + image_folder=coco_text_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +textcap_root = '/mnt/hwfile/xtuner/huanghaian/data/TextCaps/' +textcap_data_path = textcap_root + 'TextCaps_0.1_train_val_llava.json' +textcap_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/TextCaps/' +cache_4k_root = textcap_root + 'phi3_mini_4k_offline/' + +text_cap_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'text_cap_dataset', + data_path=textcap_data_path, + image_folder=textcap_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +textocr_gpt4v_root = '/mnt/hwfile/xtuner/huanghaian/data/TextOCR-GPT4V/' +textocr_gpt4v_data_path = textocr_gpt4v_root + 'train_llava.json' +textocr_gpt4v_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/TextOCR-GPT4V/' +cache_4k_root = textocr_gpt4v_root + 'phi3_mini_4k_offline/' + +textocr_gpt4v_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'textocr_gpt4v_dataset', + data_path=textocr_gpt4v_data_path, + image_folder=textocr_gpt4v_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +bunny_laion_root = '/mnt/hwfile/xtuner/huanghaian/data/Bunny-v1_0-data/pretrain/' +bunny_laion_data_path = bunny_laion_root + 'bunny_pretrain_laion_2m_llava.json' +bunny_laion_image_folder = '/mnt/hwfile/xtuner/huanghaian/data/Bunny-v1_0-data/pretrain/images' +cache_4k_root = bunny_laion_root + 'phi3_mini_4k_offline/' + +bunny_laion_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder=cache_4k_root + 'bunny_laion_dataset', + data_path=bunny_laion_data_path, + image_folder=bunny_laion_image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + encode_map_fn=dict( + type=internvl_1_5_encode_fn, + min_num=min_num, + max_num=max_num), + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length) + +# 42m +train_dataset = dict( + type=ConcatDataset, + datasets=[ + # laion_coco_dataset0, laion_coco_dataset1, laion_coco_dataset2, laion_coco_dataset3, + # laion_coco_dataset4, laion_coco_dataset5, laion_coco_dataset6, laion_coco_dataset7, + laion_coco_dataset0, laion_coco_dataset5, + laion_coco_ocr_dataset0, laion_coco_ocr_dataset1, coco_caption_dataset, + sharegpt4v_dataset, allava_laion_dataset, allava_vflan_dataset, laion_gpt4v_dataset, + allava_text_dataset, coco_text_dataset, text_cap_dataset, textocr_gpt4v_dataset, + bunny_laion_dataset + ]) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + constructor='LearningRateDecayOptimWrapperConstructor', # ==================== + paramwise_cfg=dict(layer_decay_rate=0.9), # vit-l + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=50), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=True, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py new file mode 100644 index 000000000..c92e3cecf --- /dev/null +++ b/xtuner/configs/llava/phi3_mini_chat/phi3_internvl_1-5_pretrain.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from torch.optim import AdamW +from transformers import (AutoModelForCausalLM, AutoTokenizer, + CLIPImageProcessor, CLIPVisionModel) + +from xtuner.dataset import InternVL_V1_5_LLaVADataset +from xtuner.dataset.collate_fns import mm_collate_fn +from xtuner.dataset.map_fns import llava_map_fn, template_map_fn_factory +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_v1_5_LLaVAModel +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +llm_name_or_path = '/mnt/petrelfs/share_data/gaojianfei/Phi-3-mini-4k-instruct/models--microsoft--Phi-3-mini-4k-instruct/snapshots/3a811845d89f3c1b3f41b341d0f9f05104769f35' +visual_encoder_name_or_path = 'model/models--openai--clip-vit-large-patch14-336/snapshots/ce19dc912ca5cd21c8a653c79e251e808ccabcd1' + +# Data +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Pretrain/blip_laion_cc_sbu_558k.json' +image_folder = data_root + 'LLaVA-Pretrain/images' +prompt_template = PROMPT_TEMPLATE.phi3_chat +max_length = int(4094 - (336 / 14) ** 2) + +# Scheduler & Optimizer +batch_size = 32 # per_device +accumulative_counts = 1 +dataloader_num_workers = 4 +max_epochs = 1 +optim_type = AdamW +lr = 1e-3 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 1000 +save_total_limit = 1 # Maximum checkpoints to keep (-1 means unlimited) + +# Evaluate the generation performance during the training +evaluation_freq = 1000 +SYSTEM = '' +evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg' +evaluation_inputs = ['Please describe this picture'] + + +min_num = 1 +max_num = 6 +downsample_ratio = 0.5 + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True, + padding_side='right') + +image_processor = dict( + type=CLIPImageProcessor.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path, + trust_remote_code=True) + +model = dict( + type=InternVL_v1_5_LLaVAModel, + downsample_ratio=downsample_ratio, + tokenizer=tokenizer, + template=prompt_template, + image_processor=image_processor, + freeze_llm=True, + freeze_visual_encoder=True, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=llm_name_or_path, + trust_remote_code=True), + visual_encoder=dict( + type=CLIPVisionModel.from_pretrained, + pretrained_model_name_or_path=visual_encoder_name_or_path)) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_LLaVADataset, + min_num=min_num, + max_num=max_num, + downsample_ratio=downsample_ratio, + offline_processed_text_folder='/mnt/petrelfs/huanghaian/code/xtuner/phi3_mini_llava_pretrain', + data_path=data_path, + image_folder=image_folder, + tokenizer=tokenizer, + image_processor=image_processor, + dataset_map_fn=llava_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + max_length=max_length, + pad_image_to_square=False) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=mm_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + # dict( + # type=EvaluateChatHook, + # tokenizer=tokenizer, + # image_processor=image_processor, + # every_n_iters=evaluation_freq, + # evaluation_inputs=evaluation_inputs, + # evaluation_images=evaluation_images, + # system=SYSTEM, + # prompt_template=prompt_template) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False) diff --git a/xtuner/dataset/__init__.py b/xtuner/dataset/__init__.py index bcfe0dcc3..14fd06d81 100644 --- a/xtuner/dataset/__init__.py +++ b/xtuner/dataset/__init__.py @@ -6,14 +6,20 @@ from .intern_repo import (build_packed_dataset, load_intern_repo_tokenized_dataset, load_intern_repo_untokenized_dataset) + +from .llava import LLaVADataset, AnyResLLaVADataset, InternVL_V1_5_LLaVADataset from .json_dataset import load_json_file -from .llava import LLaVADataset from .modelscope import process_ms_dataset from .moss_sft import MOSSSFTDataset from .refcoco_json import (InvRefCOCOJsonDataset, RefCOCOJsonDataset, RefCOCOJsonEvalDataset) -from .utils import decode_base64_to_image, expand2square, load_image - +from .utils import decode_base64_to_image, expand2square, load_image, internvl_1_5_encode_fn +from .llava_proxy_eval_dataset import LLaVAProxyEvalDataset +from .anyres_llava_proxy_eval_dataset import AnyResLLaVAProxyEvalDataset +from .mini_gemini_dataset import MiniGeminiDataset +from .mini_gemini_proxy_eval_dataset import MiniGeminiProxyEvalDataset +from .internvl_v1_5_llava_proxy_eval_dataset import InternVL_v1_5_LLaVAProxyEvalDataset +from .llava_proxy_eval_dataset1 import LLaVAProxyEvalDataset1 # ignore FutureWarning in hf datasets warnings.simplefilter(action='ignore', category=FutureWarning) @@ -22,7 +28,18 @@ 'process_ms_dataset', 'LLaVADataset', 'expand2square', 'decode_base64_to_image', 'load_image', 'process_ms_dataset', 'load_intern_repo_tokenized_dataset', - 'load_intern_repo_untokenized_dataset', 'build_packed_dataset', - 'RefCOCOJsonDataset', 'RefCOCOJsonEvalDataset', 'InvRefCOCOJsonDataset', - 'load_json_file' + 'load_intern_repo_untokenized_dataset', + 'build_packed_dataset', + 'RefCOCOJsonDataset', + 'RefCOCOJsonEvalDataset', + 'InvRefCOCOJsonDataset', + 'AnyResLLaVADataset', + 'load_json_file', + 'LLaVAProxyEvalDataset', + 'AnyResLLaVAProxyEvalDataset', + 'MiniGeminiDataset', + 'MiniGeminiProxyEvalDataset', + 'InternVL_V1_5_LLaVADataset', + 'InternVL_v1_5_LLaVAProxyEvalDataset', + 'internvl_1_5_encode_fn' ] diff --git a/xtuner/dataset/anyres_llava_proxy_eval_dataset.py b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py new file mode 100644 index 000000000..9c3f9351c --- /dev/null +++ b/xtuner/dataset/anyres_llava_proxy_eval_dataset.py @@ -0,0 +1,100 @@ +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string +from .utils import process_anyres_image + + +class AnyResLLaVAProxyEvalDataset: + def __init__(self, eval_dataset, image_grid_pinpoints): + self.eval_ds = eval_dataset + self.image_grid_pinpoints = image_grid_pinpoints + + # TODO: Assuming they are all squares. + if hasattr(eval_dataset.image_processor, 'crop_size'): + self._crop_size = eval_dataset.image_processor.crop_size + else: + self._crop_size = eval_dataset.image_processor.size + self._patch_size = self._crop_size['height'] + self._shortest_edge = self._crop_size['height'] + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + # TODO prompt are different of vlmevalkit + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + elif self.eval_ds.metainfo['name'] in ['chartqa', 'gvqa']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nAnswer the question using a single word or phrase.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text + elif self.eval_ds.metainfo['name'] in ['hullusion', 'pope']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nPlease answer yes or no.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text + else: + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.eval_ds.tokenizer.encode(chunk) + else: + cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2', 'chartqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])).convert('RGB') + else: + image = self.eval_ds.get_image(data['img']).convert('RGB') + + orig_size = image.size + # use to remove padding + data_dict['orig_size'] = orig_size + image = process_anyres_image(image, self.eval_ds.image_processor, + self.image_grid_pinpoints, + self._patch_size, self._shortest_edge, + pad_mean=tuple(int(x * 255) for x in self.eval_ds.image_processor.image_mean), + orig_img_pad_to_square=self.eval_ds.pad_image_to_square) + data_dict['pixel_values'] = image + + return data_dict diff --git a/xtuner/dataset/collate_fns/__init__.py b/xtuner/dataset/collate_fns/__init__.py index 96652b259..9ddfd5fc1 100644 --- a/xtuner/dataset/collate_fns/__init__.py +++ b/xtuner/dataset/collate_fns/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .default_collate_fn import default_collate_fn from .mmlu_collate_fn import mmlu_collate_fn +from .mm_collate_fn import mm_collate_fn,mm_collate_fn1 -__all__ = ['default_collate_fn', 'mmlu_collate_fn'] +__all__ = ['default_collate_fn', 'mmlu_collate_fn', 'mm_collate_fn'] diff --git a/xtuner/dataset/collate_fns/mm_collate_fn.py b/xtuner/dataset/collate_fns/mm_collate_fn.py new file mode 100644 index 000000000..1e4c1704d --- /dev/null +++ b/xtuner/dataset/collate_fns/mm_collate_fn.py @@ -0,0 +1,97 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Sequence + +import torch +from torch.nn.utils.rnn import pad_sequence +from xtuner.utils import DEFAULT_PAD_TOKEN_INDEX, IGNORE_INDEX + + +def mm_collate_fn(instances: Sequence[Dict], + pad_index: int = DEFAULT_PAD_TOKEN_INDEX, + return_hf_format: bool = False, + extra_collate_keys=None): + input_ids = [] + labels = [] + cumulative_len = [] + position_ids = [] + + has_image = any(inst.get('pixel_values') is not None for inst in instances) + has_labels = any(inst.get('labels') is not None for inst in instances) + mode = 'train' if has_labels else 'eval' + + if has_image: + pixel_values = [] + + for i, data in enumerate(instances): + input_ids.append(torch.LongTensor(data['input_ids'])) + if mode == 'train': + labels.append(torch.LongTensor(data['labels'])) + + if 'cumulative_len' in data: + cumulative_len.append(torch.IntTensor(data['cumulative_len'])) + + if has_image: + pixel_values.append(data['pixel_values']) + + ori_length = [len(ids) for ids in input_ids] + if len(instances) > 1: + input_ids = pad_sequence( + input_ids, batch_first=True, padding_value=pad_index) + labels = pad_sequence( + labels, batch_first=True, padding_value=IGNORE_INDEX) + else: + input_ids = torch.stack(input_ids) + if mode == 'train': + labels = torch.stack(labels) + + # Some tokenizers have the same eos token and pad token, so input_ids + # cannot be masked directly based on the pad token id. + attention_mask = torch.zeros_like(input_ids).bool() + for i in ori_length: + attention_mask[:i] = True + + if mode == 'train': + bs, seq_len = input_ids.shape + position_ids = torch.arange(seq_len).unsqueeze(0).long().repeat(bs, 1) + + if len(cumulative_len) == 0: + cumulative_len = None + + if mode == 'train': + data_dict = { + 'input_ids': input_ids, + 'position_ids': position_ids, + 'attention_mask': attention_mask, + 'labels': labels, + 'cumulative_len': cumulative_len, + } + else: + data_dict = { + 'input_ids': input_ids, + } + + if has_image: + # if all images have the same size, stack them into a single tensor + # else, keep them as a list of tensors + if all(x.shape == pixel_values[0].shape for x in pixel_values): + pixel_values = torch.stack(pixel_values, dim=0) + data_dict['pixel_values'] = pixel_values + + if extra_collate_keys is not None: + for key in extra_collate_keys: + data_dict[key] = [inst[key] for inst in instances] + + if return_hf_format: + return data_dict + else: + return {'data': data_dict, 'data_samples': None} + + +def mm_collate_fn1(instances: Sequence[Dict], + pad_index: int = DEFAULT_PAD_TOKEN_INDEX, + return_hf_format: bool = False, + extra_collate_keys=None): + data_dict = {'pixel_values': [inst['pixel_values'] for inst in instances], + 'text': [inst['text'] for inst in instances], + 'img_id': [inst['img_id'] for inst in instances]} + return {'data': data_dict, 'data_samples': None} diff --git a/xtuner/dataset/evaluation/__init__.py b/xtuner/dataset/evaluation/__init__.py new file mode 100644 index 000000000..652ae88e4 --- /dev/null +++ b/xtuner/dataset/evaluation/__init__.py @@ -0,0 +1,12 @@ +from .mme_dataset import MMEDataset +from .multiple_choice_dataset import MultipleChoiceDataset +from .pope_dataset import POPEDataset +from .hallusion_dataset import HallusionDataset +from .textvqa_dataset import TextVQADataset +from .gqa_dataset import GQADataset +from .vqav2_dataset import VQAv2Dataset +from .chartqa_dataset import ChartQADataset +from .general_vqa_dataset import GeneralVQADataset + +__all__ = ['MMEDataset', 'MultipleChoiceDataset', 'POPEDataset', 'HallusionDataset', 'TextVQADataset', 'GQADataset', + 'VQAv2Dataset', 'ChartQADataset', 'GeneralVQADataset'] diff --git a/xtuner/dataset/evaluation/base_eval_dataset.py b/xtuner/dataset/evaluation/base_eval_dataset.py new file mode 100644 index 000000000..99c778245 --- /dev/null +++ b/xtuner/dataset/evaluation/base_eval_dataset.py @@ -0,0 +1,69 @@ +from torch.utils.data import Dataset +import copy +from collections.abc import Mapping +from typing import Union +from mmengine.config import Config +import logging +from mmengine.fileio import list_from_file +from mmengine.logging import print_log +from abc import abstractmethod + + +class BaseEvalDataset(Dataset): + + METAINFO: dict = dict(name='default') + + def __init__(self, metainfo: Union[Mapping, Config, None] = None): + self._metainfo = self._load_metainfo(copy.deepcopy(metainfo)) + + @classmethod + def _load_metainfo(cls, + metainfo: Union[Mapping, Config, None] = None) -> dict: + """Collect meta information from the dictionary of meta. + + Args: + metainfo (Mapping or Config, optional): Meta information dict. + If ``metainfo`` contains existed filename, it will be + parsed by ``list_from_file``. + + Returns: + dict: Parsed meta information. + """ + # avoid `cls.METAINFO` being overwritten by `metainfo` + cls_metainfo = copy.deepcopy(cls.METAINFO) + if metainfo is None: + return cls_metainfo + if not isinstance(metainfo, (Mapping, Config)): + raise TypeError('metainfo should be a Mapping or Config, ' + f'but got {type(metainfo)}') + + for k, v in metainfo.items(): + if isinstance(v, str): + # If type of value is string, and can be loaded from + # corresponding backend. it means the file name of meta file. + try: + cls_metainfo[k] = list_from_file(v) + except (TypeError, FileNotFoundError): + print_log( + f'{v} is not a meta file, simply parsed as meta ' + 'information', + logger='current', + level=logging.WARNING) + cls_metainfo[k] = v + else: + cls_metainfo[k] = v + return cls_metainfo + + @property + def metainfo(self) -> dict: + """Get meta information of dataset. + + Returns: + dict: meta information collected from ``BaseDataset.METAINFO``, + annotation file and metainfo argument during instantiation. + """ + return copy.deepcopy(self._metainfo) + + @abstractmethod + def evaluate(self, results, work_dir): + pass diff --git a/xtuner/dataset/evaluation/chartqa_dataset.py b/xtuner/dataset/evaluation/chartqa_dataset.py new file mode 100644 index 000000000..c8b47dc7f --- /dev/null +++ b/xtuner/dataset/evaluation/chartqa_dataset.py @@ -0,0 +1,177 @@ +import os +import os.path as osp +from typing import Optional +import json +from mmengine.dist import master_only +from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset + + +def relaxed_correctness(prediction: str, + target: str, + max_relative_change: float = 0.05) -> bool: + """Calculates relaxed correctness. + + The correctness tolerates certain error ratio defined by max_relative_change. + See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1: + “Following Methani et al. (2020), we use a relaxed accuracy measure for the + numeric answers to allow a minor inaccuracy that may result from the automatic + data extraction process. We consider an answer to be correct if it is within + 5% of the gold answer. For non-numeric answers, we still need an exact match + to consider an answer to be correct.” + + Args: + prediction: Predicted string. + target: Target string. + max_relative_change: Maximum relative change. + + Returns: + Whether the prediction was correct given the specified tolerance. + """ + + def _to_float(text: str) -> Optional[float]: + try: + if text.endswith('%'): + # Convert percentages to floats. + return float(text.rstrip('%')) / 100.0 + else: + return float(text) + except ValueError: + return None + + prediction_float = _to_float(prediction) + target_float = _to_float(target) + if prediction_float is not None and target_float: + relative_change = abs(prediction_float - + target_float) / abs(target_float) + return relative_change <= max_relative_change + else: + return prediction.lower() == target.lower() + + +def evaluate_relaxed_accuracy(entries): + scores = [] + for elem in entries: + if isinstance(elem['label'], str): + elem['label'] = [elem['label']] + score = max([ + relaxed_correctness(elem['prediction'].strip(), ann) + for ann in elem['label'] + ]) + scores.append(score) + return scores, sum(scores) / len(scores) + + +class ChartQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='chartqa') + + def __init__( + self, + data_file, + image_folder, + prompt_template, + image_processor, + tokenizer, + pad_image_to_square=True, + use_system=False, + for_llava_prompt=False, + metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset), + ): + super().__init__(metainfo) + self.use_system=use_system + self.for_llava_prompt = for_llava_prompt + + if isinstance(data_file, str): + data_file = [data_file] + self.raw_data = [json.load(open(f)) for f in data_file] + # test_human, test_augmented + self.name = [ + os.path.splitext(os.path.basename(f))[0] for f in data_file + ] + self.name_map = {name: i for i, name in enumerate(self.name)} + self.revert_name_map = {i: name for i, name in enumerate(self.name)} + + template = prompt_template + self.template = template + + self.image_folder = image_folder + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def load_data_list(self): + data_list = [] + idx = 0 + + for data_idx in range(len(self.raw_data)): + for sample_idx in range(len(self.raw_data[data_idx])): + sample = self.raw_data[data_idx][sample_idx] + image_path = sample['imgname'] + question = sample['query'] + answer = sample['label'] + category = self.name[data_idx] + data = { + 'img_id': idx, + 'image_path': image_path, + 'question': question, + 'answer': answer, + 'category': category + } + data_list.append(data) + idx += 1 + return data_list + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, result, work_dir): + orig_index = [x['img_id'] for x in self.data] + results = [[] for _ in range(len(self.name))] + for pred_dict in result: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + cur_result = {} + cur_result['query'] = filtered_rows.get('question') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['label'] = filtered_rows.get('answer') + + index = self.name_map[filtered_rows['category']] + results[index].append(cur_result) + + print_log('============================================', 'current') + acc_list = [] + for i, result in enumerate(results): + scores, _accuracy = evaluate_relaxed_accuracy(result) + + for res, score in zip(result, scores): + res['score'] = score + prediction_file = osp.join(work_dir, self.revert_name_map[i] + '.json') + with open(prediction_file, 'w') as f: + json.dump(result, f) + + print_log('Acc: {}, Category: {}, # samples: {}'.format(_accuracy, self.revert_name_map[i], + len(result)), 'current') + acc_list.append(_accuracy) + + print_log('============================================', 'current') + acc = sum(acc_list) / len(acc_list) + print_log('Overall Acc: {}'.format(acc), 'current') + print_log('============================================', 'current') + print_log('ChartQA successfully finished evaluating', 'current') + + return {'Acc': acc} diff --git a/xtuner/dataset/evaluation/general_vqa_dataset.py b/xtuner/dataset/evaluation/general_vqa_dataset.py new file mode 100644 index 000000000..c87a4d7ac --- /dev/null +++ b/xtuner/dataset/evaluation/general_vqa_dataset.py @@ -0,0 +1,175 @@ +import os +import os.path as osp +from mmengine.dist import master_only +from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset +import pandas as pd +from xtuner.dataset.utils import decode_base64_to_image +import numpy as np + + +def levenshtein_distance(s1, s2): + if len(s1) > len(s2): + s1, s2 = s2, s1 + + distances = range(len(s1) + 1) + for i2, c2 in enumerate(s2): + distances_ = [i2 + 1] + for i1, c1 in enumerate(s1): + if c1 == c2: + distances_.append(distances[i1]) + else: + distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) + distances = distances_ + return distances[-1] + + +def anls_compute(groundtruth, prediction): + gt_answer = ' '.join(groundtruth.strip().lower().split()) + det_answer = ' '.join(prediction.strip().lower().split()) + dist = levenshtein_distance(gt_answer, det_answer) + length = max(len(groundtruth.upper()), len(prediction.upper())) + values = 0.0 if length == 0 else float(dist) / float(length) + return values + + +def hit_calculate(result, dataset_name, anls_threshold=0.5): + if 'DocVQA' in dataset_name or 'InfoVQA' in dataset_name: + # return [1 - np.min(x['match']) >= anls_threshold for x in result] + return [0.0 if 1 - np.min(x['match']) < anls_threshold else 1 - np.min(x['match']) for x in result] + elif 'OCRVQA' in dataset_name: + return [np.max(x['match']) for x in result] + else: + raise NotImplementedError(f"Dataset {dataset_name} not supported for hit calculation") + + +def istype(s, type): + if isinstance(s, type): + return True + try: + return isinstance(eval(s), type) + except Exception as _: + return False + + +class GeneralVQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='gvqa') + + def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + anls_threshold=0.5, use_system=False, metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): + super().__init__(metainfo) + self.anls_threshold = anls_threshold + self.use_system = use_system + self.data_file = data_file + self.df = pd.read_csv(data_file, sep='\t') + + skip_noimg = True + if skip_noimg: + self.df = self.df[~pd.isna(self.df['image'])] + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def get_image(self, image): + while len(image) < 16: + image = self.df[self.df['index'] == int(image)]['image'].values + assert len(image) == 1 + image = image[0] + image = decode_base64_to_image(image) + return image + + def __len__(self): + return len(self.df) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + def load_data_list(self): + data_list = [] + for idx in range(len(self.df)): + index = self.df.iloc[idx]['index'] + image = self.df.iloc[idx]['image'] + question = self.df.iloc[idx]['question'] + split = self.df.iloc[idx]['split'] if 'split' in self.df.iloc[ + 0].keys() else None + answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ + 0].keys() else None + + data = { + 'img': image, + 'question': question, + 'answer': answer, + 'index': index, + 'img_id': idx + } + if split is not None: + data['split'] = split + + data_list.append(data) + return data_list + + @master_only + def evaluate(self, results, work_dir): + orig_index = [x['img_id'] for x in self.data] + new_results = [] + for pred_dict in results: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result['split'] = filtered_rows.get('split') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['index'] = filtered_rows.get('index') + cur_result['index'] = filtered_rows.get('answer') + answers = filtered_rows.get('answer') + if istype(answers, list): + answers = eval(answers) + else: + answers = [answers] + if 'OCRVQA' in self.name: + match = [(1.0 if (x.strip().lower() == cur_result['prediction'].strip().lower()) else 0.0) for x in + answers] + else: + match = [anls_compute(x, cur_result['prediction']) for x in answers] + cur_result['match'] = match + + new_results.append(cur_result) + + results_df = pd.DataFrame(new_results) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + ret = dict() + if 'split' in results_df: + splits = list(set(results_df['split'])) + for sp in splits: + sub = [new_results[i] for i, x in enumerate(new_results) if x['split'] == sp] + hit = hit_calculate(sub, self.name) + ret[sp] = np.mean(hit) * 100 + else: + hit = hit_calculate(new_results, self.name) + ret['overall'] = np.mean(hit) * 100 + + print_log('============================================', 'current') + print_log(ret, 'current') + print_log('============================================', 'current') + print_log(f'{self.name} successfully finished evaluating', 'current') + return ret diff --git a/xtuner/dataset/evaluation/gqa_dataset.py b/xtuner/dataset/evaluation/gqa_dataset.py new file mode 100644 index 000000000..fd16ec12b --- /dev/null +++ b/xtuner/dataset/evaluation/gqa_dataset.py @@ -0,0 +1,118 @@ +import os +import os.path as osp +import json +from mmengine.dist import master_only +from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset +from .gqa_eval_utils import eval_gqa + + +class GQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='gqa') + + def __init__( + self, + data_file, + ann_file, + image_folder, + prompt_template, + image_processor, + tokenizer, + pad_image_to_square=True, + use_system=False, + for_llava_prompt=False, + metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset), + ): + super().__init__(metainfo) + self.data_file = data_file + self.ann_file = ann_file + # Save detailed information for easy viewing + self.answer_file = 'answer_gqa_results.jsonl' + # solely for evaluation purposes + self.prediction_file = 'pred_gqa_results.jsonl' + + self.image_folder = image_folder + self.use_system = use_system + self.for_llava_prompt = for_llava_prompt + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def load_data_list(self): + question_data = [json.loads(q) for q in open(os.path.expanduser(self.data_file), "r")] + data_list = [] + for idx in range(len(question_data)): + sample = question_data[idx] + index = sample['question_id'] + image_path = sample['image'] + question = sample['text'] + category = sample['category'] + + data = { + 'img_id': idx, + 'index': index, + 'image_path': image_path, + 'question': question, + 'category': category, + } + data_list.append(data) + return data_list + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, results, work_dir): + answers_file = osp.join(work_dir, self.answer_file) + ans_file = open(answers_file, "w") + + for pred_dict in results: + idx = pred_dict["img_id"] + gt_data = self.data[idx] + + ans_file.write( + json.dumps( + { + "question_id": gt_data['index'], + "prompt": gt_data['question'], + "text": pred_dict['prediction'], + "metadata": {}, + } + ) + + "\n" + ) + ans_file.close() + + all_preds = [] + for line_idx, line in enumerate(open(answers_file)): + res = json.loads(line) + question_id = res['question_id'] + text = res['text'].rstrip('.').lower() + all_preds.append({"questionId": question_id, "prediction": text}) + + prediction_file = osp.join(work_dir, self.prediction_file) + with open(prediction_file, 'w') as f: + json.dump(all_preds, f) + + evaluator = eval_gqa(questions=self.ann_file, predictions=prediction_file) + print_log('============================================', 'current') + scores = evaluator.forward() + print_log('============================================', 'current') + print_log(f'GQA successfully finished evaluating', 'current') + return scores diff --git a/xtuner/dataset/evaluation/gqa_eval_utils.py b/xtuner/dataset/evaluation/gqa_eval_utils.py new file mode 100644 index 000000000..9e97e26e8 --- /dev/null +++ b/xtuner/dataset/evaluation/gqa_eval_utils.py @@ -0,0 +1,499 @@ +# Evaluation code for GQA. +# Computes a suite of metrics such as accuracy, consistency, plausibility and scores per question type and length. +# Visit https://gqadataset.org/ for all information about the dataset, including examples, visualizations, paper and slides. +# +# +# Metrics: +# - Accuracy: Standard accuracy, computed over the balanced version of the dataset, which is more robust against +# cheating by making educated guesses. For each question-answer pair (q,a), we give 1 point if the +# predicted answer p matches a and 0 otherwise, and average over all questions in the dataset. +# +# - Consistency: A metric for the level of model's consistency across different questions. For each question-answer +# pair (q,a), we define a set Eq={q1, q2, ..., qn} of entailed questions, the answers to which can +# be unambiguously inferred given (q,a). +# Denote Q the set of all questions the model answered correctly. For each question q in Q, we +# measure the model's accuracy over the entailed questions Eq to get the score sq and finally +# average these results across all questions in Q. +# +# - Validity: Measures whether the model gives a "valid" answer - one that can theoretically be an answer +# to the question (e.g. a color to a color question, yes/no to a binary question etc.). +# We provide a set of valid answers to each questions over the final answer vocabulary, in +# the choices file, and use it to compute average validity across the dataset. +# +# - Plausibility: Measures whether the model answers are plausible, e.g. one that make sense in the real world, +# e.g. not answering "purple" to a question about apple color (unless it's really purple). +# We provide a set of all plausible answers to each questions, computed by looking at all +# attributes and relations hold for various objects throughout the whole dataset scene graphs, +# and use it to compute average model plausibility across the data. +# +# - Grounding: Only for attention models. Measures whether the model looks at the relevant regions in the +# image when answering a question. Each question in the dataset is annotated with the visual regions +# they refer to, which are then used to compute the level to which the model has a correct visual attention, +# which will allow to identify whether it really answers based on the image of by language-based guesses. +# Supports both spatial features and object-based features. +# +# - Distribution: Measures the overall match between the true answer distribution for different questions, +# vs the overall distribution predicted by the model through its answers for all the data. +# We use chi-square statistic to measure the degree of similarity between the distributions, +# giving indication to the level of overall world-knowledge of the model +# +# - Accuracy per type: accuracy per question structural types (logic, compare, choose), and semantic type +# (questions about attributes, relations, categories, objects or the whole scene). +# +# - Accuracy for length: accuracy as a function of the question length, in terms of (1) words number, and semantic +# complexity - number of reasoning steps. +# +# We may support additional metrics (e.g. coverage) in the future. +# +# +# Files format: +# - predictions file format: JSON array: [{"questionId": str, "prediction": str}] +# - attentions file format: JSON array: +# Spatial attention: [{"questionId": str, "attention": [mapSize x mapSize: float] }]. +# Object-based attention:[{"questionId": str, "attention": [[x0, y0, x1, y1, float] x #regions] }]. 0 < x,y < 1. +# - questions and choices files are provided as part of the dataset. +# see https://gqadataset.org/download.html for information about their format. +# +# +# If you have any questions or comments, please feel free to send an email, +# at dorarad@cs.stanford.edu. We hope you'll enjoy using the GQA dataset! :) +# +# +# import torch.nn as nn +from collections import defaultdict +from tqdm import tqdm +import os.path +import glob +import json +from mmengine.logging import print_log + + +########################################################################################## +class eval_gqa(): + + def __init__( + self, + tier="val", + scenes="{tier}_sceneGraphs.json", + questions="{tier}_all_questions.json", + choices="{tier}_choices.json", + predictions="{tier}_predictions.json", + attentions="{tier}_attentions.json", + consistency=False, + grounding=False, + objectFeatures=False, + mapSize=7, + ): + + self.consistency = consistency + self.grounding = grounding + self.objectFeatures = objectFeatures + self.mapSize = mapSize + if not consistency: + print_log("Please consider using --consistency to compute consistency scores for entailed questions.", + 'current') + print_log("If you do so, please provide answers to all questions in val_all_questions.json.\n", 'current') + + if not grounding: + print_log("Please consider using --grounding to compute attention scores.", 'current') + print_log("If you do so, please provide attention maps through --attentions.\n", 'current') + + ##### Files Loading + ########################################################################################## + # Load scene graphs + print_log("Loading scene graphs...", 'current') + try: + self.scenes = self.loadFile(scenes.format(tier=self.tier)) + except: + print_log('Failed to load scene graphs -- cannot evaluate grounding') + self.scenes = None # for testdev + + # Load questions + print_log("Loading questions...", 'current') + self.questions = self.loadFile(questions) + + # Load choices + print_log("Loading choices...", 'current') + try: + self.choices = self.loadFile(choices.format(tier=self.tier)) + except: + print_log('Failed to load choices -- cannot evaluate validity or plausibility', 'current') + self.choices = None # for testdev + + # Load predictions and turn them into a dictionary + print_log("Loading predictions...", 'current') + predictions = self.loadFile(predictions.format(tier=tier)) + self.predictions = {p["questionId"]: p["prediction"] for p in predictions} + + # Make sure all question have predictions + for qid in self.questions: + if (qid not in self.predictions) and (consistency or self.questions[qid]["isBalanced"]): + print_log("no prediction for question {}. Please add prediction for all questions.".format(qid), + 'current') + raise Exception("missing predictions") + + # Load attentions and turn them into a dictionary + self.attentions = None + if grounding: + with open(attentions.format(tier=tier)) as attentionsFile: + attentions = json.load(attentionsFile) + self.attentions = {a["questionId"]: a["attention"] for a in attentions} + + def forward(self): + # Initialize data structure to track all metrics: e.g. accuracy, validity and plausibility, as well as + # accuracy per question type, length and number of reasoning steps. + scores = { + "accuracy": [], # list of accuracies per question (1 if correct else 0). Will be averaged ultimately. + "binary": [], + # list of accuracies per a binary question (1 if correct else 0). Will be averaged ultimately. + "open": [], # list of accuracies per an open question (1 if correct else 0). Will be averaged ultimately. + "validity": [], # list of validity per question (1 if valid else 0). + "plausibility": [], # list of plausibility per question (1 if plausible else 0). + "consistency": [], # list of consistency scores for entailed questions. + "accuracyPerStructuralType": defaultdict(list), + # list of question accuracies for each structural type (e.g. compare, logic questions). + "accuracyPerSemanticType": defaultdict(list), + # list of question accuracies for each semantic type (e.g. questions about an object, an attribute, a relation). + "accuracyPerLength": defaultdict(list), # list of question accuracies per question's word number. + "accuracyPerSteps": defaultdict(list), + # list of question accuracies per question's reasoning length (steps number). + "grounding": [], # list of grounding scores for each question. + } + + # Initialize golden and predicted histograms per each question group. Used to compute the distribution metric. + dist = {"gold": defaultdict(lambda: defaultdict(int)), "predicted": defaultdict(lambda: defaultdict(int))} + ##### Main score computation + ########################################################################################## + + # Loop over the questions and compute mterics + for qid, question in tqdm(self.questions.items()): + + # Compute scores over the balanced dataset (more robust against cheating by making educated guesses) + if question["isBalanced"]: + gold = question["answer"] + predicted = self.predictions[qid] + + correct = predicted == gold + score = self.toScore(correct) + + wordsNum = self.getWordsNum(question) + stepsNum = self.getStepsNum(question) + + # Update accuracy + scores["accuracy"].append(score) + scores["accuracyPerLength"][wordsNum].append(score) + scores["accuracyPerSteps"][stepsNum].append(score) + scores["accuracyPerStructuralType"][question["types"]["structural"]].append(score) + scores["accuracyPerSemanticType"][question["types"]["semantic"]].append(score) + answerType = "open" if question["types"]["structural"] == "query" else "binary" + scores[answerType].append(score) + + # Update validity score + valid = ( + self.belongs(predicted, self.choices[qid]["valid"], question) if self.choices else False + ) + scores["validity"].append(self.toScore(valid)) + + # Update plausibility score + plausible = ( + self.belongs(predicted, self.choices[qid]["plausible"], question) + if self.choices + else False + ) + scores["plausibility"].append(self.toScore(plausible)) + + # Optionally compute grounding (attention) score + if self.attentions is not None: + groundingScore = self.computeGroundingScore( + question, self.scenes[question["imageId"]], self.attentions[qid] + ) + if groundingScore is not None: + scores["grounding"].append(groundingScore) + + # Update histograms for gold and predicted answers + globalGroup = question["groups"]["global"] + if globalGroup is not None: + dist["gold"][globalGroup][gold] += 1 + dist["predicted"][globalGroup][predicted] += 1 + + if self.consistency: + # Compute consistency (for entailed questions) + scores = self.updateConsistency(qid, question, self.questions, correct, scores) + + # Compute distribution score + scores["distribution"] = self.chiSquare(dist["gold"], dist["predicted"]) / 100 + + # Average scores over all questions (in the balanced dataset) and print_log scores + + metrics = [ + "binary", + "open", + "accuracy", + "consistency", + "validity", + "plausibility", + "grounding", + "distribution", + ] + + detailedMetrics = [ + ("accuracyPerStructuralType", "Accuracy / structural type"), + ("accuracyPerSemanticType", "Accuracy / semantic type"), + ("accuracyPerSteps", "Accuracy / steps number"), + ("accuracyPerLength", "Accuracy / words number"), + ] + + subMetrics = {"attr": "attribute", "cat": "category", "global": "scene", "obj": "object", "rel": "relation"} + # average + for k in metrics: + if isinstance(scores[k], list): + scores[k] = self.avg(scores[k]) * 100 + + for k, _ in detailedMetrics: + for t in scores[k]: + scores[k][t] = self.avg(scores[k][t]) * 100, len(scores[k][t]) + + # print_log + for m in metrics: + # skip grounding and consistency scores if not requested + if m == "grounding" and not self.grounding: + continue + if m == "consistency" and not self.consistency: + continue + + # print_log score + print_log( + "{title}: {score:.2f}{suffix}".format( + title=m.capitalize(), + score=scores[m], + suffix=" (lower is better)" if m == "distribution" else "%", + ) + , 'current') + + for m, mPrintName in detailedMetrics: + print_log("") + # print_log metric title + print_log("{}:".format(mPrintName)) + + for t in sorted(list(scores[m].keys())): + # set sub-metric title + tName = t + if isinstance(scores[k], list): + tName = subMetrics.get(t, t).capitalize() + + # print_log score + print_log( + " {title}: {score:.2f}{suffix} ({amount} questions)".format( + title=tName, score=scores[m][t][0], suffix="%", amount=scores[m][t][1] + ) + , 'current') + return scores + + def loadFile(self, name): + # load standard json file + if os.path.isfile(name): + with open(name) as file: + data = json.load(file) + # load file chunks if too big + elif os.path.isdir(name.split(".")[0]): + data = {} + chunks = glob.glob('{dir}/{dir}_*.{ext}'.format(dir=name.split(".")[0], ext=name.split(".")[1])) + for chunk in chunks: + with open(chunk) as file: + data.update(json.load(file)) + else: + raise Exception("Can't find {}".format(name)) + return data + + ##### Scores data structures initialization + ########################################################################################## + + # book to float + def toScore(self, b): + return float(1 if b else 0) + + # Compute average of a list + def avg(self, l): + if len(l) == 0: + return 0 + return float(sum(l)) / len(l) + + def wavg(self, l, w): + if sum(w) == 0: + return None + return float(sum(l[i] * w[i] for i in range(len(l)))) / sum(w) + + ##### Question lengths - words numbers and reasoning steps number + ########################################################################################## + + # Compute question length (words number) + def getWordsNum(self, question): + return len(question["question"].split()) + + # Compute number of reasoning steps (excluding the final "querying" step which doesn't increase effective reasoning length) + def getStepsNum(self, question): + return len( + [ + c + for c in question["semantic"] + if not ( + any( + [ + o in "{}: {}".format(c["operation"], c["argument"]) + for o in ["exist", "query: name", "choose name"] + ] + ) + ) + ] + ) + + ##### Functions for question annotations + ########################################################################################## + + # # Utility function for converting question annotations string keys to slices + # def toSlice(strSlice): + # sliceLims = (int(n) for n in strSlice.split(':')) + # return apply(slice, sliceLims) + + # # Utility function for converting question annotations string keys to indexes list: + # # "1" => [0] + # # "1:3" => [1, 2] + # # "4:9:2" => [4, 6, 8] + # def intsFromSlice(strSlice): + # slice_obj = get_slice_obj(slicearg) + # return range(slice_obj.start or 0, slice_obj.stop or -1, slice_obj.step or 1) + + ##### Functions for validity and plausibility + ########################################################################################## + + def belongs(self, element, group, question): + # normalization () + if "Common" in question["types"]["detailed"]: + group = ["color", "material", "shape"] + + return element in group + + ##### Functions for consistency scores (for entailed questions ("inferred")) + ########################################################################################## + + def updateConsistency(self, questionId, question, questions, correct, scores): + inferredQuestions = [eid for eid in question["entailed"] if eid != questionId] + + if correct and len(inferredQuestions) > 0: + + cosnsitencyScores = [] + for eid in inferredQuestions: + gold = questions[eid]["answer"] + predicted = self.predictions[eid] + score = self.toScore(predicted == gold) + cosnsitencyScores.append(score) + + scores["consistency"].append(self.avg(cosnsitencyScores)) + return scores + + ##### Functions for grounding score (optional, only for attention models) + ########################################################################################## + + # Utility functions for working with bounding boxes. + # c = (x0, y0, x1, y1), r = (r0, r1) + + def yrange(self, c): + return (c[1], c[3]) + + def xrange(self, c): + return (c[0], c[2]) + + def length(self, r): + if r is None: + return 0 + return float(r[1] - r[0]) + + def size(self, c): + return self.length(self.xrange(c)) * self.length(self.yrange(c)) + + def intersection(self, r1, r2): + ir = (max(r1[0], r2[0]), min(r1[1], r2[1])) + if ir[1] > ir[0]: + return ir + return None + + def intersectionSize(self, c1, c2): + return self.length(self.intersection(self.xrange(c1), self.xrange(c2))) * self.length( + self.intersection(self.yrange(c1), self.yrange(c2)) + ) + + def intersectionRate(self, c1, c2): + return float(self.intersectionSize(c1, c2)) / self.size(c1) + + # Get spatial cell + def getCell(self, i, j): + edge = float(1) / self.mapSize + return (edge * i, edge * j, edge * (i + 1), edge * (j + 1)) + + # Get bounding box of objectId in sceneGraph + def getRegion(self, sceneGraph, objectId): + obj = sceneGraph["objects"][objectId] + x0 = float(obj["x"]) / sceneGraph["width"] + y0 = float(obj["y"]) / sceneGraph["height"] + x1 = float(obj["x"] + obj["w"]) / sceneGraph["width"] + y1 = float(obj["y"] + obj["h"]) / sceneGraph["height"] + return (x0, y0, x1, y1) + + # Compute grounding score. Computer amount of attention (probability) given to each of the regions + # the question and answers refer to. + def computeGroundingScore(self, question, sceneGraph, attentionMap): + ## prepare gold regions + regions = [] + # add question regions + regions += [ + self.getRegion(sceneGraph, pointer) for pointer in question["annotations"]["question"].values() + ] + # add answer regions + regions += [ + self.getRegion(sceneGraph, pointer) for pointer in question["annotations"]["fullAnswer"].values() + ] + # add all the image if the question refers to the whole scene + if any(("scene" in c) for c in question["semantic"]): + regions.append((0, 0, 1, 1)) + + # prepare attention map + if self.objectFeatures: + # cells = [((x0, y0, x1, y1), attention) for x0, y0, x1, y1, attention in cells] + pass + else: + cells = [ + (self.getCell(i, j), attentionMap[i][j]) + for i in range(self.mapSize) + for j in range(self.mapSize) + ] + + # compare attention map to gold regions + scores = [] + for region in regions: + for cell, attention in cells: + scores.append(attention * self.intersectionRate(cell, region)) + return sum(scores) + + ##### Functions for distribution score + ########################################################################################## + + # Compute chi square statistic of gold distribution vs predicted distribution, + # averaged over all question groups + def chiSquare(self, goldDist, predictedDist): + sumScore, sumOverall = 0, 0 + + for group in goldDist: + score, overall = 0, 0 + + for ans in goldDist[group]: + e = goldDist[group][ans] + o = predictedDist[group].get(ans, 0) + score += (float(o - e) ** 2) / e + overall += goldDist[group][ans] + + sumScore += score * overall + sumOverall += overall + + avgScore = float(sumScore) / sumOverall + + return avgScore diff --git a/xtuner/dataset/evaluation/hallusion_dataset.py b/xtuner/dataset/evaluation/hallusion_dataset.py new file mode 100644 index 000000000..cd5fd2dc6 --- /dev/null +++ b/xtuner/dataset/evaluation/hallusion_dataset.py @@ -0,0 +1,125 @@ +import os +import os.path as osp + +import pandas as pd +from mmengine.dist import (master_only) +from .base_eval_dataset import BaseEvalDataset + +from xtuner.dataset.utils import decode_base64_to_image +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from .utils import YOrN_Extraction, Hallusion_rating +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset + + +class HallusionDataset(BaseEvalDataset): + + METAINFO: dict = dict(name='hullusion') + + def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + use_system=False, metainfo=None, proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): + super().__init__(metainfo) + self.use_system = use_system + self.data_file = data_file + self.df = pd.read_csv(data_file, sep='\t') + + skip_noimg = True + if skip_noimg: + self.df = self.df[~pd.isna(self.df['image'])] + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def get_image(self, image): + while len(image) < 16: + image = self.df[self.df['index'] == int(image)]['image'].values + assert len(image) == 1 + image = image[0] + image = decode_base64_to_image(image) + return image + + def __len__(self): + return len(self.df) + + def load_data_list(self): + data_list = [] + for idx in range(len(self.df)): + index = self.df.iloc[idx]['index'] + image = self.df.iloc[idx]['image'] + image_path = self.df.iloc[idx]['image_path'] + question = self.df.iloc[idx]['question'] + category = self.df.iloc[idx]['category'] + l2_category = self.df.iloc[idx]['l2-category'] + answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ + 0].keys() else None + + data = { + 'img': image, + 'image_path': image_path, + 'question': question, + 'answer': answer, + 'category': category, + 'index': index, + 'l2-category': l2_category, + 'img_id': idx + } + data_list.append(data) + return data_list + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, result, work_dir): + orig_index = [x['img_id'] for x in self.data] + results = [] + for pred_dict in result: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['category'] = filtered_rows['category'] + cur_result['index'] = filtered_rows.get('index') + cur_result['answer'] = filtered_rows.get('answer') + cur_result['image_path'] = filtered_rows.get('image_path') + cur_result['l2-category'] = filtered_rows.get('l2-category') + results.append(cur_result) + + results_df = pd.DataFrame(results) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + data = results_df.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + + ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])} + # 不使用 gpt + data['extracted'] = [ans_map[x] for x in data['index']] + data['score'] = (data['answer'] == data['extracted']) + + results_df = pd.DataFrame(data) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + score = Hallusion_rating(data) + print_log('============================================', 'current') + print_log(score, 'current') + print_log('============================================', 'current') + print_log(f'YOrN_eval successfully finished evaluating', 'current') + return score + diff --git a/xtuner/dataset/evaluation/mme_dataset.py b/xtuner/dataset/evaluation/mme_dataset.py new file mode 100644 index 000000000..94f68f9b6 --- /dev/null +++ b/xtuner/dataset/evaluation/mme_dataset.py @@ -0,0 +1,129 @@ +import os +import os.path as osp + +import pandas as pd +from mmengine.dist import (master_only) +from .base_eval_dataset import BaseEvalDataset + +from xtuner.dataset.utils import decode_base64_to_image +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from .utils import YOrN_Extraction, MME_rating +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset + + +class MMEDataset(BaseEvalDataset): + + METAINFO: dict = dict(name='mme') + + def __init__(self, data_file, image_folder, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + use_system=False, for_llava_prompt=False, metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): + super().__init__(metainfo) + self.image_folder = image_folder + self.use_system = use_system + self.for_llava_prompt = for_llava_prompt + self.data_file = data_file + self.df = pd.read_csv(data_file, sep='\t') + + skip_noimg = True + if skip_noimg: + self.df = self.df[~pd.isna(self.df['image'])] + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def load_data_list(self): + data_list = [] + for idx in range(len(self.df)): + index = self.df.iloc[idx]['index'] + image = self.df.iloc[idx]['image'] + image_path = self.df.iloc[idx]['image_path'] + + question = self.df.iloc[idx]['question'] + if self.for_llava_prompt: + question = question.replace(' Please answer yes or no.', + '\nAnswer the question using a single word or phrase.') + + category = self.df.iloc[idx]['category'] + answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ + 0].keys() else None + + data = { + 'img': image, + 'image_path': image_path, + 'question': question, + 'answer': answer, + 'category': category, + 'index': index, + 'img_id': idx + } + data_list.append(data) + return data_list + + def get_image(self, image): + while len(image) < 16: + image = self.df[self.df['index'] == int(image)]['image'].values + assert len(image) == 1 + image = image[0] + image = decode_base64_to_image(image) + return image + + def __len__(self): + return len(self.df) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, results, work_dir): + orig_index = [x['img_id'] for x in self.data] + new_results = [] + for pred_dict in results: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['category'] = filtered_rows['category'] + cur_result['index'] = filtered_rows.get('index') + cur_result['answer'] = filtered_rows.get('answer') + cur_result['image_path'] = filtered_rows.get('image_path') + new_results.append(cur_result) + + results_df = pd.DataFrame(new_results) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + data = results_df.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + + ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])} + # 不使用 gpt + data['extracted'] = [ans_map[x] for x in data['index']] + data['score'] = (data['answer'] == data['extracted']) + + results_df = pd.DataFrame(data) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + score = MME_rating(data) + print_log('============================================', 'current') + print_log(score, 'current') + print_log('============================================', 'current') + print_log(f'MME YOrN_eval successfully finished evaluating', 'current') + return score diff --git a/xtuner/dataset/evaluation/multiple_choice_dataset.py b/xtuner/dataset/evaluation/multiple_choice_dataset.py new file mode 100644 index 000000000..e9e139e34 --- /dev/null +++ b/xtuner/dataset/evaluation/multiple_choice_dataset.py @@ -0,0 +1,267 @@ +import os +import os.path as osp +import re +import string + +import numpy as np +import pandas as pd +from mmengine.dist import (master_only) +from rich.console import Console +from rich.table import Table +from .base_eval_dataset import BaseEvalDataset + +from xtuner.dataset.utils import decode_base64_to_image +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset + + +def MMMU_preproc(data): + cnt = 0 + As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer']) + lt = len(data) + for i in range(lt): + if pd.isna(As[i]): + As[i] = Ans[i] + Bs[i] = 'Other Answers' + cnt += 1 + print_log(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones. ', 'current') + data['A'] = As + data['B'] = Bs + return data + + +class MultipleChoiceDataset(BaseEvalDataset): + # 'mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d' + METAINFO: dict = dict(name='multiple_choice') + + def __init__(self, data_file, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + use_system=False, metainfo=None, proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): + super().__init__(metainfo) + self.use_system = use_system + self.data_file = data_file + self.df = pd.read_csv(data_file, sep='\t') + + if 'MMMU' in os.path.basename(data_file): + self.df = MMMU_preproc(self.df) + self.split = 'dev' if 'answer' in self.df.iloc[0].keys() else 'test' + self.has_l2_category = 'l2-category' in self.df.columns.to_list() + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_xlsx_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.xlsx' + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def get_image(self, image): + while len(image) < 16: + image = self.df[self.df['index'] == int(image)]['image'].values + assert len(image) == 1 + image = image[0] + image = decode_base64_to_image(image) + return image + + def __len__(self): + return len(self.df) + + def load_data_list(self): + data_list = [] + for idx in range(len(self.df)): + index = self.df.iloc[idx]['index'] + image = self.df.iloc[idx]['image'] + question = self.df.iloc[idx]['question'] + answer = self.df.iloc[idx]['answer'] if 'answer' in self.df.iloc[ + 0].keys() else None + category = self.df.iloc[idx]['category'] + split = self.df.iloc[idx]['split'] if 'split' in self.df.iloc[ + 0].keys() else None + + options = { + cand: self.load_from_df(idx, cand) + for cand in string.ascii_uppercase + if self.load_from_df(idx, cand) is not None + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + hint = self.load_from_df(idx, 'hint') + data = { + 'img': image, + 'question': question, + 'answer': answer, + 'options': options_prompt, + 'category': category, + 'options_dict': options, + 'index': index, + 'context': hint, + 'img_id': idx + } + if split is not None: + data['split'] = split + + if self.has_l2_category: + data.update({'l2-category': self.df.iloc[idx]['l2-category']}) + data_list.append(data) + return data_list + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + def load_from_df(self, idx, key): + if key in self.df.iloc[idx] and not pd.isna(self.df.iloc[idx][key]): + return self.df.iloc[idx][key] + else: + return None + + @master_only + def evaluate(self, results, work_dir): + + def calc_acc(df, split, group='category'): + assert group in ['overall', 'category', 'l2-category'] + if group == 'overall': + if split is None: + res = {'Average': np.mean(df['hit'])} + else: + res = {'Average': np.mean(df[df['split'] == split]['hit'])} + else: + res = {} + abilities = list(set(df[group])) + abilities.sort() + for ab in abilities: + sub_df = df[df[group] == ab] + if split is None: + res[ab] = np.mean(sub_df['hit']) + else: + res[ab] = np.mean(sub_df[sub_df['split'] == split]['hit']) + return res + + def eval_sub_data(sub_data, answer_map): + lt = len(sub_data) + for i in range(lt): + item = sub_data.iloc[i] + match = re.search(r'([A-D]+)', item['prediction']) + pred = match.group(1) if match else '' + gt = answer_map[item['index']] + if gt != pred: + return 0 + return 1 + + def show_result(ret_json, split): + show_dict = ret_json.copy() + table = Table(title=f' Multiple Choice ({self.data_file}) ') + console = Console() + if split is not None: + table.add_column(f'Category ({split})', justify='left') + else: + table.add_column('Category', justify='left') + table.add_column('Accuracy (%)', justify='right') + average = show_dict.pop('Average') * 100 + table.add_row('Average', f'{average:.1f}') + table.add_section() + for cat_name, cat_acc in show_dict.items(): + table.add_row(cat_name, f'{cat_acc * 100:.1f}') + with console.capture() as capture: + console.print(table, end='') + print_log('\n' + capture.get(), 'current') + print_log('Note: Please be cautious if you use the results in papers, ' + "since we don't use ChatGPT as a helper for choice " + 'extraction', 'current') + + orig_index = [x['img_id'] for x in self.data] + new_results = [] + for pred_dict in results: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result.update(filtered_rows.get('options_dict')) + cur_result['prediction'] = pred_dict['prediction'] + if filtered_rows.get('category') is not None: + cur_result['category'] = filtered_rows.get('category') + if filtered_rows.get('l2-category') is not None: + cur_result['l2-category'] = filtered_rows.get('l2-category') + cur_result['index'] = filtered_rows.get('index') + cur_result['split'] = filtered_rows.get('split') + cur_result['answer'] = filtered_rows.get('answer') + new_results.append(cur_result) + + results_df = pd.DataFrame(new_results) + with pd.ExcelWriter(osp.join(work_dir, self.results_xlsx_path), engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + if self.split != 'dev': + print_log('Test set does not have answers, skip evaluation', 'current') + return {'Average': 0} + + data = results_df.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + for k in data.keys(): + data[k.lower() if k not in 'ABCD' else k] = data.pop(k) + + data_main = data[data['index'] < int(1e6)] + cate_map = { + i: c + for i, c in zip(self.df['index'], self.df['category']) + } + if self.has_l2_category: + l2_cate_map = { + i: c + for i, c in zip(self.df['index'], self.df['l2-category']) + } + answer_map = { + i: c + for i, c in zip(self.df['index'], self.df['answer']) + } + + lt = len(data_main) + hit, tot = 0, 0 + result = {} + for i in range(lt): + item_main = data_main.iloc[i] + idx = item_main['index'] + assert idx not in result + sub_data = data[data['index'] % int(1e6) == idx] + ret = eval_sub_data(sub_data, answer_map) + result[idx] = ret + hit += ret + tot += 1 + + indices = data_main['index'] + data_main = data_main.copy() + data_main['hit'] = [result[i] for i in indices] + main_idx = data_main['index'] + data_main['category'] = [cate_map[i] for i in main_idx] + + if 'split' in data_main: + splits = list(set(data_main['split'])) + else: + splits = [None] + + for split in splits: + ret_json = calc_acc(data_main, split, 'overall') + + if self.has_l2_category: + data_main['l2-category'] = [l2_cate_map[i] for i in main_idx] + l2 = calc_acc(data_main, split, 'l2-category') + ret_json.update(l2) + + leaf = calc_acc(data_main, split, 'category') + ret_json.update(leaf) + + print_log('============================================', 'current') + show_result(ret_json,split) + print_log('============================================', 'current') + print_log('Multiple Choice successfully finished evaluating' 'current') + return ret_json diff --git a/xtuner/dataset/evaluation/pope_dataset.py b/xtuner/dataset/evaluation/pope_dataset.py new file mode 100644 index 000000000..780784eb8 --- /dev/null +++ b/xtuner/dataset/evaluation/pope_dataset.py @@ -0,0 +1,155 @@ +import os + +import pandas as pd +from mmengine.dist import master_only +from PIL import Image + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from .base_eval_dataset import BaseEvalDataset + +from .utils import YOrN_Extraction, load_jsonl +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset + + +def eval_func(pred_list, label_list): + pos = 1 + neg = 0 + yes_ratio = pred_list.count(1) / len(pred_list) + + TP, TN, FP, FN = 0, 0, 0, 0 + for pred, label in zip(pred_list, label_list): + if pred == pos and label == pos: + TP += 1 + elif pred == pos and label == neg: + FP += 1 + elif pred == neg and label == neg: + TN += 1 + elif pred == neg and label == pos: + FN += 1 + + print_log('TP\tFP\tTN\tFN\t', 'current') + print_log(f'{TP}\t{FP}\t{TN}\t{FN}', 'current') + + precision = float(TP) / float(TP + FP) + recall = float(TP) / float(TP + FN) + f1 = 2 * precision * recall / (precision + recall) + acc = (TP + TN) / (TP + TN + FP + FN) + print_log(f'Accuracy: {acc}', 'current') + print_log(f'Precision: {precision}', 'current') + print_log(f'Recall: {recall}', 'current') + print_log(f'F1 score: {f1}', 'current') + print_log(f'Yes ratio: {yes_ratio}', 'current') + return f1 + + +class POPEDataset(BaseEvalDataset): + METAINFO: dict = dict(name='pope') + + def __init__(self, data_file, coco_val_path, prompt_template, image_processor, tokenizer, pad_image_to_square=True, + use_system=False, metainfo=None, proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): + super().__init__(metainfo) + self.use_system = use_system + if isinstance(data_file, str): + data_file = [data_file] + self.raw_data = [load_jsonl(f) for f in data_file] + + self.name = [ + os.path.splitext(os.path.basename(f))[0] for f in data_file + ] + + self.coco_val_path = coco_val_path + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + + self.results_xlsx_path = 'pope-results.xlsx' + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def get_image(self, image): + image = Image.open(os.path.join(self.coco_val_path, image)) + return image + + def __len__(self): + return len(self.data) + + def load_data_list(self): + data_list = [] + idx = 0 + for data_idx in range(len(self.raw_data)): + for sample_idx in range(len(self.raw_data[data_idx])): + sample = self.raw_data[data_idx][sample_idx] + index = sample['question_id'] + image_path = sample['image'] + question = sample['text'] + answer = sample['label'] + category = self.name[data_idx] + assert answer in ['yes', 'no'] + data = { + 'img_id': idx, + 'index': index, + 'img': image_path, + 'question': question, + 'answer': answer, + 'category': category + } + data_list.append(data) + idx += 1 + return data_list + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, result, work_dir, show=True): + orig_index = [x['img_id'] for x in self.data] + results = [] + for pred_dict in result: + index = pred_dict['img_id'] + new_index = orig_index.index(index) + filtered_rows = self.data[new_index] + cur_result = {} + cur_result['question'] = filtered_rows.get('question') + cur_result['prediction'] = pred_dict['prediction'] + cur_result['category'] = filtered_rows['category'] + cur_result['index'] = filtered_rows.get('index') + cur_result['answer'] = filtered_rows.get('answer') + results.append(cur_result) + + results_df = pd.DataFrame(results) + with pd.ExcelWriter( + os.path.join(work_dir, self.results_xlsx_path), + engine='openpyxl') as writer: + results_df.to_excel(writer, index=False) + + score = 0 + for sub_name in self.name: + sub_results = [x for x in results if x['category'] == sub_name] + pred_list = [ + int(YOrN_Extraction(x['prediction']) == 'Yes') + for x in sub_results + ] + label_list = [ + int(YOrN_Extraction(x['answer']) == 'Yes') for x in sub_results + ] + print_log('============================================', 'current') + print_log('Category: {}, # samples: {}'.format(sub_name, + len(sub_results)), 'current') + cur_f1 = eval_func(pred_list, label_list) + score += cur_f1 + + score /= len(self.name) + print_log('============================================', 'current') + print_log(f'Average F1-score: {score}', 'current') + print_log('============================================', 'current') + print_log('POPE successfully finished evaluating', 'current') + return score diff --git a/xtuner/dataset/evaluation/textvqa_dataset.py b/xtuner/dataset/evaluation/textvqa_dataset.py new file mode 100644 index 000000000..e786c2f8f --- /dev/null +++ b/xtuner/dataset/evaluation/textvqa_dataset.py @@ -0,0 +1,108 @@ +import os +import os.path as osp +import re + +from .base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +import json +from mmengine.dist import (master_only) +from .textvqa_utils import TextVQAAccuracyEvaluator +from mmengine.logging import print_log +from ..llava_proxy_eval_dataset import LLaVAProxyEvalDataset + + +def prompt_processor(prompt): + if prompt.startswith('OCR tokens: '): + pattern = r"Question: (.*?) Short answer:" + match = re.search(pattern, prompt, re.DOTALL) + question = match.group(1) + elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3: + if prompt.startswith('Reference OCR token:'): + question = prompt.split('\n')[1] + else: + question = prompt.split('\n')[0] + elif len(prompt.split('\n')) == 2: + question = prompt.split('\n')[0] + else: + assert False + + return question.lower() + + +class TextVQADataset(BaseEvalDataset): + METAINFO: dict = dict(name='textvqa') + + def __init__(self, data_file, ann_file, image_folder, prompt_template, image_processor, tokenizer, + pad_image_to_square=True, use_system=False, metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset)): + super().__init__(metainfo) + self.use_system = use_system + self.data_file = data_file + self.ann_file = ann_file + self.image_folder = image_folder + + template = prompt_template + self.template = template + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + self.pad_image_to_square = pad_image_to_square + self.name = os.path.splitext(os.path.basename(data_file))[0] + self.results_path = os.path.splitext(os.path.basename(data_file))[0] + '-results.jsonl' + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def load_data_list(self): + data = [json.loads(q) for q in open(os.path.expanduser(self.data_file), "r")] + for i, d in enumerate(data): + d['img_id'] = i + d['image_path'] = d['image'] + d['question'] = d['text'] + return data + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, result, work_dir, show=True): + answers_file = osp.join(work_dir, self.results_path) + ans_file = open(answers_file, "w") + + for pred_dict in result: + idx = pred_dict["img_id"] + gt_data = self.data[idx] + + ans_file.write(json.dumps({"question_id": gt_data['question_id'], + "prompt": gt_data['text'], + "text": pred_dict['prediction'], + "metadata": {}}) + "\n") + ans_file.close() + + annotations = json.load(open(self.ann_file))['data'] + annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in + annotations} + results = [json.loads(line) for line in open(answers_file)] + + pred_list = [] + for result in results: + annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))] + pred_list.append({ + "pred_answer": result['text'], + "gt_answers": annotation['answers'], + }) + + evaluator = TextVQAAccuracyEvaluator() + acc = 100. * evaluator.eval_pred_list(pred_list) + print_log('============================================', 'current') + print_log('Samples: {}, Accuracy: {:.2f}%'.format(len(pred_list), acc), 'current') + print_log('============================================', 'current') + print_log(f'TextVQA successfully finished evaluating', 'current') + return {'acc': acc} diff --git a/xtuner/dataset/evaluation/textvqa_utils.py b/xtuner/dataset/evaluation/textvqa_utils.py new file mode 100644 index 000000000..c3e5887e0 --- /dev/null +++ b/xtuner/dataset/evaluation/textvqa_utils.py @@ -0,0 +1,255 @@ +from tqdm import tqdm +import re + + +class EvalAIAnswerProcessor: + """ + Processes an answer similar to Eval AI + copied from + https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897 + """ + + CONTRACTIONS = { + "aint": "ain't", + "arent": "aren't", + "cant": "can't", + "couldve": "could've", + "couldnt": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didnt": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadnt": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasnt": "hasn't", + "havent": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isnt": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "oclock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldnt": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "thats": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "theres": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasnt": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "werent": "weren't", + "whatll": "what'll", + "whatre": "what're", + "whats": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "wont": "won't", + "wouldve": "would've", + "wouldnt": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", + } + + NUMBER_MAP = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + ARTICLES = ["a", "an", "the"] + PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)") + COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)") + PUNCTUATIONS = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", + ] + + def __init__(self, *args, **kwargs): + pass + + def word_tokenize(self, word): + word = word.lower() + word = word.replace(",", "").replace("?", "").replace("'s", " 's") + return word.strip() + + def process_punctuation(self, in_text): + out_text = in_text + for p in self.PUNCTUATIONS: + if (p + " " in in_text or " " + p in in_text) or ( + re.search(self.COMMA_STRIP, in_text) is not None + ): + out_text = out_text.replace(p, "") + else: + out_text = out_text.replace(p, " ") + out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE) + return out_text + + def process_digit_article(self, in_text): + out_text = [] + temp_text = in_text.lower().split() + for word in temp_text: + word = self.NUMBER_MAP.setdefault(word, word) + if word not in self.ARTICLES: + out_text.append(word) + else: + pass + for word_id, word in enumerate(out_text): + if word in self.CONTRACTIONS: + out_text[word_id] = self.CONTRACTIONS[word] + out_text = " ".join(out_text) + return out_text + + def __call__(self, item): + item = self.word_tokenize(item) + item = item.replace("\n", " ").replace("\t", " ").strip() + item = self.process_punctuation(item) + item = self.process_digit_article(item) + return item + + +class TextVQAAccuracyEvaluator: + def __init__(self): + self.answer_processor = EvalAIAnswerProcessor() + + def _compute_answer_scores(self, raw_answers): + """ + compute the accuracy (soft score) of human answers + """ + answers = [self.answer_processor(a) for a in raw_answers] + assert len(answers) == 10 + gt_answers = list(enumerate(answers)) + unique_answers = set(answers) + unique_answer_scores = {} + + for unique_answer in unique_answers: + accs = [] + for gt_answer in gt_answers: + other_answers = [item for item in gt_answers if item != gt_answer] + matching_answers = [ + item for item in other_answers if item[1] == unique_answer + ] + acc = min(1, float(len(matching_answers)) / 3) + accs.append(acc) + unique_answer_scores[unique_answer] = sum(accs) / len(accs) + + return unique_answer_scores + + def eval_pred_list(self, pred_list): + pred_scores = [] + for entry in tqdm(pred_list): + pred_answer = self.answer_processor(entry["pred_answer"]) + unique_answer_scores = self._compute_answer_scores(entry["gt_answers"]) + score = unique_answer_scores.get(pred_answer, 0.0) + pred_scores.append(score) + + accuracy = sum(pred_scores) / len(pred_scores) + return accuracy diff --git a/xtuner/dataset/evaluation/utils.py b/xtuner/dataset/evaluation/utils.py new file mode 100644 index 000000000..4b02eda82 --- /dev/null +++ b/xtuner/dataset/evaluation/utils.py @@ -0,0 +1,135 @@ +import numpy as np +from collections import defaultdict +import json + + +def process_punctuation(inText): + import re + outText = inText + punct = [ + ';', r'/', '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', + '>', '<', '@', '`', ',', '?', '!' + ] + commaStrip = re.compile('(\d)(,)(\d)') # noqa: W605 + periodStrip = re.compile('(?!<=\d)(\.)(?!\d)') # noqa: W605 + for p in punct: + if (p + ' ' in inText or ' ' + p in inText) or (re.search( + commaStrip, inText) is not None): + outText = outText.replace(p, '') + else: + outText = outText.replace(p, ' ') + outText = periodStrip.sub('', outText, re.UNICODE) + return outText + + +def YOrN_Extraction(output): + s = output.lower() + words = process_punctuation(s).split() + if 'yes' in words and 'no' not in words: + return 'Yes' + if 'yes' not in words and 'no' in words: + return 'No' + return 'Unknown' + + +def MME_rating(data): + stats = defaultdict(dict) + lt = len(data) + for i in range(lt): + item = data.iloc[i] + category = item['category'] + image_path = item['image_path'] + score = item['score'] + if image_path not in stats[category]: + stats[category][image_path] = [] + stats[category][image_path].append(score) + + def acc(key, mode='normal'): + res = stats[key] + values = [] + for val in res.values(): + if mode == 'normal': + values.extend(val) + elif mode == 'plus': + values.append(val[0] * val[1]) + return np.mean(values) * 100 + + scores = {} + for k in stats: + scores[k] = acc(k) + acc(k, 'plus') + + super_cates = dict( + perception=[ + 'OCR', 'artwork', 'celebrity', 'color', 'count', 'existence', + 'landmark', 'position', 'posters', 'scene' + ], + reasoning=['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation'] + ) + + ret = {} + for sc, cate_list in super_cates.items(): + base = 0 + for c in cate_list: + base += scores[c] + ret[sc] = base + ret.update(scores) + return ret + + +def Hallusion_rating(data): + def calc_fAcc(data): + res = defaultdict(list) + lt = len(data) + for i in range(lt): + line = data.iloc[i] + res[f"{line['l2-category']}_{line['set_id']}_{line['figure_id']}"].append(line['score']) + return np.mean([np.all(x) for x in res.values()]) * 100 + + def calc_qAcc(data): + res = defaultdict(list) + lt = len(data) + for i in range(lt): + line = data.iloc[i] + res[f"{line['l2-category']}_{line['set_id']}_{line['question_id']}"].append(line['score']) + return np.mean([np.all(x) for x in res.values()]) * 100 + + def calc_aAcc(data): + return np.mean(data['score']) * 100 + + data['set_id'] = [x.split('_')[3] for x in data['index']] + data['figure_id'] = [x.split('_')[4] for x in data['index']] + data['question_id'] = [x.split('_')[5] for x in data['index']] + + res = dict(split=[], aAcc=[], fAcc=[], qAcc=[]) + res['split'].append('Overall') + res['aAcc'].append(calc_aAcc(data)) + res['fAcc'].append(calc_fAcc(data)) + res['qAcc'].append(calc_qAcc(data)) + + if 'category' in data: + cates = list(set(data['category'])) + for c in cates: + sub = data[data['category'] == c] + res['split'].append(c) + res['aAcc'].append(calc_aAcc(sub)) + res['fAcc'].append(calc_fAcc(sub)) + res['qAcc'].append(calc_qAcc(sub)) + + if 'l2-category' in data: + cates = list(set(data['l2-category'])) + for c in cates: + sub = data[data['l2-category'] == c] + res['split'].append(c) + res['aAcc'].append(calc_aAcc(sub)) + res['fAcc'].append(calc_fAcc(sub)) + res['qAcc'].append(calc_qAcc(sub)) + return res + + +def load_jsonl(json_file): + with open(json_file) as f: + lines = f.readlines() + data = [] + for line in lines: + data.append(json.loads(line)) + return data diff --git a/xtuner/dataset/evaluation/vqav2_dataset.py b/xtuner/dataset/evaluation/vqav2_dataset.py new file mode 100644 index 000000000..f9d4fb7d0 --- /dev/null +++ b/xtuner/dataset/evaluation/vqav2_dataset.py @@ -0,0 +1,139 @@ +import os +import os.path as osp +import json +from mmengine.dist import master_only +from xtuner.dataset.evaluation.base_eval_dataset import BaseEvalDataset + +from xtuner.registry import BUILDER +from mmengine.logging import print_log +from xtuner.dataset.llava_proxy_eval_dataset import LLaVAProxyEvalDataset +from .vqav2_utils import EvalAIAnswerProcessor + + +class VQAv2Dataset(BaseEvalDataset): + + METAINFO: dict = dict(name='vqa_v2') + + def __init__( + self, + data_file, + test_file, + image_folder, + prompt_template, + image_processor, + tokenizer, + pad_image_to_square=True, + use_system=False, + for_llava_prompt=False, + metainfo=None, + proxy_eval_dataset=dict(type=LLaVAProxyEvalDataset), + ): + super().__init__(metainfo) + self.data_file = data_file + self.test_file = test_file + self.image_folder = image_folder + # Save detailed information for easy viewing + self.answer_file = 'answer_vqav2_results.json' + # solely for evaluation purposes + self.prediction_file = 'pred_vqav2_results.json' + self.answer_processor = EvalAIAnswerProcessor() + + self.use_system = use_system + self.for_llava_prompt = for_llava_prompt + self.template = prompt_template + self.pad_image_to_square = pad_image_to_square + + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = BUILDER.build(image_processor) + + self.data = self.load_data_list() + + proxy_eval_dataset['eval_dataset'] = self + self.proxy_eval_dataset = BUILDER.build(proxy_eval_dataset) + + def load_data_list(self): + question_data = [json.loads(q) for q in open(os.path.expanduser(self.data_file), "r")] + data_list = [] + for idx in range(len(question_data)): + sample = question_data[idx] + index = sample['question_id'] + image_path = sample['image'] + question = sample['text'] + category = sample['category'] + + data = { + 'img_id': idx, + 'index': index, + 'image_path': image_path, + 'question': question, + 'category': category, + } + data_list.append(data) + + return data_list + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + data = self.data[idx] + data_dict = self.proxy_eval_dataset.getitem(idx, data) + return data_dict + + @master_only + def evaluate(self, results, work_dir): + answers_file = osp.join(work_dir, self.answer_file) + ans_file = open(answers_file, "w") + + for pred_dict in results: + idx = pred_dict["img_id"] + gt_data = self.data[idx] + + ans_file.write( + json.dumps( + { + "question_id": gt_data['index'], + "prompt": gt_data['question'], + "text": pred_dict['prediction'], + "metadata": {}, + } + ) + + "\n" + ) + ans_file.close() + + results = [] + error_line = 0 + for line_idx, line in enumerate(open(answers_file)): + try: + results.append(json.loads(line)) + except: + error_line += 1 + + results = {x['question_id']: x['text'] for x in results} + test_split = [json.loads(line) for line in open(self.test_file)] + + all_answers = [] + + for x in test_split: + if x['question_id'] not in results: + all_answers.append({ + 'question_id': x['question_id'], + 'answer': '' + }) + else: + all_answers.append({ + 'question_id': x['question_id'], + 'answer': self.answer_processor(results[x['question_id']]) + }) + + prediction_file = osp.join(work_dir, self.prediction_file) + with open(prediction_file, 'w') as f: + json.dump(all_answers, f) + + print_log('============================================', 'current') + print(f'total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}') + print_log(f'Please submit the generated {prediction_file} file to the official server for evaluation.', + 'current') + print_log('============================================', 'current') + return {'acc': 0} diff --git a/xtuner/dataset/evaluation/vqav2_utils.py b/xtuner/dataset/evaluation/vqav2_utils.py new file mode 100644 index 000000000..51566338b --- /dev/null +++ b/xtuner/dataset/evaluation/vqav2_utils.py @@ -0,0 +1,216 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import re + + +class EvalAIAnswerProcessor: + """ + Processes an answer similar to Eval AI + copied from + https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897 + """ + + CONTRACTIONS = { + "aint": "ain't", + "arent": "aren't", + "cant": "can't", + "couldve": "could've", + "couldnt": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didnt": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadnt": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasnt": "hasn't", + "havent": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isnt": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "oclock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldnt": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "thats": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "theres": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasnt": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "werent": "weren't", + "whatll": "what'll", + "whatre": "what're", + "whats": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "wont": "won't", + "wouldve": "would've", + "wouldnt": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", + } + + NUMBER_MAP = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + ARTICLES = ["a", "an", "the"] + PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)") + COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)") + PUNCTUATIONS = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", + ] + + def __init__(self, *args, **kwargs): + pass + + def word_tokenize(self, word): + word = word.lower() + word = word.replace(",", "").replace("?", "").replace("'s", " 's") + return word.strip() + + def process_punctuation(self, in_text): + out_text = in_text + for p in self.PUNCTUATIONS: + if (p + " " in in_text or " " + p in in_text) or ( + re.search(self.COMMA_STRIP, in_text) is not None + ): + out_text = out_text.replace(p, "") + else: + out_text = out_text.replace(p, " ") + out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE) + return out_text + + def process_digit_article(self, in_text): + out_text = [] + temp_text = in_text.lower().split() + for word in temp_text: + word = self.NUMBER_MAP.setdefault(word, word) + if word not in self.ARTICLES: + out_text.append(word) + else: + pass + for word_id, word in enumerate(out_text): + if word in self.CONTRACTIONS: + out_text[word_id] = self.CONTRACTIONS[word] + out_text = " ".join(out_text) + return out_text + + def __call__(self, item): + item = self.word_tokenize(item) + item = item.replace("\n", " ").replace("\t", " ").strip() + item = self.process_punctuation(item) + item = self.process_digit_article(item) + return item diff --git a/xtuner/dataset/huggingface.py b/xtuner/dataset/huggingface.py index c44e88688..c3e2c93ad 100644 --- a/xtuner/dataset/huggingface.py +++ b/xtuner/dataset/huggingface.py @@ -12,11 +12,23 @@ from torch import distributed as dist from xtuner.registry import BUILDER, MAP_FUNC -from .utils import Packer, encode_fn +from .utils import Packer +from .utils import encode_fn as default_encode_fn +from .utils import total_image_token def get_lengths(example): - return {'length': len(example['input_ids'])} + cur_len = len(example['input_ids']) + if example.get('image', None) is not None: + assert 'image_wh' in example + image_wh = example['image_wh'] + if image_wh is not None: + if isinstance(image_wh[0], int): + image_wh = [image_wh] + num_image_token = total_image_token(image_wh[0], 1, 12, 448, 16) + cur_len += num_image_token + cur_len = -cur_len + return {'length': cur_len} def build_origin_dataset(dataset, split): @@ -66,12 +78,23 @@ def add_template_to_dataset(dataset, template_map_fn, map_num_proc): def tokenize_dataset(dataset, tokenizer, max_length, with_image_token, input_ids_with_output, remove_unused_columns, - map_num_proc): + map_num_proc, encode_map_fn=None): assert (tokenizer is not None) and (max_length is not None), \ f'({tokenizer}, {max_length})' if isinstance(tokenizer, dict) or isinstance( tokenizer, Config) or isinstance(tokenizer, ConfigDict): tokenizer = BUILDER.build(tokenizer) + if encode_map_fn is None: + encode_fn = default_encode_fn + else: + if isinstance(encode_map_fn, + dict) or isinstance(encode_map_fn, Config) or \ + isinstance(encode_map_fn, ConfigDict): + encode_fn = encode_map_fn.pop('type') + if len(encode_map_fn) != 0: + encode_fn = partial(encode_fn, **encode_map_fn) + else: + encode_fn = encode_map_fn dataset = dataset.map( partial( encode_fn, @@ -103,6 +126,7 @@ def process(dataset, max_length=None, dataset_map_fn=None, template_map_fn=None, + encode_map_fn=None, max_dataset_length=None, split='train', remove_unused_columns=False, @@ -198,7 +222,8 @@ def process(dataset, if do_dataset_tokenization: dataset = tokenize_dataset(dataset, tokenizer, max_length, with_image_token, input_ids_with_output, - remove_unused_columns, map_num_proc) + remove_unused_columns, map_num_proc, + encode_map_fn=encode_map_fn) if input_ids_with_output: assert {'input_ids', 'labels'}.issubset(dataset.column_names) @@ -214,6 +239,7 @@ def process(dataset, # add 'length' dataset = dataset.map(get_lengths, num_proc=map_num_proc) + setattr(dataset, 'modality_length', dataset['length']) setattr(dataset, 'length', dataset['length']) return dataset @@ -226,6 +252,7 @@ def process_hf_dataset(dataset, dataset_map_fn=None, template_map_fn=None, max_dataset_length=None, + encode_map_fn=None, split='train', remove_unused_columns=False, rename_maps=[], @@ -284,6 +311,7 @@ def process_hf_dataset(dataset, max_length=max_length, dataset_map_fn=dataset_map_fn, template_map_fn=template_map_fn, + encode_map_fn=encode_map_fn, max_dataset_length=max_dataset_length, split=split, remove_unused_columns=remove_unused_columns, diff --git a/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py b/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py new file mode 100644 index 000000000..4a8cd4985 --- /dev/null +++ b/xtuner/dataset/internvl_v1_5_llava_proxy_eval_dataset.py @@ -0,0 +1,117 @@ +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string +from .utils import dynamic_preprocess + +from torchvision.transforms.functional import InterpolationMode +import torchvision.transforms as T + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + + +class InternVL_v1_5_LLaVAProxyEvalDataset: + def __init__(self, eval_dataset, min_num, max_num, custom=False): + self.eval_ds = eval_dataset + self.min_num = min_num + self.max_num = max_num + + self.custom = custom + if custom: + self.image_processor = build_transform(448) + self._crop_size = {'height': 448, 'width': 448} + else: + # TODO: Assuming they are all squares. + if hasattr(eval_dataset.image_processor, 'crop_size'): + self._crop_size = eval_dataset.image_processor.crop_size + else: + self._crop_size = eval_dataset.image_processor.size + + self._image_size = self._crop_size['height'] + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + # TODO prompt are different of vlmevalkit + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + elif self.eval_ds.metainfo['name'] in ['chartqa', 'gvqa']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nAnswer the question using a single word or phrase.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text + elif self.eval_ds.metainfo['name'] in ['hullusion', 'pope']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nPlease answer yes or no.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text + else: + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.eval_ds.tokenizer.encode(chunk) + else: + cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2', 'chartqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])).convert('RGB') + else: + image = self.eval_ds.get_image(data['img']).convert('RGB') + + images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size) + for i, image in enumerate(images): + if self.custom: + image = self.image_processor(image) + else: + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + images[i] = image + images = torch.stack(images, dim=0) + data_dict['pixel_values'] = images + return data_dict diff --git a/xtuner/dataset/llava.py b/xtuner/dataset/llava.py index 0fab0258a..25c21f7d7 100644 --- a/xtuner/dataset/llava.py +++ b/xtuner/dataset/llava.py @@ -2,7 +2,7 @@ import json import logging import os - +import io import torch from datasets import Dataset as HFDataset from datasets import DatasetDict, load_from_disk @@ -13,8 +13,11 @@ from xtuner.registry import BUILDER from .huggingface import process_hf_dataset -from .utils import expand2square - +from .utils import expand2square, process_anyres_image, total_image_token, dynamic_preprocess +from mmengine.fileio import get +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor +import numpy as np def load_jsonl(json_file): with open(json_file) as f: @@ -36,7 +39,9 @@ def __init__(self, max_dataset_length=None, dataset_map_fn=None, template_map_fn=None, + encode_map_fn=None, max_length=2048, + s2_scales=None, # [1, 2] or [1,2,3] pad_image_to_square=False): super().__init__() @@ -61,7 +66,7 @@ def __init__(self, raise NotImplementedError for idx in range(len(json_data)): - if isinstance(json_data[idx]['id'], int): + if 'id' in json_data[idx] and isinstance(json_data[idx]['id'], int): json_data[idx]['id'] = str(json_data[idx]['id']) json_data = DatasetDict({'train': HFDataset.from_list(json_data)}) self.text_data = process_hf_dataset( @@ -70,6 +75,7 @@ def __init__(self, max_length=max_length, dataset_map_fn=dataset_map_fn, template_map_fn=template_map_fn, + encode_map_fn=encode_map_fn, split='train', max_dataset_length=max_dataset_length, remove_unused_columns=False, @@ -85,6 +91,17 @@ def __init__(self, self.image_processor = image_processor self.pad_image_to_square = pad_image_to_square + self.max_s2_scale = s2_scales + if s2_scales is not None: + self.max_s2_scale = max(s2_scales) + if hasattr(self.image_processor, 'crop_size'): + self.image_processor.crop_size['height'] *= self.max_s2_scale + self.image_processor.crop_size['width'] *= self.max_s2_scale + self.image_processor.size['shortest_edge'] *= self.max_s2_scale + else: + self.image_processor.size['height'] *= self.max_s2_scale + self.image_processor.size['width'] *= self.max_s2_scale + @property def modality_length(self): length_list = [] @@ -98,12 +115,20 @@ def modality_length(self): def __len__(self): return len(self.text_data) + def get_image(self, path): + if "s3://" in path: + img_bytes = get(path) + with io.BytesIO(img_bytes) as buff: + img = Image.open(buff).convert('RGB') + return img + else: + return Image.open(path).convert('RGB') + def __getitem__(self, index): data_dict = self.text_data[index] if data_dict.get('image', None) is not None: image_file = data_dict['image'] - image = Image.open(os.path.join(self.image_folder, - image_file)).convert('RGB') + image = self.get_image(os.path.join(self.image_folder, image_file)) if self.pad_image_to_square: image = expand2square( image, @@ -120,3 +145,171 @@ def __getitem__(self, index): data_dict['pixel_values'] = torch.zeros(3, crop_size['height'], crop_size['width']) return data_dict + + +class AnyResLLaVADataset(LLaVADataset): + + def __init__(self, image_grid_pinpoints, *args, **kwargs): + self.image_grid_pinpoints = image_grid_pinpoints + super().__init__(*args, **kwargs) + # TODO: Assuming they are all squares. + if hasattr(self.image_processor, 'crop_size'): + self._crop_size = self.image_processor.crop_size + else: + self._crop_size = self.image_processor.size + self._patch_size = self._crop_size['height'] + self._shortest_edge = self._crop_size['height'] + + def __getitem__(self, index): + data_dict = self.text_data[index] + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = self.get_image(os.path.join(self.image_folder, image_file)) + orig_size = image.size + # use to remove padding + data_dict['orig_size'] = orig_size + image = process_anyres_image(image, self.image_processor, + self.image_grid_pinpoints, + self._patch_size, self._shortest_edge, + pad_mean=tuple(int(x * 255) for x in self.image_processor.image_mean), + # keep the same as the original implementation + orig_img_pad_to_square=self.pad_image_to_square) + data_dict['pixel_values'] = image + else: + data_dict['orig_size'] = self._crop_size + data_dict['pixel_values'] = torch.zeros(1, 3, self._crop_size['height'], + self._crop_size['width']) + return data_dict + + +from torchvision.transforms.functional import InterpolationMode +import torchvision.transforms as T + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + + +class InternVL_V1_5_LLaVADataset(LLaVADataset): + def __init__(self, min_num, max_num, downsample_ratio=0.5, image_size=336, use_patch=True, custom=False, *args, + **kwargs): + self.min_num = min_num + self.max_num = max_num + self.downsample_ratio = downsample_ratio + self.use_patch = use_patch + super().__init__(*args, **kwargs) + + self.custom = custom + + if custom: + self.image_processor = build_transform(448) + self._crop_size = {'height': 448, 'width': 448} + else: + if hasattr(self.image_processor, 'crop_size'): + self._crop_size = self.image_processor.crop_size + else: + self._crop_size = self.image_processor.size + + self._patch_size = self._crop_size['height'] + self._shortest_edge = self._crop_size['height'] + + # clip + self._image_size = image_size + self._patch_size = (self._image_size // 14) * downsample_ratio # 12, 16 + + self.max_refetch = 1000 + + def __calc_fn(self, data_dict): + cur_len = data_dict['length'] + # cur_len = len(data_dict['input_ids']) + # if data_dict.get('image', None) is not None: + # cur_len = len(data_dict['input_ids']) + # if data_dict.get('image', None) is not None: + # image_file = data_dict['image'] + # assert 'image_wh' in data_dict + # if 'image_wh' in data_dict: + # size = data_dict['image_wh'][0] + # else: + # try: + # image = self.get_image(os.path.join(self.image_folder, image_file)) + # size = image.size + # except Exception as e: + # print(f'Error: {e}', flush=True) + # print_log(f'Error: {e}', logger='current') + # size = [1, 1] + # if self.use_patch: + # num_image_token = total_image_token(size, self.min_num, self.max_num, self._image_size, + # self._patch_size) + # else: + # num_image_token = self._patch_size * self._patch_size + # cur_len += num_image_token + # cur_len = -cur_len + return cur_len + + # 太慢了,改离线吧 + @property + def modality_length(self): + # 可以超级加速 + print_log('start calculating modality length', logger='current') + # with ThreadPoolExecutor(max_workers=16) as executor: + # length_list = list( + # tqdm( + # executor.map(self.__calc_fn, self.text_data), + # desc='Calculating modality length', + # total=len(self.text_data))) + # print_log('end calculating modality length', logger='current') + + length_list = self.text_data['length'] + print_log('end calculating modality length', logger='current') + return length_list + + def __getitem__(self, index): + for _ in range(self.max_refetch + 1): + data = self.prepare_data(index) + # Broken images may cause the returned data to be None + if data is None: + index = self._rand_another() + continue + return data + + def prepare_data(self, index): + data_dict = self.text_data[index] + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + if isinstance(image_file, list): + if len(image_file) > 1: + return None + image_file = image_file[0] + try: + image = self.get_image(os.path.join(self.image_folder, image_file)) + except Exception as e: + print(f'Error: {e}', flush=True) + print_log(f'Error: {e}', logger='current') + return None + images = dynamic_preprocess(image, self.min_num, self.max_num, self._image_size, use_patch=self.use_patch) + for i, image in enumerate(images): + if self.custom: + image = self.image_processor(image) + else: + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + images[i] = image + images = torch.stack(images, dim=0) + data_dict['pixel_values'] = images + else: + data_dict['pixel_values'] = torch.zeros(1, 3, self._crop_size['height'], + self._crop_size['width']) + return data_dict + + def _rand_another(self) -> int: + return np.random.randint(0, len(self.text_data)) diff --git a/xtuner/dataset/llava_proxy_eval_dataset.py b/xtuner/dataset/llava_proxy_eval_dataset.py new file mode 100644 index 000000000..e04f8391a --- /dev/null +++ b/xtuner/dataset/llava_proxy_eval_dataset.py @@ -0,0 +1,88 @@ +from xtuner.dataset.utils import expand2square +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string + + +class LLaVAProxyEvalDataset: + def __init__(self, eval_dataset): + self.eval_ds = eval_dataset + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + # TODO prompt are different of vlmevalkit + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + elif self.eval_ds.metainfo['name'] in ['chartqa', 'gvqa']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nAnswer the question using a single word or phrase.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text + elif self.eval_ds.metainfo['name'] in ['hullusion', 'pope']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nPlease answer yes or no.' + text = DEFAULT_IMAGE_TOKEN + '\n' + text + else: + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + # add bos token + bos_token_id = self.eval_ds.tokenizer.bos_token_id + cur_encode = [bos_token_id] + cur_encode += self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + else: + cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2', 'chartqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])).convert('RGB') + else: + image = self.eval_ds.get_image(data['img']).convert('RGB') + + if self.eval_ds.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.eval_ds.image_processor.image_mean)) + image = self.eval_ds.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict + diff --git a/xtuner/dataset/llava_proxy_eval_dataset1.py b/xtuner/dataset/llava_proxy_eval_dataset1.py new file mode 100644 index 000000000..570555e24 --- /dev/null +++ b/xtuner/dataset/llava_proxy_eval_dataset1.py @@ -0,0 +1,106 @@ +from xtuner.dataset.utils import expand2square +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string +import asyncio +from openai import AsyncOpenAI +from typing import List + +import base64 +from io import BytesIO +from typing import Union + +import requests +from PIL import Image + + +def encode_image_base64(image: Image.Image) -> str: + """encode image to base64 format.""" + buffered = BytesIO() + image.save(buffered, format='PNG') + + return f"data:image/jpeg;base64,{base64.b64encode(buffered.getvalue()).decode('utf-8')}" + + +class LLaVAProxyEvalDataset1: + def __init__(self, eval_dataset): + self.eval_ds = eval_dataset + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + # text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + # TODO prompt are different of vlmevalkit + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + elif self.eval_ds.metainfo['name'] in ['chartqa', 'gvqa']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nAnswer the question using a single word or phrase.' + # text = DEFAULT_IMAGE_TOKEN + '\n' + text + elif self.eval_ds.metainfo['name'] in ['hullusion', 'pope']: + # TODO prompt are different of vlmevalkit + text = data['question'] + '\nPlease answer yes or no.' + # text = DEFAULT_IMAGE_TOKEN + '\n' + text + else: + text = data['question'] + # text = DEFAULT_IMAGE_TOKEN + '\n' + text + data_dict['text'] = text + + # if self.eval_ds.use_system: + # inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + # else: + # inputs = '' + # inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + # chunk_encode = [] + # for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + # if idx == 0: + # # add bos token + # bos_token_id = self.eval_ds.tokenizer.bos_token_id + # cur_encode = [bos_token_id] + # cur_encode += self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + # else: + # cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + # chunk_encode.append(cur_encode) + # assert len(chunk_encode) == 2 + # ids = [] + # for idx, cur_chunk_encode in enumerate(chunk_encode): + # ids.extend(cur_chunk_encode) + # if idx != len(chunk_encode) - 1: + # ids.append(IMAGE_TOKEN_INDEX) + # ids = torch.tensor(ids) + # data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa', 'vqa_v2', 'chartqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])) + else: + image = self.eval_ds.get_image(data['img']) + image = encode_image_base64(image) + # if self.eval_ds.pad_image_to_square: + # image = expand2square( + # image, + # tuple( + # int(x * 255) for x in self.eval_ds.image_processor.image_mean)) + # image = self.eval_ds.image_processor.preprocess( + # image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + return data_dict diff --git a/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py b/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py index a08ca395b..5449588e5 100644 --- a/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py +++ b/xtuner/dataset/map_fns/dataset_map_fns/llava_map_fn.py @@ -29,6 +29,7 @@ def llava_map_fn(example): while messages and messages[0]['from'] == 'gpt': # Skip the first one if it is from gpt messages = messages[1:] + # system_v = '' for msg in messages: if msg['from'] == 'human': if DEFAULT_IMAGE_TOKEN in msg['value']: @@ -40,7 +41,11 @@ def llava_map_fn(example): elif msg['from'] == 'gpt': conversation.append({'input': input, 'output': msg['value']}) + # conversation.append({'input': input, 'output': msg['value'], 'system': system_v}) input = '' + # system_v = '' + # elif msg['from'] == 'system': + # system_v = msg['value'] else: raise NotImplementedError return {'conversation': conversation} diff --git a/xtuner/dataset/mini_gemini_dataset.py b/xtuner/dataset/mini_gemini_dataset.py new file mode 100644 index 000000000..33ad914ec --- /dev/null +++ b/xtuner/dataset/mini_gemini_dataset.py @@ -0,0 +1,77 @@ +from .llava import LLaVADataset +import torch +from PIL import Image +import os +from .utils import expand2square +import numpy as np + + +class MiniGeminiDataset(LLaVADataset): + # siglip 864 + # clip 768 + def __init__(self, *args, image_size_aux=768, **kwargs): + self.image_size_aux = image_size_aux + super().__init__(*args, **kwargs) + + self._model_name = type(self.image_processor).__name__ + + if self._model_name == 'CLIPImageProcessor': + self.crop_size_raw = self.image_processor.crop_size.copy() + self.image_processor.crop_size['height'] = image_size_aux + self.image_processor.crop_size['width'] = image_size_aux + self.image_processor.size['shortest_edge'] = image_size_aux + else: + self.aux_mean = np.array([0.48145466, 0.4578275, 0.40821073]) + self.aux_std = np.array([0.26862954, 0.26130258, 0.27577711]) + + def __getitem__(self, index): + data_dict = self.text_data[index] + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = Image.open(os.path.join(self.image_folder, + image_file)).convert('RGB') + + if self._model_name == 'CLIPImageProcessor': + # clip 和 convnext 均值和方差一样,前处理相同,但是 siglip 不一致 + if self.pad_image_to_square: + image = expand2square(image, tuple(int(x * 255) for x in self.image_processor.image_mean)) + + image_aux = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values_aux'] = image_aux + + image = image_aux.clone() + image = torch.nn.functional.interpolate( + image[None], size=[self.crop_size_raw['height'], self.crop_size_raw['width']], mode='bilinear', + align_corners=False + )[0] + data_dict['pixel_values'] = image + else: + # siglip + image_aux = image + if self.pad_image_to_square: + image = expand2square( + image, + tuple( + int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values'] = image + + # aux image + if self.pad_image_to_square: + image_aux = expand2square( + image_aux, + tuple( + int(x * 255) for x in self.aux_mean)) + image_aux = image_aux.resize((self.image_size_aux, self.image_size_aux), resample=Image.BILINEAR) + image_aux = np.array(image_aux) # H, W, 3 + image_aux = image_aux / 255.0 + image_aux = (image_aux - self.aux_mean) / self.aux_std + image_aux = torch.tensor(image_aux).permute(2, 0, 1) + data_dict['pixel_values_aux'] = image_aux + else: + data_dict['pixel_values_aux'] = torch.zeros(3, self.image_size_aux, self.image_size_aux) + if self._model_name == 'CLIPImageProcessor': + data_dict['pixel_values'] = torch.zeros(3, self.crop_size_raw['height'], + self.crop_size_raw['width']) + return data_dict diff --git a/xtuner/dataset/mini_gemini_proxy_eval_dataset.py b/xtuner/dataset/mini_gemini_proxy_eval_dataset.py new file mode 100644 index 000000000..b7d93cd74 --- /dev/null +++ b/xtuner/dataset/mini_gemini_proxy_eval_dataset.py @@ -0,0 +1,95 @@ +from xtuner.dataset.utils import expand2square +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX) +import torch +from PIL import Image +import os +from xtuner.tools.utils import is_cn_string +import numpy as np + + +class MiniGeminiProxyEvalDataset: + def __init__(self, eval_dataset, image_size_aux=768): + self.eval_ds = eval_dataset + + self._model_name = type(eval_dataset.image_processor).__name__ + + if self._model_name == 'CLIPImageProcessor': + self.crop_size_raw = eval_dataset.image_processor.crop_size.copy() + self.eval_ds.image_processor.crop_size['height'] = image_size_aux + self.eval_ds.image_processor.crop_size['width'] = image_size_aux + self.eval_ds.image_processor.size['shortest_edge'] = image_size_aux + else: + self.aux_mean = np.array([0.48145466, 0.4578275, 0.40821073]) + self.aux_std = np.array([0.26862954, 0.26130258, 0.27577711]) + + def getitem(self, idx, data): + data_dict = {'img_id': data['img_id']} + + # 1 prepare text + if self.eval_ds.metainfo['name'] == 'multiple_choice': + # MultipleChoiceDataset + if data['context'] is not None: + text = data['context'] + '\n' + data[ + 'question'] + '\n' + data['options'] + else: + text = data['question'] + '\n' + data['options'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if is_cn_string(text): + text = text + '请直接回答选项字母。' + else: + text = text + ("Answer with the option's letter from the " + 'given choices directly.') + else: + text = data['question'] + text = DEFAULT_IMAGE_TOKEN + '\n' + text + + if self.eval_ds.use_system: + inputs = self.eval_ds.template.get('SYSTEM', '{system}').format(system='') + else: + inputs = '' + inputs += self.eval_ds.template['INSTRUCTION'].format(input=text, round=1) + + # 2 tokenize inputs + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.eval_ds.tokenizer.encode(chunk) + else: + cur_encode = self.eval_ds.tokenizer.encode(chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + ids.append(IMAGE_TOKEN_INDEX) + ids = torch.tensor(ids) + data_dict['input_ids'] = ids + + # 3 process image + if self.eval_ds.metainfo['name'] in ['mme', 'textvqa', 'gqa']: + # MMEDataset or TextVQADataset + image = Image.open(os.path.join(self.eval_ds.image_folder, + data['image_path'])).convert('RGB') + else: + image = self.eval_ds.get_image(data['img']).convert('RGB') + + if self._model_name == 'CLIPImageProcessor': + # clip 和 convnext 均值和方差一样,前处理相同,但是 siglip 不一致 + if self.eval_ds.pad_image_to_square: + image = expand2square(image, tuple(int(x * 255) for x in self.eval_ds.image_processor.image_mean)) + + image_aux = self.eval_ds.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + data_dict['pixel_values_aux'] = image_aux + + image = image_aux.clone() + image = torch.nn.functional.interpolate( + image[None], size=[self.crop_size_raw['height'], self.crop_size_raw['width']], mode='bilinear', + align_corners=False + )[0] + data_dict['pixel_values'] = image + else: + raise NotImplementedError + + return data_dict diff --git a/xtuner/dataset/utils.py b/xtuner/dataset/utils.py index 84336ddb2..f044e5c89 100644 --- a/xtuner/dataset/utils.py +++ b/xtuner/dataset/utils.py @@ -4,7 +4,8 @@ import io from io import BytesIO from itertools import chain - +import torch +import math import numpy as np import requests from PIL import Image @@ -269,3 +270,318 @@ def decode_base64_to_image(base64_string): image_data = base64.b64decode(base64_string) image = Image.open(io.BytesIO(image_data)) return image + + +# ---------------------------------------------------------------------- +# ref: https://github.com/haotian-liu/LLaVA +def select_best_resolution(original_size, possible_resolutions): + """Selects the best resolution from a list of possible resolutions based on + the original size. + + Args: + original_size (tuple): The original size of the image in the format + (width, height). + possible_resolutions (list): A list of possible resolutions in + the format [(width1, height1), (width2, height2), ...]. + + Returns: + tuple: The best fit resolution in the format (width, height). + """ + original_width, original_height = original_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float('inf') + + for width, height in possible_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int(original_width * scale), int( + original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, + original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution + and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + +def resize_and_pad_image(image, target_resolution,pad_mean): + """Resize and pad an image to a target resolution while maintaining aspect + ratio. + + Args: + image (PIL.Image.Image): The input image. + target_resolution (tuple): The target resolution (width, height) of + the image. + + Returns: + PIL.Image.Image: The resized and padded image. + """ + original_width, original_height = image.size + target_width, target_height = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + # Resize the image + resized_image = image.resize((new_width, new_height)) + + new_image = Image.new('RGB', (target_width, target_height), pad_mean) + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + # 居中 padding + new_image.paste(resized_image, (paste_x, paste_y)) + + return new_image + + +def divide_to_patches(image, patch_size): + """Divides an image into patches of a specified size. + + Args: + image (PIL.Image.Image): The input image. + patch_size (int): The size of each patch. + + Returns: + list: A list of PIL.Image.Image objects representing the patches. + """ + patches = [] + width, height = image.size + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + box = (j, i, j + patch_size, i + patch_size) + patch = image.crop(box) + patches.append(patch) + + return patches + + +def process_anyres_image(image, processor, possible_resolutions, patch_size, shortest_edge, pad_mean=(0, 0, 0), orig_img_pad_to_square=False): + """Process an image with variable resolutions. + + Args: + image (PIL.Image.Image): The input image to be processed. + processor: The image processor object. + possible_resolutions (str): A string representation of a list of + possible resolutions. + + Returns: + torch.Tensor: A tensor containing the processed image patches. + """ + best_resolution = select_best_resolution(image.size, possible_resolutions) + image_padded = resize_and_pad_image(image, best_resolution, pad_mean) + + patches = divide_to_patches(image_padded, patch_size) + + if orig_img_pad_to_square: + # 不是居中 padding + image = expand2square(image, pad_mean) + + image_original_resize = image.resize((shortest_edge, shortest_edge)) + + image_patches = [image_original_resize] + patches + image_patches = [ + processor.preprocess(image_patch, + return_tensors='pt')['pixel_values'][0] + for image_patch in image_patches + ] + return torch.stack(image_patches, dim=0) + + +def get_anyres_image_grid_shape(image_size, possible_resolutions, patch_size): + """Calculate the shape of the image patch grid after the preprocessing for + images of any resolution. + + Args: + image_size (tuple): The size of the input image in the format + (width, height). + possible_resolutions (list): A string representation of a list of + possible resolutions. + patch_size (int): The size of each image patch. + + Returns: + tuple: The shape of the image patch grid in the format (width, height). + """ + width, height = select_best_resolution(image_size, possible_resolutions) + return width // patch_size, height // patch_size + + +def unpad_image(tensor, original_size): + """Unpads a PyTorch tensor of a padded and resized image. + + Args: + tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format. + original_size (tuple): The original size of the image (height, width). + + Returns: + torch.Tensor: The unpadded image tensor. + """ + original_width, original_height = original_size + current_height, current_width = tensor.shape[1:] + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + scale_factor = current_width / original_width + new_height = int(original_height * scale_factor) + padding = (current_height - new_height) // 2 + unpadded_tensor = tensor[:, padding:current_height - padding, :] + else: + scale_factor = current_height / original_height + new_width = int(original_width * scale_factor) + padding = (current_width - new_width) // 2 + unpadded_tensor = tensor[:, :, padding:current_width - padding] + + return unpadded_tensor +# ---------------------------------------------------------------------- + +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def total_image_token(orig_size, min_num=1, max_num=6, image_size=336, patch_size=24, use_thumbnail=True): + orig_width, orig_height = orig_size + + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if + max_num >= i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + if use_thumbnail: + blocks += 1 + + return blocks*patch_size*patch_size + + +def dynamic_preprocess(image, min_num=1, max_num=6, image_size=336, use_thumbnail=True, use_patch=True): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + if not use_patch: + processed_images = [] + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if + max_num >= i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + +def internvl_1_5_encode_fn(example, + tokenizer, + max_length, + input_ids_with_output=True, + with_image_token=False, + min_num=1, + max_num=6, + image_size=336, + patch_size=12, + use_patch=True): + """We only support the following three scenarios: + + 1. Incremental pretraining dataset. + example['conversation'] = [ + { + 'input': '', + 'output': '### Human: Can you write xxx' + } + ] + + 2. Single-turn conversation dataset. + example['conversation'] = [ + { + 'input': 'Give three tips for staying healthy.', + 'output': '1.Eat a balanced diet xxx' + } + ] + + 3. Multi-turn conversation dataset. + example['conversation'] = [ + { + 'input': 'Give three tips for staying healthy.', + 'output': '1.Eat a balanced diet xxx' + }, + { + 'input': 'Please expand on the second point.', + 'output': 'Here is an expanded explanation of the xxx' + } + ] + """ + img_token = 0 + if 'image' in example: + if use_patch: + assert 'image_wh' in example + image_wh = example['image_wh'] + if image_wh is not None: + if isinstance(image_wh[0], int): + image_wh = [image_wh] + img_token = total_image_token(image_wh[0], min_num, max_num, image_size, patch_size) + else: + # clip + img_token = patch_size * patch_size + max_length = max_length - img_token + return encode_fn(example, tokenizer, max_length, input_ids_with_output, with_image_token) diff --git a/xtuner/engine/__init__.py b/xtuner/engine/__init__.py index 4f50972ea..18fd1d3c8 100644 --- a/xtuner/engine/__init__.py +++ b/xtuner/engine/__init__.py @@ -2,9 +2,11 @@ from ._strategy import DeepSpeedStrategy from .hooks import (DatasetInfoHook, EvaluateChatHook, ThroughputHook, VarlenAttnArgsToMessageHubHook) -from .runner import TrainLoop +from .runner import TrainLoop, ValLoop, TestLoop +from .optimizers import LearningRateDecayOptimWrapperConstructor __all__ = [ 'EvaluateChatHook', 'DatasetInfoHook', 'ThroughputHook', - 'VarlenAttnArgsToMessageHubHook', 'DeepSpeedStrategy', 'TrainLoop' + 'VarlenAttnArgsToMessageHubHook', 'DeepSpeedStrategy', 'TrainLoop', + 'ValLoop', 'TestLoop', 'LearningRateDecayOptimWrapperConstructor' ] diff --git a/xtuner/engine/hooks/dataset_info_hook.py b/xtuner/engine/hooks/dataset_info_hook.py index 76b49e6a9..d997373ec 100644 --- a/xtuner/engine/hooks/dataset_info_hook.py +++ b/xtuner/engine/hooks/dataset_info_hook.py @@ -42,16 +42,12 @@ def log(self, runner, dataset, mode='train'): def before_train(self, runner) -> None: do_train = runner.train_loop is not None do_eval = runner.val_loop is not None - do_test = runner.test_loop is not None if do_train: train_dataset = runner.train_dataloader.dataset self.log(runner, train_dataset, mode='train') if do_eval: eval_dataset = runner.val_dataloader.dataset self.log(runner, eval_dataset, mode='eval') - if do_test: - test_dataset = runner.test_dataloader.dataset - self.log(runner, test_dataset, mode='test') def before_val(self, runner) -> None: eval_dataset = runner.val_dataloader.dataset diff --git a/xtuner/engine/hooks/evaluate_chat_hook.py b/xtuner/engine/hooks/evaluate_chat_hook.py index 8e6a86822..fd6650705 100644 --- a/xtuner/engine/hooks/evaluate_chat_hook.py +++ b/xtuner/engine/hooks/evaluate_chat_hook.py @@ -16,7 +16,6 @@ class EvaluateChatHook(Hook): - priority = 'LOW' def __init__(self, @@ -108,52 +107,14 @@ def _eval_images(self, for sample_image, sample_input in zip(self.evaluation_images, self.evaluation_inputs): - image = expand2square( - sample_image, - tuple(int(x * 255) for x in self.image_processor.image_mean)) - image = self.image_processor.preprocess( - image, return_tensors='pt')['pixel_values'][0] - image = image.to(device) - sample_input = DEFAULT_IMAGE_TOKEN + '\n' + sample_input - inputs = (self.system + self.instruction).format( - input=sample_input, round=1, **runner.cfg) - chunk_encode = [] - for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): - if idx == 0: - cur_encode = self.tokenizer.encode(chunk) - else: - cur_encode = self.tokenizer.encode( - chunk, add_special_tokens=False) - chunk_encode.append(cur_encode) - assert len(chunk_encode) == 2 - input_ids = [] - for idx, cur_chunk_encode in enumerate(chunk_encode): - input_ids.extend(cur_chunk_encode) - if idx != len(chunk_encode) - 1: - input_ids.append(IMAGE_TOKEN_INDEX) - input_ids = torch.tensor(input_ids).to(device) - visual_outputs = model.visual_encoder( - image.unsqueeze(0).to(model.visual_encoder.dtype), - output_hidden_states=True) - pixel_values = model.projector( - visual_outputs.hidden_states[model.visual_select_layer][:, 1:]) - mm_inputs = prepare_inputs_labels_for_multimodal( - llm=model.llm, - input_ids=input_ids.unsqueeze(0), - pixel_values=pixel_values) - - generation_output = model.generate( - **mm_inputs, - max_new_tokens=max_new_tokens, - generation_config=self.gen_config, - bos_token_id=self.tokenizer.bos_token_id, - stopping_criteria=self.stop_criteria) - generation_output = self.tokenizer.decode(generation_output[0]) + generation_output = model.chat({'image': sample_image, 'text': sample_input}) + inputs = generation_output['inputs'] + prediction = generation_output['prediction'] runner.logger.info(f'Sample output:\n' - f'{inputs + generation_output}\n') + f'{inputs + prediction}\n') if save_eval_output: - eval_outputs.append(f'{inputs + generation_output}\n') + eval_outputs.append(f'{inputs + prediction}\n') if save_eval_output: self._save_eval_output(runner, eval_outputs) @@ -196,13 +157,11 @@ def _generate_samples(self, model = model.module device = next(iter(model.parameters())).device - is_checkpointing = model.llm.is_gradient_checkpointing - use_cache = model.llm.config.use_cache - # Cast to inference mode - model.activation_checkpointing_disable() - model.llm.config.use_cache = True + model.gradient_checkpointing_disable() model.eval() + model.preparing_for_generation({'generation_kwargs': {'max_new_tokens': max_new_tokens}}) + if self.evaluation_images is not None: self._eval_images(runner, model, device, max_new_tokens, save_eval_output) @@ -211,9 +170,7 @@ def _generate_samples(self, save_eval_output) # Cast to training mode - if is_checkpointing: - model.activation_checkpointing_enable() - model.llm.config.use_cache = use_cache + model.gradient_checkpointing_enable() model.train() def before_train(self, runner): @@ -231,7 +188,7 @@ def _is_save_checkpoint(self, runner): return False if checkpoint_hook.every_n_train_iters( - runner, checkpoint_hook.interval, checkpoint_hook.save_begin) or \ + runner, checkpoint_hook.interval, checkpoint_hook.save_begin) or \ (checkpoint_hook.save_last and checkpoint_hook.is_last_train_iter(runner)): return True @@ -249,8 +206,8 @@ def after_train_iter(self, save_eval_output = self._is_save_checkpoint(runner) do_chat = ( - save_eval_output - or self.every_n_train_iters(runner, self.every_n_iters)) + save_eval_output + or self.every_n_train_iters(runner, self.every_n_iters)) if not do_chat: return diff --git a/xtuner/engine/optimizers/__init__.py b/xtuner/engine/optimizers/__init__.py new file mode 100644 index 000000000..adf3acc80 --- /dev/null +++ b/xtuner/engine/optimizers/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .layer_decay_optim_wrapper_constructor import LearningRateDecayOptimWrapperConstructor +from .utils import get_layer_depth_for_CLIPVisionModel, get_layer_depth_for_InternVisionModel +__all__ = [ + 'LearningRateDecayOptimWrapperConstructor', 'get_layer_depth_for_CLIPVisionModel', + 'get_layer_depth_for_InternVisionModel' +] diff --git a/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py b/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py new file mode 100644 index 000000000..707ea4556 --- /dev/null +++ b/xtuner/engine/optimizers/layer_decay_optim_wrapper_constructor.py @@ -0,0 +1,164 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import defaultdict +from typing import Callable, List, Optional + +from mmengine.logging import MMLogger +from mmengine.optim import DefaultOptimWrapperConstructor +from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm +from torch import nn +from torch.nn import GroupNorm, LayerNorm +from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS + + +@OPTIM_WRAPPER_CONSTRUCTORS.register_module() +class LearningRateDecayOptimWrapperConstructor(DefaultOptimWrapperConstructor): + """Different learning rates are set for different layers of backbone. + + By default, each parameter share the same optimizer settings, and we + provide an argument ``paramwise_cfg`` to specify parameter-wise settings. + It is a dict and may contain the following fields: + + - ``layer_decay_rate`` (float): The learning rate of a parameter will + multiply it by multiple times according to the layer depth of the + parameter. Usually, it's less than 1, so that the earlier layers will + have a lower learning rate. Defaults to 1. + - ``bias_decay_mult`` (float): It will be multiplied to the weight + decay for all bias parameters (except for those in normalization layers). + - ``norm_decay_mult`` (float): It will be multiplied to the weight + decay for all weight and bias parameters of normalization layers. + - ``flat_decay_mult`` (float): It will be multiplied to the weight + decay for all one-dimensional parameters + - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If + one of the keys in ``custom_keys`` is a substring of the name of one + parameter, then the setting of the parameter will be specified by + ``custom_keys[key]`` and other setting like ``bias_decay_mult`` will be + ignored. It should be a dict and may contain fields ``decay_mult``. + (The ``lr_mult`` is disabled in this constructor). + + Example: + + In the config file, you can use this constructor as below: + + .. code:: python + + optim_wrapper = dict( + optimizer=dict( + type='AdamW', + lr=4e-3, + weight_decay=0.05, + eps=1e-8, + betas=(0.9, 0.999)), + constructor='LearningRateDecayOptimWrapperConstructor', + paramwise_cfg=dict( + layer_decay_rate=0.75, # layer-wise lr decay factor + norm_decay_mult=0., + flat_decay_mult=0., + custom_keys={ + '.cls_token': dict(decay_mult=0.0), + '.pos_embed': dict(decay_mult=0.0) + })) + """ + def add_params(self, + params: List[dict], + module: nn.Module, + prefix: str = '', + get_layer_depth: Optional[Callable] = None, + **kwargs) -> None: + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (List[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + optimizer_cfg (dict): The configuration of optimizer. + prefix (str): The prefix of the module. + """ + # get param-wise options + custom_keys = self.paramwise_cfg.get('custom_keys', {}) + # first sort with alphabet order and then sort with reversed len of str + sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) + logger = MMLogger.get_current_instance() + + bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None) + norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None) + flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None) + decay_rate = self.paramwise_cfg.get('layer_decay_rate', 1.0) + + # special rules for norm layers and depth-wise conv layers + is_norm = isinstance(module, + (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) + + # The model should have `get_layer_depth` method + if get_layer_depth is None and not hasattr(module, 'get_layer_depth'): + raise NotImplementedError('The layer-wise learning rate decay need' + f' the model {type(module)} has' + ' `get_layer_depth` method.') + else: + get_layer_depth = get_layer_depth or module.get_layer_depth + + for name, param in module.named_parameters(recurse=False): + param_group = {'params': [param]} + param_name = prefix + name + if not param.requires_grad: + continue + + if self.base_wd is not None: + base_wd = self.base_wd + custom_key = next( + filter(lambda k: k in param_name, sorted_keys), None) + # custom parameters decay + if custom_key is not None: + custom_cfg = custom_keys[custom_key].copy() + decay_mult = custom_cfg.pop('decay_mult', 1.) + + param_group['weight_decay'] = base_wd * decay_mult + # add custom settings to param_group + param_group.update(custom_cfg) + # norm decay + elif is_norm and norm_decay_mult is not None: + param_group['weight_decay'] = base_wd * norm_decay_mult + # bias decay + elif name == 'bias' and bias_decay_mult is not None: + param_group['weight_decay'] = base_wd * bias_decay_mult + # flatten parameters decay + elif param.ndim == 1 and flat_decay_mult is not None: + param_group['weight_decay'] = base_wd * flat_decay_mult + else: + param_group['weight_decay'] = base_wd + + layer_id, max_id = get_layer_depth(param_name) + scale = decay_rate**(max_id - layer_id - 1) + param_group['lr'] = self.base_lr * scale + param_group['lr_scale'] = scale + param_group['layer_id'] = layer_id + param_group['param_name'] = param_name + + params.append(param_group) + + for child_name, child_mod in module.named_children(): + child_prefix = f'{prefix}{child_name}.' + self.add_params( + params, + child_mod, + prefix=child_prefix, + get_layer_depth=get_layer_depth, + ) + + if prefix == '': + layer_params = defaultdict(list) + for param in params: + layer_params[param['layer_id']].append(param) + for layer_id, layer_params in layer_params.items(): + lr_scale = layer_params[0]['lr_scale'] + lr = layer_params[0]['lr'] + msg = [ + f'layer {layer_id} params ' + f'(lr={lr:.3g}, lr_scale={lr_scale:.3g}):' + ] + for param in layer_params: + msg.append(f'\t{param["param_name"]}: ' + f'weight_decay={param["weight_decay"]:.3g}') + logger.debug('\n'.join(msg)) diff --git a/xtuner/engine/optimizers/utils.py b/xtuner/engine/optimizers/utils.py new file mode 100644 index 000000000..6cf2d0677 --- /dev/null +++ b/xtuner/engine/optimizers/utils.py @@ -0,0 +1,71 @@ + +def get_layer_depth_for_CLIPVisionModel(self, param_name: str, prefix: str = 'vision_model.'): + """Get the layer-wise depth of a parameter. + + Args: + param_name (str): The name of the parameter. + prefix (str): The prefix for the parameter. + Defaults to an empty string. + + Returns: + Tuple[int, int]: The layer-wise depth and the num of layers. + + Note: + The first depth is the stem module (``layer_depth=0``), and the + last depth is the subsequent module (``layer_depth=num_layers-1``) + """ + num_layers = self.config.num_hidden_layers + 2 + + if not param_name.startswith(prefix): + # For subsequent module like head + return num_layers - 1, num_layers + + param_name = param_name[len(prefix):] + + if param_name.startswith('embeddings'): + layer_depth = 0 + elif param_name.startswith('pre_layrnorm'): + layer_depth = 0 + elif param_name.startswith('encoder.layers'): + layer_id = int(param_name.replace('encoder.', '').split('.')[1]) + layer_depth = layer_id + 1 + else: + layer_depth = num_layers - 1 + + return layer_depth, num_layers + + +def get_layer_depth_for_InternVisionModel(self, param_name: str, prefix: str = ''): + """Get the layer-wise depth of a parameter. + + Args: + param_name (str): The name of the parameter. + prefix (str): The prefix for the parameter. + Defaults to an empty string. + + Returns: + Tuple[int, int]: The layer-wise depth and the num of layers. + + Note: + The first depth is the stem module (``layer_depth=0``), and the + last depth is the subsequent module (``layer_depth=num_layers-1``) + """ + num_layers = self.config.num_hidden_layers + 2 + + if not param_name.startswith(prefix): + # For subsequent module like head + return num_layers - 1, num_layers + + param_name = param_name[len(prefix):] + + if param_name.startswith('embeddings'): + layer_depth = 0 + elif param_name.startswith('pre_layrnorm'): + layer_depth = 0 + elif param_name.startswith('encoder.layers'): + layer_id = int(param_name.replace('encoder.', '').split('.')[1]) + layer_depth = layer_id + 1 + else: + layer_depth = num_layers - 1 + + return layer_depth, num_layers \ No newline at end of file diff --git a/xtuner/engine/runner/__init__.py b/xtuner/engine/runner/__init__.py index d8d1c582b..c621e5f1b 100644 --- a/xtuner/engine/runner/__init__.py +++ b/xtuner/engine/runner/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .loops import TrainLoop +from .loops import TrainLoop, ValLoop, TestLoop -__all__ = ['TrainLoop'] +__all__ = ['TrainLoop', 'ValLoop', 'TestLoop'] diff --git a/xtuner/engine/runner/loops.py b/xtuner/engine/runner/loops.py index aeb6be31a..558d9d16d 100644 --- a/xtuner/engine/runner/loops.py +++ b/xtuner/engine/runner/loops.py @@ -2,7 +2,16 @@ from typing import Dict, Optional, Union from mmengine.runner import IterBasedTrainLoop +from mmengine.runner import ValLoop as MMENGINE_ValLoop from torch.utils.data import DataLoader +from typing import Sequence +from mmengine.dist import broadcast_object_list, is_main_process, get_world_size, get_rank, barrier, collect_results +import math +import torch +from mmengine.model import is_model_wrapper + +TORCH_DTYPE_MAP = dict( + fp16=torch.float16, bf16=torch.bfloat16, fp32=torch.float32, auto='auto') class TrainLoop(IterBasedTrainLoop): @@ -38,3 +47,193 @@ def __init__(self, raise NotImplementedError super().__init__( runner=runner, dataloader=dataloader, max_iters=iters, **kwargs) + + +class ValLoop(MMENGINE_ValLoop): + def __init__(self, runner, dataloader, evaluator=None, torch_dtype='fp16', select_metric='first') -> None: + # must be concatset + super(MMENGINE_ValLoop, self).__init__(runner, dataloader) + self.collate_fn = self.dataloader.collate_fn + self._runner = runner + self.torch_dtype = torch_dtype + if torch_dtype is not None: + self.torch_dtype = TORCH_DTYPE_MAP[torch_dtype] + self.select_metric = select_metric + + def run(self) -> dict: + """Launch validation.""" + self.runner.logger.info('==================== Start val loop ===================') + self.runner.call_hook('before_val') + self.runner.call_hook('before_val_epoch') + + if is_model_wrapper(self.runner.model): + model = self.runner.model.module + else: + model = self.runner.model + + model.gradient_checkpointing_disable() + model.eval() + + rank = get_rank() + metrics = [] + # Ensure that eta and log are displayed correctly. + current_run_total_ids = 0 + for _, dataset in enumerate(self.dataloader.dataset.datasets): + model.preparing_for_generation(dataset.metainfo) + + results = [] + n_samples = len(dataset) + per_rank_samples = math.ceil(n_samples / get_world_size()) + per_rank_ids = range(per_rank_samples * rank, + min(n_samples, per_rank_samples * (rank + 1))) + for idx in per_rank_ids: + data_batch = dataset[idx] + # TODO: Only bs=1 is currently supported temporarily + data_batch = self.collate_fn([data_batch]) + self.run_iter(current_run_total_ids, data_batch, results) + current_run_total_ids += 1 + + barrier() + self.runner.logger.info('==================== Start collect results ===================') + results = collect_results(results, len(dataset)) + self.runner.logger.info('========= Starting the evaluation of a data ===========') + if is_main_process(): + metric = dataset.evaluate(results, self.runner.work_dir) + objects = [metric] + else: + objects = [None] + broadcast_object_list(objects) + metric = objects[0] + metrics.append(metric) + + # select metrics + if self.select_metric == 'first': + metrics = metrics[0] + else: + raise NotImplementedError + + self.runner.logger.info('================ Ending val loop ================') + self.runner.call_hook('after_val_epoch', metrics=metrics) + self.runner.call_hook('after_val') + model.gradient_checkpointing_enable() + model.train() + return metrics + + @torch.no_grad() + def run_iter(self, idx, data_batch: Sequence[dict], results: list): + """Iterate one mini-batch. + + Args: + data_batch (Sequence[dict]): Batch of data + from dataloader. + """ + assert 'img_id' in data_batch['data'], 'img_id is required in data_batch. ' \ + 'The __getitem__ function in the dataset must ' \ + 'return a dictionary with the img_id.' + prediction = {'img_id': data_batch['data']['img_id'][0]} + + self.runner.call_hook( + 'before_val_iter', batch_idx=idx, data_batch=data_batch) + + # outputs should be sequence of BaseDataElement + outputs = self.runner.model.val_step(data_batch) + prediction.update(outputs) + results.append(prediction) + + self.runner.call_hook( + 'after_val_iter', + batch_idx=idx, + data_batch=data_batch, + outputs=outputs) + + +class TestLoop(ValLoop): + def run(self) -> dict: + """Launch validation.""" + self.runner.logger.info('==================== Start test loop ===================') + self.runner.call_hook('before_test') + self.runner.call_hook('before_test_epoch') + + if is_model_wrapper(self.runner.model): + model = self.runner.model.module + else: + model = self.runner.model + + model.gradient_checkpointing_disable() + model.eval() + + if self.torch_dtype is not None: + self.runner.logger.info(f'Convert model dtype to {self.torch_dtype}') + model.to(self.torch_dtype) + + rank = get_rank() + metrics = [] + # Ensure that eta and log are displayed correctly. + current_run_total_ids = 0 + for _, dataset in enumerate(self.dataloader.dataset.datasets): + model.preparing_for_generation(dataset.metainfo) + + results = [] + n_samples = len(dataset) + per_rank_samples = math.ceil(n_samples / get_world_size()) + per_rank_ids = range(per_rank_samples * rank, + min(n_samples, per_rank_samples * (rank + 1))) + for idx in per_rank_ids: + data_batch = dataset[idx] + # TODO: Only bs=1 is currently supported temporarily + data_batch = self.collate_fn([data_batch]) + self.run_iter(current_run_total_ids, data_batch, results) + current_run_total_ids += 1 + + barrier() + self.runner.logger.info('==================== Start collect results ===================') + results = collect_results(results, len(dataset)) + self.runner.logger.info('========= Starting the evaluation of a data ===========') + + if is_main_process(): + metric = dataset.evaluate(results, self.runner.work_dir) + objects = [metric] + else: + objects = [None] + broadcast_object_list(objects) + metric = objects[0] + metrics.append(metric) + + # select metrics + if self.select_metric == 'first': + metrics = metrics[0] + else: + raise NotImplementedError + self.runner.call_hook('after_test_epoch', metrics=metrics) + self.runner.call_hook('after_test') + self.runner.logger.info('================ Ending test loop ================') + # model.gradient_checkpointing_enable() + # model.train() + return metrics + + @torch.no_grad() + def run_iter(self, idx, data_batch: Sequence[dict], results: list): + """Iterate one mini-batch. + + Args: + data_batch (Sequence[dict]): Batch of data + from dataloader. + """ + assert 'img_id' in data_batch['data'], 'img_id is required in data_batch. ' \ + 'The __getitem__ function in the dataset must ' \ + 'return a dictionary with the img_id.' + prediction = {'img_id': data_batch['data']['img_id'][0]} + + self.runner.call_hook( + 'before_test_iter', batch_idx=idx, data_batch=data_batch) + + # outputs should be sequence of BaseDataElement + outputs = self.runner.model.val_step(data_batch) + prediction.update(outputs) + results.append(prediction) + + self.runner.call_hook( + 'after_test_iter', + batch_idx=idx, + data_batch=data_batch, + outputs=outputs) diff --git a/xtuner/model/__init__.py b/xtuner/model/__init__.py index 39547b2d7..241d86e24 100644 --- a/xtuner/model/__init__.py +++ b/xtuner/model/__init__.py @@ -1,5 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. from .llava import LLaVAModel from .sft import SupervisedFinetune +from .anyres_llava import AnyResLLaVAModel +from .mini_gemini import MiniGeminiModel +from .internvl_1_5_llava import InternVL_v1_5_LLaVAModel +from .openai import OpenAIModel -__all__ = ['SupervisedFinetune', 'LLaVAModel'] +__all__ = ['SupervisedFinetune', 'LLaVAModel', 'AnyResLLaVAModel', 'MiniGeminiModel', 'InternVL_v1_5_LLaVAModel', 'OpenAIModel'] diff --git a/xtuner/model/anyres_llava.py b/xtuner/model/anyres_llava.py new file mode 100644 index 000000000..f92b9bc55 --- /dev/null +++ b/xtuner/model/anyres_llava.py @@ -0,0 +1,298 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from ..dataset.utils import get_anyres_image_grid_shape, unpad_image +from .llava import LLaVAModel +from collections import OrderedDict +import torch + +from xtuner.registry import BUILDER +from .modules import ProjectorConfig, ProjectorModel, dispatch_modules +from .utils import (LoadWoInit, + get_peft_model_state_dict, guess_load_checkpoint, + make_inputs_require_grad, + prepare_inputs_labels_for_multimodal) + +import torch.nn as nn + + +class AnyResLLaVAModel(LLaVAModel): + + def __init__(self, llm, + visual_encoder, + freeze_llm=False, + freeze_visual_encoder=False, + visual_select_layer=-2, + pretrained_pth=None, + projector_depth=2, + llm_lora=None, + visual_encoder_lora=None, + use_activation_checkpointing=True, + max_position_embeddings=None, + image_processor=None, + tokenizer=None, + template=None, + image_grid_pinpoints=None, + merge_type='simple', # or pixel_shuffle + token_merge_ratio=4): + super(LLaVAModel, self).__init__() + self.freeze_llm = freeze_llm + self.freeze_visual_encoder = freeze_visual_encoder + self.merge_type = merge_type + with LoadWoInit(): + if isinstance(llm, dict): + llm = self._dispatch_lm_model_cfg(llm, max_position_embeddings) + + self.llm = self._build_from_cfg_or_module(llm) + self.visual_encoder = self._build_from_cfg_or_module( + visual_encoder) + self.llm.config.use_cache = False + dispatch_modules(self.llm) + + if token_merge_ratio == -1: + projector_config = ProjectorConfig( + visual_hidden_size=self.visual_encoder.config.hidden_size, + llm_hidden_size=self.llm.config.hidden_size, + depth=projector_depth) + else: + projector_config = ProjectorConfig( + visual_hidden_size=self.visual_encoder.config.hidden_size * token_merge_ratio, + llm_hidden_size=self.llm.config.hidden_size, + depth=projector_depth) + self.projector = ProjectorModel(projector_config).to( + self.visual_encoder.dtype) + + if self.freeze_llm: + self.llm.requires_grad_(False) + if self.freeze_visual_encoder: + self.visual_encoder.requires_grad_(False) + + self.use_activation_checkpointing = use_activation_checkpointing + if use_activation_checkpointing: + # For backward compatibility + if hasattr(self.llm, 'enable_input_require_grads'): + self.llm.enable_input_require_grads() + else: + self.llm.get_input_embeddings().register_forward_hook( + make_inputs_require_grad) + if hasattr(self.visual_encoder, 'enable_input_require_grads'): + self.visual_encoder.enable_input_require_grads() + else: + self.visual_encoder.get_input_embeddings( + ).register_forward_hook(make_inputs_require_grad) + self.projector.enable_input_require_grads() + + # enable gradient (activation) checkpointing for memory efficiency + self.gradient_checkpointing_enable() + + self.use_llm_lora = llm_lora is not None + self.use_visual_encoder_lora = visual_encoder_lora is not None + + if self.use_llm_lora: + self._prepare_llm_for_lora(llm_lora, use_activation_checkpointing) + if self.use_visual_encoder_lora: + self._prepare_visual_encoder_for_lora( + visual_encoder_lora, use_activation_checkpointing) + + if pretrained_pth is not None: + pretrained_state_dict = guess_load_checkpoint(pretrained_pth) + + self.load_state_dict(pretrained_state_dict, strict=False) + print(f'Load pretrained weight from {pretrained_pth}') + + self.visual_select_layer = visual_select_layer + + self._is_init = True + + self.tokenizer = tokenizer + if tokenizer is not None: + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = image_processor + if image_processor is not None: + self.image_processor = BUILDER.build(image_processor) + self.template = template + + self.token_merge_ratio = token_merge_ratio + self.image_newline = nn.Parameter( + torch.randn( + self.llm.config.hidden_size, dtype=self.visual_encoder.dtype)) + self.image_grid_pinpoints = image_grid_pinpoints + self.mm_patch_merge_type = 'spatial_unpad' + self.image_aspect_ratio = 'anyres' + + def state_dict(self, *args, **kwargs): + state_dict = super(LLaVAModel, self).state_dict(*args, **kwargs) + to_return = OrderedDict() + # Step 1. visual_encoder + if self.use_visual_encoder_lora: + to_return.update( + get_peft_model_state_dict( + self.visual_encoder, state_dict=state_dict)) + elif not self.freeze_visual_encoder: + to_return.update({ + k: v + for k, v in state_dict.items() if 'visual_encoder.' in k + }) + # Step 2. LLM + if self.use_llm_lora: + to_return.update( + get_peft_model_state_dict(self.llm, state_dict=state_dict)) + elif not self.freeze_llm: + to_return.update( + {k: v + for k, v in state_dict.items() if 'llm.' in k}) + # Step 3. Projector + to_return.update( + {k: v + for k, v in state_dict.items() if 'projector.' in k}) + # Step 4. Image Newline + to_return.update( + {k: v + for k, v in state_dict.items() if 'image_newline.' in k}) + return to_return + + def _prepare_data_for_llm(self, data): + if 'pixel_values' in data: + new_image_feature = self.__preprocess_for_pixel_values(data) + data['pixel_values'] = new_image_feature + data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) + return data + + def __preprocess_for_pixel_values(self, data): + orig_sizes = data['orig_size'] + pixel_values = data['pixel_values'] + + if type(pixel_values) is list or pixel_values.ndim == 5: + if type(pixel_values) is list: + pixel_values = [ + x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values + ] + # b*n, c, h, w + concat_images = torch.cat([image for image in pixel_values], dim=0) + else: + raise NotImplementedError() + + # b*n, 27*27, d + visual_outputs = self.visual_encoder( + concat_images.to(self.visual_encoder.dtype), output_hidden_states=True) + + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] + elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + else: + raise NotImplementedError + + bs, pn, hs = visual_outputs.shape + # token merge + if self.token_merge_ratio != -1: + # 27 不是偶数,不能被整除,需要 hard code 处理下 + if pn == 27 * 27: + if self.merge_type == 'simple': + # 直接减掉最后 1 个 token,减掉点,确保能被整除 + visual_outputs = visual_outputs[:, :-1] + visual_outputs = visual_outputs.reshape(bs, (pn - 1) // self.token_merge_ratio, int(hs * 4)) + else: + # 只能补 token 了 + h_ratio = w_ratio = int(self.token_merge_ratio ** 0.5) + visual_outputs = visual_outputs.reshape(bs, 27, 27, -1) + # pad 为 28*28 + visual_outputs = torch.cat( + (visual_outputs, torch.zeros(bs, 1, 27, hs, device=visual_outputs.device,dtype=visual_outputs.dtype)), dim=1) + visual_outputs = torch.cat( + (visual_outputs, torch.zeros(bs, 28, 1, hs, device=visual_outputs.device,dtype=visual_outputs.dtype)), dim=2) + + # B, H, W // w_r, C * w_r + visual_outputs = visual_outputs.view(bs, 28, 28 // w_ratio, hs * w_ratio) + # B, W // w_r, H, C * w_r + visual_outputs = visual_outputs.permute(0, 2, 1, 3).contiguous() + # B, W // w_r, H // h_r, C * w_r * h_r + visual_outputs = visual_outputs.view(bs, 28 // w_ratio, 28 // h_ratio, + hs * w_ratio * h_ratio) + # B, W * H // w_r // h_r, C * w_r * h_r + visual_outputs = visual_outputs.view(bs, 28 * 28 // w_ratio // h_ratio, + hs * w_ratio * h_ratio).contiguous() + + # b*n, 182, d + image_features = self.projector(visual_outputs) + + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(image_features, split_sizes, dim=0) + + new_image_feature = [] + if self.token_merge_ratio == -1: + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + height = width = self.visual_encoder.config.image_size \ + // self.visual_encoder.config.patch_size + assert height * width == base_image_feature.shape[0] + if self.image_aspect_ratio == 'anyres': + num_patch = get_anyres_image_grid_shape( + orig_sizes[image_idx], self.image_grid_pinpoints, + self.visual_encoder.config.image_size) + num_patch_width, num_patch_height = num_patch + image_feature = image_feature.view(num_patch_height, + num_patch_width, height, + width, -1) + else: + raise NotImplementedError + + if 'unpad' in self.mm_patch_merge_type: + image_feature = image_feature.permute(4, 0, 2, 1, + 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = unpad_image(image_feature, + orig_sizes[image_idx]) + image_feature = torch.cat( + (image_feature, + self.image_newline[:, None, None].expand( + *image_feature.shape[:-1], 1)), + dim=-1) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + else: + image_feature = image_feature.permute(0, 2, 1, 3, + 4).contiguous() + image_feature = image_feature.flatten(0, 3) + image_feature = torch.cat((base_image_feature, image_feature), + dim=0) + else: + image_feature = image_feature[0] + if 'unpad' in self.mm_patch_merge_type: + image_feature = torch.cat( + (image_feature, self.image_newline[None]), dim=0) + new_image_feature.append(image_feature) + else: + # 由于进行了 token merge,unpad 操作不好弄,暂时不支持 + new_image_feature = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + # 182, d + base_image_feature = image_feature[0] + # 183,d + base_image_feature = torch.cat( + (base_image_feature, self.image_newline[None]), dim=0) + + # n, 182, d + image_feature = image_feature[1:] + + # n,182+1, d + image_feature = torch.cat( + (image_feature, + self.image_newline[None, None].expand( + image_feature.shape[0], 1, image_feature.shape[-1])), + dim=1) + + # n*183,d + image_feature = image_feature.flatten(0, 1) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + new_image_feature.append(image_feature) + else: + # 182, d + image_feature = image_feature[0] + # 183,d + image_feature = torch.cat( + (image_feature, self.image_newline[None]), dim=0) + + new_image_feature.append(image_feature) + return new_image_feature diff --git a/xtuner/model/internvl_1_5_llava.py b/xtuner/model/internvl_1_5_llava.py new file mode 100644 index 000000000..173754648 --- /dev/null +++ b/xtuner/model/internvl_1_5_llava.py @@ -0,0 +1,259 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from .llava import LLaVAModel +import torch + +from xtuner.registry import BUILDER +from .modules import ProjectorConfig, ProjectorModel, dispatch_modules +from .utils import (LoadWoInit, guess_load_checkpoint, + make_inputs_require_grad, + prepare_inputs_labels_for_multimodal) + +from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel, get_layer_depth_for_InternVisionModel +import types +from mmengine.logging import print_log +import torch.nn as nn +from fairscale.nn.checkpoint import checkpoint_wrapper + + +class InternVL_v1_5_LLaVAModel(LLaVAModel): + def __init__(self, llm, + visual_encoder, + freeze_llm=False, + freeze_visual_encoder=False, + visual_select_layer=-2, + pretrained_pth=None, + projector_depth=2, + llm_lora=None, + visual_encoder_lora=None, + use_activation_checkpointing=True, + max_position_embeddings=None, + image_processor=None, + tokenizer=None, + template=None, + use_lldr=False, # LearningRateDecayOptimWrapperConstructor + merge_type='pixel_shuffle', # or pixel_shuffle + downsample_ratio=0.5, + custom_mlp=False): + super(LLaVAModel, self).__init__() + self.downsample_ratio = downsample_ratio + + self.freeze_llm = freeze_llm + self.freeze_visual_encoder = freeze_visual_encoder + self.merge_type = merge_type + with LoadWoInit(): + if isinstance(llm, dict): + llm = self._dispatch_lm_model_cfg(llm, max_position_embeddings) + + self.llm = self._build_from_cfg_or_module(llm) + self.visual_encoder = self._build_from_cfg_or_module( + visual_encoder) + + if use_lldr: + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, + self.visual_encoder) + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_InternVisionModel, + self.visual_encoder) + self.llm.config.use_cache = False + dispatch_modules(self.llm) + + self.custom_mlp = custom_mlp + if custom_mlp is True: + self.mlp1 = nn.Sequential( + nn.LayerNorm(self.visual_encoder.config.hidden_size * int(1 / self.downsample_ratio) ** 2), + nn.Linear(self.visual_encoder.config.hidden_size * int(1 / self.downsample_ratio) ** 2, + self.llm.config.hidden_size), + nn.GELU(), + nn.Linear(self.llm.config.hidden_size, self.llm.config.hidden_size) + ) + self.mlp1 = self.mlp1.to(self.visual_encoder.dtype) + self.mlp1 = checkpoint_wrapper(self.mlp1) + else: + projector_config = ProjectorConfig( + visual_hidden_size=self.visual_encoder.config.hidden_size * (int(1 / self.downsample_ratio) ** 2), + llm_hidden_size=self.llm.config.hidden_size, + depth=projector_depth) + self.projector = ProjectorModel(projector_config).to( + self.visual_encoder.dtype) + + if self.freeze_llm: + self.llm.requires_grad_(False) + if self.freeze_visual_encoder: + self.visual_encoder.requires_grad_(False) + + self.use_activation_checkpointing = use_activation_checkpointing + if use_activation_checkpointing: + # For backward compatibility + if hasattr(self.llm, 'enable_input_require_grads'): + self.llm.enable_input_require_grads() + else: + self.llm.get_input_embeddings().register_forward_hook( + make_inputs_require_grad) + + if self.visual_encoder.__class__.__name__ == 'InternVisionModel': + pass + else: + if hasattr(self.visual_encoder, 'enable_input_require_grads'): + self.visual_encoder.enable_input_require_grads() + else: + self.visual_encoder.get_input_embeddings( + ).register_forward_hook(make_inputs_require_grad) + if custom_mlp is False: + self.projector.enable_input_require_grads() + + # enable gradient (activation) checkpointing for memory efficiency + self.gradient_checkpointing_enable() + + self.use_llm_lora = llm_lora is not None + self.use_visual_encoder_lora = visual_encoder_lora is not None + + if self.use_llm_lora: + self._prepare_llm_for_lora(llm_lora, use_activation_checkpointing) + if self.use_visual_encoder_lora: + self._prepare_visual_encoder_for_lora( + visual_encoder_lora, use_activation_checkpointing) + + if pretrained_pth is not None: + pretrained_state_dict = guess_load_checkpoint(pretrained_pth) + self.load_state_dict(pretrained_state_dict, strict=False) + print(f'Load pretrained weight from {pretrained_pth}') + + self.visual_select_layer = visual_select_layer + + self._is_init = True + + self.tokenizer = tokenizer + if tokenizer is not None: + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = image_processor + if image_processor is not None: + self.image_processor = BUILDER.build(image_processor) + self.template = template + + print_log(self, logger='current') + + def activation_checkpointing_enable(self): + self.llm.gradient_checkpointing_enable() + if self.custom_mlp is False: + self.projector.gradient_checkpointing_enable() + if self.visual_encoder.__class__.__name__ == 'InternVisionModel': + pass + else: + self.visual_encoder.gradient_checkpointing_enable() + + def activation_checkpointing_disable(self): + self.llm.gradient_checkpointing_disable() + if self.custom_mlp is False: + self.projector.gradient_checkpointing_disable() + if self.visual_encoder.__class__.__name__ == 'InternVisionModel': + pass + else: + self.visual_encoder.gradient_checkpointing_disable() + + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + def get_layer_depth(self, param_name: str, prefix: str = 'visual_encoder.vision_model.'): + assert hasattr(self.visual_encoder, 'get_layer_depth'), \ + 'The visual_encoder does not have `get_layer_depth` method.' + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + prefix = 'visual_encoder.vision_model.' + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + prefix = 'visual_encoder.' + return self.visual_encoder.get_layer_depth(param_name, prefix) + + def _prepare_data_for_llm(self, data): + if 'pixel_values' in data: + new_image_feature = self.__preprocess_for_pixel_values(data) + data['pixel_values'] = new_image_feature + data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) + return data + + def __preprocess_for_pixel_values(self, data): + pixel_values = data['pixel_values'] + + if type(pixel_values) is list or pixel_values.ndim == 5: + if type(pixel_values) is list: + pixel_values = [ + x.unsqueeze(0) if x.ndim == 3 else x for x in pixel_values + ] + # b*n, c, h, w + concat_images = torch.cat([image.to(self.visual_encoder.dtype) for image in pixel_values], dim=0) + else: + raise NotImplementedError() + + # b*n, hw, d + visual_outputs = self.visual_encoder(concat_images, output_hidden_states=True) + + if self._get_model_class_name(self.visual_encoder) in ['CLIPVisionModel', 'InternVisionModel']: + vit_embeds = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] + elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': + vit_embeds = visual_outputs.hidden_states[self.visual_select_layer] + else: + raise NotImplementedError + + # n, hw, c + h = w = int(vit_embeds.shape[1] ** 0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + # n,h'w',c' + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + + if self.custom_mlp is False: + vit_embeds = self.projector(vit_embeds) + else: + vit_embeds = self.mlp1(vit_embeds) + + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(vit_embeds, split_sizes, dim=0) + + new_image_feature = [] + for image_feature in image_features: + B, N, C = image_feature.shape + image_feature = image_feature.reshape(B * N, C) + new_image_feature.append(image_feature) + + # TODO: for 这种写法无法在 zero + checkpoint 情况下使用 + # if isinstance(pixel_values, torch.Tensor) and pixel_values.ndim == 5: + # pixel_values = [x if x.ndim == 4 else x.unsqueeze(0) for x in pixel_values] + # assert isinstance(pixel_values, list) + + # for bs in range(len(pixel_values)): + # # 这样可以省一点显存,虽然会慢一点 + # # n, c, h, w + # visual_outputs = self.visual_encoder( + # pixel_values[bs].to(self.visual_encoder.dtype), output_hidden_states=True) + # + # if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + # vit_embeds = visual_outputs.hidden_states[self.visual_select_layer][:, 1:] + # elif self._get_model_class_name(self.visual_encoder) == 'SiglipVisionModel': + # vit_embeds = visual_outputs.hidden_states[self.visual_select_layer] + # else: + # raise NotImplementedError + # # n, hw, c + # h = w = int(vit_embeds.shape[1] ** 0.5) + # vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + # vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + # # n,h'w',c' + # vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + # + # vit_embeds = self.projector(vit_embeds) + # B, N, C = vit_embeds.shape + # vit_embeds = vit_embeds.reshape(B * N, C) + # new_image_feature.append(vit_embeds) + return new_image_feature + + def pixel_shuffle(self, x, scale_factor=0.5): + n, h, w, c = x.size() + # N, W, H, C --> N, W, H * scale, C // scale + x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) + # N, W, H * scale, C // scale --> N, H * scale, W, C // scale + x = x.permute(0, 2, 1, 3).contiguous() + # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) + x = x.view(n, int(h * scale_factor), int(w * scale_factor), + int(c / (scale_factor * scale_factor))) + x = x.permute(0, 2, 1, 3).contiguous() + return x diff --git a/xtuner/model/llava.py b/xtuner/model/llava.py index 67955d02f..e437c0db2 100644 --- a/xtuner/model/llava.py +++ b/xtuner/model/llava.py @@ -7,7 +7,7 @@ from mmengine.config import Config, ConfigDict from mmengine.model import BaseModel from peft import get_peft_model, prepare_model_for_kbit_training -from transformers import AutoConfig +from transformers import AutoConfig, GenerationConfig from xtuner.registry import BUILDER from .modules import ProjectorConfig, ProjectorModel, dispatch_modules @@ -15,7 +15,15 @@ from .utils import (LoadWoInit, find_all_linear_names, get_peft_model_state_dict, guess_load_checkpoint, make_inputs_require_grad, - prepare_inputs_labels_for_multimodal, traverse_dict) + prepare_inputs_labels_for_multimodal, traverse_dict, s2_forward) +from xtuner.tools.utils import get_stop_criteria +from xtuner.dataset.utils import expand2square, load_image +from xtuner.utils import (DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, + StopWordStoppingCriteria) +from functools import reduce +from mmengine.logging import print_log +from xtuner.engine.optimizers import get_layer_depth_for_CLIPVisionModel, get_layer_depth_for_InternVisionModel +import types class LLaVAModel(BaseModel): @@ -26,13 +34,21 @@ def __init__(self, freeze_llm=False, freeze_visual_encoder=False, visual_select_layer=-2, + token_merge_ratio=1, + s2_scales=None, # [1, 2] or [1,2,3] pretrained_pth=None, projector_depth=2, llm_lora=None, visual_encoder_lora=None, use_activation_checkpointing=True, - max_position_embeddings=None): + max_position_embeddings=None, + image_processor=None, + tokenizer=None, + template=None, + use_lldr=False, # LearningRateDecayOptimWrapperConstructor + ): super().__init__() + self.s2_scales = s2_scales self.freeze_llm = freeze_llm self.freeze_visual_encoder = freeze_visual_encoder with LoadWoInit(): @@ -40,13 +56,34 @@ def __init__(self, llm = self._dispatch_lm_model_cfg(llm, max_position_embeddings) self.llm = self._build_from_cfg_or_module(llm) + self.visual_encoder = self._build_from_cfg_or_module( visual_encoder) + + if use_lldr: + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_CLIPVisionModel, self.visual_encoder) + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + self.visual_encoder.get_layer_depth = types.MethodType(get_layer_depth_for_InternVisionModel, self.visual_encoder) + self.llm.config.use_cache = False dispatch_modules(self.llm) + assert int(token_merge_ratio ** 0.5) ** 2 == token_merge_ratio, \ + '`token_merge_ratio` must be a square number.' + self.token_merge_ratio = int(token_merge_ratio) + + visual_hidden_size = self.visual_encoder.config.hidden_size * token_merge_ratio + self.s2_scales = s2_scales + if s2_scales is not None: + assert 1 in s2_scales, 'The scale of the original image must be included.' + total_scales = reduce(lambda x, y: x * y, s2_scales) + visual_hidden_size = visual_hidden_size * total_scales + projector_config = ProjectorConfig( - visual_hidden_size=self.visual_encoder.config.hidden_size, + visual_hidden_size=visual_hidden_size, llm_hidden_size=self.llm.config.hidden_size, depth=projector_depth) self.projector = ProjectorModel(projector_config).to( @@ -57,6 +94,7 @@ def __init__(self, if self.freeze_visual_encoder: self.visual_encoder.requires_grad_(False) + self.use_activation_checkpointing = use_activation_checkpointing if use_activation_checkpointing: # For backward compatibility if hasattr(self.llm, 'enable_input_require_grads'): @@ -93,6 +131,35 @@ def __init__(self, self._is_init = True + self.tokenizer = tokenizer + if tokenizer is not None: + self.tokenizer = BUILDER.build(tokenizer) + self.image_processor = image_processor + if image_processor is not None: + self.image_processor = BUILDER.build(image_processor) + + if s2_scales is not None: + if hasattr(self.image_processor, 'crop_size'): + orig_img_size = self.image_processor.crop_size['height'] + else: + orig_img_size = self.image_processor.size['height'] + self.orig_img_size = orig_img_size + self.s2_img_sizes = [int(orig_img_size * scale) for scale in s2_scales] + + self.template = template + print_log(self, logger='current') + + # The following code is only meaningful when the optim_wrapper configuration + # includes `LearningRateDecayOptimWrapperConstructor`. Otherwise, it will be ignored. + def get_layer_depth(self, param_name: str, prefix: str = 'visual_encoder.vision_model.'): + assert hasattr(self.visual_encoder, 'get_layer_depth'), \ + 'The visual_encoder does not have `get_layer_depth` method.' + if self._get_model_class_name(self.visual_encoder) == 'CLIPVisionModel': + prefix = 'visual_encoder.vision_model.' + elif self._get_model_class_name(self.visual_encoder) == 'InternVisionModel': + prefix = 'visual_encoder.' + return self.visual_encoder.get_layer_depth(param_name, prefix) + def _parse_lora_config(self, lora_config): if isinstance(lora_config, dict) or isinstance( lora_config, Config) or isinstance(lora_config, ConfigDict): @@ -120,16 +187,18 @@ def _prepare_visual_encoder_for_lora(self, self.visual_encoder = get_peft_model(self.visual_encoder, lora_config) def gradient_checkpointing_enable(self): - self.activation_checkpointing_enable() + if self.use_activation_checkpointing: + self.activation_checkpointing_enable() + + def gradient_checkpointing_disable(self): + if self.use_activation_checkpointing: + self.activation_checkpointing_disable() def activation_checkpointing_enable(self): self.llm.gradient_checkpointing_enable() self.visual_encoder.gradient_checkpointing_enable() self.projector.gradient_checkpointing_enable() - def gradient_checkpointing_disable(self): - self.activation_checkpointing_disable() - def activation_checkpointing_disable(self): self.llm.gradient_checkpointing_disable() self.visual_encoder.gradient_checkpointing_disable() @@ -225,7 +294,8 @@ def _dispatch_lm_model_cfg(self, cfg, max_position_embeddings=None): pretrained_model_name_or_path = cfg.pretrained_model_name_or_path llm_cfg = AutoConfig.from_pretrained( pretrained_model_name_or_path, trust_remote_code=True) - cfg, llm_cfg = self._prepare_for_flash_attn(cfg, llm_cfg) + if not hasattr(cfg, 'attn_implementation'): + cfg, llm_cfg = self._prepare_for_flash_attn(cfg, llm_cfg) if max_position_embeddings is not None: cfg, llm_cfg = self._prepare_for_long_context_training( cfg, llm_cfg, max_position_embeddings) @@ -240,43 +310,154 @@ def _build_from_cfg_or_module(self, cfg_or_mod): else: raise NotImplementedError - def forward(self, data, data_samples=None, mode='loss'): + @staticmethod + def _merge_tokens(tokens, token_merge_ratio): + if token_merge_ratio > 1: + # B, N, C + b, n, c = tokens.shape + h = w = int(n ** 0.5) + h_ratio = w_ratio = int(token_merge_ratio ** 0.5) + assert h * w == n + assert n % token_merge_ratio == 0, 'The number of visual tokens is not divisible by `token_merge_ratio`.' + # B, H, W, C + tokens = tokens.view(b, h, w, c) + # B, H, W // w_r, C * w_r + tokens = tokens.view(b, h, w // w_ratio, c * w_ratio) + # B, W // w_r, H, C * w_r + tokens = tokens.permute(0, 2, 1, 3).contiguous() + # B, W // w_r, H // h_r, C * w_r * h_r + tokens = tokens.view(b, w // w_ratio, h // h_ratio, + c * w_ratio * h_ratio) + # B, W * H // w_r // h_r, C * w_r * h_r + tokens = tokens.view(b, w * h // w_ratio // h_ratio, + c * w_ratio * h_ratio) + return tokens + + @staticmethod + def _get_model_class_name(model): + if model.__class__.__name__ == 'PeftModel': + base_model = model.base_model.model + else: + base_model = model + return base_model.__class__.__name__ + + def __forward_feature(self, images): + visual_outputs = self.visual_encoder(images.to(self.visual_encoder.dtype), output_hidden_states=True) + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + if self._get_model_class_name(self.visual_encoder) != 'SiglipVisionModel': + visual_outputs = visual_outputs[:, 1:] + return visual_outputs + + def _prepare_data_for_llm(self, data): if 'pixel_values' in data: - visual_outputs = self.visual_encoder( - data['pixel_values'].to(self.visual_encoder.dtype), - output_hidden_states=True) - pixel_values = self.projector( - visual_outputs.hidden_states[self.visual_select_layer][:, 1:]) + if self.s2_scales is None: + visual_outputs = self.__forward_feature(data['pixel_values']) + visual_outputs = self._merge_tokens(visual_outputs, self.token_merge_ratio) + else: + visual_outputs = s2_forward(self.__forward_feature, data['pixel_values'], + img_sizes=self.s2_img_sizes, + max_split_size=self.orig_img_size) + + pixel_values = self.projector(visual_outputs) + data['pixel_values'] = pixel_values data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) + return data + def forward(self, data, data_samples=None, mode='loss'): if mode == 'loss': + data = self._prepare_data_for_llm(data) return self.compute_loss(data, data_samples) - elif mode == 'predict': - return self.predict(data, data_samples) - elif mode == 'tensor': - return self._forward(data, data_samples) + elif mode == 'predict' or mode == 'generate': + data = self._prepare_data_for_llm(data) + return self.generate(data, data_samples) + elif mode == 'chat': + return self.chat(data) else: raise NotImplementedError - def _forward(self, data, data_samples=None): - - outputs = self.llm(**data) - - return outputs - - def predict(self, data, data_samples=None): - outputs = self.llm(**data) - logits_dict = [{'logits': logits} for logits in outputs.logits] - return logits_dict - def compute_loss(self, data, data_samples=None): outputs = self.llm(**data) loss_dict = {'loss': outputs.loss} return loss_dict - def __getattr__(self, name: str): - try: - return super().__getattr__(name) - except AttributeError: - return getattr(self.llm, name) + def preparing_for_generation(self, metainfo: dict = None): + default_generation_kwargs = dict( + max_new_tokens=100, + do_sample=False, + eos_token_id=self.tokenizer.eos_token_id, + pad_token_id=self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None else + self.tokenizer.eos_token_id) + default_generation_kwargs.update(metainfo.get('generation_kwargs', {})) + self.gen_config = GenerationConfig(**default_generation_kwargs) + + stop_words = [] + stop_words += self.template.get('STOP_WORDS', []) + stop_criteria = get_stop_criteria( + tokenizer=self.tokenizer, stop_words=stop_words) + self.stop_criteria = stop_criteria + + def generate(self, data, data_samples=None): + generate_output = self.llm.generate( + **data, + generation_config=self.gen_config, + streamer=None, + bos_token_id=self.tokenizer.bos_token_id, + stopping_criteria=self.stop_criteria) + + prediction = self.tokenizer.decode( + generate_output[0], skip_special_tokens=True).strip() + + return dict(prediction=prediction) + + def chat(self, data, system=''): + # single image and single text mode + instruction = self.template.get('INSTRUCTION', '{input}') + + sample_image = data['image'] + sample_input = data['text'] + + image = expand2square( + sample_image, + tuple(int(x * 255) for x in self.image_processor.image_mean)) + image = self.image_processor.preprocess( + image, return_tensors='pt')['pixel_values'][0] + image = image.to(self.visual_encoder.device) + sample_input = DEFAULT_IMAGE_TOKEN + '\n' + sample_input + if system != '': + system = self.template.get( + 'SYSTEM', '{system}\n').format(system=system) + + inputs = (system + instruction).format(input=sample_input, round=1) + chunk_encode = [] + for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)): + if idx == 0: + cur_encode = self.tokenizer.encode(chunk) + else: + cur_encode = self.tokenizer.encode( + chunk, add_special_tokens=False) + chunk_encode.append(cur_encode) + assert len(chunk_encode) == 2 + input_ids = [] + for idx, cur_chunk_encode in enumerate(chunk_encode): + input_ids.extend(cur_chunk_encode) + if idx != len(chunk_encode) - 1: + input_ids.append(IMAGE_TOKEN_INDEX) + input_ids = torch.tensor(input_ids).to(self.visual_encoder.device) + + data['input_ids'] = input_ids.unsqueeze(0) + data['pixel_values'] = image.unsqueeze(0) + + mm_inputs = self._prepare_data_for_llm(data) + generate_output = self.llm.generate( + **mm_inputs, + generation_config=self.gen_config, + streamer=None, + bos_token_id=self.tokenizer.bos_token_id, + stopping_criteria=self.stop_criteria) + + prediction = self.tokenizer.decode( + generate_output[0], skip_special_tokens=True).strip() + + return dict(prediction=prediction, inputs=inputs) diff --git a/xtuner/model/mini_gemini.py b/xtuner/model/mini_gemini.py new file mode 100644 index 000000000..14cb466ff --- /dev/null +++ b/xtuner/model/mini_gemini.py @@ -0,0 +1,137 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +import torch +import torch.nn as nn +from .utils import (get_peft_model_state_dict, guess_load_checkpoint, + prepare_inputs_labels_for_multimodal) +from .llava import LLaVAModel + + +class MiniGeminiModel(LLaVAModel): + def __init__(self, *args, visual_encoder_aux=None, pretrained_pth=None, **kwargs): + super().__init__(*args, pretrained_pth=None, **kwargs) + self.visual_encoder_aux = self._build_from_cfg_or_module(visual_encoder_aux) + + if self.freeze_visual_encoder: + self.visual_encoder_aux.requires_grad_(False) + + if self.use_activation_checkpointing: + self.visual_encoder_aux.activation_checkpointing_enable() + + mm_hidden_size = self.visual_encoder.config.hidden_size + mm_hidden_size_aux = self.visual_encoder_aux.hidden_size + self.vlm_uni_query_projector = nn.Sequential(nn.LayerNorm(mm_hidden_size), + nn.Linear(mm_hidden_size, mm_hidden_size)) + self.vlm_uni_aux_projector = nn.Sequential(nn.LayerNorm(mm_hidden_size_aux), + nn.Linear(mm_hidden_size_aux, + mm_hidden_size)) + self.vlm_uni_val_projector = nn.Sequential(nn.LayerNorm(mm_hidden_size_aux), + nn.Linear(mm_hidden_size_aux, + mm_hidden_size)) + + if pretrained_pth is not None: + pretrained_state_dict = guess_load_checkpoint(pretrained_pth) + + self.load_state_dict(pretrained_state_dict, strict=False) + print(f'Load pretrained weight from {pretrained_pth}') + + def activation_checkpointing_disable(self): + super().activation_checkpointing_disable() + if hasattr(self, 'visual_encoder_aux'): + self.visual_encoder_aux.activation_checkpointing_disable() + + def activation_checkpointing_enable(self): + super().activation_checkpointing_enable() + if hasattr(self, 'visual_encoder_aux'): + self.visual_encoder_aux.activation_checkpointing_enable() + + def state_dict(self, *args, **kwargs): + state_dict = super().state_dict(*args, **kwargs) + to_return = OrderedDict() + # Step 1. visual_encoder + if self.use_visual_encoder_lora: + to_return.update( + get_peft_model_state_dict( + self.visual_encoder, state_dict=state_dict)) + elif not self.freeze_visual_encoder: + to_return.update({ + k: v + for k, v in state_dict.items() if 'visual_encoder.' in k + }) + # Step 2. LLM + if self.use_llm_lora: + to_return.update( + get_peft_model_state_dict(self.llm, state_dict=state_dict)) + elif not self.freeze_llm: + to_return.update( + {k: v + for k, v in state_dict.items() if 'llm.' in k}) + # Step 3. Projector + to_return.update( + {k: v + for k, v in state_dict.items() if 'projector.' in k}) + + # Step 4. visual_encoder_aux + if not self.freeze_visual_encoder: + to_return.update({ + k: v + for k, v in state_dict.items() if 'visual_encoder_aux.' in k + }) + # Step 5. unified projector + to_return.update( + {k: v + for k, v in state_dict.items() if 'vlm_uni_' in k}) + return to_return + + def _prepare_data_for_llm(self, data): + if 'pixel_values' in data: + visual_outputs = self.visual_encoder( + data['pixel_values'].to(self.visual_encoder.dtype), + output_hidden_states=True) + visual_outputs = visual_outputs.hidden_states[self.visual_select_layer] + + if self._get_model_class_name(self.visual_encoder) != 'SiglipVisionModel': + visual_outputs = visual_outputs[:, 1:] + + visual_outputs_aux = torch.stack(data['pixel_values_aux']) + visual_outputs_aux = self.visual_encoder_aux( + visual_outputs_aux.to(self.visual_encoder_aux.dtype) + ) + visual_outputs = self.unified_resampler(visual_outputs, visual_outputs_aux) + + pixel_values = self.projector(visual_outputs) + data['pixel_values'] = pixel_values + data = prepare_inputs_labels_for_multimodal(llm=self.llm, **data) + return data + + def unified_resampler(self, images, images_aux): + # patchwise with square images + patch_num = int(images.shape[1] ** 0.5) # 27 + # 216x216 + patch_size = images_aux.shape[-1] // patch_num # 8 + # within patch attention + images_aux = images_aux.permute(0, 2, 3, 1) + images_aux = images_aux.reshape(len(images_aux), patch_num, patch_size, patch_num, patch_size, + images_aux.shape[-1]) + images_aux = images_aux.permute(0, 1, 3, 2, 4, 5) + images_aux = images_aux.reshape(len(images_aux), patch_num ** 2, patch_size ** 2, + images_aux.shape[-1]).contiguous() + + # token attention + embed_query = self.vlm_uni_query_projector(images) + embed_aux = self.vlm_uni_aux_projector(images_aux) + embed_value = self.vlm_uni_val_projector(images_aux) + # TODO siglip+convnext 在第一次 forward 后正常,但是 embed_att 会出现 nan + # TODO 导致第二次迭代时候 embed_value 会出现 nan,无法训练 + # TODO 怀疑是特征不匹配,即使全部转换为 fp32 也会出现 nan, 需要进一步排查 + embed_att = embed_query[:, :, None] @ (embed_aux.transpose(-1, -2) / (embed_aux.shape[-1] ** 0.5)) + # print('=xxxx=', torch.any(torch.isnan(embed_query)).item(), + # torch.any(torch.isnan(embed_aux)).item(), + # torch.any(torch.isnan(embed_value)).item(), + # torch.any(torch.isnan(embed_att)).item()) + embed_att = embed_att.nan_to_num() + embed_feat = (embed_att.softmax(-1) @ embed_value).mean(2) + # print('=xxcccxx=', torch.any(torch.isnan(embed_feat)).item()) + image_features = images + embed_feat + return image_features diff --git a/xtuner/model/modules/openclip_encoder.py b/xtuner/model/modules/openclip_encoder.py new file mode 100644 index 000000000..66ad89c91 --- /dev/null +++ b/xtuner/model/modules/openclip_encoder.py @@ -0,0 +1,208 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import os +import json +import logging +from pathlib import Path +from typing import Dict, Optional +from transformers.deepspeed import is_deepspeed_zero3_enabled + + +try: + import deepspeed + from open_clip.factory import load_state_dict, get_model_config + from open_clip.model import CLIPVisionCfg, CLIPTextCfg, _build_vision_tower, convert_to_custom_text_state_dict, \ + resize_pos_embed +except ImportError: + pass + + +class OpenCLIPVisionTower(nn.Module): + def __init__(self, vision_tower, vision_tower_path, optimize_vision_tower_aux=False, delay_load=False): + super().__init__() + + self.is_loaded = False + self.vision_tower_name = vision_tower + self.vision_tower_path = vision_tower_path + self.vision_config = json.load( + open(os.path.join(self.vision_tower_path, 'open_clip_config.json'), 'r') + ) + self.is_optimize = optimize_vision_tower_aux + + if not delay_load: + self.load_model() + + def load_model(self): + ckpt_path = os.path.join(self.vision_tower_path, 'open_clip_pytorch_model.bin') + if 'convnext' in self.vision_tower_name: + if 'large' in self.vision_tower_name and 'd-320' in self.vision_tower_name: + self.model_type = 'convnext_large_d_320' + self.model_channel = [192, 384, 768, 1536] # stage 0-3 + elif 'base' in self.vision_tower_name and 'w-320' in self.vision_tower_name: + self.model_type = 'convnext_base_w_320' + self.model_channel = [128, 256, 512, 1024] + elif 'xxlarge' in self.vision_tower_name: + self.model_type = 'convnext_xxlarge' + self.model_channel = [384, 768, 1536, 3072] + + clip_model = CLIP(**get_model_config(self.model_type)) + clip_model.visual.trunk.norm_pre = None + clip_model.visual.trunk.head = None + clip_model.visual.head = None + print(f'Loading pretrained weights ({self.model_type}).') + load_checkpoint(clip_model, ckpt_path, strict=False) + + self.clip_model = clip_model + self.is_loaded = True + # decompose stem and stages blocks in vision tower + # self.vision_stem = clip_model.visual.trunk.stem + # self.vision_stages = clip_model.visual.trunk.stages + + self.clip_model.visual.trunk.stem.requires_grad_(False) + + # self.vision_stages.requires_grad_(False) + + def activation_checkpointing_enable(self): + self.clip_model.visual.set_grad_checkpointing(True) + + def activation_checkpointing_disable(self): + self.clip_model.visual.set_grad_checkpointing(False) + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_feature = self.backbone(image.to(device=self.device, dtype=self.dtype).unsqueeze(0)) + image_features.append(image_feature) + else: + image_features = self.backbone(images.to(device=self.device, dtype=self.dtype)) + + return image_features + + def backbone(self, images): + if not self.is_optimize: + with torch.no_grad(): + results = self.basic_forward(images) + else: + results = self.basic_forward(images) + + target_size = (results['stage_0'].shape[-2], results['stage_0'].shape[-1]) + result_cat = [] + for _stage in results: + if _stage == 'stage_0': + result_cat.append(results[_stage].contiguous()) + else: + result_cat.append(F.interpolate(results[_stage].float().contiguous() , + size=target_size, + mode='bilinear', + align_corners=False).to(dtype=results[_stage].dtype)) + result_cat = torch.cat(result_cat, dim=1) + + return result_cat.contiguous() + + def basic_forward(self, images): + results = {} + x = self.clip_model.visual.trunk.stem(images) + for _idx in range(len(self.clip_model.visual.trunk.stages)): + x = self.clip_model.visual.trunk.stages[_idx](x) + results[f'stage_{_idx}'] = x + return results + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + return self.clip_model.visual.trunk.stem[0].weight.dtype + + @property + def device(self): + return self.clip_model.visual.trunk.stem[0].weight.device + + @property + def config(self): + return self.vision_config + + @property + def hidden_size(self): + return sum(self.model_channel) + + +# modified function from open_clip to support zero3 stage +def load_checkpoint(model, checkpoint_path, strict=True): + if Path(checkpoint_path).suffix in ('.npz', '.npy'): + from open_clip.big_vision import load_big_vision_weights + load_big_vision_weights(model, checkpoint_path) + return {} + + state_dict = load_state_dict(checkpoint_path) + # detect old format and make compatible with new format + if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'): + state_dict = convert_to_custom_text_state_dict(state_dict) + # If loading a non-SigLIP model for SigLIP training. See https://github.com/mlfoundations/open_clip/issues/712 + # if 'logit_bias' not in state_dict and model.logit_bias is not None: + # state_dict["logit_bias"] = torch.zeros_like(state_dict["logit_scale"]) + # Certain text transformers no longer expect position_ids after transformers==4.31 + position_id_key = 'text.transformer.embeddings.position_ids' + if position_id_key in state_dict and not hasattr(model, position_id_key): + del state_dict[position_id_key] + resize_pos_embed(state_dict, model) + # resize_text_pos_embed(state_dict, model) + #incompatible_keys = model.load_state_dict(state_dict, strict=strict) + if is_deepspeed_zero3_enabled(): + + error_msgs = [] + + def load(module: nn.Module, state_dict, prefix=""): + metadata = None + + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + args = (state_dict, prefix, local_metadata, True, [], [], error_msgs) + # Parameters of module and children will start with prefix. We can exit early if there are none in this + # state_dict + if len([key for key in state_dict if key.startswith(prefix)]) > 0: + if is_deepspeed_zero3_enabled(): + # In sharded models, each shard has only part of the full state_dict, so only gather + # parameters that are in the current state_dict. + named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False)) + params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters] + if len(params_to_gather) > 0: + # because zero3 puts placeholders in model params, this context + # manager gathers (unpartitions) the params of the current layer, then loads from + # the state dict and then re-partitions them again + with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0): + if torch.distributed.get_rank() == 0: + module._load_from_state_dict(*args) + else: + module._load_from_state_dict(*args) + + for name, child in module._modules.items(): + if child is not None: + load(child, state_dict, prefix + name + ".") + + load(model, state_dict) + incompatible_keys = [] + else: + incompatible_keys = model.load_state_dict(state_dict, strict=strict) + logging.info(f"incompatible_keys.missing_keys: {incompatible_keys.missing_keys}") + return incompatible_keys + + +class CLIP(nn.Module): + output_dict: torch.jit.Final[bool] + + def __init__( + self, + embed_dim: int, + vision_cfg: CLIPVisionCfg, + text_cfg: CLIPTextCfg, + quick_gelu: bool = False, + cast_dtype: Optional[torch.dtype] = None, + output_dict: bool = False, + ): + super().__init__() + self.output_dict = output_dict + + self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype) diff --git a/xtuner/model/openai.py b/xtuner/model/openai.py new file mode 100644 index 000000000..1c261f534 --- /dev/null +++ b/xtuner/model/openai.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +import torch +import torch.nn as nn +from .utils import (get_peft_model_state_dict, guess_load_checkpoint, + prepare_inputs_labels_for_multimodal) +from mmengine.model import BaseModel +import asyncio +from openai import AsyncOpenAI +from typing import List + + +class OpenaiBackend: + + def __init__(self, api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1'): + self.client = AsyncOpenAI(api_key=api_key, base_url=base_url) + + async def request_completion(self, model_id, messages): + response = await self.client.chat.completions.create( + model=model_id, + messages=messages, + temperature=0.8, + top_p=0.8) + return response # .choices[0].message.content + + async def _batch_infer(self, messages: List[List]): + model_cards = await self.client.models.list()._get_page() + model_id = model_cards.data[0].id + + tasks = [self.request_completion(model_id, msg) for msg in messages] + + responses = await asyncio.gather(*tasks) + + return [res.choices[0].message.content for res in responses] + + def batch_infer(self, messages: List[List]): + return asyncio.run(self._batch_infer(messages)) + + +class OpenAIModel(BaseModel): + def __init__(self, base_url): + super().__init__() + self.model = OpenaiBackend(base_url=base_url) + + def forward(self, data, data_samples=None, mode='loss'): + pixel_values = data['pixel_values'][0] + text = data['text'][0] + + prompt = [ + { + 'role': 'user', + 'content': [ + {'type': 'text', 'text': text}, + {'type': 'image_url', 'image_url': {'url': pixel_values}} + ] + } + ] + prediction = self.model.batch_infer([prompt])[0] + return dict(prediction=prediction) + + def gradient_checkpointing_disable(self): + pass + def preparing_for_generation(self, metainfo: dict = None): + pass diff --git a/xtuner/model/utils.py b/xtuner/model/utils.py index dce86315d..2553a369b 100644 --- a/xtuner/model/utils.py +++ b/xtuner/model/utils.py @@ -134,7 +134,8 @@ def prepare_inputs_labels_for_multimodal( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, labels: Optional[torch.LongTensor] = None, - pixel_values: Optional[torch.FloatTensor] = None): + pixel_values: Optional[torch.FloatTensor] = None, + **kwargs): if pixel_values is None: return { 'input_ids': input_ids, @@ -307,3 +308,88 @@ def guess_load_checkpoint(pth_model): else: raise FileNotFoundError(f'Cannot find {pth_model}') return state_dict + + + +# from https://github.com/bfshi/scaling_on_scales + +import math +import torch.nn.functional as F +from einops import rearrange + + +def split_chessboard(x, num_split): + """ + x: b * c * h * w + Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension + """ + B, C, H, W = x.shape + assert H % num_split == 0 and W % num_split == 0 + h, w = H // num_split, W // num_split + x_split = torch.cat([x[:, :, i*h:(i+1)*h, j*w:(j+1)*w] for i in range(num_split) for j in range(num_split)], dim=0) + return x_split + + +def merge_chessboard(x, num_split): + """ + x: b * c * h * w + Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square. + (inverse of split_chessboard) + """ + B, C, H, W = x.shape + assert B % (num_split**2) == 0 + b = B // (num_split**2) + x_merge = torch.cat([torch.cat([x[(i*num_split + j)*b:(i*num_split + j + 1)*b] for j in range(num_split)], dim=-1) + for i in range(num_split)], dim=-2) + return x_merge + + +def s2_forward(model, input, scales=None, img_sizes=None, max_split_size=None, resize_output_to_idx=0, num_prefix_token=0, + output_shape='bnc'): + + assert input.dim() == 4, "Input image must be in the shape of BxCxHxW." + assert input.shape[2] == input.shape[3], "Currently only square images are supported." + assert output_shape in ['bnc', 'bchw'], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)." + assert output_shape == 'bnc' or num_prefix_token == 0, "For ConvNet there shouldn't be any prefix token." + + b, c, input_size, _ = input.shape + + # image size for each scale + assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes." + img_sizes = img_sizes or [int(input_size * scale) for scale in scales] + + # prepare multiscale inputs + max_split_size = max_split_size or input_size # The maximum size of each split of image. Set as the input size by default + num_splits = [math.ceil(size / max_split_size) for size in img_sizes] # number of splits each scale + input_multiscale = [] + for size, num_split in zip(img_sizes, num_splits): + x = F.interpolate(input.to(torch.float32), size=size, mode='bicubic').to(input.dtype) + x = split_chessboard(x, num_split=num_split) + input_multiscale.append(x) + + # run feedforward on each scale + outs_multiscale = [model(x) for x in input_multiscale] + if num_prefix_token > 0: + outs_prefix_multiscale = [out[:, :num_prefix_token] for out in outs_multiscale] + outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale] + if output_shape == 'bnc': + outs_multiscale = [rearrange(out, 'b (h w) c -> b c h w', h=int(out.shape[1] ** 0.5), w=int(out.shape[1] ** 0.5)) + for out in outs_multiscale] + + # merge outputs of different splits for each scale separately + outs_multiscale = [merge_chessboard(out, num_split=num_split) for num_split, out in zip(num_splits, outs_multiscale)] + + # interpolate outputs from different scales and concat together + output_size = outs_multiscale[resize_output_to_idx].shape[-2] + out = torch.cat([F.interpolate(outs_multiscale[i].to(torch.float32), size=output_size, + mode='area').to(outs_multiscale[i].dtype) + for i in range(len(outs_multiscale))], dim=1) + if output_shape == 'bnc': + out = rearrange(out, 'b c h w -> b (h w) c') + if num_prefix_token > 0: + # take the mean of prefix tokens from different splits for each scale + outs_prefix_multiscale = [torch.stack(out.split(b, dim=0), dim=0).mean(dim=0) for out in outs_prefix_multiscale] + out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1) + out = torch.cat([out_prefix_multiscale, out], dim=1) + + return out diff --git a/xtuner/tools/calc_image_size.py b/xtuner/tools/calc_image_size.py new file mode 100644 index 000000000..1a2879976 --- /dev/null +++ b/xtuner/tools/calc_image_size.py @@ -0,0 +1,56 @@ +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +import json +from PIL import Image +import os + +data_root = '/mnt/petrelfs/share_data/huanghaian/llava_data/' +data_path = data_root + 'LLaVA-Instruct-150K/llava_v1_5_mix665k.json' +image_folder = data_root + 'llava_images' + + +def load_jsonl(json_file): + with open(json_file) as f: + lines = f.readlines() + data = [] + for line in lines: + data.append(json.loads(line)) + return data + + +def calc_fn(data_dict): + size = {'width': 0, 'height': 0, 'image': 'None'} + if data_dict.get('image', None) is not None: + image_file = data_dict['image'] + image = Image.open(os.path.join(image_folder, + image_file)) + size['image'] = image_file + size['width'] = image.size[0] + size['height'] = image.size[1] + return size + + +if __name__ == '__main__': + print('start calculating modality length') + if data_path.endswith('.json'): + json_data = json.load(open(data_path)) + elif data_path.endswith('.jsonl'): + json_data = load_jsonl(data_path) + else: + raise NotImplementedError + + with ThreadPoolExecutor(max_workers=16) as executor: + length_list = list( + tqdm( + executor.map(calc_fn, json_data), + desc='Calculating modality length', + total=len(json_data))) + print('end calculating modality length') + + new_output_dict = {} + for i in range(len(length_list)): + if length_list[i]['image'] != 'None': + new_output_dict[length_list[i]['image']] = [length_list[i]['width'], length_list[i]['height']] + + with open('llava_v1_5_mix665k_image_size.json', 'w') as f: + json.dump(new_output_dict, f) diff --git a/xtuner/tools/process_untokenized_llava_concatdata.py b/xtuner/tools/process_untokenized_llava_concatdata.py new file mode 100644 index 000000000..124be9ac6 --- /dev/null +++ b/xtuner/tools/process_untokenized_llava_concatdata.py @@ -0,0 +1,74 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import warnings + +from mmengine import Config +import numpy as np + +from xtuner.registry import BUILDER +from tqdm import tqdm +from mmengine.logging import MMLogger + +# ignore FutureWarning in hf datasets +warnings.simplefilter(action='ignore', category=FutureWarning) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('config', help='config file name or path.') + args = parser.parse_args() + return args + + +def build_llava_dataset(config): + dataset = BUILDER.build(config) + return dataset + + +if __name__ == '__main__': + args = parse_args() + cfg = Config.fromfile(args.config) + + logger = MMLogger.get_instance( + name='xtuner', + log_file='benchmark_test.log') + + datasets = cfg.train_dataloader.dataset.datasets + for dataset_cfg in tqdm(datasets): + offline_processed_text_folder = dataset_cfg.pop('offline_processed_text_folder') + logger.info('=================================================================') + logger.info(f'offline_processed_text_folder: {offline_processed_text_folder}') + try: + llava_dataset = build_llava_dataset(dataset_cfg) + text_data = llava_dataset.text_data + + length_list = text_data['length'] + length_np = np.abs(length_list) + min_, max_, mid_ = np.min(length_np), np.max(length_np), np.median(length_np) + logger.info(f'token len({length_np.shape[0]}): max: {max_}, min: {min_}, mid: {mid_}') + try: + image_wh_list = text_data['image_wh'] + new_list = [] + for d in image_wh_list: + if d is not None: + if isinstance(d[0], list): + new_list.append(d[0]) + else: + new_list.append(d) + new_list = np.array(new_list).reshape(-1, 2) + row_sums = np.sum(new_list, axis=1) + max_idx = np.argmax(row_sums) + min_idx = np.argmin(row_sums) + mid_idx = np.argsort(row_sums)[len(row_sums) // 2] + max_value = new_list[max_idx] + min_value = new_list[min_idx] + mid_value = new_list[mid_idx] + logger.info(f'Image wh: max: {max_value}, min: {min_value}, mid: {mid_value}\n') + + except Exception as e: + logger.error(f'=======Error: {e}') + + text_data.save_to_disk(offline_processed_text_folder) + except Exception as e: + logger.error(f'--------Error: {e}') + raise NotImplementedError diff --git a/xtuner/tools/test.py b/xtuner/tools/test.py index 5eb3f6d9d..6d5d3820c 100644 --- a/xtuner/tools/test.py +++ b/xtuner/tools/test.py @@ -11,6 +11,7 @@ from xtuner.configs import cfgs_name_path from xtuner.model.utils import guess_load_checkpoint from xtuner.registry import MAP_FUNC +from mmengine.model import is_model_wrapper def parse_args(): @@ -96,7 +97,11 @@ def main(): runner = RUNNERS.build(cfg) state_dict = guess_load_checkpoint(args.checkpoint) - runner.model.load_state_dict(state_dict, strict=False) + + if is_model_wrapper(runner.model): + runner.model.module.load_state_dict(state_dict, strict=False) + else: + runner.model.load_state_dict(state_dict, strict=False) runner.logger.info(f'Load checkpoint from {args.checkpoint}') # start testing diff --git a/xtuner/tools/train.py b/xtuner/tools/train.py index 2696c9a4f..a211e937c 100644 --- a/xtuner/tools/train.py +++ b/xtuner/tools/train.py @@ -332,9 +332,17 @@ def main(): 'sequence_parallel_size', 1)) cfg.__setitem__('strategy', strategy) - optim_wrapper = dict( - type='DeepSpeedOptimWrapper', - optimizer=cfg.optim_wrapper.optimizer) + if 'constructor' in cfg.optim_wrapper: + optim_wrapper = dict( + type='DeepSpeedOptimWrapper', + optimizer=cfg.optim_wrapper.optimizer, + constructor=cfg.optim_wrapper.constructor, + paramwise_cfg=cfg.optim_wrapper.get('paramwise_cfg', None)) + else: + optim_wrapper = dict( + type='DeepSpeedOptimWrapper', + optimizer=cfg.optim_wrapper.optimizer, + paramwise_cfg=cfg.optim_wrapper.get('paramwise_cfg', None)) cfg.__setitem__('optim_wrapper', optim_wrapper) cfg.runner_type = 'FlexibleRunner' diff --git a/xtuner/utils/templates.py b/xtuner/utils/templates.py index fee1e9c1b..37d427b26 100644 --- a/xtuner/utils/templates.py +++ b/xtuner/utils/templates.py @@ -142,6 +142,11 @@ SUFFIX='<|END_OF_TURN_TOKEN|>', SUFFIX_AS_EOS=True, STOP_WORDS=['<|END_OF_TURN_TOKEN|>']), + # TODO: Delete + plain=dict( + SYSTEM='', + INSTRUCTION='USER: {input} ASSISTANT:', + SEP=''), llama3_chat=dict( SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n' '{system}<|eot_id|>'),