diff --git a/examples/vl/README.md b/examples/vl/README.md new file mode 100644 index 0000000000..cd9abdb25a --- /dev/null +++ b/examples/vl/README.md @@ -0,0 +1,34 @@ +# Vision-Language Web Demo + +A chatbot demo with image input. + +## Supported Models + +- [InternLM/InternLM-XComposer](https://github.com/InternLM/InternLM-XComposer/tree/main) +- [Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat) + +## Quick Start + +### internlm/internlm-xcomposer-7b + +- extract llm model from huggingface model + ```python + python extract_xcomposer_llm.py + # the llm part will saved to internlm_model folder. + ``` +- lanuch the demo + ```python + python app.py --model-name internlm-xcomposer-7b --llm-ckpt internlm_model + ``` + +### Qwen-VL-Chat + +- lanuch the dmeo + ```python + python app.py --model-name qwen-vl-chat --hf-ckpt Qwen/Qwen-VL-Chat + ``` + +## Limitations + +- this demo the code in their repo to extract image features that might not very efficiency. +- this demo only contains the chat function. If you want to use localization ability in Qwen-VL-Chat or article generation function in InternLM-XComposer, you need implement these pre/post process. The difference compared to chat is how to build prompts and use the output of model. diff --git a/examples/vl/app.py b/examples/vl/app.py new file mode 100644 index 0000000000..39af89cf49 --- /dev/null +++ b/examples/vl/app.py @@ -0,0 +1,238 @@ +import argparse +import os +import random +from dataclasses import dataclass, field +from itertools import count +from pathlib import Path +from threading import Lock +from typing import List, Tuple + +import gradio as gr +from qwen_model import QwenVLChat +from xcomposer_model import InternLMXComposer + +from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn +from lmdeploy.turbomind import TurboMind +from lmdeploy.turbomind.chat import valid_str + +BATCH_SIZE = 32 +DEFAULT_MODEL_NAME = 'internlm-xcomposer-7b' +DEFAULT_HF_CKPT = 'internlm/internlm-xcomposer-7b' +# should use extract_xcomposer_llm.py to extract llm +# when use internlm-xcomposer-7b +DEFAULT_LLM_CKPT = None + +SUPPORTED_MODELS = { + 'internlm-xcomposer-7b': InternLMXComposer, + 'qwen-vl-chat': QwenVLChat +} + + +@dataclass +class Session: + _lock = Lock() + _count = count() + _session_id: int = None + _message: List[Tuple[str, str]] = field(default_factory=list) + _step: int = 0 + + def __init__(self): + with Session._lock: + self._session_id = next(Session._count) + self._message = [] + self._step = 0 + + @property + def session_id(self): + return self._session_id + + @property + def message(self): + return self._message + + @property + def step(self): + return self._step + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--model-name', + type=str, + default=DEFAULT_MODEL_NAME, + help='Model name, default to %(default)s') + parser.add_argument( + '--hf-ckpt', + type=str, + default=DEFAULT_HF_CKPT, + help='hf checkpoint name or path, default to %(default)s') + parser.add_argument( + '--llm-ckpt', + type=str, + default=DEFAULT_LLM_CKPT, + help='LLM checkpoint name or path, default to %(default)s') + parser.add_argument('--server-port', + type=int, + default=9006, + help='Server port, default %(default)s') + parser.add_argument('--server-name', + type=str, + default='127.0.0.1', + help='Server name, default %(default)s') + args = parser.parse_args() + return args + + +def load_preprocessor_model(args): + assert args.model_name in SUPPORTED_MODELS + llm_ckpt = args.hf_ckpt if args.llm_ckpt is None else args.llm_ckpt + preprocessor = SUPPORTED_MODELS[args.model_name](args.hf_ckpt) + model = TurboMind.from_pretrained(llm_ckpt, model_name=args.model_name) + return preprocessor, model + + +def launch_demo(args, preprocessor, model): + + def add_image(chatbot, session, file): + chatbot = chatbot + [((file.name, ), None)] + # print('add_image', chatbot) + history = session._message + # [([user, url, url], assistant), ...] + if len(history) == 0 or history[-1][-1] is not None: + history.append([[file.name], None]) + else: + history[-1][0].append(file.name) + return chatbot, session + + def add_text(chatbot, session, text): + chatbot = chatbot + [(text, None)] + history = session._message + if len(history) == 0 or history[-1][-1] is not None: + history.append([text, None]) + else: + history[-1][0].insert(0, text) + return chatbot, session, disable_btn, enable_btn + + def chat( + chatbot, + session, + ): + yield chatbot, session, disable_btn, enable_btn, disable_btn + + generator = model.create_instance() + history = session._message + sequence_start = len(history) == 1 + seed = random.getrandbits(64) if sequence_start else None + input_ids, features, ranges = preprocessor.prepare_query( + history[-1][0], sequence_start) + + if len(input_ids) + session.step > model.model.session_len: + gr.Warning('WARNING: exceed session max length.' + ' Please restart the session by reset button.') + + response_size = 0 + step = session.step + for outputs in generator.stream_infer(session_id=session.session_id, + input_ids=input_ids, + input_embeddings=features, + input_embedding_ranges=ranges, + stream_output=True, + sequence_start=sequence_start, + random_seed=seed, + step=step): + res, tokens = outputs[0] + # decode res + response = model.tokenizer.decode(res.tolist(), + offset=response_size) + if response.endswith('�'): + continue + response = valid_str(response) + response_size = tokens + if chatbot[-1][1] is None: + chatbot[-1][1] = '' + history[-1][1] = '' + chatbot[-1][1] += response + history[-1][1] += response + session._step = step + len(input_ids) + tokens + yield chatbot, session, disable_btn, enable_btn, disable_btn + + yield chatbot, session, enable_btn, disable_btn, enable_btn + + def cancel(chatbot, session): + generator = model.create_instance() + for _ in generator.stream_infer(session_id=session.session_id, + input_ids=[0], + request_output_len=0, + sequence_start=False, + sequence_end=False, + stop=True): + pass + return chatbot, session, disable_btn, enable_btn + + def reset(session): + generator = model.create_instance() + for _ in generator.stream_infer(session_id=session.session_id, + input_ids=[0], + request_output_len=0, + sequence_start=False, + sequence_end=False, + stop=True): + pass + return [], Session() + + with gr.Blocks(css=CSS, theme=THEME) as demo: + with gr.Column(elem_id='container'): + gr.Markdown('## LMDeploy VL Playground') + + chatbot = gr.Chatbot(elem_id='chatbot', label=model.model_name) + query = gr.Textbox(placeholder='Please input the instruction', + label='Instruction') + session = gr.State() + + with gr.Row(): + addimg_btn = gr.UploadButton('Upload Image', + file_types=['image']) + cancel_btn = gr.Button(value='Cancel', interactive=False) + reset_btn = gr.Button(value='Reset') + + addimg_btn.upload(add_image, [chatbot, session, addimg_btn], + [chatbot, session], + show_progress=True) + + send_event = query.submit( + add_text, [chatbot, session, query], [chatbot, session]).then( + chat, [chatbot, session], + [chatbot, session, query, cancel_btn, reset_btn]) + query.submit(lambda: gr.update(value=''), None, [query]) + + cancel_btn.click(cancel, [chatbot, session], + [chatbot, session, cancel_btn, reset_btn], + cancels=[send_event]) + + reset_btn.click(reset, [session], [chatbot, session], + cancels=[send_event]) + + demo.load(lambda: Session(), inputs=None, outputs=[session]) + + demo.queue(api_open=True, concurrency_count=BATCH_SIZE, max_size=100) + demo.launch( + share=True, + server_port=args.server_port, + server_name=args.server_name, + ) + + +def main(): + args = parse_args() + + cur_folder = Path(__file__).parent.as_posix() + if cur_folder != os.getcwd(): + os.chdir(cur_folder) + print(f'change working dir to {cur_folder}') + + preprocessor, model = load_preprocessor_model(args) + launch_demo(args, preprocessor, model) + + +if __name__ == '__main__': + main() diff --git a/examples/vl/extract_xcomposer_llm.py b/examples/vl/extract_xcomposer_llm.py new file mode 100644 index 0000000000..5da0bd4d14 --- /dev/null +++ b/examples/vl/extract_xcomposer_llm.py @@ -0,0 +1,41 @@ +import os +from pathlib import Path + +import torch +from transformers import AutoModel, AutoTokenizer +from xcomposer_model import InternLMXComposerTemplate # noqa + +model = AutoModel.from_pretrained('internlm/internlm-xcomposer-7b', + trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained('internlm/internlm-xcomposer-7b', + trust_remote_code=True) + +internlm_model = model.internlm_model + +lora_layers = [ + 'self_attn.q_proj', 'self_attn.v_proj', 'mlp.down_proj', 'mlp.up_proj' +] + + +def get_attr(m, key): + keys = key.split('.') + for key in keys: + m = getattr(m, key) + return m + + +# merge lora +for i in range(len(internlm_model.model.layers)): + layer = internlm_model.model.layers[i] + for key in lora_layers: + lora_linear = get_attr(layer, key) + lora_b = lora_linear.lora_B + lora_a = lora_linear.lora_A + w_ba = torch.matmul(lora_b.weight, lora_a.weight) + lora_linear.weight.data += w_ba.data + +# save model +cur_folder = Path(__file__).parent +dst_path = os.path.join(cur_folder, 'internlm_model') +internlm_model.save_pretrained(dst_path) +tokenizer.save_pretrained(dst_path) diff --git a/examples/vl/qwen_model.py b/examples/vl/qwen_model.py new file mode 100644 index 0000000000..02b5553eb5 --- /dev/null +++ b/examples/vl/qwen_model.py @@ -0,0 +1,145 @@ +import os +from glob import glob + +import numpy as np +import torch +from accelerate import init_empty_weights +from huggingface_hub import snapshot_download +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from lmdeploy.model import MODELS, Qwen7BChat + + +@MODELS.register_module(name='qwen-vl-chat') +class QwenVLChatTemplate(Qwen7BChat): + + def __init__(self, + session_len=8192, + top_p=0.3, + top_k=None, + temperature=1.0, + im_start='<|im_start|>', + im_end='<|im_end|>', + system='You are a helpful assistant.', + stop_words=['<|im_end|>'], + **kwargs): + super().__init__(**kwargs) + self.session_len = session_len + self.top_p = top_p + self.top_k = top_k + self.temperature = temperature + self.im_start = im_start + self.im_end = im_end + self.system = system + self.stop_words = stop_words + + def _concat_image_info(self, prompt): + if isinstance(prompt, str): + return prompt + prompt, nimg = prompt + res = '' + for i in range(nimg): + res += f'Picture {str(i)}:placeholder\n' + prompt = res + prompt + return prompt + + def decorate_prompt(self, prompt, sequence_start=True): + prompt = self._concat_image_info(prompt) + return super().decorate_prompt(prompt, sequence_start) + + def messages2prompt(self, messages, sequence_start=True): + if isinstance(messages, str) or isinstance(messages[0], str): + return self.decorate_prompt(messages, sequence_start) + system, users, assistants = self._translate_messages(messages) + ret = f'{self.im_start}system\n{system}{self.im_end}' + for user, assistant in zip(users, assistants): + if not isinstance(user): + user = [user[0]['text'], len(user) - 1] + user = self._concat_image_info(user) + if assistant: + ret += f'\n{self.im_start}user\n{user}{self.im_end}' \ + f'\n{self.im_start}assistant\n{assistant}' + else: + ret += f'\n{self.im_start}user\n{user}{self.im_end}' \ + f'\n{self.im_start}assistant\n' + return ret + + +class QwenVLChat: + + def __init__(self, pretrained_model_name_or_path, **kwargs): + self.pretrained_model_name_or_path = pretrained_model_name_or_path + self.decorator = QwenVLChatTemplate(**kwargs) + self._load_model() + + def _load_model(self): + path = self.pretrained_model_name_or_path + if not os.path.exists(path): + path = snapshot_download(path) + self.tokenizer = AutoTokenizer.from_pretrained(path, + trust_remote_code=True) + with init_empty_weights(): + config = AutoConfig.from_pretrained(path, trust_remote_code=True) + model = AutoModelForCausalLM.from_config(config, + trust_remote_code=True) + del model.lm_head + for key in ['wte', 'h', 'ln_f']: + setattr(model.transformer, key, None) + model.to_empty(device='cpu') + named_parameters = set() + for key, _ in model.named_parameters(): + named_parameters.add(key) + # TODO: load bin according to index.json + bins = glob(os.path.join(path, '*.bin')) + for bin in bins: + dt = torch.load(bin, map_location='cpu') + missed, _ = model.load_state_dict(dt, strict=False) + named_parameters.difference_update(set(missed)) + assert len( + named_parameters) == 0, f'missing keys: {named_parameters}' + self.model = model.to('cuda').eval() + + @torch.no_grad() + def encode_img(self, paths): + if len(paths) == 0: + return None + features = [] + # with torch.cuda.amp.autocast(dtype=torch.float16): + features = self.model.transformer.visual.encode(paths).float() + features = [x.cpu().numpy() for x in features] + return features + + def _to_inputs(self, decorate_text, image_paths, sequence_start): + features = self.encode_img(image_paths) + input_ids = self.tokenizer.encode(decorate_text) + ranges = None + if features is not None: + input_ids_arr = np.array(input_ids) + begins = np.where( + input_ids_arr == self.tokenizer.img_start_id)[0] + 1 + ends = np.where(input_ids_arr == self.tokenizer.img_end_id)[0] + ranges = np.stack([begins, ends], axis=1) + assert len(features) == len(ranges) + return input_ids, features, ranges + + def prepare_query(self, query, sequence_start=True): + image_paths = [] + if not isinstance(query, str): + query, image_paths = query[0], query[1:] + decorate_text = self.decorator.decorate_prompt( + (query, len(image_paths)), sequence_start) + return self._to_inputs(decorate_text, image_paths, sequence_start) + + def prepare_message(self, messages): + decorate_text = self.decorator.messages2prompt(messages, True) + image_paths = [] + for msg in messages: + if msg['role'] == 'user': + content = msg['content'] + if isinstance(content, str): + continue + for item in content: + if item['type'] == 'image_url': + url = item['image_url']['url'] + image_paths.append(url) + return self._to_inputs(decorate_text, image_paths, True) diff --git a/examples/vl/xcomposer_model.py b/examples/vl/xcomposer_model.py new file mode 100644 index 0000000000..a5fd350195 --- /dev/null +++ b/examples/vl/xcomposer_model.py @@ -0,0 +1,166 @@ +import os +# from safetensors.torch import load_file +from collections.abc import Sequence +from glob import glob + +import numpy as np +import torch +from accelerate import init_empty_weights +from huggingface_hub import snapshot_download +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +from lmdeploy.model import MODELS, BaseModel + +meta_instruction = """meta instruction +You are an AI assistant whose name is 浦语. +- 浦语 is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. +- 浦语 can understand and communicate fluently in the language chosen by the user such as English and 中文. +conversation +""" # noqa + + +@MODELS.register_module(name='internlm-xcomposer-7b') +class InternLMXComposerTemplate(BaseModel): + + def __init__(self, + system=meta_instruction, + user='<|User|>:', + assistant='<|Bot|>:', + eoh='', + eoa='', + stop_words=['', ''], + image_placeholder='', + **kwargs): + super().__init__(**kwargs) + self.system = system + self.user = user + self.assistant = assistant + self.eoh = eoh + self.eoa = eoa + self.stop_words = stop_words + self.image_placeholder = image_placeholder + + def _concat_image_info(self, prompt): + if isinstance(prompt, str): + return prompt + prompt, nimg = prompt + assert nimg <= 1 + if nimg == 1: + prompt = f'{self.image_placeholder}{prompt}' + return prompt + + def decorate_prompt(self, prompt, sequence_start=True): + prompt = self._concat_image_info(prompt) + if sequence_start: + return f'{self.system} {self.user} {prompt}{self.eoh} {self.assistant}' # noqa + else: + return f' {self.user} {prompt}{self.eoh} {self.assistant}' + + def messages2prompt(self, messages, sequence_start=True): + if isinstance(messages, str) or isinstance(messages[0], str): + return self.decorate_prompt(messages, sequence_start) + system, users, assistants = self._translate_messages(messages) + system = self.system if not system else system + ret = system + for user, assistant in zip(users, assistants): + if not isinstance(user, str): + assert isinstance(user, Sequence) + assert all(isinstance(item, dict) for item in user) + user = [user[0]['text'], len(user) - 1] + user = self._concat_image_info(user) + if assistant: + ret += f' {self.user} {user}{self.eoh} {self.assistant} {assistant}{self.eoa}' # noqa + else: + ret += f' {self.user} {user}{self.eoh} {self.assistant}' + return ret + + +class InternLMXComposer: + + def __init__(self, pretrained_model_name_or_path, **kwargs): + self.pretrained_model_name_or_path = pretrained_model_name_or_path + self.decorator = InternLMXComposerTemplate(**kwargs) + self._load_model() + + def _load_model(self): + path = self.pretrained_model_name_or_path + if not os.path.exists(path): + path = snapshot_download(path) + self.tokenizer = AutoTokenizer.from_pretrained(path, + trust_remote_code=True) + with init_empty_weights(): + config = AutoConfig.from_pretrained(path, trust_remote_code=True) + config.num_hidden_layers = 0 # speedup + model = AutoModelForCausalLM.from_config(config, + trust_remote_code=True) + model.internlm_model = None + model.to_empty(device='cpu') + named_parameters = set() + for key, _ in model.named_parameters(): + named_parameters.add(key) + # TODO: load bin according to index.json + bins = glob(os.path.join(path, '*.bin')) + # bins = glob(os.path.join(path, '*.safetensors')) + for bin in bins: + dt = torch.load(bin, map_location='cpu') + # dt = load_file(bin) + missed, _ = model.load_state_dict(dt, strict=False) + named_parameters.difference_update(set(missed)) + assert len( + named_parameters) == 0, f'missing keys: {named_parameters}' + self.model = model.to('cuda').eval() + + @torch.no_grad() + def encode_img(self, paths): + if len(paths) == 0: + return None + features = [] + with torch.cuda.amp.autocast(dtype=torch.float16): + for path in paths: + out = self.model.encode_img(path) + features.append(out.squeeze().cpu().numpy()) + return features + + def _to_inputs(self, decorate_text, image_paths, sequence_start): + features = self.encode_img(image_paths) + input_ids = [] + ranges = None + begins = [] + segs = decorate_text.split(self.decorator.image_placeholder) + image_dim = features[-1].shape[0] if features is not None else 0 + for i, seg in enumerate(segs): + if i > 0: + begins.append(len(input_ids)) + input_ids.extend([0] * image_dim) + seg_ids = self.tokenizer.encode( + seg, add_special_tokens=((i == 0) and sequence_start)) + input_ids.extend(seg_ids) + if features is not None: + ends = np.array(begins) + image_dim + ranges = np.stack([begins, ends], axis=1).tolist() + return input_ids, features, ranges + + def prepare_query(self, query, sequence_start=True): + image_paths = [] + if not isinstance(query, str): + query, image_paths = query[0], query[1:] + if len(image_paths) > 1: + print('does not support multiple images, use last one.') + image_paths = image_paths[-1:] + decorate_text = self.decorator.decorate_prompt( + (query, len(image_paths))) + return self._to_inputs(decorate_text, image_paths, sequence_start) + + def prepare_message(self, messages): + decorate_text = self.decorator.messages2prompt(messages, True) + image_paths = [] + for msg in messages: + if msg['role'] == 'user': + content = msg['content'] + if isinstance(content, str): + continue + for item in content: + if item['type'] == 'image_url': + url = item['image_url']['url'] + image_paths.append(url) + return self._to_inputs(decorate_text, image_paths, True) diff --git a/lmdeploy/serve/gradio/constants.py b/lmdeploy/serve/gradio/constants.py index 891c572e5a..2fea11b98b 100644 --- a/lmdeploy/serve/gradio/constants.py +++ b/lmdeploy/serve/gradio/constants.py @@ -24,5 +24,5 @@ secondary_hue=gr.themes.colors.sky, font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif']) -enable_btn = gr.Button.update(interactive=True) -disable_btn = gr.Button.update(interactive=False) +enable_btn = gr.update(interactive=True) +disable_btn = gr.update(interactive=False)