diff --git a/examples/vl/README.md b/examples/vl/README.md
new file mode 100644
index 0000000000..cd9abdb25a
--- /dev/null
+++ b/examples/vl/README.md
@@ -0,0 +1,34 @@
+# Vision-Language Web Demo
+
+A chatbot demo with image input.
+
+## Supported Models
+
+- [InternLM/InternLM-XComposer](https://github.com/InternLM/InternLM-XComposer/tree/main)
+- [Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)
+
+## Quick Start
+
+### internlm/internlm-xcomposer-7b
+
+- extract llm model from huggingface model
+ ```python
+ python extract_xcomposer_llm.py
+ # the llm part will saved to internlm_model folder.
+ ```
+- lanuch the demo
+ ```python
+ python app.py --model-name internlm-xcomposer-7b --llm-ckpt internlm_model
+ ```
+
+### Qwen-VL-Chat
+
+- lanuch the dmeo
+ ```python
+ python app.py --model-name qwen-vl-chat --hf-ckpt Qwen/Qwen-VL-Chat
+ ```
+
+## Limitations
+
+- this demo the code in their repo to extract image features that might not very efficiency.
+- this demo only contains the chat function. If you want to use localization ability in Qwen-VL-Chat or article generation function in InternLM-XComposer, you need implement these pre/post process. The difference compared to chat is how to build prompts and use the output of model.
diff --git a/examples/vl/app.py b/examples/vl/app.py
new file mode 100644
index 0000000000..39af89cf49
--- /dev/null
+++ b/examples/vl/app.py
@@ -0,0 +1,238 @@
+import argparse
+import os
+import random
+from dataclasses import dataclass, field
+from itertools import count
+from pathlib import Path
+from threading import Lock
+from typing import List, Tuple
+
+import gradio as gr
+from qwen_model import QwenVLChat
+from xcomposer_model import InternLMXComposer
+
+from lmdeploy.serve.gradio.constants import CSS, THEME, disable_btn, enable_btn
+from lmdeploy.turbomind import TurboMind
+from lmdeploy.turbomind.chat import valid_str
+
+BATCH_SIZE = 32
+DEFAULT_MODEL_NAME = 'internlm-xcomposer-7b'
+DEFAULT_HF_CKPT = 'internlm/internlm-xcomposer-7b'
+# should use extract_xcomposer_llm.py to extract llm
+# when use internlm-xcomposer-7b
+DEFAULT_LLM_CKPT = None
+
+SUPPORTED_MODELS = {
+ 'internlm-xcomposer-7b': InternLMXComposer,
+ 'qwen-vl-chat': QwenVLChat
+}
+
+
+@dataclass
+class Session:
+ _lock = Lock()
+ _count = count()
+ _session_id: int = None
+ _message: List[Tuple[str, str]] = field(default_factory=list)
+ _step: int = 0
+
+ def __init__(self):
+ with Session._lock:
+ self._session_id = next(Session._count)
+ self._message = []
+ self._step = 0
+
+ @property
+ def session_id(self):
+ return self._session_id
+
+ @property
+ def message(self):
+ return self._message
+
+ @property
+ def step(self):
+ return self._step
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model-name',
+ type=str,
+ default=DEFAULT_MODEL_NAME,
+ help='Model name, default to %(default)s')
+ parser.add_argument(
+ '--hf-ckpt',
+ type=str,
+ default=DEFAULT_HF_CKPT,
+ help='hf checkpoint name or path, default to %(default)s')
+ parser.add_argument(
+ '--llm-ckpt',
+ type=str,
+ default=DEFAULT_LLM_CKPT,
+ help='LLM checkpoint name or path, default to %(default)s')
+ parser.add_argument('--server-port',
+ type=int,
+ default=9006,
+ help='Server port, default %(default)s')
+ parser.add_argument('--server-name',
+ type=str,
+ default='127.0.0.1',
+ help='Server name, default %(default)s')
+ args = parser.parse_args()
+ return args
+
+
+def load_preprocessor_model(args):
+ assert args.model_name in SUPPORTED_MODELS
+ llm_ckpt = args.hf_ckpt if args.llm_ckpt is None else args.llm_ckpt
+ preprocessor = SUPPORTED_MODELS[args.model_name](args.hf_ckpt)
+ model = TurboMind.from_pretrained(llm_ckpt, model_name=args.model_name)
+ return preprocessor, model
+
+
+def launch_demo(args, preprocessor, model):
+
+ def add_image(chatbot, session, file):
+ chatbot = chatbot + [((file.name, ), None)]
+ # print('add_image', chatbot)
+ history = session._message
+ # [([user, url, url], assistant), ...]
+ if len(history) == 0 or history[-1][-1] is not None:
+ history.append([[file.name], None])
+ else:
+ history[-1][0].append(file.name)
+ return chatbot, session
+
+ def add_text(chatbot, session, text):
+ chatbot = chatbot + [(text, None)]
+ history = session._message
+ if len(history) == 0 or history[-1][-1] is not None:
+ history.append([text, None])
+ else:
+ history[-1][0].insert(0, text)
+ return chatbot, session, disable_btn, enable_btn
+
+ def chat(
+ chatbot,
+ session,
+ ):
+ yield chatbot, session, disable_btn, enable_btn, disable_btn
+
+ generator = model.create_instance()
+ history = session._message
+ sequence_start = len(history) == 1
+ seed = random.getrandbits(64) if sequence_start else None
+ input_ids, features, ranges = preprocessor.prepare_query(
+ history[-1][0], sequence_start)
+
+ if len(input_ids) + session.step > model.model.session_len:
+ gr.Warning('WARNING: exceed session max length.'
+ ' Please restart the session by reset button.')
+
+ response_size = 0
+ step = session.step
+ for outputs in generator.stream_infer(session_id=session.session_id,
+ input_ids=input_ids,
+ input_embeddings=features,
+ input_embedding_ranges=ranges,
+ stream_output=True,
+ sequence_start=sequence_start,
+ random_seed=seed,
+ step=step):
+ res, tokens = outputs[0]
+ # decode res
+ response = model.tokenizer.decode(res.tolist(),
+ offset=response_size)
+ if response.endswith('�'):
+ continue
+ response = valid_str(response)
+ response_size = tokens
+ if chatbot[-1][1] is None:
+ chatbot[-1][1] = ''
+ history[-1][1] = ''
+ chatbot[-1][1] += response
+ history[-1][1] += response
+ session._step = step + len(input_ids) + tokens
+ yield chatbot, session, disable_btn, enable_btn, disable_btn
+
+ yield chatbot, session, enable_btn, disable_btn, enable_btn
+
+ def cancel(chatbot, session):
+ generator = model.create_instance()
+ for _ in generator.stream_infer(session_id=session.session_id,
+ input_ids=[0],
+ request_output_len=0,
+ sequence_start=False,
+ sequence_end=False,
+ stop=True):
+ pass
+ return chatbot, session, disable_btn, enable_btn
+
+ def reset(session):
+ generator = model.create_instance()
+ for _ in generator.stream_infer(session_id=session.session_id,
+ input_ids=[0],
+ request_output_len=0,
+ sequence_start=False,
+ sequence_end=False,
+ stop=True):
+ pass
+ return [], Session()
+
+ with gr.Blocks(css=CSS, theme=THEME) as demo:
+ with gr.Column(elem_id='container'):
+ gr.Markdown('## LMDeploy VL Playground')
+
+ chatbot = gr.Chatbot(elem_id='chatbot', label=model.model_name)
+ query = gr.Textbox(placeholder='Please input the instruction',
+ label='Instruction')
+ session = gr.State()
+
+ with gr.Row():
+ addimg_btn = gr.UploadButton('Upload Image',
+ file_types=['image'])
+ cancel_btn = gr.Button(value='Cancel', interactive=False)
+ reset_btn = gr.Button(value='Reset')
+
+ addimg_btn.upload(add_image, [chatbot, session, addimg_btn],
+ [chatbot, session],
+ show_progress=True)
+
+ send_event = query.submit(
+ add_text, [chatbot, session, query], [chatbot, session]).then(
+ chat, [chatbot, session],
+ [chatbot, session, query, cancel_btn, reset_btn])
+ query.submit(lambda: gr.update(value=''), None, [query])
+
+ cancel_btn.click(cancel, [chatbot, session],
+ [chatbot, session, cancel_btn, reset_btn],
+ cancels=[send_event])
+
+ reset_btn.click(reset, [session], [chatbot, session],
+ cancels=[send_event])
+
+ demo.load(lambda: Session(), inputs=None, outputs=[session])
+
+ demo.queue(api_open=True, concurrency_count=BATCH_SIZE, max_size=100)
+ demo.launch(
+ share=True,
+ server_port=args.server_port,
+ server_name=args.server_name,
+ )
+
+
+def main():
+ args = parse_args()
+
+ cur_folder = Path(__file__).parent.as_posix()
+ if cur_folder != os.getcwd():
+ os.chdir(cur_folder)
+ print(f'change working dir to {cur_folder}')
+
+ preprocessor, model = load_preprocessor_model(args)
+ launch_demo(args, preprocessor, model)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/vl/extract_xcomposer_llm.py b/examples/vl/extract_xcomposer_llm.py
new file mode 100644
index 0000000000..5da0bd4d14
--- /dev/null
+++ b/examples/vl/extract_xcomposer_llm.py
@@ -0,0 +1,41 @@
+import os
+from pathlib import Path
+
+import torch
+from transformers import AutoModel, AutoTokenizer
+from xcomposer_model import InternLMXComposerTemplate # noqa
+
+model = AutoModel.from_pretrained('internlm/internlm-xcomposer-7b',
+ trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained('internlm/internlm-xcomposer-7b',
+ trust_remote_code=True)
+
+internlm_model = model.internlm_model
+
+lora_layers = [
+ 'self_attn.q_proj', 'self_attn.v_proj', 'mlp.down_proj', 'mlp.up_proj'
+]
+
+
+def get_attr(m, key):
+ keys = key.split('.')
+ for key in keys:
+ m = getattr(m, key)
+ return m
+
+
+# merge lora
+for i in range(len(internlm_model.model.layers)):
+ layer = internlm_model.model.layers[i]
+ for key in lora_layers:
+ lora_linear = get_attr(layer, key)
+ lora_b = lora_linear.lora_B
+ lora_a = lora_linear.lora_A
+ w_ba = torch.matmul(lora_b.weight, lora_a.weight)
+ lora_linear.weight.data += w_ba.data
+
+# save model
+cur_folder = Path(__file__).parent
+dst_path = os.path.join(cur_folder, 'internlm_model')
+internlm_model.save_pretrained(dst_path)
+tokenizer.save_pretrained(dst_path)
diff --git a/examples/vl/qwen_model.py b/examples/vl/qwen_model.py
new file mode 100644
index 0000000000..02b5553eb5
--- /dev/null
+++ b/examples/vl/qwen_model.py
@@ -0,0 +1,145 @@
+import os
+from glob import glob
+
+import numpy as np
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+from lmdeploy.model import MODELS, Qwen7BChat
+
+
+@MODELS.register_module(name='qwen-vl-chat')
+class QwenVLChatTemplate(Qwen7BChat):
+
+ def __init__(self,
+ session_len=8192,
+ top_p=0.3,
+ top_k=None,
+ temperature=1.0,
+ im_start='<|im_start|>',
+ im_end='<|im_end|>',
+ system='You are a helpful assistant.',
+ stop_words=['<|im_end|>'],
+ **kwargs):
+ super().__init__(**kwargs)
+ self.session_len = session_len
+ self.top_p = top_p
+ self.top_k = top_k
+ self.temperature = temperature
+ self.im_start = im_start
+ self.im_end = im_end
+ self.system = system
+ self.stop_words = stop_words
+
+ def _concat_image_info(self, prompt):
+ if isinstance(prompt, str):
+ return prompt
+ prompt, nimg = prompt
+ res = ''
+ for i in range(nimg):
+ res += f'Picture {str(i)}:placeholder\n'
+ prompt = res + prompt
+ return prompt
+
+ def decorate_prompt(self, prompt, sequence_start=True):
+ prompt = self._concat_image_info(prompt)
+ return super().decorate_prompt(prompt, sequence_start)
+
+ def messages2prompt(self, messages, sequence_start=True):
+ if isinstance(messages, str) or isinstance(messages[0], str):
+ return self.decorate_prompt(messages, sequence_start)
+ system, users, assistants = self._translate_messages(messages)
+ ret = f'{self.im_start}system\n{system}{self.im_end}'
+ for user, assistant in zip(users, assistants):
+ if not isinstance(user):
+ user = [user[0]['text'], len(user) - 1]
+ user = self._concat_image_info(user)
+ if assistant:
+ ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
+ f'\n{self.im_start}assistant\n{assistant}'
+ else:
+ ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
+ f'\n{self.im_start}assistant\n'
+ return ret
+
+
+class QwenVLChat:
+
+ def __init__(self, pretrained_model_name_or_path, **kwargs):
+ self.pretrained_model_name_or_path = pretrained_model_name_or_path
+ self.decorator = QwenVLChatTemplate(**kwargs)
+ self._load_model()
+
+ def _load_model(self):
+ path = self.pretrained_model_name_or_path
+ if not os.path.exists(path):
+ path = snapshot_download(path)
+ self.tokenizer = AutoTokenizer.from_pretrained(path,
+ trust_remote_code=True)
+ with init_empty_weights():
+ config = AutoConfig.from_pretrained(path, trust_remote_code=True)
+ model = AutoModelForCausalLM.from_config(config,
+ trust_remote_code=True)
+ del model.lm_head
+ for key in ['wte', 'h', 'ln_f']:
+ setattr(model.transformer, key, None)
+ model.to_empty(device='cpu')
+ named_parameters = set()
+ for key, _ in model.named_parameters():
+ named_parameters.add(key)
+ # TODO: load bin according to index.json
+ bins = glob(os.path.join(path, '*.bin'))
+ for bin in bins:
+ dt = torch.load(bin, map_location='cpu')
+ missed, _ = model.load_state_dict(dt, strict=False)
+ named_parameters.difference_update(set(missed))
+ assert len(
+ named_parameters) == 0, f'missing keys: {named_parameters}'
+ self.model = model.to('cuda').eval()
+
+ @torch.no_grad()
+ def encode_img(self, paths):
+ if len(paths) == 0:
+ return None
+ features = []
+ # with torch.cuda.amp.autocast(dtype=torch.float16):
+ features = self.model.transformer.visual.encode(paths).float()
+ features = [x.cpu().numpy() for x in features]
+ return features
+
+ def _to_inputs(self, decorate_text, image_paths, sequence_start):
+ features = self.encode_img(image_paths)
+ input_ids = self.tokenizer.encode(decorate_text)
+ ranges = None
+ if features is not None:
+ input_ids_arr = np.array(input_ids)
+ begins = np.where(
+ input_ids_arr == self.tokenizer.img_start_id)[0] + 1
+ ends = np.where(input_ids_arr == self.tokenizer.img_end_id)[0]
+ ranges = np.stack([begins, ends], axis=1)
+ assert len(features) == len(ranges)
+ return input_ids, features, ranges
+
+ def prepare_query(self, query, sequence_start=True):
+ image_paths = []
+ if not isinstance(query, str):
+ query, image_paths = query[0], query[1:]
+ decorate_text = self.decorator.decorate_prompt(
+ (query, len(image_paths)), sequence_start)
+ return self._to_inputs(decorate_text, image_paths, sequence_start)
+
+ def prepare_message(self, messages):
+ decorate_text = self.decorator.messages2prompt(messages, True)
+ image_paths = []
+ for msg in messages:
+ if msg['role'] == 'user':
+ content = msg['content']
+ if isinstance(content, str):
+ continue
+ for item in content:
+ if item['type'] == 'image_url':
+ url = item['image_url']['url']
+ image_paths.append(url)
+ return self._to_inputs(decorate_text, image_paths, True)
diff --git a/examples/vl/xcomposer_model.py b/examples/vl/xcomposer_model.py
new file mode 100644
index 0000000000..a5fd350195
--- /dev/null
+++ b/examples/vl/xcomposer_model.py
@@ -0,0 +1,166 @@
+import os
+# from safetensors.torch import load_file
+from collections.abc import Sequence
+from glob import glob
+
+import numpy as np
+import torch
+from accelerate import init_empty_weights
+from huggingface_hub import snapshot_download
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+from lmdeploy.model import MODELS, BaseModel
+
+meta_instruction = """meta instruction
+You are an AI assistant whose name is 浦语.
+- 浦语 is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
+- 浦语 can understand and communicate fluently in the language chosen by the user such as English and 中文.
+conversation
+""" # noqa
+
+
+@MODELS.register_module(name='internlm-xcomposer-7b')
+class InternLMXComposerTemplate(BaseModel):
+
+ def __init__(self,
+ system=meta_instruction,
+ user='<|User|>:',
+ assistant='<|Bot|>:',
+ eoh='',
+ eoa='',
+ stop_words=['', ''],
+ image_placeholder='',
+ **kwargs):
+ super().__init__(**kwargs)
+ self.system = system
+ self.user = user
+ self.assistant = assistant
+ self.eoh = eoh
+ self.eoa = eoa
+ self.stop_words = stop_words
+ self.image_placeholder = image_placeholder
+
+ def _concat_image_info(self, prompt):
+ if isinstance(prompt, str):
+ return prompt
+ prompt, nimg = prompt
+ assert nimg <= 1
+ if nimg == 1:
+ prompt = f'{self.image_placeholder}{prompt}'
+ return prompt
+
+ def decorate_prompt(self, prompt, sequence_start=True):
+ prompt = self._concat_image_info(prompt)
+ if sequence_start:
+ return f'{self.system} {self.user} {prompt}{self.eoh} {self.assistant}' # noqa
+ else:
+ return f' {self.user} {prompt}{self.eoh} {self.assistant}'
+
+ def messages2prompt(self, messages, sequence_start=True):
+ if isinstance(messages, str) or isinstance(messages[0], str):
+ return self.decorate_prompt(messages, sequence_start)
+ system, users, assistants = self._translate_messages(messages)
+ system = self.system if not system else system
+ ret = system
+ for user, assistant in zip(users, assistants):
+ if not isinstance(user, str):
+ assert isinstance(user, Sequence)
+ assert all(isinstance(item, dict) for item in user)
+ user = [user[0]['text'], len(user) - 1]
+ user = self._concat_image_info(user)
+ if assistant:
+ ret += f' {self.user} {user}{self.eoh} {self.assistant} {assistant}{self.eoa}' # noqa
+ else:
+ ret += f' {self.user} {user}{self.eoh} {self.assistant}'
+ return ret
+
+
+class InternLMXComposer:
+
+ def __init__(self, pretrained_model_name_or_path, **kwargs):
+ self.pretrained_model_name_or_path = pretrained_model_name_or_path
+ self.decorator = InternLMXComposerTemplate(**kwargs)
+ self._load_model()
+
+ def _load_model(self):
+ path = self.pretrained_model_name_or_path
+ if not os.path.exists(path):
+ path = snapshot_download(path)
+ self.tokenizer = AutoTokenizer.from_pretrained(path,
+ trust_remote_code=True)
+ with init_empty_weights():
+ config = AutoConfig.from_pretrained(path, trust_remote_code=True)
+ config.num_hidden_layers = 0 # speedup
+ model = AutoModelForCausalLM.from_config(config,
+ trust_remote_code=True)
+ model.internlm_model = None
+ model.to_empty(device='cpu')
+ named_parameters = set()
+ for key, _ in model.named_parameters():
+ named_parameters.add(key)
+ # TODO: load bin according to index.json
+ bins = glob(os.path.join(path, '*.bin'))
+ # bins = glob(os.path.join(path, '*.safetensors'))
+ for bin in bins:
+ dt = torch.load(bin, map_location='cpu')
+ # dt = load_file(bin)
+ missed, _ = model.load_state_dict(dt, strict=False)
+ named_parameters.difference_update(set(missed))
+ assert len(
+ named_parameters) == 0, f'missing keys: {named_parameters}'
+ self.model = model.to('cuda').eval()
+
+ @torch.no_grad()
+ def encode_img(self, paths):
+ if len(paths) == 0:
+ return None
+ features = []
+ with torch.cuda.amp.autocast(dtype=torch.float16):
+ for path in paths:
+ out = self.model.encode_img(path)
+ features.append(out.squeeze().cpu().numpy())
+ return features
+
+ def _to_inputs(self, decorate_text, image_paths, sequence_start):
+ features = self.encode_img(image_paths)
+ input_ids = []
+ ranges = None
+ begins = []
+ segs = decorate_text.split(self.decorator.image_placeholder)
+ image_dim = features[-1].shape[0] if features is not None else 0
+ for i, seg in enumerate(segs):
+ if i > 0:
+ begins.append(len(input_ids))
+ input_ids.extend([0] * image_dim)
+ seg_ids = self.tokenizer.encode(
+ seg, add_special_tokens=((i == 0) and sequence_start))
+ input_ids.extend(seg_ids)
+ if features is not None:
+ ends = np.array(begins) + image_dim
+ ranges = np.stack([begins, ends], axis=1).tolist()
+ return input_ids, features, ranges
+
+ def prepare_query(self, query, sequence_start=True):
+ image_paths = []
+ if not isinstance(query, str):
+ query, image_paths = query[0], query[1:]
+ if len(image_paths) > 1:
+ print('does not support multiple images, use last one.')
+ image_paths = image_paths[-1:]
+ decorate_text = self.decorator.decorate_prompt(
+ (query, len(image_paths)))
+ return self._to_inputs(decorate_text, image_paths, sequence_start)
+
+ def prepare_message(self, messages):
+ decorate_text = self.decorator.messages2prompt(messages, True)
+ image_paths = []
+ for msg in messages:
+ if msg['role'] == 'user':
+ content = msg['content']
+ if isinstance(content, str):
+ continue
+ for item in content:
+ if item['type'] == 'image_url':
+ url = item['image_url']['url']
+ image_paths.append(url)
+ return self._to_inputs(decorate_text, image_paths, True)
diff --git a/lmdeploy/serve/gradio/constants.py b/lmdeploy/serve/gradio/constants.py
index 891c572e5a..2fea11b98b 100644
--- a/lmdeploy/serve/gradio/constants.py
+++ b/lmdeploy/serve/gradio/constants.py
@@ -24,5 +24,5 @@
secondary_hue=gr.themes.colors.sky,
font=[gr.themes.GoogleFont('Inconsolata'), 'Arial', 'sans-serif'])
-enable_btn = gr.Button.update(interactive=True)
-disable_btn = gr.Button.update(interactive=False)
+enable_btn = gr.update(interactive=True)
+disable_btn = gr.update(interactive=False)