diff --git a/docs/en/serving/gradio.md b/docs/en/serving/gradio.md new file mode 100644 index 0000000000..803dff50f5 --- /dev/null +++ b/docs/en/serving/gradio.md @@ -0,0 +1,34 @@ +# Steps to create a huggingface online demo + +## create space + +First, register for a Hugging Face account. After successful registration, click on your profile picture in the upper right corner and select “New Space” to create one. Follow the Hugging Face guide to choose the necessary configurations, and you will have a blank demo space ready. + +## A demo for LMDeploy + +Replace the content of `app.py` in your space with the following code: + +```python +from lmdeploy.serve.gradio.turbomind_coupled import run_local +from lmdeploy.messages import TurbomindEngineConfig + +backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05) +model_path = 'internlm/internlm2-chat-7b' +run_local(model_path, backend_config=backend_config, server_name="huggingface-space") +``` + +Create a `requirements.txt` file with the following content: + +``` +lmdeploy +``` + +## FAQs + +- ZeroGPU compatibility issue. ZeroGPU is more suitable for inference methods similar to PyTorch, rather than Turbomind. You can switch to the PyTorch backend or enable standard GPUs. +- Gradio version issue, versions above 4.0.0 are currently not supported. You can modify this in `app.py`, for example: + ```python + import os + os.system("pip uninstall -y gradio") + os.system("pip install gradio==3.43.0") + ``` diff --git a/docs/zh_cn/serving/gradio.md b/docs/zh_cn/serving/gradio.md new file mode 100644 index 0000000000..fe1e01af3f --- /dev/null +++ b/docs/zh_cn/serving/gradio.md @@ -0,0 +1,35 @@ +# 从 LMDeploy 创建一个 huggingface 的在线 demo + +## 创建 space + +首先,注册一个 huggingface 的账号,注册成功后,可以点击右上角头像,选择 New Space 创建。 +根据 huggingface 的引导选择需要的配置,完成后即可得到一个空白的 demo。 + +## 使用 LMDeploy 的 demo + +以 `internlm/internlm2-chat-7b` 模型为例,将 space 空间中的`app.py`内容填写为: + +```python +from lmdeploy.serve.gradio.turbomind_coupled import run_local +from lmdeploy.messages import TurbomindEngineConfig + +backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05) +model_path = 'internlm/internlm2-chat-7b' +run_local(model_path, backend_config=backend_config, server_name="huggingface-space") +``` + +创建`requirements.txt`文本文件,填写如下安装包: + +``` +lmdeploy +``` + +## FAQs + +- ZeroGPU 适配问题。ZeroGPU 更适合类似 PyTorch 这样的推理方式,而非 Turbomind。可以改用 pytorch 后端,或者启用普通 GPU。 +- gradio 版本问题,目前不支持 4.0.0 以上版本,可以在 `app.py` 中修改,类似: + ```python + import os + os.system("pip uninstall -y gradio") + os.system("pip install gradio==3.43.0") + ``` diff --git a/lmdeploy/serve/gradio/turbomind_coupled.py b/lmdeploy/serve/gradio/turbomind_coupled.py index 4de13e9baa..8bc9b2e04f 100644 --- a/lmdeploy/serve/gradio/turbomind_coupled.py +++ b/lmdeploy/serve/gradio/turbomind_coupled.py @@ -121,7 +121,7 @@ def run_local(model_path: str, backend_config: Optional[Union[PytorchEngineConfig, TurbomindEngineConfig]] = None, chat_template_config: Optional[ChatTemplateConfig] = None, - server_name: str = 'localhost', + server_name: str = '0.0.0.0', server_port: int = 6006, tp: int = 1, **kwargs): @@ -150,7 +150,9 @@ def run_local(model_path: str, config instance. Default to none. chat_template_config (ChatTemplateConfig): chat template configuration. Default to None. - server_name (str): the ip address of gradio server + server_name (str): the ip address of gradio server. Default to + "0.0.0.0". For huggingface space demo, it should be + "huggingface-space". server_port (int): the port of gradio server tp (int): tensor parallel for Turbomind """ @@ -220,15 +222,19 @@ def init(): demo.load(init, inputs=None, outputs=[state_session_id]) - print(f'server is gonna mount on: http://{server_name}:{server_port}') - demo.queue(concurrency_count=InterFace.async_engine.instance_num, - max_size=100, - api_open=True).launch( - max_threads=10, - share=True, - server_port=server_port, - server_name=server_name, - ) + if server_name == 'huggingface-space': + demo.queue(concurrency_count=InterFace.async_engine.instance_num, + max_size=100).launch() + else: + print(f'server is gonna mount on: http://{server_name}:{server_port}') + demo.queue(concurrency_count=InterFace.async_engine.instance_num, + max_size=100, + api_open=True).launch( + max_threads=10, + share=True, + server_port=server_port, + server_name=server_name, + ) if __name__ == '__main__': diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 4a4dc91577..1030df512f 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -522,7 +522,7 @@ def _update_generation_config(self, config: EngineGenerationConfig, if k in config.__dict__: config.__dict__[k] = v deprecated_kwargs.append(k) - if kwargs.get('request_output_len'): + if 'request_output_len' in kwargs: config.max_new_tokens = kwargs['request_output_len'] deprecated_kwargs.append('request_output_len') for k in deprecated_kwargs: