From 2c04a7209bab01882f6d6b6c627c7450596cd92f Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Mon, 22 Jan 2024 15:23:19 +0800
Subject: [PATCH 1/4] Fix turbomind end session bug. Add huggingface demo
 document

---
 docs/en/serving/gradio.md                  | 24 +++++++++++++++++++++
 docs/zh_cn/serving/gradio.md               | 25 ++++++++++++++++++++++
 lmdeploy/serve/gradio/turbomind_coupled.py | 25 ++++++++++++++--------
 lmdeploy/turbomind/turbomind.py            |  2 +-
 4 files changed, 66 insertions(+), 10 deletions(-)
 create mode 100644 docs/en/serving/gradio.md
 create mode 100644 docs/zh_cn/serving/gradio.md

diff --git a/docs/en/serving/gradio.md b/docs/en/serving/gradio.md
new file mode 100644
index 0000000000..4aeccf5c71
--- /dev/null
+++ b/docs/en/serving/gradio.md
@@ -0,0 +1,24 @@
+# Steps to create a huggingface online demo
+
+## create space
+
+First, register for a Hugging Face account. After successful registration, click on your profile picture in the upper right corner and select “New Space” to create one. Follow the Hugging Face guide to choose the necessary configurations, and you will have a blank demo space ready.
+
+## 使用 LMDeploy 的 demo
+
+Replace the content of `app.py` in your space with the following code:
+
+```python
+from lmdeploy.serve.gradio.turbomind_coupled import run_local
+from lmdeploy.messages import TurbomindEngineConfig
+
+backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05)
+model_path = 'internlm/internlm2-chat-7b'
+run_local(model_path, backend_config=backend_config, huggingface_demo=True)
+```
+
+Create a `requirements.txt` file with the following content:
+
+```
+lmdeploy
+```
diff --git a/docs/zh_cn/serving/gradio.md b/docs/zh_cn/serving/gradio.md
new file mode 100644
index 0000000000..5f6a5b08db
--- /dev/null
+++ b/docs/zh_cn/serving/gradio.md
@@ -0,0 +1,25 @@
+# 从 LMDeploy 创建一个 huggingface 的在线 demo
+
+## 创建 space
+
+首先，注册一个 huggingface 的账号，注册成功后，可以点击右上角头像，选择 New Space 创建。
+根据 huggingface 的引导选择需要的配置，完成后即可得到一个空白的 demo。
+
+## 使用 LMDeploy 的 demo
+
+以 `internlm/internlm2-chat-7b` 模型为例，将 space 空间中的`app.py`内容填写为：
+
+```python
+from lmdeploy.serve.gradio.turbomind_coupled import run_local
+from lmdeploy.messages import TurbomindEngineConfig
+
+backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05)
+model_path = 'internlm/internlm2-chat-7b'
+run_local(model_path, backend_config=backend_config, huggingface_demo=True)
+```
+
+创建`requirements.txt`文本文件，填写如下安装包：
+
+```
+lmdeploy
+```
diff --git a/lmdeploy/serve/gradio/turbomind_coupled.py b/lmdeploy/serve/gradio/turbomind_coupled.py
index 4de13e9baa..14351c8e75 100644
--- a/lmdeploy/serve/gradio/turbomind_coupled.py
+++ b/lmdeploy/serve/gradio/turbomind_coupled.py
@@ -124,6 +124,7 @@ def run_local(model_path: str,
               server_name: str = 'localhost',
               server_port: int = 6006,
               tp: int = 1,
+              huggingface_demo: bool = False,
               **kwargs):
     """chat with AI assistant through web ui.
 
@@ -153,6 +154,8 @@ def run_local(model_path: str,
         server_name (str): the ip address of gradio server
         server_port (int): the port of gradio server
         tp (int): tensor parallel for Turbomind
+        huggingface_demo (bool): whether for huggingface space demo. Running
+            on huggingface space require no specified host name or port.
     """
     InterFace.async_engine = AsyncEngine(
         model_path=model_path,
@@ -220,15 +223,19 @@ def init():
 
         demo.load(init, inputs=None, outputs=[state_session_id])
 
-    print(f'server is gonna mount on: http://{server_name}:{server_port}')
-    demo.queue(concurrency_count=InterFace.async_engine.instance_num,
-               max_size=100,
-               api_open=True).launch(
-                   max_threads=10,
-                   share=True,
-                   server_port=server_port,
-                   server_name=server_name,
-               )
+    if huggingface_demo is True:
+        demo.queue(concurrency_count=InterFace.async_engine.instance_num,
+                   max_size=100).launch()
+    else:
+        print(f'server is gonna mount on: http://{server_name}:{server_port}')
+        demo.queue(concurrency_count=InterFace.async_engine.instance_num,
+                   max_size=100,
+                   api_open=True).launch(
+                       max_threads=10,
+                       share=True,
+                       server_port=server_port,
+                       server_name=server_name,
+                   )
 
 
 if __name__ == '__main__':
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index 4a4dc91577..1030df512f 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -522,7 +522,7 @@ def _update_generation_config(self, config: EngineGenerationConfig,
             if k in config.__dict__:
                 config.__dict__[k] = v
                 deprecated_kwargs.append(k)
-        if kwargs.get('request_output_len'):
+        if 'request_output_len' in kwargs:
             config.max_new_tokens = kwargs['request_output_len']
             deprecated_kwargs.append('request_output_len')
         for k in deprecated_kwargs:

From 35a9bb1417f1c1e3bad525a79fa6e6e236b8ba62 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Tue, 23 Jan 2024 10:13:52 +0800
Subject: [PATCH 2/4] update documents

---
 docs/en/serving/gradio.md    | 12 +++++++++++-
 docs/zh_cn/serving/gradio.md | 10 ++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/docs/en/serving/gradio.md b/docs/en/serving/gradio.md
index 4aeccf5c71..644706833f 100644
--- a/docs/en/serving/gradio.md
+++ b/docs/en/serving/gradio.md
@@ -4,7 +4,7 @@
 
 First, register for a Hugging Face account. After successful registration, click on your profile picture in the upper right corner and select “New Space” to create one. Follow the Hugging Face guide to choose the necessary configurations, and you will have a blank demo space ready.
 
-## 使用 LMDeploy 的 demo
+## A demo for LMDeploy
 
 Replace the content of `app.py` in your space with the following code:
 
@@ -22,3 +22,13 @@ Create a `requirements.txt` file with the following content:
 ```
 lmdeploy
 ```
+
+## FAQs
+
+- ZeroGPU compatibility issue. ZeroGPU is more suitable for inference methods similar to PyTorch, rather than Turbomind. You can switch to the PyTorch backend or enable standard GPUs.
+- Gradio version issue, versions above 4.0.0 are currently not supported. You can modify this in `app.py`, for example:
+  ```python
+  import os
+  os.system("pip uninstall -y gradio")
+  os.system("pip install gradio==3.43.0")
+  ```
diff --git a/docs/zh_cn/serving/gradio.md b/docs/zh_cn/serving/gradio.md
index 5f6a5b08db..af6b6e0bdc 100644
--- a/docs/zh_cn/serving/gradio.md
+++ b/docs/zh_cn/serving/gradio.md
@@ -23,3 +23,13 @@ run_local(model_path, backend_config=backend_config, huggingface_demo=True)
 ```
 lmdeploy
 ```
+
+## FAQs
+
+- ZeroGPU 适配问题。ZeroGPU 更适合类似 PyTorch 这样的推理方式，而非 Turbomind。可以改用 pytorch 后端，或者启用普通 GPU。
+- gradio 版本问题，目前不支持 4.0.0 以上版本，可以在 `app.py` 中修改，类似：
+  ```python
+  import os
+  os.system("pip uninstall -y gradio")
+  os.system("pip install gradio==3.43.0")
+  ```

From cc0f0c7da53afed112187c80dcf0f6109774d326 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Thu, 25 Jan 2024 15:19:00 +0800
Subject: [PATCH 3/4] set server_name for huggingface space

---
 docs/en/serving/gradio.md                  |  2 +-
 docs/zh_cn/serving/gradio.md               |  2 +-
 lmdeploy/serve/gradio/turbomind_coupled.py | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/en/serving/gradio.md b/docs/en/serving/gradio.md
index 644706833f..803dff50f5 100644
--- a/docs/en/serving/gradio.md
+++ b/docs/en/serving/gradio.md
@@ -14,7 +14,7 @@ from lmdeploy.messages import TurbomindEngineConfig
 
 backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05)
 model_path = 'internlm/internlm2-chat-7b'
-run_local(model_path, backend_config=backend_config, huggingface_demo=True)
+run_local(model_path, backend_config=backend_config, server_name="huggingface-space")
 ```
 
 Create a `requirements.txt` file with the following content:
diff --git a/docs/zh_cn/serving/gradio.md b/docs/zh_cn/serving/gradio.md
index af6b6e0bdc..fe1e01af3f 100644
--- a/docs/zh_cn/serving/gradio.md
+++ b/docs/zh_cn/serving/gradio.md
@@ -15,7 +15,7 @@ from lmdeploy.messages import TurbomindEngineConfig
 
 backend_config = TurbomindEngineConfig(max_batch_size=1, cache_max_entry_count=0.05)
 model_path = 'internlm/internlm2-chat-7b'
-run_local(model_path, backend_config=backend_config, huggingface_demo=True)
+run_local(model_path, backend_config=backend_config, server_name="huggingface-space")
 ```
 
 创建`requirements.txt`文本文件，填写如下安装包：
diff --git a/lmdeploy/serve/gradio/turbomind_coupled.py b/lmdeploy/serve/gradio/turbomind_coupled.py
index 14351c8e75..2f703d9739 100644
--- a/lmdeploy/serve/gradio/turbomind_coupled.py
+++ b/lmdeploy/serve/gradio/turbomind_coupled.py
@@ -121,7 +121,7 @@ def run_local(model_path: str,
               backend_config: Optional[Union[PytorchEngineConfig,
                                              TurbomindEngineConfig]] = None,
               chat_template_config: Optional[ChatTemplateConfig] = None,
-              server_name: str = 'localhost',
+              server_name: str = '0.0.0.0',
               server_port: int = 6006,
               tp: int = 1,
               huggingface_demo: bool = False,
@@ -151,11 +151,11 @@ def run_local(model_path: str,
             config instance. Default to none.
         chat_template_config (ChatTemplateConfig): chat template configuration.
             Default to None.
-        server_name (str): the ip address of gradio server
+        server_name (str): the ip address of gradio server. Default to
+            "0.0.0.0". For huggingface space demo, it should be
+            "huggingface-space".
         server_port (int): the port of gradio server
         tp (int): tensor parallel for Turbomind
-        huggingface_demo (bool): whether for huggingface space demo. Running
-            on huggingface space require no specified host name or port.
     """
     InterFace.async_engine = AsyncEngine(
         model_path=model_path,
@@ -223,7 +223,7 @@ def init():
 
         demo.load(init, inputs=None, outputs=[state_session_id])
 
-    if huggingface_demo is True:
+    if server_name == 'huggingface-space':
         demo.queue(concurrency_count=InterFace.async_engine.instance_num,
                    max_size=100).launch()
     else:

From 8660898e33a0dc0690e9967f718ae388a7fa3018 Mon Sep 17 00:00:00 2001
From: AllentDan <AllentDan@yeah.net>
Date: Thu, 25 Jan 2024 15:20:51 +0800
Subject: [PATCH 4/4] remove huggingface_demo arg

---
 lmdeploy/serve/gradio/turbomind_coupled.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lmdeploy/serve/gradio/turbomind_coupled.py b/lmdeploy/serve/gradio/turbomind_coupled.py
index 2f703d9739..8bc9b2e04f 100644
--- a/lmdeploy/serve/gradio/turbomind_coupled.py
+++ b/lmdeploy/serve/gradio/turbomind_coupled.py
@@ -124,7 +124,6 @@ def run_local(model_path: str,
               server_name: str = '0.0.0.0',
               server_port: int = 6006,
               tp: int = 1,
-              huggingface_demo: bool = False,
               **kwargs):
     """chat with AI assistant through web ui.