InternLM · lvhan028 · Aug 7, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/.github/workflows/pr_ete_test.yml b/.github/workflows/pr_ete_test.yml
@@ -24,6 +24,7 @@ env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA
 
 
 jobs:
@@ -33,58 +34,65 @@ jobs:
     env:
       REPORT_DIR: /nvme/qa_test_models/test-reports
     container:
-      image: nvcr.io/nvidia/tritonserver:24.03-py3
+      image: nvidia/cuda:12.4.1-devel-ubuntu22.04
       options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
       volumes:
         - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
         - /nvme/share_data/github-actions/packages:/root/packages
         - /nvme/qa_test_models:/nvme/qa_test_models
+        - /mnt/187:/mnt/187
         - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
     steps:
       - name: Setup systems
         run: |
-          rm /etc/apt/sources.list.d/cuda*.list
-          apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \
-              libgoogle-glog-dev libgl1 openjdk-8-jre-headless
-          rm -rf /var/lib/apt/lists/*
+          rm /etc/apt/sources.list.d/cuda*.list && apt-get update -y && apt-get install -y software-properties-common wget vim &&\
+          add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
+          rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \
+          && apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
+          echo "PATH=/opt/py3/bin:$PATH" >> "$GITHUB_ENV"
       - name: Clone repository
         uses: actions/checkout@v2
       - name: Install pytorch
         run: |
           python3 -m pip cache dir
-          python3 -m pip install torch==2.1.0 torchvision==0.16.0
+          python3 -m pip install torch==2.3.0 torchvision==0.18.0
           # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
-          python3 -m pip install /root/packages/flash_attn-2.5.8+cu122torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+          python3 -m pip install /root/packages/flash_attn-2.6.3+cu123torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
       - name: Build lmdeploy
         run: |
           python3 -m pip install cmake
           python3 -m pip install -r requirements/build.txt
+          cp /nvme/qa_test_models/offline_pkg/openmpi-4.1.5.tar.gz .
+          tar xf openmpi-4.1.5.tar.gz && cd openmpi-4.1.5 && ./configure --prefix=/usr/local/openmpi
+          make -j$(nproc) && make install && cd .. && rm -rf openmpi-4.1.5*
+          export PATH=$PATH:/usr/local/openmpi/bin
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
           mkdir build
           cd build
-          cp -r /nvme/qa_test_models/offline_pkg/_deps .
           cmake .. \
               -DCMAKE_BUILD_TYPE=RelWithDebInfo \
               -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
               -DCMAKE_INSTALL_PREFIX=/opt/tritonserver \
               -DBUILD_PY_FFI=ON \
               -DBUILD_MULTI_GPU=ON \
+              -DBUILD_CUTLASS_MOE=OFF \
+              -DBUILD_CUTLASS_MIXED_GEMM=OFF \
               -DCMAKE_CUDA_FLAGS="-lineinfo" \
-              -DUSE_NVTX=ON \
-              -DSM=80 \
-              -DCMAKE_CUDA_ARCHITECTURES=80 \
-              -DBUILD_TEST=OFF
+              -DUSE_NVTX=ON &&\
           make -j$(nproc) && make install
       - name: Install lmdeploy
         run: |
-          python3 -m pip install packaging transformers_stream_generator transformers datasets openai einops
+          python3 -m pip install packaging transformers_stream_generator transformers datasets openai einops timm decord
           python3 -m pip install -r requirements.txt -r requirements/test.txt
           python3 -m pip install .
       - name: Check env
         run: |
           python3 -m pip list
           lmdeploy check_env
       - name: Test lmdeploy
-        run: CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m pr_test -x --alluredir=allure-results --clean-alluredir
+        run: |
+          CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_2' -x --alluredir=allure-results --clean-alluredir
+          CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_1' -n 2 -x --alluredir=allure-results
       - name: Generate reports
         if: always()
         run: |

diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -54,6 +54,7 @@ def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
 def test_hf_pytorch_chat_pr(config, model, cli_case_config):

diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -104,6 +104,7 @@ def test_hf_turbomind_base_tp2(config, model, cli_case_config, worker_id):
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_turbomind_chat
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize(
     'model',

diff --git a/autotest/tools/chat/test_command_chat_workspace.py b/autotest/tools/chat/test_command_chat_workspace.py
@@ -95,6 +95,7 @@ def test_workspace_base_tp2(config, cli_case_config, model, worker_id):
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.command_chat
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize(
     'model',

diff --git a/autotest/tools/convert/test_convert.py b/autotest/tools/convert/test_convert.py
@@ -20,6 +20,7 @@ def test_convert(config, model, worker_id):
 
 @pytest.mark.order(5)
 @pytest.mark.convert
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize(
     'model',

diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch.py
@@ -59,6 +59,7 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('model', ['internlm/internlm2-chat-20b'])
 def test_pipeline_chat_pytorch_pr(config, common_case_config, model):

diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind.py
@@ -98,6 +98,7 @@ def test_pipeline_chat_kvint_tp2(config, common_case_config, model,
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize(
     'model',

diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_vl.py
@@ -32,3 +32,20 @@ def test_pipeline_chat_tp2(config, model, worker_id):
     p.start()
     p.join()
     assert_pipeline_vl_chat_log(config, model)
+
+
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.pr_test
+@pytest.mark.parametrize('model', [
+    'liuhaotian/llava-v1.6-vicuna-7b', 'OpenGVLab/InternVL2-4B',
+    'OpenGVLab/InternVL2-8B', 'internlm/internlm-xcomposer2d5-7b'
+])
+def test_pipeline_pr_test(config, model, worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(
+            int(get_cuda_id_by_workerid(worker_id)) + 5)
+    p = Process(target=run_pipeline_vl_chat_test, args=(config, model))
+    p.start()
+    p.join()
+    assert_pipeline_vl_chat_log(config, model)
diff --git a/autotest/tools/quantization/test_quantization_w4a16.py b/autotest/tools/quantization/test_quantization_w4a16.py
@@ -19,6 +19,7 @@ def test_quantization_w4a16(config, model, worker_id):
 @pytest.mark.order(3)
 @pytest.mark.quantization_w4a16
 @pytest.mark.pr_test
+@pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
 @pytest.mark.timeout(900)
 @pytest.mark.parametrize(

diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind.py b/autotest/tools/restful/test_restful_chat_hf_turbomind.py
@@ -114,6 +114,7 @@ def test_restful_chat_kvint_tp2(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'internlm/internlm2-chat-20b',
@@ -126,7 +127,12 @@ def test_restful_chat_kvint_tp2(config, common_case_config, worker_id):
 }],
                          indirect=True)
 def test_restful_chat_pr(config, common_case_config):
-    run_all_step(config, common_case_config)
+    run_all_step(
+        config, {
+            key: value
+            for key, value in common_case_config.items()
+            if key == 'memory_test'
+        })
 
 
 @pytest.mark.order(7)

diff --git a/autotest/tools/restful/test_restful_chat_workspace.py b/autotest/tools/restful/test_restful_chat_workspace.py
@@ -65,6 +65,7 @@ def test_restful_chat_tp2(config, common_case_config, worker_id):
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.restful_api
 @pytest.mark.flaky(reruns=0)
+@pytest.mark.gpu_num_2
 @pytest.mark.pr_test
 @pytest.mark.parametrize('prepare_environment', [{
     'model': 'internlm/internlm2-chat-20b',
@@ -77,4 +78,9 @@ def test_restful_chat_tp2(config, common_case_config, worker_id):
 }],
                          indirect=True)
 def test_restful_chat_pr(config, common_case_config):
-    run_all_step(config, common_case_config)
+    run_all_step(
+        config, {
+            key: value
+            for key, value in common_case_config.items()
+            if key == 'memory_test'
+        })
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
@@ -5,6 +5,7 @@
 from time import sleep, time
 
 import allure
+import psutil
 from pytest import assume
 from utils.config_utils import get_cuda_prefix_by_workerid, get_workerid
 from utils.get_run_config import get_command_with_extra
@@ -92,7 +93,10 @@ def start_restful_api(config, param, model, model_path, backend_tpye,
 
 def stop_restful_api(pid, startRes, param):
     if pid > 0:
-        startRes.terminate()
+        parent = psutil.Process(pid)
+        for child in parent.children(recursive=True):
+            child.terminate()
+        parent.terminate()
     if 'modelscope' in param.keys():
         modelscope = param['modelscope']
         if modelscope: