Skip to content

Support QoS in api_server #183

Support QoS in api_server

Support QoS in api_server #183

Workflow file for this run

name: test
on:
pull_request:
paths:
- ".github/scripts/test_triton_server.py"
- ".github/workflows/test.yml"
- "cmake/**"
- "src/**"
- "3rdparty/**"
- "lmdeploy/**"
- "requirements/**"
- "requirements.txt"
- "CMakeLists.txt"
- "setup.py"
push:
branches:
- main
paths:
- "lmdeploy/version.py"
tags:
- "v*.*.*"
workflow_dispatch:
inputs:
markers:
required: false
description: 'Tested markers. eg: "-m internlm_chat_7b"'
type: string
default: ''
env:
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
jobs:
test_functions:
runs-on: [self-hosted, linux-a100]
timeout-minutes: 4320 # 72hours
environment: 'prod'
env:
REPORT_DIR: /nvme/qa_test_models/test-reports
container:
image: nvcr.io/nvidia/tritonserver:22.12-py3
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip"
volumes:
- /nvme/github-actions/pip-cache:/root/.cache/pip
- /nvme/github-actions/packages:/root/packages
- /nvme/qa_test_models:/nvme/qa_test_models
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
steps:
- name: Setup systems
run: |
rm /etc/apt/sources.list.d/cuda*.list
apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \
libgoogle-glog-dev libgl1 openjdk-8-jre-headless
dpkg -i /root/packages/allure_2.24.1-1_all.deb
rm -rf /var/lib/apt/lists/*
- name: Clone repository
uses: actions/checkout@v2
- name: Install pytorch
run: |
python3 -m pip cache dir
python3 -m pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
- name: Build lmdeploy
run: |
python3 -m pip install cmake
python3 -m pip install -r requirements/build.txt
# use cached build
cp -r ../../build build
cd build
cmake .. \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-DCMAKE_INSTALL_PREFIX=./install \
-DBUILD_PY_FFI=ON \
-DBUILD_MULTI_GPU=ON \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
-DUSE_NVTX=ON \
-DSM=80 \
-DCMAKE_CUDA_ARCHITECTURES=80 \
-DBUILD_TEST=OFF
make -j$(nproc) && make install
- name: Install lmdeploy
run: |
python3 -m pip install packaging protobuf transformers_stream_generator
python3 -m pip install -r requirements.txt -r requirements/test.txt
python3 -m pip install .
- name: Check env
run: |
python3 -m pip list
lmdeploy check_env
- name: Test lmdeploy
run: |
echo "TODO: awaiting PR of adding autotest"
# pytest autotest ${{github.event.inputs.markers}} --alluredir=allure-results --clean-alluredir
- name: Generate reports
if: always()
run: |
if test -D "allure-results"; then
export date_today="$(date +'%Y%m%d-%H%M%S')"
export report_dir="$REPORT_DIR/$date_today"
echo "Save report to $ALLURE_DIR"
allure generate -c -o $report_dir
fi
- name: Clear workfile
if: always()
run: |
export workdir=$(pwd)
cd ..
rm -rf $workdir
mkdir $workdir
chmod -R 777 $workdir
test_triton:
runs-on: [self-hosted, linux-a100]
timeout-minutes: 4320 # 72hours
environment: 'prod'
env:
HF_MODEL: /nvme/qa_test_models/internlm-chat-20b
WORKDIR: /nvme/qa_test_models/triton_workspace
TB_MODEL: internlm-chat-20b-fp16-tp2
GRPC_PORT: 33337
steps:
- name: Clone repository
uses: actions/checkout@v2
- name: Create test container
run: |
export CONTAINER_ID=$(docker create \
--rm \
--gpus='"device=0,1"' \
--shm-size 16g \
--cap-add=SYS_PTRACE \
--cap-add=SYS_ADMIN \
--security-opt seccomp=unconfined \
--name lmdeploy-ci-triton \
--workdir /__w/lmdeploy/lmdeploy \
--env PIP_CACHE_DIR=/root/.cache/pip \
--env NCCL_LAUNCH_MODE=GROUP \
-v $(pwd)/../../:/__w \
-v ${HF_MODEL}:/root/workspace/hf_model \
-v ${WORKDIR}:/root/workspace/workdir \
-v ${HOST_PIP_CACHE_DIR}:/root/.cache/pip \
-v ${HOST_LOCALTIME}:/etc/localtime:ro \
openmmlab/lmdeploy:latest tail -f /dev/null \
)
docker start $CONTAINER_ID
echo "CONTAINER_ID=$CONTAINER_ID"
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV
- name: Build lmdeploy from source
run: |
docker exec $CONTAINER_ID cp -r ../../build build
docker exec --workdir /__w/lmdeploy/lmdeploy/build \
--env http_proxy=${{secrets.PROXY}} \
--env https_proxy=${{secrets.PROXY}} \
$CONTAINER_ID cmake .. \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-DCMAKE_INSTALL_PREFIX=./install \
-DBUILD_PY_FFI=ON \
-DBUILD_MULTI_GPU=ON \
-DCMAKE_CUDA_FLAGS="-lineinfo" \
-DUSE_NVTX=ON \
-DSM=80 \
-DCMAKE_CUDA_ARCHITECTURES=80 \
-DBUILD_TEST=OFF
docker exec --workdir /__w/lmdeploy/lmdeploy/build $CONTAINER_ID make -j$(nproc)
docker exec --workdir /__w/lmdeploy/lmdeploy/build $CONTAINER_ID make install
- name: Install lmdeploy
run: |
docker exec \
--env http_proxy=${{secrets.PROXY}} \
--env https_proxy=${{secrets.PROXY}} \
$CONTAINER_ID python3 -m pip install tritonclient[grpc]
docker exec \
--env http_proxy=${{secrets.PROXY}} \
--env https_proxy=${{secrets.PROXY}} \
$CONTAINER_ID python3 -m pip install -r requirements/test.txt
docker exec $CONTAINER_ID python3 -m pip install .
# docker exec $CONTAINER_ID check_env
- name: Convert to turbomind model
run: |
docker exec $CONTAINER_ID \
lmdeploy convert \
--model-name internlm-chat-20b \
--model-path /root/workspace/hf_model \
--tp 2 \
--dst-path /root/workspace/workdir/${TB_MODEL}
- name: Start triton server service
run: |
docker exec --detach $CONTAINER_ID \
tritonserver \
--model-repository=/root/workspace/workdir/${TB_MODEL}/model_repository \
--allow-http=0 \
--allow-grpc=1 \
--grpc-port=${GRPC_PORT} \
--log-verbose=0 \
--allow-metrics=1
# wait for triton server to fully start up
sleep 180s
- name: Test triton server
run: |
docker exec \
--env no_proxy=localhost,127.0.0.1 \
$CONTAINER_ID python3 .github/scripts/test_triton_server.py --port ${GRPC_PORT}
- name: Clear workfile
if: always()
run: |
export workdir=$(pwd)
docker exec --workdir /__w/lmdeploy $CONTAINER_ID rm -rf lmdeploy
mkdir $workdir
chmod -R 777 $workdir
docker exec --workdir /__w/lmdeploy $CONTAINER_ID rm -rf /root/workspace/workdir/${TB_MODEL}
docker stop $CONTAINER_ID