Add test case for function regression #145
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: test | |
on: | |
pull_request: | |
paths: | |
- ".github/scripts/test_triton_server.py" | |
- ".github/workflows/test.yml" | |
- "cmake/**" | |
- "src/**" | |
- "autotest/**" | |
- "3rdparty/**" | |
- "lmdeploy/**" | |
- "requirements/**" | |
- "requirements.txt" | |
- "CMakeLists.txt" | |
- "setup.py" | |
push: | |
branches: | |
- main | |
paths: | |
- "lmdeploy/version.py" | |
tags: | |
- "v*.*.*" | |
workflow_dispatch: | |
inputs: | |
markers: | |
required: false | |
description: 'Tested markers. eg: "-m internlm_chat_7b"' | |
type: string | |
default: '' | |
env: | |
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache | |
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai | |
jobs: | |
test_functions: | |
runs-on: [self-hosted, linux-a100] | |
timeout-minutes: 4320 # 72hours | |
environment: 'prod' | |
env: | |
REPORT_DIR: /nvme/qa_test_models/test-reports | |
container: | |
image: nvcr.io/nvidia/tritonserver:22.12-py3 | |
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip" | |
volumes: | |
- /nvme/github-actions/pip-cache:/root/.cache/pip | |
- /nvme/github-actions/packages:/root/packages | |
- /nvme/qa_test_models:/nvme/qa_test_models | |
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro | |
steps: | |
- name: Setup systems | |
run: | | |
rm /etc/apt/sources.list.d/cuda*.list | |
apt-get update && apt-get install -y --no-install-recommends rapidjson-dev \ | |
libgoogle-glog-dev libgl1 openjdk-8-jre-headless | |
dpkg -i /root/packages/allure_2.24.1-1_all.deb | |
rm -rf /var/lib/apt/lists/* | |
- name: Clone repository | |
uses: actions/checkout@v2 | |
- name: Install pytorch | |
run: | | |
python3 -m pip cache dir | |
python3 -m pip install torch==2.0.1 torchvision==0.15.2 --extra-index-url https://download.pytorch.org/whl/cu117 | |
- name: Build lmdeploy | |
run: | | |
python3 -m pip install cmake | |
python3 -m pip install -r requirements/build.txt | |
# use cached build | |
mkdir build | |
cd build | |
cmake .. \ | |
-DCMAKE_BUILD_TYPE=RelWithDebInfo \ | |
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ | |
-DCMAKE_INSTALL_PREFIX=./install \ | |
-DBUILD_PY_FFI=ON \ | |
-DBUILD_MULTI_GPU=ON \ | |
-DCMAKE_CUDA_FLAGS="-lineinfo" \ | |
-DUSE_NVTX=ON \ | |
-DSM=80 \ | |
-DCMAKE_CUDA_ARCHITECTURES=80 \ | |
-DBUILD_TEST=OFF | |
make -j$(nproc) && make install | |
- name: Install lmdeploy | |
run: | | |
python3 -m pip install packaging protobuf transformers_stream_generator transformers==4.33.0 | |
# manually install flash attn | |
# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl | |
python3 -m pip install /root/packages/flash_attn-2.3.6+cu118torch2.0cxx11abiFALSE-cp38-cp38-linux_x86_64.whl | |
python3 -m pip install -r requirements.txt -r requirements/test.txt | |
python3 -m pip install . | |
- name: Check env | |
run: | | |
python3 -m pip list | |
lmdeploy check_env | |
- name: Test lmdeploy | |
run: | | |
pytest autotest ${{github.event.inputs.markers}} --alluredir=allure-results --clean-alluredir | |
- name: Generate reports | |
if: always() | |
run: | | |
export date_today="$(date +'%Y%m%d-%H%M%S')" | |
export report_dir="$REPORT_DIR/$date_today" | |
echo "Save report to $ALLURE_DIR" | |
allure generate -c -o $report_dir | |
- name: Clear workfile | |
if: always() | |
run: | | |
export workdir=$(pwd) | |
cd .. | |
rm -rf $workdir | |
mkdir $workdir | |
chmod -R 777 $workdir | |
test_triton: | |
runs-on: [self-hosted, linux-a100] | |
timeout-minutes: 4320 # 72hours | |
environment: 'prod' | |
env: | |
HF_MODEL: /nvme/qa_test_models/internlm-chat-20b | |
WORKDIR: /nvme/qa_test_models/triton_workspace | |
TB_MODEL: internlm-chat-20b-fp16-tp2 | |
GRPC_PORT: 33337 | |
steps: | |
- name: Clone repository | |
uses: actions/checkout@v2 | |
- name: Create test container | |
run: | | |
export CONTAINER_ID=$(docker create \ | |
--rm \ | |
--gpus='"device=0,1"' \ | |
--shm-size 16g \ | |
--cap-add=SYS_PTRACE \ | |
--cap-add=SYS_ADMIN \ | |
--security-opt seccomp=unconfined \ | |
--name lmdeploy-ci-triton \ | |
--workdir /__w/lmdeploy/lmdeploy \ | |
--env PIP_CACHE_DIR=/root/.cache/pip \ | |
--env NCCL_LAUNCH_MODE=GROUP \ | |
--env http_proxy=${{secrets.PROXY}} \ | |
--env https_proxy=${{secrets.PROXY}} \ | |
--env no_proxy="localhost,127.0.0.1,0.0.0.0" \ | |
--env HTTP_PROXY=${{secrets.PROXY}} \ | |
--env HTTPS_PROXY=${{secrets.PROXY}} \ | |
--env NO_PROXY="localhost,127.0.0.1,0.0.0.0" \ | |
-v $(pwd)/../../:/__w \ | |
-v ${HF_MODEL}:/root/workspace/hf_model \ | |
-v ${WORKDIR}:/root/workspace/workdir \ | |
-v ${HOST_PIP_CACHE_DIR}:/root/.cache/pip \ | |
-v ${HOST_LOCALTIME}:/etc/localtime:ro \ | |
openmmlab/lmdeploy:latest tail -f /dev/null \ | |
) | |
docker start $CONTAINER_ID | |
echo "CONTAINER_ID=$CONTAINER_ID" | |
echo "CONTAINER_ID=$CONTAINER_ID" >> $GITHUB_ENV | |
- name: Build lmdeploy from source | |
run: | | |
docker exec $CONTAINER_ID mkdir build | |
docker exec --workdir /__w/lmdeploy/lmdeploy/build \ | |
$CONTAINER_ID cmake .. \ | |
-DCMAKE_BUILD_TYPE=RelWithDebInfo \ | |
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ | |
-DCMAKE_INSTALL_PREFIX=./install \ | |
-DBUILD_PY_FFI=ON \ | |
-DBUILD_MULTI_GPU=ON \ | |
-DCMAKE_CUDA_FLAGS="-lineinfo" \ | |
-DUSE_NVTX=ON \ | |
-DSM=80 \ | |
-DCMAKE_CUDA_ARCHITECTURES=80 \ | |
-DBUILD_TEST=OFF | |
docker exec --workdir /__w/lmdeploy/lmdeploy/build $CONTAINER_ID make -j$(nproc) | |
docker exec --workdir /__w/lmdeploy/lmdeploy/build $CONTAINER_ID make install | |
- name: Install lmdeploy | |
run: | | |
docker exec $CONTAINER_ID python3 -m pip install tritonclient[grpc] | |
docker exec $CONTAINER_ID python3 -m pip install -r requirements/test.txt | |
docker exec $CONTAINER_ID python3 -m pip install . | |
# docker exec $CONTAINER_ID check_env | |
- name: Convert to turbomind model | |
run: | | |
docker exec $CONTAINER_ID \ | |
lmdeploy convert \ | |
--model-name internlm-chat-20b \ | |
--model-path /root/workspace/hf_model \ | |
--tp 2 \ | |
--dst-path /root/workspace/workdir/${TB_MODEL} | |
- name: Start triton server service | |
run: | | |
docker exec --detach $CONTAINER_ID \ | |
tritonserver \ | |
--model-repository=/root/workspace/workdir/${TB_MODEL}/model_repository \ | |
--allow-http=0 \ | |
--allow-grpc=1 \ | |
--grpc-port=${GRPC_PORT} \ | |
--log-verbose=0 \ | |
--allow-metrics=1 | |
# wait for triton server to fully start up | |
sleep 180s | |
- name: Test triton server | |
run: | | |
docker exec \ | |
$CONTAINER_ID python3 .github/scripts/test_triton_server.py --port ${GRPC_PORT} | |
- name: Clear workfile | |
if: always() | |
run: | | |
export workdir=$(pwd) | |
docker exec --workdir /__w/lmdeploy $CONTAINER_ID rm -rf lmdeploy | |
mkdir $workdir | |
chmod -R 777 $workdir | |
docker exec --workdir /__w/lmdeploy $CONTAINER_ID rm -rf /root/workspace/workdir/${TB_MODEL} | |
docker stop $CONTAINER_ID |