.github/workflows/evaluate.yml

name: evaluate

on:
  workflow_dispatch:
    inputs:
      repo_org:
        required: false
        description: 'Tested repository organization name. Default is InternLM/lmdeploy'
        type: string
        default: 'InternLM/lmdeploy'
      repo_ref:
        required: false
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
      chat_models:
        required: true
        description: 'Tested TurboMind models list. eg. [internlm_chat_7b,internlm_chat_7b_w8a16]'
        type: string
        default: '[turbomind_internlm2_chat_7b, pytorch_internlm2_chat_7b, turbomind_internlm2_5_7b_chat, pytorch_internlm2_5_7b_chat, turbomind_internlm2_5_7b_chat_batch1, turbomind_internlm2_5_7b_chat_batch1_4bits, turbomind_internlm2_5_20b_chat, pytorch_internlm2_5_20b_chat, turbomind_qwen1_5_7b_chat, pytorch_qwen1_5_7b_chat, turbomind_llama2_7b_chat, turbomind_llama3_8b_instruct, pytorch_llama3_8b_instruct, turbomind_llama3_1_8b_instruct, pytorch_llama3_1_8b_instruct, turbomind_qwen2_7b_instruct, pytorch_qwen2_7b_instruct, pytorch_qwen1_5_moe_2_7b_chat, pytorch_gemma_2_9b_it, turbomind_internlm2_chat_7b_4bits, turbomind_internlm2_chat_7b_kvint4, turbomind_internlm2_chat_7b_kvint8, turbomind_internlm2_5_7b_chat_4bits, turbomind_internlm2_5_7b_chat_kvint4, turbomind_internlm2_5_7b_chat_kvint8, turbomind_internlm2_5_20b_chat_4bits, turbomind_internlm2_5_20b_chat_kvint4, turbomind_internlm2_5_20b_chat_kvint8, turbomind_qwen1_5_7b_chat_4bits, turbomind_qwen1_5_7b_chat_kvint4, turbomind_qwen1_5_7b_chat_kvint8, turbomind_llama2_7b_chat_4bits, turbomind_llama2_7b_chat_kvint4, turbomind_llama2_7b_chat_kvint8, turbomind_llama3_8b_instruct_4bits, turbomind_llama3_8b_instruct_kvint4, turbomind_llama3_8b_instruct_kvint8, turbomind_llama3_1_8b_instruct_4bits, turbomind_llama3_1_8b_instruct_kvint4, turbomind_llama3_1_8b_instruct_kvint8, turbomind_qwen2_7b_instruct_4bits, turbomind_qwen2_7b_instruct_kvint8]'
      chat_datasets:
        required: true
        description: 'Tested datasets list. eg. [*bbh_datasets,*ceval_datasets,*cmmlu_datasets,*GaokaoBench_datasets,*gpqa_datasets,*gsm8k_datasets,*hellaswag_datasets,*humaneval_datasets,*ifeval_datasets,*math_datasets,*sanitized_mbpp_datasets,*mmlu_datasets,*nq_datasets,*race_datasets,*TheoremQA_datasets,*triviaqa_datasets,*winogrande_datasets,*crowspairs_datasets]'
        type: string
        default: '[*mmlu_datasets, *gsm8k_datasets, *ifeval_datasets]'
      base_models:
        required: true
        description: 'Tested TurboMind models list. eg. [turbomind_internlm2_5_7b, turbomind_qwen2_7b, turbomind_internlm2_5_7b_batch1]'
        type: string
        default: '[turbomind_internlm2_5_7b, turbomind_internlm2_5_7b_4bits, turbomind_internlm2_5_7b_batch1, turbomind_internlm2_5_7b_batch1_4bits, turbomind_qwen2_7b, turbomind_qwen2_5_7b, turbomind_qwen2_5_14b]'
      baes_datasets:
        required: true
        description: 'Tested datasets list. eg. [*mmlu_datasets, *gsm8k_datasets]'
        type: string
        default: '[*mmlu_datasets, *gsm8k_datasets, *gpqa_datasets, *winogrande_datasets]'
      oc_repo_org:
        required: false
        description: 'Tested repository organization name. Default is open-compass/opencompass'
        type: string
        default: 'open-compass/opencompass'
      oc_repo_ref:
        required: false
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
      offline_mode:
        required: true
        description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself'
        type: boolean
        default: false

env:
  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true

jobs:
  linux-build:
    if: ${{github.event_name == 'schedule' || (!cancelled() && !inputs.offline_mode)}}
    strategy:
      matrix:
        pyver: [py310]
    runs-on: ubuntu-latest
    env:
      PYTHON_VERSION: ${{ matrix.pyver }}
      PLAT_NAME: manylinux2014_x86_64
      DOCKER_TAG: cuda11.8
      OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
        with:
          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Build
        run: |
          echo ${PYTHON_VERSION}
          echo ${PLAT_NAME}
          echo ${DOCKER_TAG}
          echo ${OUTPUT_FOLDER}
          echo ${GITHUB_RUN_ID}
          # remove -it
          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
      - name: Upload Artifacts
        uses: actions/upload-artifact@v4
        with:
          if-no-files-found: error
          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
          retention-days: 1
          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}


  evaluate:
    needs: linux-build
    if: ${{github.event_name == 'schedule' || !cancelled()}}
    runs-on: [self-hosted, linux-a100]
    timeout-minutes: 4320 # 72hours
    strategy:
      fail-fast: false
      matrix:
        evaluate_type: ['chat', 'base']
    container:
      image: openmmlab/lmdeploy:latest-cu11
      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
      volumes:
        - /nvme/github-actions/pip-cache:/root/.cache/pip
        - /nvme/github-actions/packages:/root/packages
        - /nvme/github-actions/resources:/root/resources
        - /nvme/github-actions/opencompass-data:/root/opencompass-data
        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
        - /nvme/qa_test_models:/root/models
        - /mnt/shared:/mnt/shared
        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
    steps:
      - name: Setup systems
        run: |
          export TIME_STAMP="$(date +'%Y%m%d-%H%M%S')"
          echo "TIME_STAMP=$TIME_STAMP" >> $GITHUB_ENV
      - name: Clone repository
        uses: actions/checkout@v2
        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
        with:
          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Copy repository - offline
        if: ${{inputs.offline_mode}}
        run: cp -r /root/models/offline_pkg/lmdeploy/. .
      - name: Download Artifacts
        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}-py310
      - name: Install lmdeploy - dependency
        run: |
          # manually install flash attn
          # the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
          python3 -m pip install /root/packages/flash_attn-2.6.3+cu118torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
          python3 -m pip install /root/packages/xformers-0.0.27+cu118-cp310-cp310-manylinux2014_x86_64.whl --no-deps
          python3 -m pip install -r /root/models/offline_pkg/requirements.txt
      - name: Install lmdeploy
        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
        run: |
          python3 -m pip install lmdeploy-*.whl --no-deps
          python3 -m pip install -r requirements/test.txt
      - name: Install lmdeploy - offline
        if: ${{inputs.offline_mode}}
        run: |
          python3 -m pip install /root/models/offline_pkg/py310/lmdeploy-*.whl --no-deps
          python3 -m pip install -r requirements/test.txt
      - name: Install opencompass
        run: |
          git clone https://github.com/${{ github.event.inputs.oc_repo_org}}.git
          cd opencompass
          git checkout ${{ github.event.inputs.oc_repo_ref}}
          python3 -m pip install -e .
          echo "OPENCOMPASS_DIR=$(pwd)" >> $GITHUB_ENV
      - name: Check env
        run: |
          pip uninstall -y nvidia-nccl-cu11
          python3 -m pip list
          lmdeploy check_env
      - name: Setup paths for evaluation
        run: |
          ln -s /root/opencompass-data ./data
          python3 .github/scripts/action_tools.py create_model_links /root/models .
      - name: Evaluate chat models
        if: matrix.evaluate_type == 'chat'
        run: |
          echo ${{github.event.inputs.chat_models}}
          echo ${{github.event.inputs.chat_datasets}}
          export LMDEPLOY_DIR=$(pwd)

          python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.chat_models}}" "${{github.event.inputs.chat_datasets}}" /root/evaluation-reports/${{ github.run_id }} chat
      - name: Evaluate base models
        if: matrix.evaluate_type == 'base'
        run: |
          echo ${{github.event.inputs.base_models}}
          echo ${{github.event.inputs.baes_datasets}}
          export LMDEPLOY_DIR=$(pwd)

          python3 .github/scripts/action_tools.py evaluate "${{github.event.inputs.base_models}}" "${{github.event.inputs.baes_datasets}}" /root/evaluation-reports/${{ github.run_id }} base
      - name: Clear workspace
        if: always()
        run: |
          export workdir=$(pwd)
          cd ..
          rm -rf $workdir
          mkdir $workdir
          chmod -R 777 $workdir