diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index cee1cb1fe..4196adc5d 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -1,11 +1,12 @@ trigger: - main -pr: - branches: - include: - - main - drafts: false +# pr: +# branches: +# include: +# - main +# drafts: false +pr: none jobs: - job: UnitTest diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4bcf13c92..c404f96aa 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,29 +1,22 @@ -name: "CodeQL" +name: CodeQL on: - push: + pull_request: branches: - - "**" + - main schedule: - cron: '42 20 * * 4' jobs: - cancel: - name: Cancel previous runs - runs-on: ubuntu-latest - steps: - - name: Cancel previous runs - uses: styfle/cancel-workflow-action@0.8.0 - with: - access_token: ${{ secrets.GITHUB_TOKEN }} - analyze: name: Analyze strategy: fail-fast: false matrix: language: [ 'cpp' ] - + concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true runs-on: ubuntu-latest container: image: ghcr.io/microsoft/ark/ark:base-cuda12.1 @@ -34,7 +27,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Check disk space run: | diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 303a4e65f..758eaf564 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -3,7 +3,7 @@ name: Lint on: pull_request: branches: - - "**" + - main jobs: linters: @@ -11,7 +11,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install ClangFormat run: sudo apt-get install -y clang-format @@ -35,7 +35,7 @@ jobs: steps: - name: Check out Git repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Download misspell run: | diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml new file mode 100644 index 000000000..66012cf97 --- /dev/null +++ b/.github/workflows/ut-cuda.yml @@ -0,0 +1,78 @@ +name: "Unit Tests (CUDA)" + +on: + pull_request: + branches: + - main + +jobs: + UnitTest: + runs-on: [ self-hosted, A100 ] + defaults: + run: + shell: bash + timeout-minutes: 30 + strategy: + matrix: + cuda: [ cuda11.8, cuda12.1 ] + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.cuda }} + cancel-in-progress: true + container: + image: "ghcr.io/microsoft/ark/ark:base-dev-${{ matrix.cuda }}" + options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: LockGPUClock + run: | + sudo nvidia-smi -pm 1 + for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do + sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i + done + + - name: UpdateSubmodules + run: | + git config --global --add safe.directory /__w/ark/ark + git submodule update --init --recursive + + - name: InstallGPUDMA + run: | + if ! lsmod | grep gpumem; then + cd third_party && make gpudma + sudo insmod ./gpudma/module/gpumem.ko + sudo chmod 666 /dev/gpumem + fi + + - name: Build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Debug .. + make -j ut + + - name: RunUT + run: | + cd build && ARK_ROOT=$PWD ctest --verbose + + - name: ReportCoverage + run: | + cd build + lcov --capture --directory . --output-file coverage.info + lcov --remove coverage.info \ + '/usr/*' \ + '/tmp/*' \ + '*/third_party/*' \ + '*/ark/*_test.*' \ + '*/examples/*' \ + '*/python/*' \ + '*/ark/unittest/unittest_utils.cc' \ + --output-file coverage.info + lcov --list coverage.info + bash <(curl -s https://codecov.io/bash) -f coverage.info || echo "Codecov did not collect coverage reports" + + - name: BuildPython + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install . diff --git a/README.md b/README.md index 0b4b2c950..ccb6c2f28 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,12 @@ A GPU-driven system framework for scalable AI applications. [![License](https://img.shields.io/github/license/microsoft/ark.svg)](LICENSE) [![CodeQL](https://github.com/microsoft/ark/actions/workflows/codeql.yml/badge.svg)](https://github.com/microsoft/ark/actions/workflows/codeql.yml) +[![Unit Tests (CUDA)](https://github.com/microsoft/ark/actions/workflows/ut-cuda.yml/badge.svg)](https://github.com/microsoft/ark/actions/workflows/ut-cuda.yml) [![Build Status](https://dev.azure.com/binyli/HPC/_apis/build/status%2Fark-test?branchName=main)](https://dev.azure.com/binyli/HPC/_build/latest?definitionId=6&branchName=main) [![codecov](https://codecov.io/gh/microsoft/ark/graph/badge.svg?token=XmMOK85GOB)](https://codecov.io/gh/microsoft/ark) +*NOTE (Oct 2023): Azure pipelines are currently down due to resource migration.* + See [Quick Start](docs/quickstart.md) to quickly get started. ## Overview diff --git a/docker/base-dev-cuda11.8.dockerfile b/docker/base-dev-cuda11.8.dockerfile new file mode 100644 index 000000000..a7a45a7f9 --- /dev/null +++ b/docker/base-dev-cuda11.8.dockerfile @@ -0,0 +1,34 @@ +FROM ghcr.io/microsoft/ark/ark:base-cuda11.8 + +LABEL maintainer="ARK" +LABEL org.opencontainers.image.source https://github.com/microsoft/ark + +ENV ARK_SRC_DIR="/tmp/ark" \ + CMAKE_VERSION="3.26.4" + +ADD . ${ARK_SRC_DIR} +WORKDIR ${ARK_SRC_DIR} + +# Install Lcov +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + lcov \ + && \ + apt-get autoremove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + +# Install cmake 3.26.4 +ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ + CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" +RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ + tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \ + rm -rf ${CMAKE_HOME}.tar.gz +ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" + +# Set PATH +RUN echo PATH="${PATH}" > /etc/environment + +# Cleanup +WORKDIR / +RUN rm -rf ${ARK_SRC_DIR} diff --git a/docker/base-dev-cuda12.1.dockerfile b/docker/base-dev-cuda12.1.dockerfile new file mode 100644 index 000000000..5f6006008 --- /dev/null +++ b/docker/base-dev-cuda12.1.dockerfile @@ -0,0 +1,34 @@ +FROM ghcr.io/microsoft/ark/ark:base-cuda12.1 + +LABEL maintainer="ARK" +LABEL org.opencontainers.image.source https://github.com/microsoft/ark + +ENV ARK_SRC_DIR="/tmp/ark" \ + CMAKE_VERSION="3.26.4" + +ADD . ${ARK_SRC_DIR} +WORKDIR ${ARK_SRC_DIR} + +# Install Lcov +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + lcov \ + && \ + apt-get autoremove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + +# Install cmake 3.26.4 +ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ + CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" +RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ + tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \ + rm -rf ${CMAKE_HOME}.tar.gz +ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" + +# Set PATH +RUN echo PATH="${PATH}" > /etc/environment + +# Cleanup +WORKDIR / +RUN rm -rf ${ARK_SRC_DIR}