diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index 712b13b55..0aede0826 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -113,9 +113,12 @@ jax-triton: mode: git-clone maxtext: url: https://github.com/google/maxtext.git + mirror_url: https://github.com/nvjax-svc-0/maxtext.git tracking_ref: main latest_verified_commit: 78daad198544def8274dbd656d122fbe6a0e1129 mode: git-clone + patches: + mirror/patch/test_rosetta_maxtext: file://patches/maxtext/mirror-patch-rosetta-maxtext.patch levanter: url: https://github.com/stanford-crfm/levanter.git tracking_ref: main diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 589a42d3b..203a58163 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -22,12 +22,11 @@ on: value: ${{ jobs.collect-docker-tags.outputs.TAGS }} permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows + contents: read # to fetch code + actions: write # to cancel previous workflows packages: write # to upload container jobs: - build-base: uses: ./.github/workflows/_build_base.yaml with: @@ -77,7 +76,7 @@ jobs: DOCKERFILE: .github/container/Dockerfile.equinox secrets: inherit - build-maxtext: + build-upstream-maxtext: needs: build-jax if: inputs.ARCHITECTURE == 'amd64' # Triton does not seem to support arm64 uses: ./.github/workflows/_build.yaml @@ -87,17 +86,27 @@ jobs: BADGE_FILENAME: badge-maxtext-build BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext + CONTAINER_NAME: upstream-maxtext DOCKERFILE: .github/container/Dockerfile.maxtext.amd64 secrets: inherit + build-rosetta-maxtext: + needs: build-upstream-maxtext + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: maxtext + secrets: inherit + build-levanter: needs: [build-jax] uses: ./.github/workflows/_build.yaml with: ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-levanter-build" - BADGE_FILENAME: "badge-levanter-build" + ARTIFACT_NAME: 'artifact-levanter-build' + BADGE_FILENAME: 'badge-levanter-build' BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: levanter @@ -109,8 +118,8 @@ jobs: uses: ./.github/workflows/_build.yaml with: ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-t5x-build" - BADGE_FILENAME: "badge-t5x-build" + ARTIFACT_NAME: 'artifact-t5x-build' + BADGE_FILENAME: 'badge-t5x-build' BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: upstream-t5x @@ -155,28 +164,30 @@ jobs: uses: ./.github/workflows/_build.yaml with: ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: "artifact-grok-build" - BADGE_FILENAME: "badge-grok-build" + ARTIFACT_NAME: 'artifact-grok-build' + BADGE_FILENAME: 'badge-grok-build' BUILD_DATE: ${{ inputs.BUILD_DATE }} BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: grok DOCKERFILE: .github/container/Dockerfile.grok secrets: inherit - + collect-docker-tags: runs-on: ubuntu-22.04 - if: "!cancelled()" + if: '!cancelled()' needs: - build-base - build-jax - build-triton - build-equinox - - build-maxtext + - build-upstream-maxtext - build-levanter - build-upstream-t5x - build-upstream-pax + - build-upstream-maxtext - build-rosetta-t5x - build-rosetta-pax + - build-rosetta-maxtext - build-grok outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} @@ -190,20 +201,22 @@ jobs: {"flavor": "jax", "stage": "final", "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "triton", "stage": "final", "priority": 900, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "equinox", "stage": "final", "priority": 900, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\ - {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "upstream-maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "levanter", "stage": "final", "priority": 900, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "upstream-pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "maxtext", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-maxtext.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "pax", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "grok", "stage": "final", "priority": 900, "tag": "${{ needs.build-grok.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ - {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "upstream-maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "levanter", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-levanter.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "upstream-pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "maxtext", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "pax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "grok", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-grok.outputs.DOCKER_TAG_MEALKIT }}"},\ @@ -214,58 +227,58 @@ jobs: echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT - test-distribution: - runs-on: ubuntu-22.04 - strategy: - matrix: - TEST_SCRIPT: - - extra-only-distribution.sh - - mirror-only-distribution.sh - - upstream-only-distribution.sh - - local-patch-distribution.sh - fail-fast: false - steps: - - name: Print environment variables - run: env - - name: Set git login for tests - run: | - git config --global user.email "jax@nvidia.com" - git config --global user.name "JAX-Toolbox CI" - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - name: Run integration test ${{ matrix.TEST_SCRIPT }} - run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} + # test-distribution: + # runs-on: ubuntu-22.04 + # strategy: + # matrix: + # TEST_SCRIPT: + # - extra-only-distribution.sh + # - mirror-only-distribution.sh + # - upstream-only-distribution.sh + # - local-patch-distribution.sh + # fail-fast: false + # steps: + # - name: Print environment variables + # run: env + # - name: Set git login for tests + # run: | + # git config --global user.email "jax@nvidia.com" + # git config --global user.name "JAX-Toolbox CI" + # - name: Check out the repository under ${GITHUB_WORKSPACE} + # uses: actions/checkout@v4 + # - name: Run integration test ${{ matrix.TEST_SCRIPT }} + # run: bash rosetta/tests/${{ matrix.TEST_SCRIPT }} - test-jax: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee tee test-gpu.log - test-jax.sh -b gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-gpu.log - secrets: inherit + # test-jax: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: jax + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee tee test-gpu.log + # test-jax.sh -b gpu + # EOF + # STATISTICS_SCRIPT: | + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-backend-independent.log + # test-gpu.log + # secrets: inherit # test-equinox: # needs: build-equinox @@ -291,157 +304,157 @@ jobs: # test-equinox.log # secrets: inherit - test-te-multigpu: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_te.yaml - with: - TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-te-multigpu: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_te.yaml + # with: + # TE_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-t5x: + # needs: build-upstream-t5x + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_upstream_t5x.yaml + # with: + # T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-t5x: - needs: build-rosetta-t5x - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml - with: - T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-t5x: + # needs: build-rosetta-t5x + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_t5x_rosetta.yaml + # with: + # T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-pallas: - needs: build-jax - if: inputs.ARCHITECTURE == 'amd64' # triton doesn't support arm64(?) - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: pallas - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-pallas.log - python /opt/jax/tests/pallas/pallas_test.py --xml_output_file /output/pallas_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' pallas_test.xml) - errors=$(./yq '.testsuites."+@errors"' pallas_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' pallas_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-pallas.log - secrets: inherit + # test-pallas: + # needs: build-jax + # if: inputs.ARCHITECTURE == 'amd64' # triton doesn't support arm64(?) + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: pallas + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-pallas.log + # python /opt/jax/tests/pallas/pallas_test.py --xml_output_file /output/pallas_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' pallas_test.xml) + # errors=$(./yq '.testsuites."+@errors"' pallas_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' pallas_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-pallas.log + # secrets: inherit - test-triton: - needs: build-triton - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: triton - EXECUTE: | - docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ - ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-triton.log - python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml - EOF - STATISTICS_SCRIPT: | - curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; - total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) - errors=$(./yq '.testsuites."+@errors"' triton_test.xml) - failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) - passed_tests=$((total_tests - errors - failed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-triton.log - secrets: inherit + # test-triton: + # needs: build-triton + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: triton + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all --volume $PWD:/output \ + # ${{ needs.build-triton.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-triton.log + # python /opt/jax-triton/tests/triton_call_test.py --xml_output_file /output/triton_test.xml + # EOF + # STATISTICS_SCRIPT: | + # curl -L -o yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_$(dpkg --print-architecture) && chmod 777 yq; + # total_tests=$(./yq '.testsuites."+@tests"' triton_test.xml) + # errors=$(./yq '.testsuites."+@errors"' triton_test.xml) + # failed_tests=$(./yq '.testsuites."+@failures"' triton_test.xml) + # passed_tests=$((total_tests - errors - failed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-triton.log + # secrets: inherit - test-levanter: - needs: build-levanter - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: levanter - EXECUTE: | - docker run -i --gpus all --shm-size=1g \ - ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-levanter.log - pip install pytest - PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-levanter.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-levanter.log - secrets: inherit + # test-levanter: + # needs: build-levanter + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: levanter + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g \ + # ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-levanter.log + # pip install pytest + # PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-levanter.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # failed_tests=$(echo $summary_line | grep -oE '[0-9]+ failed' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(echo $summary_line | grep -oE '[0-9]+ passed' | awk '{print $1} END { if (!NR) print 0}') + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-levanter.log + # secrets: inherit - test-te: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: te - EXECUTE: | - docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-te.log - pip install pytest-reportlog - pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TE}/tests/jax - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-te.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-te.log - pytest-report.jsonl - secrets: inherit + # test-te: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: te + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-te.log + # pip install pytest-reportlog + # pytest --report-log=log/pytest-report.jsonl ${SRC_PATH_TE}/tests/jax + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-te.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat pytest-report.jsonl | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-te.log + # pytest-report.jsonl + # secrets: inherit - test-upstream-pax: - needs: build-upstream-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_upstream_pax.yaml - with: - PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-pax: + # needs: build-upstream-pax + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_upstream_pax.yaml + # with: + # PAX_IMAGE: ${{ needs.build-upstream-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-rosetta-pax: - needs: build-rosetta-pax - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_pax_rosetta.yaml - with: - PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-rosetta-pax: + # needs: build-rosetta-pax + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_pax_rosetta.yaml + # with: + # PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit - test-maxtext: - needs: build-maxtext - if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit + # test-upstream-maxtext: + # needs: build-upstream-maxtext + # if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 + # uses: ./.github/workflows/_test_upstream_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit diff --git a/.github/workflows/_test_maxtext.yaml b/.github/workflows/_test_upstream_maxtext.yaml similarity index 97% rename from .github/workflows/_test_maxtext.yaml rename to .github/workflows/_test_upstream_maxtext.yaml index 61bee91d6..77589d479 100644 --- a/.github/workflows/_test_maxtext.yaml +++ b/.github/workflows/_test_upstream_maxtext.yaml @@ -11,13 +11,13 @@ on: EXTRA_TEST_ARGS: type: string description: Extra command line args to pass to test-maxtext.sh - default: "" + default: '' required: false BADGE_FILENAME: type: string description: 'Name of the endpoint JSON file for shields.io badge' required: false - default: 'badge-maxtext-test.json' + default: 'badge-upstream-maxtext-test.json' ARTIFACT_NAME: type: string description: 'Name of the artifact zip file' @@ -34,12 +34,11 @@ on: value: ${{ jobs.sitrep.outputs.STATUS }} jobs: - single-process-multi-device: strategy: matrix: PARALLEL_CONFIG: - - [1, 1, 2, 4] + - [1, 1, 2, 4] # - [1, 1, 1, 8] # PP, DP, FSDP, TP fail-fast: false @@ -183,12 +182,12 @@ jobs: strategy: matrix: PARALLEL_CONFIG: - - [1, 1, 1, 1] - - [1, 1, 8, 1] - - [1, 1, 1, 8] - - [1, 1, 4, 2] - - [1, 2, 2, 2] - - [1, 4, 2, 2] + - [1, 1, 1, 1] + - [1, 1, 8, 1] + - [1, 1, 1, 8] + - [1, 1, 4, 2] + - [1, 2, 2, 2] + - [1, 4, 2, 2] fail-fast: false runs-on: ubuntu-22.04 @@ -366,7 +365,7 @@ jobs: sitrep: needs: [single-process-multi-device, maxtext-multinode, metrics] - if: "!cancelled()" + if: '!cancelled()' uses: ./.github/workflows/_sitrep_mgmn.yaml secrets: inherit with: @@ -377,7 +376,7 @@ jobs: summary: runs-on: ubuntu-22.04 needs: [single-process-multi-device, maxtext-multinode] - if: "!cancelled()" + if: '!cancelled()' steps: - name: Generate TensorBoard query URL run: | @@ -394,7 +393,7 @@ jobs: outcome: needs: sitrep runs-on: ubuntu-22.04 - if: "!cancelled()" + if: '!cancelled()' steps: - name: Sets workflow status based on test outputs run: | diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0098b83bf..fcaf495a6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -2,7 +2,7 @@ name: CI on: schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC + - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC pull_request: types: - opened @@ -25,7 +25,7 @@ on: required: false MERGE_BUMPED_MANIFEST: type: boolean - description: "(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch" + description: '(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch' default: false required: false @@ -34,16 +34,15 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} permissions: - contents: write # to fetch code and push branch - actions: write # to cancel previous workflows - packages: write # to upload container - pull-requests: write # to make pull request for manifest bump + contents: write # to fetch code and push branch + actions: write # to cancel previous workflows + packages: write # to upload container + pull-requests: write # to make pull request for manifest bump env: DEFAULT_MANIFEST_ARTIFACT_NAME: bumped-manifest jobs: - metadata: runs-on: ubuntu-22.04 outputs: @@ -81,7 +80,7 @@ jobs: id: manifest-branch shell: bash -x -e {0} run: | - BUMP_MANIFEST=${{ github.event_name == 'schedule' || inputs.BUMP_MANIFEST || 'false' }} + BUMP_MANIFEST=${{ github.event_name == 'schedule' || inputs.BUMP_MANIFEST || 'true' }} MERGE_BUMPED_MANIFEST=${{ github.event_name == 'schedule' || inputs.MERGE_BUMPED_MANIFEST || 'false' }} # Prepend nightly manifest branch with "z" to make it appear at the end if [[ "$BUMP_MANIFEST" == "true" ]]; then @@ -115,7 +114,7 @@ jobs: shell: bash -x -e {0} run: | bash bump.sh --input-manifest manifest.yaml --output-manifest manifest.yaml.new --base-patch-dir ./patches-new - + - name: Maybe replace current manifest/patches with the new one and show diff working-directory: .github/container shell: bash -x -e {0} @@ -168,12 +167,11 @@ jobs: steps: - name: "Tests Succeeded: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" id: test_result - run: - echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT + run: echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v4 - + - name: Delete checked-out manifest and patches run: | rm .github/container/manifest.yaml @@ -185,7 +183,7 @@ jobs: name: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} path: .github/container/ - - name: "Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}" + - name: 'Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}' id: local_branch shell: bash -x -e {0} run: | @@ -213,7 +211,7 @@ jobs: git merge --ff-only ${{ needs.metadata.outputs.MANIFEST_BRANCH }} # Push the new change git push origin ${{ github.ref_name }} - + # We will create a Draft PR & remote branch if: # 1. The tests failed # 2. The merge failed @@ -244,12 +242,12 @@ jobs: draft: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: "Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}" + + - name: 'Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}' if: steps.create_pr.outcome == 'success' run: | echo "https://github.com/NVIDIA/JAX-Toolbox/pull/${{ fromJson(steps.create_pr.outputs.data).number }}" | tee -a $GITHUB_STEP_SUMMARY - + # Guard delete in simple check to protect other branches - name: Check that the branch matches znightly- prefix run: | @@ -271,7 +269,7 @@ jobs: make-publish-configs: runs-on: ubuntu-22.04 - if: ${{ !cancelled() }} + if: ${{ !cancelled() }} env: MEALKIT_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax-mealkit' || 'mock-jax-mealkit' }} FINAL_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax' || 'mock-jax' }} @@ -294,6 +292,7 @@ jobs: levanter upstream-t5x upstream-pax + upstream-maxtext t5x pax grok @@ -365,7 +364,7 @@ jobs: needs: - metadata - make-publish-configs - if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }} + if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }} strategy: fail-fast: false matrix: ${{ fromJson(needs.make-publish-configs.outputs.PUBLISH_CONFIGS) }} @@ -381,7 +380,7 @@ jobs: finalize: needs: [metadata, amd64, arm64, publish-containers] - if: "!cancelled()" + if: '!cancelled()' uses: ./.github/workflows/_finalize.yaml with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..51b4cfda1 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +local/ diff --git a/README.md b/README.md index e8d598626..fe7a960ca 100644 --- a/README.md +++ b/README.md @@ -215,19 +215,38 @@ - + + + + + ghcr.io/nvidia/jax:upstream-maxtext + + + + + + + + + + + + + + + ghcr.io/nvidia/jax:maxtext - - + + - + diff --git a/rosetta/Dockerfile.maxtext b/rosetta/Dockerfile.maxtext new file mode 100644 index 000000000..6c69ed93a --- /dev/null +++ b/rosetta/Dockerfile.maxtext @@ -0,0 +1,75 @@ +# syntax=docker/dockerfile:1-labs +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:upstream-maxtext +ARG GIT_USER_EMAIL=jax@nvidia.com +ARG GIT_USER_NAME=NVIDIA +# If set to "true", then will pull new local patches, the manifest.yaml and create-distribution.sh (in case it was updated). +# This is useful for development if you run `./bump.sh -i manifest.yaml` manually and do not want to trigger a full rebuild all +# the way up to the jax build. +ARG UPDATE_PATCHES=false +# It is common for TE developers to test a different TE against the LLM application. This is a knob to override what's in the manifest +# Accepts git-ref's from NVIDIA/TransformerEngine or pull requests (pull/$number/head) +ARG UPDATED_TE_REF="" + +# Rosetta and optionally patches are pulled from this +FROM scratch AS jax-toolbox + +############################################################################### +### Download source and add auxiliary scripts +################################################################################ + +FROM ${BASE_IMAGE} AS mealkit +ARG GIT_USER_EMAIL +ARG GIT_USER_NAME +ARG UPDATE_PATCHES +ARG UPDATED_TE_REF + +ENV ENABLE_TE=1 + +RUN --mount=target=/mnt/jax-toolbox,from=jax-toolbox <<"EOF" bash -exu +MANIFEST_DIR=$(dirname ${MANIFEST_FILE}) +if [[ "${UPDATE_PATCHES}" != "true" && "${UPDATE_PATCHES}" != "false" ]]; then + echo "UPDATE_PATCHES can only be true or false" + exit 1 +fi +if [[ "${UPDATE_PATCHES}" == "true" ]]; then + cp -r /mnt/jax-toolbox/.github/container/patches ${MANIFEST_DIR}/ + cp /mnt/jax-toolbox/.github/container/manifest.yaml ${MANIFEST_DIR}/manifest.yaml + cp /mnt/jax-toolbox/.github/container/create-distribution.sh ${MANIFEST_DIR}/create-distribution.sh +fi +cp -r /mnt/jax-toolbox/rosetta /opt/rosetta + +if [[ -n "${UPDATED_TE_REF}" ]]; then + TE_INSTALL_DIR=/opt/transformer-engine + yq e ".transformer-engine.latest_verified_commit = \"${UPDATED_TE_REF}\"" -i $MANIFEST_FILE + # Install from source instead of pre-built wheel + sed -i -E 's@( file:///opt/transformer-engine)/dist/[^ ]*@\1@' /opt/pip-tools.d/requirements-te.in + git -C $TE_INSTALL_DIR fetch -a + if [[ "${UPDATED_TE_REF}" =~ ^pull/ ]]; then + PR_ID=$(cut -d/ -f2 <<<"${UPDATED_TE_REF}") + git -C $TE_INSTALL_DIR fetch origin ${UPDATED_TE_REF}:PR-${PR_ID} + git -C $TE_INSTALL_DIR checkout PR-${PR_ID} + else + git -C $TE_INSTALL_DIR checkout ${UPDATED_TE_REF} + fi +fi + +# Setting the username/email is required to author commits from patches +git config --global user.email "${GIT_USER_EMAIL}" +git config --global user.name "${GIT_USER_NAME}" + +bash ${MANIFEST_DIR}/create-distribution.sh \ + --manifest ${MANIFEST_FILE} \ + --package maxtext +# Remove .gitconfig to avoid end-user authoring commits as the "build user" +rm -f ~/.gitconfig +EOF + +WORKDIR /opt/rosetta + +############################################################################### +### Install accumulated packages from the base image and the previous stage +################################################################################ + +FROM mealkit as final + +RUN pip-finalize.sh