Skip to content

fix: pynvml is pinned when using TRTLLM v13 due to breaking change in… #542

fix: pynvml is pinned when using TRTLLM v13 due to breaking change in…

fix: pynvml is pinned when using TRTLLM v13 due to breaking change in… #542

Workflow file for this run

# Copyright (c) 2020-2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "CICD NeMo Aligner"
on:
pull_request:
branches:
- 'main'
- 'r**'
- 'dev'
types: [labeled]
workflow_dispatch:
inputs:
test_to_run:
required: false
default: all
type: string
description: Comma-separated list of tests to run. Use "all" to run the full test suite.
push:
branches:
- 'main'
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
test_to_run: ${{ steps.test_to_run.outputs.main }}
all: ${{ steps.all.outputs.main }}
run_ci: ${{ steps.evaluate.outputs.run_ci }}
steps:
- name: Parse test_to_run
id: test_to_run
run: |
parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")')
echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
- name: Parse all
id: all
run: |
echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT"
- name: Get changed files
id: changed-files
if: github.event_name == 'pull_request'
uses: tj-actions/changed-files@v44
with:
files_yaml: |
doc:
- '**.md'
- docs/**
src:
- '!**.md'
- '!docs/**'
- name: Evaluate conditions
id: evaluate
env:
DOCS_ONLY: ${{ steps.changed-files.outputs.doc_any_changed == 'true' && steps.changed-files.outputs.src_any_changed == 'false' }}
CHANGED_DOCS: ${{ steps.changed-files.outputs.doc_all_changed_files }}
CHANGED_SRC: ${{ steps.changed-files.outputs.src_all_changed_files }}
IS_PULLREQUEST: ${{ github.event_name == 'pull_request' }}
LABEL: ${{ github.event.label.name == 'Run CICD' }}
run: |
# Some output that's helpful for debugging
echo "Docs changed: $CHANGED_DOCS"
echo "Src changed: $CHANGED_SRC"
echo "DOCS_ONLY: $DOCS_ONLY"
echo "LABEL: $LABEL"
echo "IS_PULLREQUEST: $IS_PULLREQUEST"
# Run CI only (on main or if label is attached) and if it's not only docs
echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false") && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT"
build-container:
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
needs: [pre-flight]
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
with:
image-name: nemo_aligner_container
dockerfile: Dockerfile
image-label: nemo-aligner
build-args: |
MAX_JOBS=32
ALIGNER_COMMIT=${{ github.sha }}
Unit_Tests:
name: ${{ matrix.test_case }}
needs: [build-container, pre-flight]
uses: ./.github/workflows/_run_test.yml
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
strategy:
matrix:
test_case:
- run_unit.sh
- run_mpi_unit.sh
with:
RUNNER: self-hosted-azure
TIMEOUT: 10
SCRIPT: |
nvidia-smi
cd ${ALIGNER_REPO_DIR}
bash tests/${{ matrix.test_case }}
Functional_Tests:
name: ${{ matrix.test_case }}
needs: [build-container, pre-flight]
uses: ./.github/workflows/_run_test.yml
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
strategy:
matrix:
test_case:
- ppo-llama3-pp2-reshard
- reinforce-llama3-pp2-reshard
- dpo-llama3
- dpo-llama3-pack
- kd-llama3
- sft-llama3
- rm-llama3
with:
RUNNER: self-hosted-azure
# Fairly aggresive timeout that all functional tests should try to adhere to
TIMEOUT: 8
SCRIPT: |
bash /opt/NeMo-Aligner/tests/functional/test_cases/${{ matrix.test_case }}
CI_QA_Gate:
name: CI quality check
if: always()
runs-on: ubuntu-latest
needs:
- Unit_Tests
- Functional_Tests
steps:
- name: main
env:
JOB_RESULTS: ${{ toJSON(needs) }}
ALL_SUCCESS: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && !contains(needs.*.result, 'skipped') }}
CI_SKIP: ${{ github.event.label.name == 'Skip CICD' }}
run: |
SUMMARY=$(echo $JOB_RESULTS | jq 'to_entries[] | .key + ": " + .value.result' | tr -d '"')
echo '🤖: CICD Result' >> $GITHUB_STEP_SUMMARY
echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY
test "$ALL_SUCCESS" = "true" || test "$CI_SKIP" = "true"