Skip to content

Commit

Permalink
[develop] Feature/cicd metrics adds methods to collect resource usage…
Browse files Browse the repository at this point in the history
… data from major stages of the SRW pipeline build job (ufs-community#1058)

Updated SRW Jenkinsfile with some run-time stats collection, and adds a final stage that triggers ufs-srw-metrics stats collection job for reporting metrics.

The SRW pipeline job that uses this Jenkinsfile will now use the 'time' command when executing major stages: init, build, test. This will collect CPU, Memory, and DiskUsage measurements that can be later used in trend plots on a metrics dashboard.

Additionally, it adds options to the pipeline job that allow the operator to select just a single test, or no test suite (default is still 'coverage' suite), and allows an option to select the depth of wrapper script tasks to execute during functional testing (default is still all 9 scripts).
  • Loading branch information
BruceKropp-Raytheon authored Apr 15, 2024
1 parent aa1678b commit a609196
Show file tree
Hide file tree
Showing 7 changed files with 153 additions and 13 deletions.
60 changes: 50 additions & 10 deletions .cicd/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ pipeline {
choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'derecho', 'gaea', 'hera', 'jet', 'orion', 'hercules'], description: 'Specify the platform(s) to use')
// Allow job runner to filter based on compiler
choice(name: 'SRW_COMPILER_FILTER', choices: ['all', 'gnu', 'intel'], description: 'Specify the compiler(s) to use to build')
// Workflow Wrapper test depth {0..9}, 0=none, 1=simple, 9=all [default]
choice(name: 'SRW_WRAPPER_TASK_DEPTH', choices: ['9', '1', '0'], description: '0=none, 1=simple, 9=all [default]')
// WE2E Tests ?
choice(name: 'SRW_WE2E_SINGLE_TEST', choices: ['coverage', 'none', 'skill-score', 'grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0'], description: 'Specify the WE2E test to use')
booleanParam name: 'SRW_WE2E_COMPREHENSIVE_TESTS', defaultValue: false, description: 'Whether to execute the comprehensive end-to-end tests'
}

Expand Down Expand Up @@ -126,10 +130,17 @@ pipeline {
stage('Initialize') {
steps {
dir ("${env.SRW_PLATFORM}") {
echo "Initializing SRW (${env.SRW_COMPILER}) build environment on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
echo "${env.STAGE_NAME} SRW (${env.SRW_COMPILER}) build environment on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
cleanWs()
checkout scm
sh '"${WORKSPACE}/${SRW_PLATFORM}/manage_externals/checkout_externals"'
sh '"${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/srw_init.sh"'
sh "STAGE_NAME=${env.STAGE_NAME} " + 'bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/disk_usage.sh"'
}
}
post {
always {
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-time-srw_init.json", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-disk-usage${env.STAGE_NAME}.csv", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
}
}
}
Expand All @@ -138,8 +149,9 @@ pipeline {
stage('Build') {
steps {
dir ("${env.SRW_PLATFORM}") {
echo "Building SRW (${env.SRW_COMPILER}) on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
echo "${env.STAGE_NAME} SRW (${env.SRW_COMPILER}) on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
sh 'bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/srw_build.sh"'
sh "STAGE_NAME=${env.STAGE_NAME} " + 'bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/disk_usage.sh"'
}
}

Expand All @@ -148,6 +160,11 @@ pipeline {
sh 'cd "${WORKSPACE}/${SRW_PLATFORM}/${INSTALL_NAME}" && tar --create --gzip --verbose --file "${WORKSPACE}/${SRW_PLATFORM}/${BUILD_NAME}.tgz" *'
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: true, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/${env.BUILD_NAME}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: true, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/build_${env.SRW_COMPILER}/srw_build-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
}
always {
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-env.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-time-srw_build.json", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-disk-usage${env.STAGE_NAME}.csv", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
}
}
}

Expand All @@ -163,9 +180,12 @@ pipeline {

// Try a few Workflow Task scripts to make sure E2E tests can be launched in a follow-on 'Test' stage
stage('Functional WorkflowTaskTests') {
environment {
TASK_DEPTH = "${env.SRW_WRAPPER_TASK_DEPTH}"
}
steps {
dir ("${env.SRW_PLATFORM}") {
echo "Running simple workflow script task tests on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
echo "Running ${TASK_DEPTH} simple workflow script task tests on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
sh 'bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/wrapper_srw_ftest.sh"'
}
}
Expand All @@ -179,11 +199,12 @@ pipeline {

steps {
dir ("${env.SRW_PLATFORM}") {
echo "Testing SRW (${env.SRW_COMPILER}) on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"
echo "${env.STAGE_NAME} SRW (${env.SRW_COMPILER}) on ${env.SRW_PLATFORM} (using ${env.WORKSPACE}/${env.SRW_PLATFORM})"

// If executing for a Pull Request, check for the run_we2e_comprehensive_tests. If set,
// override the value of the SRW_WE2E_COMPREHENSIVE_TESTS parameter
script {
def single_test = params.SRW_WE2E_SINGLE_TEST
def run_we2e_comprehensive_tests = params.SRW_WE2E_COMPREHENSIVE_TESTS
def run_we2e_comprehensive_tests_label = 'run_we2e_comprehensive_tests'

Expand All @@ -195,18 +216,37 @@ pipeline {
}
}

sh "SRW_WE2E_COMPREHENSIVE_TESTS=${run_we2e_comprehensive_tests}" + ' bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/srw_test.sh"'
}
sh "SRW_WE2E_COMPREHENSIVE_TESTS=${run_we2e_comprehensive_tests} SRW_WE2E_SINGLE_TEST=${single_test}" + ' bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/srw_test.sh"'

// Archive the test log files
sh "[[ -d ${SRW_WE2E_EXPERIMENT_BASE_DIR} ]] && cd ${SRW_WE2E_EXPERIMENT_BASE_DIR} && tar --create --gzip --verbose --dereference --file ${WORKSPACE}/${SRW_PLATFORM}/we2e_test_logs-${SRW_PLATFORM}-${SRW_COMPILER}.tgz */log.generate_FV3LAM_wflow */log/* ${WORKSPACE}/${SRW_PLATFORM}/tests/WE2E/WE2E_tests_*yaml WE2E_summary*txt ${WORKSPACE}/${SRW_PLATFORM}/tests/WE2E/log.* || cat /dev/null > ${WORKSPACE}/${SRW_PLATFORM}/we2e_test_logs-${SRW_PLATFORM}-${SRW_COMPILER}.tgz"
}
sh "STAGE_NAME=${env.STAGE_NAME} " + 'bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/disk_usage.sh"'
}
}

post {
success {
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*_test_results-*-*.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/we2e_test_logs-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
}
always {
// Archive the test log files
sh 'cd "${SRW_WE2E_EXPERIMENT_BASE_DIR}" && tar --create --gzip --verbose --dereference --file "${WORKSPACE}/${SRW_PLATFORM}/we2e_test_logs-${SRW_PLATFORM}-${SRW_COMPILER}.tgz" */log.generate_FV3LAM_wflow */log/* ${WORKSPACE}/${SRW_PLATFORM}/tests/WE2E/WE2E_tests_*yaml WE2E_summary*txt ${WORKSPACE}/${SRW_PLATFORM}/tests/WE2E/log.*'
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-time-srw_test.json", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-disk-usage${env.STAGE_NAME}.csv", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
// Remove the data sets from the experiments directory to conserve disk space
sh 'find "${SRW_WE2E_EXPERIMENT_BASE_DIR}" -regextype posix-extended -regex "^.*(orog|[0-9]{10})$" -type d | xargs rm -rf'
s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*_test_results-*-*.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/we2e_test_logs-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
}
}
}

stage('Metrics') {
steps {
script {
CI_BRANCH_NAME=env.JOB_BASE_NAME.replace("%2F","%252F")
echo "Triggering job for branch ${CI_BRANCH_NAME}/${env.BUILD_NUMBER} ..."
build job: '/ufs-srweather-app/ufs-srw-metrics', parameters: [
string(name: 'CI_JOB_NAME', value: "ufs-srweather-app/metrics"),
string(name: 'CI_BUILD_NUMBER', value: "${CI_BRANCH_NAME}/${env.BUILD_NUMBER}")
], wait: false
}
}
}
Expand Down
48 changes: 48 additions & 0 deletions .cicd/scripts/disk_usage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env bash

# Output a CSV report of disk usage on subdirs of some path
# Usage:
# [JOB_NAME=<ci_job>] [BUILD_NUMBER=<n>] [SRW_COMPILER=<intel>] [SRW_PLATFORM=<machine>] disk_usage path depth size outfile.csv
#
# args:
# directory=$1
# depth=$2
# size=$3
# outfile=$4

[[ -n ${WORKSPACE} ]] || WORKSPACE=$(pwd)
[[ -n ${SRW_PLATFORM} ]] || SRW_PLATFORM=$(hostname -s 2>/dev/null) || SRW_PLATFORM=$(hostname 2>/dev/null)
[[ -n ${SRW_COMPILER} ]] || SRW_COMPILER=compiler

script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" > /dev/null 2>&1 && pwd)"

# Get repository root from Jenkins WORKSPACE variable if set, otherwise, set
# relative to script directory.
declare workspace
if [[ -n "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
workspace="${WORKSPACE}/${SRW_PLATFORM}"
else
workspace="$(cd -- "${script_dir}/../.." && pwd)"
fi

echo "STAGE_NAME=${STAGE_NAME}" # from pipeline
outfile="${4:-${workspace}-${SRW_COMPILER}-disk-usage${STAGE_NAME}.csv}"

function disk_usage() {
local directory=${1:-${PWD}}
local depth=${2:-1}
local size=${3:-k}
echo "Disk usage: ${JOB_NAME:-ci}/${SRW_PLATFORM}/$(basename $directory)"
(
cd $directory || exit 1
echo "Platform,Build,Owner,Group,Inodes,${size:-k}bytes,Access Time,Filename"
du -Px -d ${depth:-1} --inode --exclude='./workspace' | \
while read line ; do
arr=($line); inode=${arr[0]}; filename=${arr[1]};
echo "${SRW_PLATFORM}-${SRW_COMPILER:-compiler},${JOB_NAME:-ci}/${BUILD_NUMBER:-0},$(stat -c '%U,%G' $filename),${inode:-0},$(du -Px -s -${size:-k} --time $filename)" | tr '\t' ',' ;
done | sort -t, -k5 -n #-r
)
echo ""
}

disk_usage $1 $2 $3 | tee ${outfile}
3 changes: 2 additions & 1 deletion .cicd/scripts/srw_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ fi
# Build and install
cd ${workspace}/tests
set +e
./build.sh ${platform} ${SRW_COMPILER}
/usr/bin/time -p -f '{\n "cpu": "%P"\n, "memMax": "%M"\n, "mem": {"text": "%X", "data": "%D", "swaps": "%W", "context": "%c", "waits": "%w"}\n, "pagefaults": {"major": "%F", "minor": "%R"}\n, "filesystem": {"inputs": "%I", "outputs": "%O"}\n, "time": {"real": "%e", "user": "%U", "sys": "%S"}\n}' -o ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-time-srw_build.json \
./build.sh ${platform} ${SRW_COMPILER}
build_exit=$?
set -e
cd -
Expand Down
38 changes: 38 additions & 0 deletions .cicd/scripts/srw_init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
#
# A unified init script for the SRW application. This script is expected to
# fetch initial source for the SRW application for all supported platforms.
#
set -e -u -x

script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" > /dev/null 2>&1 && pwd)"

# Get repository root from Jenkins WORKSPACE variable if set, otherwise, set
# relative to script directory.
declare workspace
if [[ -n "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
workspace="${WORKSPACE}/${SRW_PLATFORM}"
else
workspace="$(cd -- "${script_dir}/../.." && pwd)"
fi

# Normalize Parallel Works cluster platform value.
declare platform
if [[ "${SRW_PLATFORM}" =~ ^(az|g|p)clusternoaa ]]; then
platform='noaacloud'
else
platform="${SRW_PLATFORM}"
fi

# Build and install
cd ${workspace}
set +e
/usr/bin/time -p -f '{\n "cpu": "%P"\n, "memMax": "%M"\n, "mem": {"text": "%X", "data": "%D", "swaps": "%W", "context": "%c", "waits": "%w"}\n, "pagefaults": {"major": "%F", "minor": "%R"}\n, "filesystem": {"inputs": "%I", "outputs": "%O"}\n, "time": {"real": "%e", "user": "%U", "sys": "%S"}\n}' -o ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-time-srw_init.json \
./manage_externals/checkout_externals
init_exit=$?
echo "STAGE_NAME=${STAGE_NAME}"
env | grep = | sort > ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-env.txt
set -e
cd -

exit $init_exit
Loading

0 comments on commit a609196

Please sign in to comment.