From 162bcf7e898eed842af23ef48673fc14218c1753 Mon Sep 17 00:00:00 2001 From: DavidBurrows-NCO Date: Tue, 5 Nov 2024 09:28:19 -0500 Subject: [PATCH] C6 updates to run GSI reg tests --- modulefiles/gsi_gaeac6.intel.lua | 5 +- regression/regression_param.sh | 52 ++++++++-- regression/regression_var.sh | 18 +++- ush/{sub_gaea => sub_gaeac5} | 6 +- ush/sub_gaeac6 | 170 +++++++++++++++++++++++++++++++ 5 files changed, 233 insertions(+), 18 deletions(-) rename ush/{sub_gaea => sub_gaeac5} (97%) create mode 100755 ush/sub_gaeac6 diff --git a/modulefiles/gsi_gaeac6.intel.lua b/modulefiles/gsi_gaeac6.intel.lua index 68e8c1ff39..883bd02a72 100644 --- a/modulefiles/gsi_gaeac6.intel.lua +++ b/modulefiles/gsi_gaeac6.intel.lua @@ -1,11 +1,12 @@ help([[ ]]) -prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/spack-stack-1.6.0/envs/gsi-addon-dev/install/modulefiles/Core") +--prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/spack-stack-1.6.0/envs/gsi-addon-dev/install/modulefiles/Core") +prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/c6/spack-stack-1.6.0/envs/gsi-addon/install/modulefiles/Core") local stack_python_ver=os.getenv("stack_python_ver") or "3.11.6" local stack_intel_ver=os.getenv("stack_intel_ver") or "2023.2.0" -local stack_cray_mpich_ver=os.getenv("stack_cray_mpich_ver") or "8.1.28" +local stack_cray_mpich_ver=os.getenv("stack_cray_mpich_ver") or "8.1.29" local cmake_ver=os.getenv("cmake_ver") or "3.23.1" local prod_util_ver=os.getenv("prod_util_ver") or "2.1.1" diff --git a/regression/regression_param.sh b/regression/regression_param.sh index 209762569b..2fddb36589 100755 --- a/regression/regression_param.sh +++ b/regression/regression_param.sh @@ -24,10 +24,15 @@ case $machine in memnode=96 numcore=40 ;; - Gaea) - sub_cmd="sub_gaea" + gaeac5) + sub_cmd="sub_gaeac5" memnode=251 numcore=128 + ;; + gaeac6) + sub_cmd="sub_gaeac6" + memnode=384 + numcore=192 ;; wcoss2) sub_cmd="sub_wcoss2" @@ -68,7 +73,10 @@ case $regtest in elif [[ "$machine" = "Discover" ]]; then topts[1]="0:30:00" ; popts[1]="48/2" ; ropts[1]="/1" topts[2]="0:30:00" ; popts[2]="60/3" ; ropts[2]="/2" - elif [[ "$machine" = "Gaea" ]]; then + elif [[ "$machine" = "gaeac5" ]]; then + topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1" + topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2" + elif [[ "$machine" = "gaeac6" ]]; then topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1" topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2" elif [[ "$machine" = "wcoss2" ]]; then @@ -98,9 +106,12 @@ case $regtest in elif [[ "$machine" = "Jet" ]]; then topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1" - elif [[ "$machine" = "Gaea" ]]; then + elif [[ "$machine" = "gaeac5" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" + elif [[ "$machine" = "gaeac6" ]]; then + topts[1]="0:60:00" ; popts[1]="64/1/" ; ropts[1]="/1" + topts[2]="0:60:00" ; popts[2]="128/2/" ; ropts[2]="/1" elif [[ "$machine" = "wcoss2" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" @@ -128,7 +139,10 @@ case $regtest in elif [[ "$machine" = "Jet" ]]; then topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1" - elif [[ "$machine" = "Gaea" ]]; then + elif [[ "$machine" = "gaeac5" ]]; then + topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" + topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" + elif [[ "$machine" = "gaeac6" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" elif [[ "$machine" = "wcoss2" ]]; then @@ -157,7 +171,10 @@ case $regtest in elif [[ "$machine" = "Jet" ]]; then topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1" - elif [[ "$machine" = "Gaea" ]]; then + elif [[ "$machine" = "gaeac5" ]]; then + topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" + topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" + elif [[ "$machine" = "gaeac6" ]]; then topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1" elif [[ "$machine" = "wcoss2" ]]; then @@ -187,7 +204,10 @@ case $regtest in elif [[ "$machine" = "Jet" ]]; then topts[1]="0:15:00" ; popts[1]="4/4/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="6/6/" ; ropts[2]="/1" - elif [[ "$machine" = "Gaea" ]]; then + elif [[ "$machine" = "gaeac5" ]]; then + topts[1]="0:15:00" ; popts[1]="28/1/" ; ropts[1]="/1" + topts[2]="0:15:00" ; popts[2]="28/2/" ; ropts[2]="/1" + elif [[ "$machine" = "gaeac6" ]]; then topts[1]="0:15:00" ; popts[1]="28/1/" ; ropts[1]="/1" topts[2]="0:15:00" ; popts[2]="28/2/" ; ropts[2]="/1" elif [[ "$machine" = "wcoss2" ]]; then @@ -217,7 +237,10 @@ case $regtest in elif [[ "$machine" = "Jet" ]]; then topts[1]="0:30:00" ; popts[1]="6/12/" ; ropts[1]="/1" topts[2]="0:30:00" ; popts[2]="8/12/" ; ropts[2]="/1" - elif [[ "$machine" = "Gaea" ]]; then + elif [[ "$machine" = "gaeac5" ]]; then + topts[1]="0:30:00" ; popts[1]="14/8/" ; ropts[1]="/1" + topts[2]="0:30:00" ; popts[2]="14/14/" ; ropts[2]="/1" + elif [[ "$machine" = "gaeac6" ]]; then topts[1]="0:30:00" ; popts[1]="14/8/" ; ropts[1]="/1" topts[2]="0:30:00" ; popts[2]="14/14/" ; ropts[2]="/1" elif [[ "$machine" = "wcoss2" ]]; then @@ -247,7 +270,10 @@ case $regtest in elif [[ "$machine" = "Jet" ]]; then topts[1]="0:10:00" ; popts[1]="12/3/" ; ropts[1]="/1" topts[2]="0:10:00" ; popts[2]="12/5/" ; ropts[2]="/2" - elif [[ "$machine" = "Gaea" ]]; then + elif [[ "$machine" = "gaeac5" ]]; then + topts[1]="0:10:00" ; popts[1]="16/2/" ; ropts[1]="/1" + topts[2]="0:10:00" ; popts[2]="16/4/" ; ropts[2]="/2" + elif [[ "$machine" = "gaeac6" ]]; then topts[1]="0:10:00" ; popts[1]="16/2/" ; ropts[1]="/1" topts[2]="0:10:00" ; popts[2]="16/4/" ; ropts[2]="/2" elif [[ "$machine" = "wcoss2" ]]; then @@ -310,7 +336,13 @@ elif [[ "$machine" = "Jet" ]]; then export MPI_BUFS_PER_HOST=256 export MPI_GROUP_MAX=256 export APRUN="srun -n \$ntasks --cpus-per-task=\$threads" -elif [[ "$machine" = "Gaea" ]]; then +elif [[ "$machine" = "gaeac5" ]]; then + export OMP_STACKSIZE=1024M + export MPI_BUFS_PER_PROC=256 + export MPI_BUFS_PER_HOST=256 + export MPI_GROUP_MAX=256 + export APRUN="srun --export=ALL -n \$ntasks" +elif [[ "$machine" = "gaeac6" ]]; then export OMP_STACKSIZE=1024M export MPI_BUFS_PER_PROC=256 export MPI_BUFS_PER_HOST=256 diff --git a/regression/regression_var.sh b/regression/regression_var.sh index 4a2bc85874..a58bcd56ff 100755 --- a/regression/regression_var.sh +++ b/regression/regression_var.sh @@ -36,8 +36,10 @@ elif [[ -d /mnt/lfs4 || -d /jetmon || -d /mnt/lfs5 ]]; then # Jet export machine="Jet" elif [[ -d /discover ]]; then # NCCS Discover export machine="Discover" -elif [[ -d /ncrc ]]; then # Gaea - export machine="Gaea" +elif [[ -d /gpfs/f5 ]]; then # GaeaC5 + export machine="gaeac5" +elif [[ -d /gpfs/f6 ]]; then # GaeaC6 + export machine="gaeac6" elif [[ -d /data/prod ]]; then # S4 export machine="S4" elif [[ -d /work ]]; then # Orion or Hercules @@ -53,7 +55,7 @@ fi echo "Running Regression Tests on '$machine'"; case $machine in - Gaea) + gaeac5) export queue="normal" export group="ufs-ard" export noscrub="/gpfs/f5/${group}/scratch/${USER}/$LOGNAME/gsi_tmp/noscrub" @@ -63,6 +65,16 @@ case $machine in export check_resource="no" export accnt="ufs-ard" ;; + gaeac6) + export queue="normal" + export group="bil-fire8" + export noscrub="/gpfs/f6/${group}/scratch/${USER}/${LOGNAME}/gsi_tmp/noscrub" + export ptmp="/gpfs/f6/${group}/scratch/${USER}/${LOGNAME}/gsi_tmp/ptmp" + export casesdir="/gpfs/f6/bil-fire8/world-shared/GSI_data/CASES/regtest" + + export check_resource="no" + export accnt="bil-fire8" + ;; wcoss2) export local_or_default="${local_or_default:-/lfs/h2/emc/da/noscrub/$LOGNAME}" if [ -d $local_or_default ]; then diff --git a/ush/sub_gaea b/ush/sub_gaeac5 similarity index 97% rename from ush/sub_gaea rename to ush/sub_gaeac5 index 9c4e253c93..2ed9affe5e 100755 --- a/ush/sub_gaea +++ b/ush/sub_gaeac5 @@ -88,8 +88,8 @@ output=${output:-$jobname.out} myuser=$LOGNAME myhost=$(hostname) -if [ -d /gpfs/f5/epic/scratch/${USER}/$LOGNAME ]; then - DATA=/gpfs/f5/epic/scratch/${USER}/$LOGNAME/tmp +if [ -d /gpfs/f5/ufs-ard/scratch/${USER}/$LOGNAME ]; then + DATA=/gpfs/f5/ufs-ard/scratch/${USER}/$LOGNAME/tmp fi DATA=${DATA:-$ptmp/tmp} @@ -129,7 +129,7 @@ echo "" >>$cfile echo "module reset" >> $cfile echo "module use $modulefiles" >> $cfile -echo "module load gsi_gaea.intel" >> $cfile +echo "module load gsi_gaeac5.intel" >> $cfile echo "module list" >> $cfile echo "" >>$cfile diff --git a/ush/sub_gaeac6 b/ush/sub_gaeac6 new file mode 100755 index 0000000000..3617c36f49 --- /dev/null +++ b/ush/sub_gaeac6 @@ -0,0 +1,170 @@ +#!/bin/sh --login +set -x +usage="\ +Usage: $0 [options] executable [args] + where the options are: + -a account account (default: none) + -b binding run smt binding or not (default:NO) + -d dirin initial directory (default: cwd) + -e envars copy comma-separated environment variables + -g group group name + -i append standard input to command file + -j jobname specify jobname (default: executable basename) + -m machine machine on which to run (default: current) + -n write command file to stdout rather than submitting it + -o output specify output file (default: jobname.out) + -p procs[/nodes[/ppreq] + number of MPI tasks and optional nodes or Bblocking and + ppreq option (N or S) (defaults: serial, Bunlimited, S) + -q queue[/qpreq] queue name and optional requirement, e.g. dev/P + (defaults: 1 if serial or dev if parallel and none) + (queue 3 or 4 is dev or prod with twice tasks over ip) + (options: P=parallel, B=bigmem, b=batch) + -r rmem[/rcpu] resources memory and cpus/task (default: '1024 mb', 1) + -t timew wall time limit in [[hh:]mm:]ss format (default: 900) + -u userid userid to run under (default: self) + -v verbose mode + -w when when to run, in yyyymmddhh[mm], +hh[mm], thh[mm], or + Thh[mm] (full, incremental, today or tomorrow) format + (default: now) +Function: This command submits a job to the batch queue." +subcmd="$*" +stdin=NO +nosub=NO +account="" +binding="NO" +dirin="" +envars="" +group="" +jobname="" +machine="" +output="" +procs=0 +nodes="" +ppreq="" +queue="" +qpreq="" +rmem="1024" +rcpu="1" +timew="900" +userid="" +verbose=NO +when="" +while getopts a:b:d:e:g:ij:m:no:p:q:r:t:u:vw: opt;do + case $opt in + a) account="$OPTARG";; + b) binding="$OPTARG";; + d) dirin="$OPTARG";; + e) envars="$OPTARG";; + g) group="$OPTARG";; + i) stdin=YES;; + j) jobname=$OPTARG;; + m) machine="$OPTARG";; + n) nosub=YES;; + o) output=$OPTARG;; + p) procs=$(echo $OPTARG/|cut -d/ -f1);nodes=$(echo $OPTARG/|cut -d/ -f2);ppreq=$(echo $OPTARG/|cut -d/ -f3);; + q) queue=$(echo $OPTARG/|cut -d/ -f1);qpreq=$(echo $OPTARG/|cut -d/ -f2);; + r) rmem=$(echo $OPTARG/|cut -d/ -f1);rcpu=$(echo $OPTARG/|cut -d/ -f2);; + t) timew=$OPTARG;; + u) userid=$OPTARG;; + v) verbose=YES;; + w) when=$OPTARG;; + \?) echo $0: invalid option >&2;echo "$usage" >&2;exit 1;; + esac +done +shift $(($OPTIND-1)) +if [[ $# -eq 0 ]];then + echo $0: missing executable name >&2;echo "$usage" >&2;exit 1 +fi +exec=$1 +if [[ ! -s $exec ]]&&which $exec >/dev/null 2>&1;then + exec=$(which $exec) +fi +shift +args="$*" +bn=$(basename $exec) +export jobname=${jobname:-$bn} +output=${output:-$jobname.out} +myuser=$LOGNAME +myhost=$(hostname) + +if [ -d /gpfs/f6/bil-fire8/scratch/${USER}/$LOGNAME ]; then + DATA=/gpfs/f6/bil-fire8/scratch/${USER}/$LOGNAME/tmp +fi +DATA=${DATA:-$ptmp/tmp} + +mkdir -p $DATA + +queue=${queue:-batch} +timew=${timew:-01:20:00} +task_node=${task_node:-$procs} +size=$((nodes*task_node)) +envars=$envars +threads=${rcpu:-1} + +export TZ=GMT +cfile=$DATA/sub$$ +> $cfile +echo "#!/bin/bash -l" >> $cfile +echo "" >> $cfile +echo "#SBATCH --output=$output" >> $cfile +echo "#SBATCH --job-name=$jobname" >> $cfile +echo "#SBATCH --qos=$queue" >> $cfile +echo "#SBATCH --clusters=c6" >> $cfile +echo "#SBATCH --time=$timew" >> $cfile +echo "#SBATCH --nodes=$nodes --ntasks-per-node=$procs --cpus-per-task=$threads" >> $cfile +echo "#SBATCH --account=$accnt" >> $cfile +echo "#SBATCH --mem=0" >> $cfile + +echo "" >>$cfile +echo "export ntasks=$(( $nodes * $procs ))" >> $cfile +echo "export ppn=$procs" >> $cfile +echo "export threads=$threads" >> $cfile +echo "export OMP_NUM_THREADS=$threads" >> $cfile +echo "ulimit -s unlimited" >> $cfile + +echo "" >>$cfile +echo ". "$(awk '{ print $1, $2, $3, $4, $5, $6, $7, $8, $9 }' $regdir/regression_var.out) >>$cfile +echo "" >>$cfile + +echo "module reset" >> $cfile +echo "module use $modulefiles" >> $cfile +echo "module load gsi_gaeac6.intel" >> $cfile +echo "module list" >> $cfile +echo "" >>$cfile + +cat $exec >> $cfile + +if [[ $nosub = YES ]];then + cat $cfile + exit +elif [[ $verbose = YES ]];then + set -x + cat $cfile +fi + +if [[ $stdin = YES ]];then + cat +fi >>$cfile +if [[ $nosub = YES ]];then + cat $cfile + exit +elif [[ $verbose = YES ]];then + set -x + cat $cfile +fi +sbatch=${sbatch:-sbatch} + +ofile=$DATA/subout$$ +>$ofile +chmod 777 $ofile +$sbatch $cfile >$ofile +rc=$? +cat $ofile +if [[ -w $SUBLOG ]];then + jobn=$(grep -i submitted $ofile|head -n1|cut -d\" -f2) + date -u +"%Y%m%d%H%M%S : $subcmd : $jobn" >>$SUBLOG +fi +rm $cfile $ofile +[[ $MKDATA = YES ]] && rmdir $DATA +exit $rc