Skip to content

Commit

Permalink
C6 updates to run GSI reg tests
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidBurrows-NCO committed Nov 5, 2024
1 parent 351781b commit 162bcf7
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 18 deletions.
5 changes: 3 additions & 2 deletions modulefiles/gsi_gaeac6.intel.lua
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
help([[
]])

prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/spack-stack-1.6.0/envs/gsi-addon-dev/install/modulefiles/Core")
--prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/spack-stack-1.6.0/envs/gsi-addon-dev/install/modulefiles/Core")
prepend_path("MODULEPATH", "/ncrc/proj/epic/spack-stack/c6/spack-stack-1.6.0/envs/gsi-addon/install/modulefiles/Core")

local stack_python_ver=os.getenv("stack_python_ver") or "3.11.6"
local stack_intel_ver=os.getenv("stack_intel_ver") or "2023.2.0"
local stack_cray_mpich_ver=os.getenv("stack_cray_mpich_ver") or "8.1.28"
local stack_cray_mpich_ver=os.getenv("stack_cray_mpich_ver") or "8.1.29"
local cmake_ver=os.getenv("cmake_ver") or "3.23.1"
local prod_util_ver=os.getenv("prod_util_ver") or "2.1.1"

Expand Down
52 changes: 42 additions & 10 deletions regression/regression_param.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,15 @@ case $machine in
memnode=96
numcore=40
;;
Gaea)
sub_cmd="sub_gaea"
gaeac5)
sub_cmd="sub_gaeac5"
memnode=251
numcore=128
;;
gaeac6)
sub_cmd="sub_gaeac6"
memnode=384
numcore=192
;;
wcoss2)
sub_cmd="sub_wcoss2"
Expand Down Expand Up @@ -68,7 +73,10 @@ case $regtest in
elif [[ "$machine" = "Discover" ]]; then
topts[1]="0:30:00" ; popts[1]="48/2" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="60/3" ; ropts[2]="/2"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:10:00" ; popts[1]="12/8/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/10/" ; ropts[2]="/2"
elif [[ "$machine" = "wcoss2" ]]; then
Expand Down Expand Up @@ -98,9 +106,12 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:60:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:60:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
Expand Down Expand Up @@ -128,7 +139,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" ]]; then
Expand Down Expand Up @@ -157,7 +171,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="5/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="10/4/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:15:00" ; popts[1]="64/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="128/2/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" ]]; then
Expand Down Expand Up @@ -187,7 +204,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:15:00" ; popts[1]="4/4/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="6/6/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:15:00" ; popts[1]="28/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="28/2/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:15:00" ; popts[1]="28/1/" ; ropts[1]="/1"
topts[2]="0:15:00" ; popts[2]="28/2/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" ]]; then
Expand Down Expand Up @@ -217,7 +237,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:30:00" ; popts[1]="6/12/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="8/12/" ; ropts[2]="/1"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:30:00" ; popts[1]="14/8/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="14/14/" ; ropts[2]="/1"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:30:00" ; popts[1]="14/8/" ; ropts[1]="/1"
topts[2]="0:30:00" ; popts[2]="14/14/" ; ropts[2]="/1"
elif [[ "$machine" = "wcoss2" ]]; then
Expand Down Expand Up @@ -247,7 +270,10 @@ case $regtest in
elif [[ "$machine" = "Jet" ]]; then
topts[1]="0:10:00" ; popts[1]="12/3/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="12/5/" ; ropts[2]="/2"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
topts[1]="0:10:00" ; popts[1]="16/2/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="16/4/" ; ropts[2]="/2"
elif [[ "$machine" = "gaeac6" ]]; then
topts[1]="0:10:00" ; popts[1]="16/2/" ; ropts[1]="/1"
topts[2]="0:10:00" ; popts[2]="16/4/" ; ropts[2]="/2"
elif [[ "$machine" = "wcoss2" ]]; then
Expand Down Expand Up @@ -310,7 +336,13 @@ elif [[ "$machine" = "Jet" ]]; then
export MPI_BUFS_PER_HOST=256
export MPI_GROUP_MAX=256
export APRUN="srun -n \$ntasks --cpus-per-task=\$threads"
elif [[ "$machine" = "Gaea" ]]; then
elif [[ "$machine" = "gaeac5" ]]; then
export OMP_STACKSIZE=1024M
export MPI_BUFS_PER_PROC=256
export MPI_BUFS_PER_HOST=256
export MPI_GROUP_MAX=256
export APRUN="srun --export=ALL -n \$ntasks"
elif [[ "$machine" = "gaeac6" ]]; then
export OMP_STACKSIZE=1024M
export MPI_BUFS_PER_PROC=256
export MPI_BUFS_PER_HOST=256
Expand Down
18 changes: 15 additions & 3 deletions regression/regression_var.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ elif [[ -d /mnt/lfs4 || -d /jetmon || -d /mnt/lfs5 ]]; then # Jet
export machine="Jet"
elif [[ -d /discover ]]; then # NCCS Discover
export machine="Discover"
elif [[ -d /ncrc ]]; then # Gaea
export machine="Gaea"
elif [[ -d /gpfs/f5 ]]; then # GaeaC5
export machine="gaeac5"
elif [[ -d /gpfs/f6 ]]; then # GaeaC6
export machine="gaeac6"
elif [[ -d /data/prod ]]; then # S4
export machine="S4"
elif [[ -d /work ]]; then # Orion or Hercules
Expand All @@ -53,7 +55,7 @@ fi
echo "Running Regression Tests on '$machine'";

case $machine in
Gaea)
gaeac5)
export queue="normal"
export group="ufs-ard"
export noscrub="/gpfs/f5/${group}/scratch/${USER}/$LOGNAME/gsi_tmp/noscrub"
Expand All @@ -63,6 +65,16 @@ case $machine in
export check_resource="no"
export accnt="ufs-ard"
;;
gaeac6)
export queue="normal"
export group="bil-fire8"
export noscrub="/gpfs/f6/${group}/scratch/${USER}/${LOGNAME}/gsi_tmp/noscrub"
export ptmp="/gpfs/f6/${group}/scratch/${USER}/${LOGNAME}/gsi_tmp/ptmp"
export casesdir="/gpfs/f6/bil-fire8/world-shared/GSI_data/CASES/regtest"

export check_resource="no"
export accnt="bil-fire8"
;;
wcoss2)
export local_or_default="${local_or_default:-/lfs/h2/emc/da/noscrub/$LOGNAME}"
if [ -d $local_or_default ]; then
Expand Down
6 changes: 3 additions & 3 deletions ush/sub_gaea → ush/sub_gaeac5
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ output=${output:-$jobname.out}
myuser=$LOGNAME
myhost=$(hostname)

if [ -d /gpfs/f5/epic/scratch/${USER}/$LOGNAME ]; then
DATA=/gpfs/f5/epic/scratch/${USER}/$LOGNAME/tmp
if [ -d /gpfs/f5/ufs-ard/scratch/${USER}/$LOGNAME ]; then
DATA=/gpfs/f5/ufs-ard/scratch/${USER}/$LOGNAME/tmp
fi
DATA=${DATA:-$ptmp/tmp}

Expand Down Expand Up @@ -129,7 +129,7 @@ echo "" >>$cfile

echo "module reset" >> $cfile
echo "module use $modulefiles" >> $cfile
echo "module load gsi_gaea.intel" >> $cfile
echo "module load gsi_gaeac5.intel" >> $cfile
echo "module list" >> $cfile
echo "" >>$cfile

Expand Down
170 changes: 170 additions & 0 deletions ush/sub_gaeac6
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#!/bin/sh --login
set -x
usage="\
Usage: $0 [options] executable [args]
where the options are:
-a account account (default: none)
-b binding run smt binding or not (default:NO)
-d dirin initial directory (default: cwd)
-e envars copy comma-separated environment variables
-g group group name
-i append standard input to command file
-j jobname specify jobname (default: executable basename)
-m machine machine on which to run (default: current)
-n write command file to stdout rather than submitting it
-o output specify output file (default: jobname.out)
-p procs[/nodes[/ppreq]
number of MPI tasks and optional nodes or Bblocking and
ppreq option (N or S) (defaults: serial, Bunlimited, S)
-q queue[/qpreq] queue name and optional requirement, e.g. dev/P
(defaults: 1 if serial or dev if parallel and none)
(queue 3 or 4 is dev or prod with twice tasks over ip)
(options: P=parallel, B=bigmem, b=batch)
-r rmem[/rcpu] resources memory and cpus/task (default: '1024 mb', 1)
-t timew wall time limit in [[hh:]mm:]ss format (default: 900)
-u userid userid to run under (default: self)
-v verbose mode
-w when when to run, in yyyymmddhh[mm], +hh[mm], thh[mm], or
Thh[mm] (full, incremental, today or tomorrow) format
(default: now)
Function: This command submits a job to the batch queue."
subcmd="$*"
stdin=NO
nosub=NO
account=""
binding="NO"
dirin=""
envars=""
group=""
jobname=""
machine=""
output=""
procs=0
nodes=""
ppreq=""
queue=""
qpreq=""
rmem="1024"
rcpu="1"
timew="900"
userid=""
verbose=NO
when=""
while getopts a:b:d:e:g:ij:m:no:p:q:r:t:u:vw: opt;do
case $opt in
a) account="$OPTARG";;
b) binding="$OPTARG";;
d) dirin="$OPTARG";;
e) envars="$OPTARG";;
g) group="$OPTARG";;
i) stdin=YES;;
j) jobname=$OPTARG;;
m) machine="$OPTARG";;
n) nosub=YES;;
o) output=$OPTARG;;
p) procs=$(echo $OPTARG/|cut -d/ -f1);nodes=$(echo $OPTARG/|cut -d/ -f2);ppreq=$(echo $OPTARG/|cut -d/ -f3);;
q) queue=$(echo $OPTARG/|cut -d/ -f1);qpreq=$(echo $OPTARG/|cut -d/ -f2);;
r) rmem=$(echo $OPTARG/|cut -d/ -f1);rcpu=$(echo $OPTARG/|cut -d/ -f2);;
t) timew=$OPTARG;;
u) userid=$OPTARG;;
v) verbose=YES;;
w) when=$OPTARG;;
\?) echo $0: invalid option >&2;echo "$usage" >&2;exit 1;;
esac
done
shift $(($OPTIND-1))
if [[ $# -eq 0 ]];then
echo $0: missing executable name >&2;echo "$usage" >&2;exit 1
fi
exec=$1
if [[ ! -s $exec ]]&&which $exec >/dev/null 2>&1;then
exec=$(which $exec)
fi
shift
args="$*"
bn=$(basename $exec)
export jobname=${jobname:-$bn}
output=${output:-$jobname.out}
myuser=$LOGNAME
myhost=$(hostname)

if [ -d /gpfs/f6/bil-fire8/scratch/${USER}/$LOGNAME ]; then
DATA=/gpfs/f6/bil-fire8/scratch/${USER}/$LOGNAME/tmp
fi
DATA=${DATA:-$ptmp/tmp}

mkdir -p $DATA

queue=${queue:-batch}
timew=${timew:-01:20:00}
task_node=${task_node:-$procs}
size=$((nodes*task_node))
envars=$envars
threads=${rcpu:-1}

export TZ=GMT
cfile=$DATA/sub$$
> $cfile
echo "#!/bin/bash -l" >> $cfile
echo "" >> $cfile
echo "#SBATCH --output=$output" >> $cfile
echo "#SBATCH --job-name=$jobname" >> $cfile
echo "#SBATCH --qos=$queue" >> $cfile
echo "#SBATCH --clusters=c6" >> $cfile
echo "#SBATCH --time=$timew" >> $cfile
echo "#SBATCH --nodes=$nodes --ntasks-per-node=$procs --cpus-per-task=$threads" >> $cfile
echo "#SBATCH --account=$accnt" >> $cfile
echo "#SBATCH --mem=0" >> $cfile

echo "" >>$cfile
echo "export ntasks=$(( $nodes * $procs ))" >> $cfile
echo "export ppn=$procs" >> $cfile
echo "export threads=$threads" >> $cfile
echo "export OMP_NUM_THREADS=$threads" >> $cfile
echo "ulimit -s unlimited" >> $cfile

echo "" >>$cfile
echo ". "$(awk '{ print $1, $2, $3, $4, $5, $6, $7, $8, $9 }' $regdir/regression_var.out) >>$cfile
echo "" >>$cfile

echo "module reset" >> $cfile
echo "module use $modulefiles" >> $cfile
echo "module load gsi_gaeac6.intel" >> $cfile
echo "module list" >> $cfile
echo "" >>$cfile

cat $exec >> $cfile

if [[ $nosub = YES ]];then
cat $cfile
exit
elif [[ $verbose = YES ]];then
set -x
cat $cfile
fi

if [[ $stdin = YES ]];then
cat
fi >>$cfile
if [[ $nosub = YES ]];then
cat $cfile
exit
elif [[ $verbose = YES ]];then
set -x
cat $cfile
fi
sbatch=${sbatch:-sbatch}

ofile=$DATA/subout$$
>$ofile
chmod 777 $ofile
$sbatch $cfile >$ofile
rc=$?
cat $ofile
if [[ -w $SUBLOG ]];then
jobn=$(grep -i submitted $ofile|head -n1|cut -d\" -f2)
date -u +"%Y%m%d%H%M%S : $subcmd : $jobn" >>$SUBLOG
fi
rm $cfile $ofile
[[ $MKDATA = YES ]] && rmdir $DATA
exit $rc

0 comments on commit 162bcf7

Please sign in to comment.