forked from GoogleCloudPlatform/cluster-toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathslurm-gcp-v5-high-io.yaml
153 lines (135 loc) · 4.54 KB
/
slurm-gcp-v5-high-io.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
blueprint_name: hpc-cluster-high-io-v5
vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: high-io-slurm-gcp-v5
region: us-central1
zone: us-central1-c
# By default, public IPs are set in the login and controller to allow easier
# SSH access. To turn this behavior off, set this to true.
disable_public_ips: false
# Set to true for active cluster reconfiguration.
# Note that setting this option requires additional dependencies to be installed locally.
enable_reconfigure: true
# When set, active compute nodes will be cleaned up on destroy.
# Note that setting this option requires additional dependencies to be installed locally.
enable_cleanup_compute: true
# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
deployment_groups:
- group: primary
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local or community module, prefix with ./, ../ or /
# Example - ./modules/network/pre-existing-vpc
- id: network1
source: modules/network/vpc
- id: homefs
source: modules/file-system/filestore
use: [network1]
settings:
local_mount: /home
- id: projectsfs
source: modules/file-system/filestore
use: [network1]
settings:
filestore_tier: HIGH_SCALE_SSD
size_gb: 10240
local_mount: /projects
- id: scratchfs
source: community/modules/file-system/DDN-EXAScaler
use: [network1]
settings:
local_mount: /scratch
# The lowcost partition is designed to run at a lower cost and without additional quota
# Use:
# `srun -N 4 <<Command>>` for any node in the partition.
# `srun -N 4 --mincpus 2` for node group n2s4.
- id: low_cost_node_group_n2s2
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: n2s2
machine_type: n2-standard-2
node_count_dynamic_max: 10
- id: low_cost_node_group_n2s4
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: n2s4
machine_type: n2-standard-4
node_count_dynamic_max: 10
- id: low_cost_partition
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
use:
- network1
- homefs
- scratchfs
- projectsfs
- low_cost_node_group_n2s2
- low_cost_node_group_n2s4
settings:
is_default: true
partition_name: lowcost
enable_placement: false
exclusive: false
# The compute partition is designed for performance.
# Use:
# `srun -N 4 -p compute <<Command>>` for any node in the partition.
# `srun -N 4 -p compute --mincpus 30 <<Command>>` for node group c2s60.
- id: compute_node_group_c2s60
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: c2s60
node_count_dynamic_max: 200
- id: compute_node_group_c2s30
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: c2s30
node_count_dynamic_max: 200
machine_type: c2-standard-30
- id: compute_partition
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
use:
- network1
- homefs
- scratchfs
- projectsfs
- compute_node_group_c2s60
- compute_node_group_c2s30
settings:
partition_name: compute
- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
use:
- network1
- homefs
- scratchfs
- projectsfs
- low_cost_partition
- compute_partition
settings:
machine_type: c2-standard-8
disable_controller_public_ips: $(vars.disable_public_ips)
- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
use:
- network1
- slurm_controller
settings:
machine_type: n2-standard-4
disable_login_public_ips: $(vars.disable_public_ips)
- id: hpc_dashboard
source: modules/monitoring/dashboard
outputs: [instructions]