Skip to content

Commit

Permalink
setting structure to put upper limit to ray cluster, uninsg ray.init,…
Browse files Browse the repository at this point in the history
… still missing gpu info
  • Loading branch information
alessiovignoli committed Jun 6, 2024
1 parent 8c5f03d commit 8b2d211
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 8 deletions.
15 changes: 13 additions & 2 deletions bin/launch_check_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def get_args():
parser.add_argument("-m", "--model", type=str, required=True, metavar="FILE", help="Path to model file")
parser.add_argument("-e", "--experiment", type=str, required=True, metavar="FILE", help="Experiment config file. From this the experiment class name is extracted.")
parser.add_argument("-c", "--config", type=str, required=True, metavar="FILE", help="Path to yaml config training file")
parser.add_argument("--cpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="ray can have a limiter on the number of CPUs it can use. This might be usefull in many occasions, especially on a cluster system. The default value is None meaning ray will use all CPUs available. It can be set to 0 to use only GPUs.")
parser.add_argument("--gpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="ray can have a limiter on the number of GPUs it can use. This might be usefull in many occasions, especially on a cluster system. The default value is None meaning ray will use all CGPUs available. It can be set to 0 to use only CPUs.")
parser.add_argument("-n", "--num_samples", type=int, required=False, nargs='?', const=3, default=3, metavar="TUNE_PARAM", help="the config given for the tuning will have the field tune.tune_params.num_samples overwritten by this value. This means a more or less extensive representation of all possible combination of choiches for the tuning. For each run inside tune a snapshot of the config is taken and some params are chosen like loss function gradient descent, batch size ecc.. . Some of this combination may not be compatible with either the data or the model. So the higher this value is the most likely that every value for a given param is tested. But if there are not that many choiches in the tune config there is no point in putting an high value. Default is 3.")
parser.add_argument("--ray_results_dirpath", type=str, required=False, nargs='?', const=None, default=None, metavar="DIR_PATH", help="the location where ray_results output dir should be written. if set to None (default) ray will be place it in ~/ray_results. ")

Expand All @@ -28,7 +30,7 @@ def get_args():



def main(data_path: str, model_path: str, experiment_config: str, config_path: str, num_samples: int, ray_results_dirpath: str = None) -> None:
def main(data_path: str, model_path: str, experiment_config: str, config_path: str, cpus: int = None, gpus: int = None, num_samples: int = 3, ray_results_dirpath: str = None) -> None:

# TODO update to yaml the experimnt config
# load json into dictionary
Expand Down Expand Up @@ -82,6 +84,8 @@ def main(data_path: str, model_path: str, experiment_config: str, config_path: s
model_class,
downsampled_csv,
initialized_experiment_class,
max_cpus=cpus,
max_gpus=gpus,
ray_results_dir=os.path.abspath(ray_results_dirpath)) # TODO this version of pytorch does not support relative paths, in future maybe good to remove abspath

# Tune the model and get the tuning results
Expand All @@ -100,4 +104,11 @@ def main(data_path: str, model_path: str, experiment_config: str, config_path: s

if __name__ == "__main__":
args = get_args()
main(args.data, args.model, args.experiment, args.config, args.num_samples, args.ray_results_dirpath)
main(args.data,
args.model,
args.experiment,
args.config,
args.cpus,
args.gpus,
args.num_samples,
args.ray_results_dirpath)
18 changes: 16 additions & 2 deletions bin/launch_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ def get_args():
parser.add_argument("-bc", "--best_config", type=str, required=False, nargs='?', const='best_config.json', default='best_config.json', metavar="FILE", help='The path to write the best config to')
parser.add_argument("-bm", "--best_metrics", type=str, required=False, nargs='?', const='best_metrics.csv', default='best_metrics.csv', metavar="FILE", help='The path to write the best metrics to')
parser.add_argument("-bo", "--best_optimizer", type=str, required=False, nargs='?', const='best_optimizer.pt', default='best_optimizer.pt', metavar="FILE", help='The path to write the best optimizer to')
parser.add_argument("--cpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="ray can have a limiter on the number of CPUs it can use. This might be usefull in many occasions, especially on a cluster system. The default value is None meaning ray will use all CPUs available. It can be set to 0 to use only GPUs.")
parser.add_argument("--gpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="ray can have a limiter on the number of GPUs it can use. This might be usefull in many occasions, especially on a cluster system. The default value is None meaning ray will use all CGPUs available. It can be set to 0 to use only CPUs.")
parser.add_argument("--ray_results_dirpath", type=str, required=False, nargs='?', const=None, default=None, metavar="DIR_PATH", help="the location where ray_results output dir should be written. if set to None (default) ray will be place it in ~/ray_results ")

args = parser.parse_args()
return args

def main(config_path: str, model_path: str, data_path: str, experiment_config: str, output: str, best_config_path: str, best_metrics_path: str, best_optimizer_path: str, ray_results_dirpath: str = None) -> None:
def main(config_path: str, model_path: str, data_path: str, experiment_config: str, output: str, best_config_path: str, best_metrics_path: str, best_optimizer_path: str, cpus: int = None, gpus: int = None, ray_results_dirpath: str = None) -> None:
"""
This launcher use ray tune to find the best hyperparameters for a given model.
"""
Expand All @@ -49,6 +51,8 @@ def main(config_path: str, model_path: str, data_path: str, experiment_config: s
model_class,
data_path,
initialized_experiment_class,
max_cpus=cpus,
max_gpus=gpus,
ray_results_dir=os.path.abspath(ray_results_dirpath)) # TODO this version of pytorch does not support relative paths, in future maybe good to remove abspath

# Tune the model and get the tuning results
Expand All @@ -64,4 +68,14 @@ def main(config_path: str, model_path: str, data_path: str, experiment_config: s

if __name__ == "__main__":
args = get_args()
main(args.config, args.model, args.data, args.experiment_config, args.output, args.best_config, args.best_metrics, args.best_optimizer, args.ray_results_dirpath)
main(args.config,
args.model,
args.data,
args.experiment_config,
args.output,
args.best_config,
args.best_metrics,
args.best_optimizer,
args.cpus,
args.gpus,
args.ray_results_dirpath)
10 changes: 8 additions & 2 deletions bin/src/learner/raytune_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import torch
import torch.nn as nn
import torch.optim as optim
from ray import train, tune
from ray import train, tune, init
from ray.tune import Trainable
from torch.utils.data import DataLoader

Expand All @@ -12,7 +12,7 @@
from .predict import PredictWrapper

class TuneWrapper():
def __init__(self, config_path: str, model_class: nn.Module, data_path: str, experiment_object: object, ray_results_dir: str = None) -> None:
def __init__(self, config_path: str, model_class: nn.Module, data_path: str, experiment_object: object, max_cpus: int = None, max_gpus: int = None, ray_results_dir: str = None) -> None:
"""
Initialize the TuneWrapper with the paths to the config, model, and data.
"""
Expand All @@ -34,12 +34,18 @@ def __init__(self, config_path: str, model_class: nn.Module, data_path: str, exp
storage_path=ray_results_dir
) #TODO implement run_config (in tune/run_params for the yaml file)

self.max_cpus = max_cpus
self.max_gpus = max_gpus
self.tuner = self.tuner_initialization()

def tuner_initialization(self) -> tune.Tuner:
"""
Prepare the tuner with the configs.
"""

# initialize the ray cluster with the limiter on CPUs or on GPUs if needed, otherwise everything that is available. None is what ray uses to get all resources available for either CPU or GPU
init(num_cpus=self.max_cpus, num_gpus=self.max_gpus)

return tune.Tuner(TuneModel,
tune_config= self.tune_config,
param_space=self.config,
Expand Down
2 changes: 1 addition & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ process {
time = { check_max( 2.h * task.attempt, 'time' ) }
}
withLabel:process_high {
cpus = { check_max( 4 , 'cpus' ) }
cpus = { check_max( 6 , 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
time = { check_max( 2.h * task.attempt, 'time' ) }
}
Expand Down
4 changes: 4 additions & 0 deletions modules/local/check_torch_model.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ process CHECK_TORCH_MODEL {
def suffix = task.ext.suffix
def args = task.ext.args ?: ''
"""
# TODO gpus should point to a variable that controls the number of gpu
launch_check_model.py \
-d ${original_csv} \
-m ${model} \
-e ${experiment_config} \
-c ${ray_tune_config} \
--cpus ${task.cpus} \
--gpus 0 \
$args
"""

Expand Down
4 changes: 4 additions & 0 deletions modules/local/torch_tune.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ process TORCH_TUNE {
def suffix = task.ext.suffix
def args = task.ext.args ?: ''
"""
# TODO gpus should point to a variable that controls the number of gpu
launch_tuning.py \
-c ${ray_tune_config} \
-m ${model} \
Expand All @@ -33,6 +35,8 @@ process TORCH_TUNE {
-bo ${prefix}-optimizer.pt \
-bm ${prefix}-metrics.csv \
-bc ${prefix}-config.json \
--cpus ${task.cpus} \
--gpus 0 \
$args
"""

Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ params {
max_retries = 0
err_start = 'finish'
max_cpus = 6 // this flasg and the following are for regulating resources, profiles can overwrite these.
max_memory = 8.GB
max_memory = 16.GB
max_time = 24.h

// General
Expand Down

0 comments on commit 8b2d211

Please sign in to comment.