setting structure to put upper limit to ray cluster, uninsg ray.init,…

… still missing gpu info
cbcrg · Jun 6, 2024 · 8b2d211 · 8b2d211
1 parent 8c5f03d
commit 8b2d211
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 8 deletions.
diff --git a/bin/launch_check_model.py b/bin/launch_check_model.py
@@ -20,6 +20,8 @@ def get_args():
     parser.add_argument("-m", "--model", type=str, required=True, metavar="FILE", help="Path to model file")
     parser.add_argument("-e", "--experiment", type=str, required=True, metavar="FILE", help="Experiment config file. From this the experiment class name is extracted.")
     parser.add_argument("-c", "--config", type=str, required=True, metavar="FILE", help="Path to yaml config training file")
+    parser.add_argument("--cpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="ray can have a limiter on the number of CPUs it can use. This might be usefull in many occasions, especially on a cluster system. The default value is None meaning ray will use all CPUs available. It can be set to 0 to use only GPUs.")
+    parser.add_argument("--gpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="ray can have a limiter on the number of GPUs it can use. This might be usefull in many occasions, especially on a cluster system. The default value is None meaning ray will use all CGPUs available. It can be set to 0 to use only CPUs.")
     parser.add_argument("-n", "--num_samples", type=int, required=False, nargs='?', const=3, default=3, metavar="TUNE_PARAM", help="the config given for the tuning will have the field tune.tune_params.num_samples overwritten by this value. This means a more or less extensive representation of all possible combination of choiches for the tuning. For each run inside tune a snapshot of the config is taken and some params are chosen like loss function gradient descent, batch size ecc.. . Some of this combination may not be compatible with either the data or the model. So the higher this value is the most likely that every value for a given param is tested. But if there are not that many choiches in the tune config there is no point in putting an high value. Default is 3.")
     parser.add_argument("--ray_results_dirpath", type=str, required=False, nargs='?', const=None, default=None, metavar="DIR_PATH", help="the location where ray_results output dir should be written. if set to None (default) ray will be place it in ~/ray_results. ")
 
@@ -28,7 +30,7 @@ def get_args():
 
 
 
-def main(data_path: str, model_path: str, experiment_config: str, config_path: str, num_samples: int, ray_results_dirpath: str = None) -> None:
+def main(data_path: str, model_path: str, experiment_config: str, config_path: str, cpus: int = None, gpus: int = None, num_samples: int = 3, ray_results_dirpath: str = None) -> None:
 
     # TODO update to yaml the experimnt config
     # load json into dictionary
@@ -82,6 +84,8 @@ def main(data_path: str, model_path: str, experiment_config: str, config_path: s
                                   model_class,
                                   downsampled_csv,
                                   initialized_experiment_class,
+                                  max_cpus=cpus,
+                                  max_gpus=gpus,
                                   ray_results_dir=os.path.abspath(ray_results_dirpath)) # TODO this version of pytorch does not support relative paths, in future maybe good to remove abspath
 
     # Tune the model and get the tuning results
@@ -100,4 +104,11 @@ def main(data_path: str, model_path: str, experiment_config: str, config_path: s
 
 if __name__ == "__main__":
     args = get_args()
-    main(args.data, args.model, args.experiment, args.config, args.num_samples, args.ray_results_dirpath)
+    main(args.data, 
+         args.model, 
+         args.experiment, 
+         args.config, 
+         args.cpus, 
+         args.gpus, 
+         args.num_samples, 
+         args.ray_results_dirpath)
diff --git a/bin/launch_tuning.py b/bin/launch_tuning.py
@@ -22,12 +22,14 @@ def get_args():
     parser.add_argument("-bc", "--best_config", type=str, required=False, nargs='?', const='best_config.json', default='best_config.json', metavar="FILE", help='The path to write the best config to')
     parser.add_argument("-bm", "--best_metrics", type=str, required=False, nargs='?', const='best_metrics.csv', default='best_metrics.csv', metavar="FILE", help='The path to write the best metrics to')
     parser.add_argument("-bo", "--best_optimizer", type=str, required=False, nargs='?', const='best_optimizer.pt', default='best_optimizer.pt', metavar="FILE", help='The path to write the best optimizer to')
+    parser.add_argument("--cpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="ray can have a limiter on the number of CPUs it can use. This might be usefull in many occasions, especially on a cluster system. The default value is None meaning ray will use all CPUs available. It can be set to 0 to use only GPUs.")
+    parser.add_argument("--gpus", type=int, required=False, nargs='?', const=None, default=None, metavar="NUM_OF_MAX_CPU", help="ray can have a limiter on the number of GPUs it can use. This might be usefull in many occasions, especially on a cluster system. The default value is None meaning ray will use all CGPUs available. It can be set to 0 to use only CPUs.")
     parser.add_argument("--ray_results_dirpath", type=str, required=False, nargs='?', const=None, default=None, metavar="DIR_PATH", help="the location where ray_results output dir should be written. if set to None (default) ray will be place it in ~/ray_results ")
 
     args = parser.parse_args()
     return args
 
-def main(config_path: str, model_path: str, data_path: str, experiment_config: str, output: str, best_config_path: str, best_metrics_path: str, best_optimizer_path: str, ray_results_dirpath: str = None) -> None:
+def main(config_path: str, model_path: str, data_path: str, experiment_config: str, output: str, best_config_path: str, best_metrics_path: str, best_optimizer_path: str, cpus: int = None, gpus: int = None, ray_results_dirpath: str = None) -> None:
     """
     This launcher use ray tune to find the best hyperparameters for a given model.
     """
@@ -49,6 +51,8 @@ def main(config_path: str, model_path: str, data_path: str, experiment_config: s
                                   model_class,
                                   data_path,
                                   initialized_experiment_class,
+                                  max_cpus=cpus,
+                                  max_gpus=gpus,
                                   ray_results_dir=os.path.abspath(ray_results_dirpath))  # TODO this version of pytorch does not support relative paths, in future maybe good to remove abspath
 
     # Tune the model and get the tuning results
@@ -64,4 +68,14 @@ def main(config_path: str, model_path: str, data_path: str, experiment_config: s
 
 if __name__ == "__main__":
     args = get_args()
-    main(args.config, args.model, args.data, args.experiment_config, args.output, args.best_config, args.best_metrics, args.best_optimizer, args.ray_results_dirpath)
+    main(args.config, 
+         args.model, 
+         args.data, 
+         args.experiment_config, 
+         args.output, 
+         args.best_config, 
+         args.best_metrics, 
+         args.best_optimizer, 
+         args.cpus,
+         args.gpus,
+         args.ray_results_dirpath)
diff --git a/bin/src/learner/raytune_learner.py b/bin/src/learner/raytune_learner.py
@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim 
-from ray import train, tune
+from ray import train, tune, init
 from ray.tune import Trainable
 from torch.utils.data import DataLoader
 
@@ -12,7 +12,7 @@
 from .predict import PredictWrapper
 
 class TuneWrapper():
-    def __init__(self, config_path: str, model_class: nn.Module, data_path: str, experiment_object: object, ray_results_dir: str = None) -> None:
+    def __init__(self, config_path: str, model_class: nn.Module, data_path: str, experiment_object: object, max_cpus: int = None, max_gpus: int = None, ray_results_dir: str = None) -> None:
         """
         Initialize the TuneWrapper with the paths to the config, model, and data.
         """
@@ -34,12 +34,18 @@ def __init__(self, config_path: str, model_class: nn.Module, data_path: str, exp
                                           storage_path=ray_results_dir
                                         )                                       #TODO implement run_config (in tune/run_params for the yaml file)
 
+        self.max_cpus = max_cpus
+        self.max_gpus = max_gpus
         self.tuner = self.tuner_initialization()
 
     def tuner_initialization(self) -> tune.Tuner:
         """
         Prepare the tuner with the configs.
         """
+
+        # initialize the ray cluster with the limiter on CPUs or on GPUs if needed, otherwise everything that is available. None is what ray uses to get all resources available for either CPU or GPU
+        init(num_cpus=self.max_cpus, num_gpus=self.max_gpus)
+
         return tune.Tuner(TuneModel,
                             tune_config= self.tune_config,
                             param_space=self.config,

diff --git a/conf/test.config b/conf/test.config
@@ -37,7 +37,7 @@ process {
         time   = { check_max( 2.h  * task.attempt, 'time'    ) }
     }
     withLabel:process_high {
-        cpus   = { check_max( 4                  , 'cpus'    ) }
+        cpus   = { check_max( 6                  , 'cpus'    ) }
         memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
         time   = { check_max( 2.h  * task.attempt, 'time'    ) }
     }

diff --git a/modules/local/check_torch_model.nf b/modules/local/check_torch_model.nf
@@ -15,11 +15,15 @@ process CHECK_TORCH_MODEL {
     def suffix = task.ext.suffix
     def args = task.ext.args ?: ''
     """
+    # TODO  gpus  should point to a variable that controls the number of gpu 
+
     launch_check_model.py \
         -d ${original_csv} \
         -m ${model} \
         -e ${experiment_config} \
         -c ${ray_tune_config} \
+        --cpus ${task.cpus} \
+        --gpus 0 \
         $args
     """
 

diff --git a/modules/local/torch_tune.nf b/modules/local/torch_tune.nf
@@ -24,6 +24,8 @@ process TORCH_TUNE {
     def suffix = task.ext.suffix
     def args = task.ext.args ?: ''
     """
+    # TODO  gpus  should point to a variable that controls the number of gpu
+
     launch_tuning.py \
         -c ${ray_tune_config} \
         -m ${model} \
@@ -33,6 +35,8 @@ process TORCH_TUNE {
         -bo ${prefix}-optimizer.pt \
         -bm ${prefix}-metrics.csv \
         -bc ${prefix}-config.json \
+        --cpus ${task.cpus} \
+        --gpus 0 \
         $args
     """
 

diff --git a/nextflow.config b/nextflow.config
@@ -21,7 +21,7 @@ params {
     max_retries             = 0
     err_start               = 'finish'
     max_cpus                = 6                     // this flasg and the following are for regulating resources, profiles can overwrite these.
-    max_memory              = 8.GB
+    max_memory              = 16.GB
     max_time                = 24.h
 
     // General