diff --git a/examples/.gitkeep b/examples/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/examples/example.py b/examples/example.py new file mode 100644 index 0000000..901ac24 --- /dev/null +++ b/examples/example.py @@ -0,0 +1,48 @@ +from gpu_bartender import ( + DataArgs, + FinetuningArgs, + ModelArgs, + OptimizerArgs, + VRAMCalculator, +) + +# Example usage +model_args = ModelArgs( + num_params=123456789, + vocab_size=30522, + hidden_size=768, + num_attention_heads=12, + num_key_value_heads=12, + intermediate_size=3072, + num_layers=12 +) + +data_args = DataArgs( + batch_size=32, + sequence_length=128 +) + +optimizer_args = OptimizerArgs( + optimizer='Adam', + optimizer_sgd_momentum=0.9 +) + +finetuning_args = FinetuningArgs( + training_precision='mixed', + is_fsdp=True +) + +calculator = VRAMCalculator( + model_args=model_args, + finetuning_args=finetuning_args, + optimizer_args=optimizer_args, + data_args=data_args, + num_gpus=4, + unit="MiB" +) + +result_estimation = calculator.estimate_result() +total_usage_per_gpu = calculator.get_total_usage_per_gpu(result_estimation, is_first=True) + +print("Result Estimation:", result_estimation) +print("Total VRAM Usage per GPU:", total_usage_per_gpu, "MiB") diff --git a/gpu_bartender/__init__.py b/gpu_bartender/__init__.py new file mode 100644 index 0000000..4916993 --- /dev/null +++ b/gpu_bartender/__init__.py @@ -0,0 +1,13 @@ +from .calculator import VRAMCalculator +from .data_args import DataArgs +from .finetuning_args import FinetuningArgs +from .model_args import ModelArgs +from .optimizer_args import OptimizerArgs + +__all__ = [ + 'VRAMCalculator', + 'ModelArgs', + 'FinetuningArgs', + 'OptimizerArgs', + 'DataArgs' +] diff --git a/gpu_bartender/calculator.py b/gpu_bartender/calculator.py new file mode 100644 index 0000000..d6ebc98 --- /dev/null +++ b/gpu_bartender/calculator.py @@ -0,0 +1,189 @@ +from typing import Optional + +from .data_args import DataArgs +from .finetuning_args import FinetuningArgs +from .model_args import ModelArgs +from .optimizer_args import OptimizerArgs + + +class VRAMCalculator: + def __init__( + self, + model_args: ModelArgs, + finetuning_args: FinetuningArgs, + optimizer_args: OptimizerArgs, + data_args: DataArgs, + num_gpus: int = 1, + unit: str = "MiB" + ): + self.model_args = model_args + self.finetuning_args = finetuning_args + self.optimizer_args = optimizer_args + self.data_args = data_args + self.num_gpus = num_gpus + self.unit = unit + self.divisor = 2 ** 20 if unit == "MiB" else 2 ** 30 + self.precision = 0 if unit == "MiB" else 3 + + self.bytes_per_param = self.compute_bytes_per_param() + self.gpu_divisor = self.compute_gpu_divisor() + + def compute_bytes_per_param(self) -> int: + training_precision = self.finetuning_args.training_precision + if training_precision == 'mixed': + return 6 + else: + return 4 + + def calculate_bytes_per_param(self) -> int: + return self.bytes_per_param + + def compute_gpu_divisor(self) -> int: + is_fsdp = self.finetuning_args.is_fsdp + is_parallel_mode = is_fsdp + return self.num_gpus if self.num_gpus > 1 and is_parallel_mode else 1 + + def calculate_gpu_divisor(self) -> int: + return self.gpu_divisor + + def compute_cuda_kernels(self) -> float: + cuda_kernels = 1000 * 2 ** 20 + return cuda_kernels + + def calculate_cuda_kernels(self) -> float: + cuda_kernels = self.compute_cuda_kernels() + return self.round_num(cuda_kernels / self.divisor) + + def compute_parameters(self) -> float: + num_params = self.model_args.num_params + parameters = (self.bytes_per_param * num_params * 10 ** 9) / self.gpu_divisor + return parameters + + def calculate_parameters(self) -> float: + parameters = self.compute_parameters() + return self.round_num(parameters / self.divisor) + + def compute_activations(self) -> int: + hidden_size = self.model_args.hidden_size + num_attention_heads = self.model_args.num_attention_heads + num_key_value_heads = self.model_args.num_key_value_heads + intermediate_size = self.model_args.intermediate_size + num_layers = self.model_args.num_layers + + batch_size = self.data_args.batch_size + sequence_length = self.data_args.sequence_length + + bytes_per_param = self.bytes_per_param + head_dim = hidden_size // num_attention_heads + + attention_input = bytes_per_param * batch_size * sequence_length * hidden_size + q = bytes_per_param * batch_size * sequence_length * head_dim * num_attention_heads + k = bytes_per_param * batch_size * sequence_length * head_dim * num_key_value_heads + softmax_output = bytes_per_param * batch_size * num_attention_heads * sequence_length ** 2 + softmax_dropout_mask = batch_size * num_attention_heads * sequence_length ** 2 + dropout_output = bytes_per_param * batch_size * num_attention_heads * sequence_length ** 2 + v = bytes_per_param * batch_size * sequence_length * head_dim * num_key_value_heads + out_proj_input = bytes_per_param * batch_size * sequence_length * num_attention_heads * head_dim + attention_dropout = batch_size * sequence_length * hidden_size + + attention_block = ( + attention_input + q + k + softmax_output + v + out_proj_input + softmax_dropout_mask + dropout_output + attention_dropout + ) + + mlp_input = bytes_per_param * batch_size * sequence_length * hidden_size + activation_input = bytes_per_param * batch_size * sequence_length * intermediate_size + down_proj_input = bytes_per_param * batch_size * sequence_length * intermediate_size + dropout_mask = batch_size * sequence_length * hidden_size + + mlp_block = mlp_input + activation_input + down_proj_input + dropout_mask + + layer_norms = bytes_per_param * batch_size * sequence_length * hidden_size * 2 + + layer = attention_block + mlp_block + layer_norms + + activations = layer * num_layers + + return activations + + def calculate_activations(self) -> float: + activations = self.compute_activations() + return self.round_num(activations / self.divisor) + + def compute_outputs(self) -> float: + batch_size = self.data_args.batch_size + sequence_length = self.data_args.sequence_length + vocab_size = self.model_args.vocab_size + outputs = 4 * batch_size * sequence_length * vocab_size * 2 + return outputs + + def calculate_outputs(self) -> float: + outputs = self.compute_outputs() + return self.round_num(outputs / self.divisor) + + def compute_gradients(self) -> Optional[float]: + num_params = self.model_args.num_params + gradients = (4 * num_params * 10 ** 9) / self.gpu_divisor + return gradients + + def calculate_gradients(self) -> Optional[float]: + gradients = self.compute_gradients() + return self.round_num(gradients / self.divisor) + + def compute_first_moments(self) -> Optional[float]: + optimizer = self.optimizer_args.optimizer + optimizer_sgd_momentum = self.optimizer_args.optimizer_sgd_momentum + if not ((optimizer == 'SGD' and optimizer_sgd_momentum) or optimizer == 'Adam'): + return None + num_params = self.model_args.num_params + first_moments = (4 * num_params * 10 ** 9) / self.gpu_divisor + return first_moments + + def calculate_first_moments(self) -> Optional[float]: + first_moments = self.compute_first_moments() + if first_moments is None: + return None + return self.round_num(first_moments / self.divisor) + + def compute_second_moments(self) -> Optional[float]: + optimizer = self.optimizer_args.optimizer + if optimizer != 'Adam': + return None + num_params = self.model_args.num_params + second_moments = (4 * num_params * 10 ** 9) / self.gpu_divisor + return second_moments + + def calculate_second_moments(self) -> Optional[float]: + second_moments = self.compute_second_moments() + if second_moments is None: + return None + return self.round_num(second_moments / self.divisor) + + def estimate_result(self) -> dict: + result_estimation = { + 'cudaKernels': self.calculate_cuda_kernels(), + 'parameters': self.calculate_parameters(), + 'outputs': self.calculate_outputs(), + 'activations': self.calculate_activations(), + 'gradients': self.calculate_gradients(), + 'firstMoments': self.calculate_first_moments(), + 'secondMoments': self.calculate_second_moments(), + } + + return result_estimation + + def get_total_usage_per_gpu(self, result_estimation: dict, is_first: bool) -> float: + total_usage = ( + result_estimation['cudaKernels'] + + result_estimation['parameters'] + + (result_estimation['outputs'] if result_estimation['outputs'] is not None else 0) * int(is_first) + + (result_estimation['activations'] if result_estimation['activations'] is not None else 0) + + (result_estimation['gradients'] if result_estimation['gradients'] is not None else 0) + + (result_estimation['firstMoments'] if result_estimation['firstMoments'] is not None else 0) + + (result_estimation['secondMoments'] if result_estimation['secondMoments'] is not None else 0) + ) + + return self.round_num(total_usage) + + @staticmethod + def round_num(num: float, fraction_digits: int = 3) -> float: + return round(num, fraction_digits) diff --git a/gpu_bartender/data_args.py b/gpu_bartender/data_args.py new file mode 100644 index 0000000..a40e912 --- /dev/null +++ b/gpu_bartender/data_args.py @@ -0,0 +1,7 @@ +from dataclasses import dataclass, field + + +@dataclass +class DataArgs: + batch_size: int = field(default=4) + sequence_length: int = field(default=512) diff --git a/gpu_bartender/finetuning_args.py b/gpu_bartender/finetuning_args.py new file mode 100644 index 0000000..88572fc --- /dev/null +++ b/gpu_bartender/finetuning_args.py @@ -0,0 +1,20 @@ +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class LoraArgs: + lora_alpha: Optional[int] = field(default=None) + lora_dropout: Optional[float] = field(default=None) + lora_rank: Optional[int] = field(default=8) + lora_target: Optional[str] = field(default=None) + +@dataclass +class QLoraArgs: + qlora_alpha: Optional[int] = field(default=None) + qlora_dropout: Optional[float] = field(default=None) + +@dataclass +class FinetuningArgs(LoraArgs, QLoraArgs): + training_precision: str = field(default='mixed') + is_fsdp: bool = field(default=True) diff --git a/gpu_bartender/model_args.py b/gpu_bartender/model_args.py new file mode 100644 index 0000000..908266d --- /dev/null +++ b/gpu_bartender/model_args.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass, field + + +@dataclass +class ModelArgs: + num_params: int = field(default=1) + vocab_size: int = field(default=1) + hidden_size: int = field(default=1) + num_attention_heads: int = field(default=1) + num_key_value_heads: int = field(default=1) + intermediate_size: int = field(default=1) + num_layers: int = field(default=1) diff --git a/gpu_bartender/optimizer_args.py b/gpu_bartender/optimizer_args.py new file mode 100644 index 0000000..d9bb4cc --- /dev/null +++ b/gpu_bartender/optimizer_args.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class OptimizerArgs: + optimizer: str = field(default="adam") + optimizer_sgd_momentum: Optional[float] = field(default=None, metadata={"help": "Momentum for SGD optimizer, if used."}) diff --git a/pyproject.toml b/pyproject.toml index 0a5fcc3..ecebf39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "python-project-template" +name = "gpu_bartender" version = "0.0.1" description = "A template for python-based research project" authors = ["Haofei Yu "] diff --git a/src/.gitkeep b/src/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index e69de29..0000000