generated from lwaekfjlk/python-project-template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
287 additions
and
1 deletion.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from gpu_bartender import VRAMCalculator, ModelArgs, DataArgs, OptimizerArgs, FinetuningArgs | ||
|
||
# Example usage | ||
model_args = ModelArgs( | ||
num_params=123456789, | ||
vocab_size=30522, | ||
hidden_size=768, | ||
num_attention_heads=12, | ||
num_key_value_heads=12, | ||
intermediate_size=3072, | ||
num_layers=12 | ||
) | ||
|
||
data_args = DataArgs( | ||
batch_size=32, | ||
sequence_length=128 | ||
) | ||
|
||
optimizer_args = OptimizerArgs( | ||
optimizer='Adam', | ||
optimizer_sgd_momentum=0.9 | ||
) | ||
|
||
finetuning_args = FinetuningArgs( | ||
training_precision='mixed', | ||
is_fsdp=True | ||
) | ||
|
||
calculator = VRAMCalculator( | ||
model_args=model_args, | ||
finetuning_args=finetuning_args, | ||
optimizer_args=optimizer_args, | ||
data_args=data_args, | ||
num_gpus=4, | ||
unit="MiB" | ||
) | ||
|
||
result_estimation = calculator.estimate_result() | ||
total_usage_per_gpu = calculator.get_total_usage_per_gpu(result_estimation, is_first=True) | ||
|
||
print("Result Estimation:", result_estimation) | ||
print("Total VRAM Usage per GPU:", total_usage_per_gpu, "MiB") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from .calculator import VRAMCalculator | ||
from .model_args import ModelArgs | ||
from .finetuning_args import FinetuningArgs | ||
from .optimizer_args import OptimizerArgs | ||
from .data_args import DataArgs | ||
|
||
__all__ = [ | ||
'VRAMCalculator', | ||
'ModelArgs', | ||
'FinetuningArgs', | ||
'OptimizerArgs', | ||
'DataArgs' | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
from dataclasses import dataclass, field | ||
from typing import Optional | ||
from .data_args import DataArgs | ||
from .finetuning_args import FinetuningArgs | ||
from .model_args import ModelArgs | ||
from .optimizer_args import OptimizerArgs | ||
|
||
class VRAMCalculator: | ||
def __init__( | ||
self, | ||
model_args: ModelArgs, | ||
finetuning_args: FinetuningArgs, | ||
optimizer_args: OptimizerArgs, | ||
data_args: DataArgs, | ||
num_gpus: int = 1, | ||
unit: str = "MiB" | ||
): | ||
self.model_args = model_args | ||
self.finetuning_args = finetuning_args | ||
self.optimizer_args = optimizer_args | ||
self.data_args = data_args | ||
self.num_gpus = num_gpus | ||
self.unit = unit | ||
self.divisor = 2 ** 20 if unit == "MiB" else 2 ** 30 | ||
self.precision = 0 if unit == "MiB" else 3 | ||
|
||
self.bytes_per_param = self.compute_bytes_per_param() | ||
self.gpu_divisor = self.compute_gpu_divisor() | ||
|
||
def compute_bytes_per_param(self) -> int: | ||
training_precision = self.finetuning_args.training_precision | ||
if training_precision == 'mixed': | ||
return 6 | ||
else: | ||
return 4 | ||
|
||
def calculate_bytes_per_param(self) -> int: | ||
return self.bytes_per_param | ||
|
||
def compute_gpu_divisor(self) -> int: | ||
is_fsdp = self.finetuning_args.is_fsdp | ||
is_parallel_mode = is_fsdp | ||
return self.num_gpus if self.num_gpus > 1 and is_parallel_mode else 1 | ||
|
||
def calculate_gpu_divisor(self) -> int: | ||
return self.gpu_divisor | ||
|
||
def compute_cuda_kernels(self) -> float: | ||
cuda_kernels = 1000 * 2 ** 20 | ||
return cuda_kernels | ||
|
||
def calculate_cuda_kernels(self) -> float: | ||
cuda_kernels = self.compute_cuda_kernels() | ||
return self.round_num(cuda_kernels / self.divisor) | ||
|
||
def compute_parameters(self) -> float: | ||
num_params = self.model_args.num_params | ||
parameters = (self.bytes_per_param * num_params * 10 ** 9) / self.gpu_divisor | ||
return parameters | ||
|
||
def calculate_parameters(self) -> float: | ||
parameters = self.compute_parameters() | ||
return self.round_num(parameters / self.divisor) | ||
|
||
def compute_activations(self) -> int: | ||
hidden_size = self.model_args.hidden_size | ||
num_attention_heads = self.model_args.num_attention_heads | ||
num_key_value_heads = self.model_args.num_key_value_heads | ||
intermediate_size = self.model_args.intermediate_size | ||
num_layers = self.model_args.num_layers | ||
|
||
batch_size = self.data_args.batch_size | ||
sequence_length = self.data_args.sequence_length | ||
|
||
bytes_per_param = self.bytes_per_param | ||
head_dim = hidden_size // num_attention_heads | ||
|
||
attention_input = bytes_per_param * batch_size * sequence_length * hidden_size | ||
q = bytes_per_param * batch_size * sequence_length * head_dim * num_attention_heads | ||
k = bytes_per_param * batch_size * sequence_length * head_dim * num_key_value_heads | ||
softmax_output = bytes_per_param * batch_size * num_attention_heads * sequence_length ** 2 | ||
softmax_dropout_mask = batch_size * num_attention_heads * sequence_length ** 2 | ||
dropout_output = bytes_per_param * batch_size * num_attention_heads * sequence_length ** 2 | ||
v = bytes_per_param * batch_size * sequence_length * head_dim * num_key_value_heads | ||
out_proj_input = bytes_per_param * batch_size * sequence_length * num_attention_heads * head_dim | ||
attention_dropout = batch_size * sequence_length * hidden_size | ||
|
||
attention_block = ( | ||
attention_input + q + k + softmax_output + v + out_proj_input + softmax_dropout_mask + dropout_output + attention_dropout | ||
) | ||
|
||
mlp_input = bytes_per_param * batch_size * sequence_length * hidden_size | ||
activation_input = bytes_per_param * batch_size * sequence_length * intermediate_size | ||
down_proj_input = bytes_per_param * batch_size * sequence_length * intermediate_size | ||
dropout_mask = batch_size * sequence_length * hidden_size | ||
|
||
mlp_block = mlp_input + activation_input + down_proj_input + dropout_mask | ||
|
||
layer_norms = bytes_per_param * batch_size * sequence_length * hidden_size * 2 | ||
|
||
layer = attention_block + mlp_block + layer_norms | ||
|
||
activations = layer * num_layers | ||
|
||
return activations | ||
|
||
def calculate_activations(self) -> float: | ||
activations = self.compute_activations() | ||
return self.round_num(activations / self.divisor) | ||
|
||
def compute_outputs(self) -> float: | ||
batch_size = self.data_args.batch_size | ||
sequence_length = self.data_args.sequence_length | ||
vocab_size = self.model_args.vocab_size | ||
outputs = 4 * batch_size * sequence_length * vocab_size * 2 | ||
return outputs | ||
|
||
def calculate_outputs(self) -> float: | ||
outputs = self.compute_outputs() | ||
return self.round_num(outputs / self.divisor) | ||
|
||
def compute_gradients(self) -> Optional[float]: | ||
num_params = self.model_args.num_params | ||
gradients = (4 * num_params * 10 ** 9) / self.gpu_divisor | ||
return gradients | ||
|
||
def calculate_gradients(self) -> Optional[float]: | ||
gradients = self.compute_gradients() | ||
return self.round_num(gradients / self.divisor) | ||
|
||
def compute_first_moments(self) -> Optional[float]: | ||
optimizer = self.optimizer_args.optimizer | ||
optimizer_sgd_momentum = self.optimizer_args.optimizer_sgd_momentum | ||
if not ((optimizer == 'SGD' and optimizer_sgd_momentum) or optimizer == 'Adam'): | ||
return None | ||
num_params = self.model_args.num_params | ||
first_moments = (4 * num_params * 10 ** 9) / self.gpu_divisor | ||
return first_moments | ||
|
||
def calculate_first_moments(self) -> Optional[float]: | ||
first_moments = self.compute_first_moments() | ||
if first_moments is None: | ||
return None | ||
return self.round_num(first_moments / self.divisor) | ||
|
||
def compute_second_moments(self) -> Optional[float]: | ||
optimizer = self.optimizer_args.optimizer | ||
if optimizer != 'Adam': | ||
return None | ||
num_params = self.model_args.num_params | ||
second_moments = (4 * num_params * 10 ** 9) / self.gpu_divisor | ||
return second_moments | ||
|
||
def calculate_second_moments(self) -> Optional[float]: | ||
second_moments = self.compute_second_moments() | ||
if second_moments is None: | ||
return None | ||
return self.round_num(second_moments / self.divisor) | ||
|
||
def estimate_result(self) -> dict: | ||
result_estimation = { | ||
'cudaKernels': self.calculate_cuda_kernels(), | ||
'parameters': self.calculate_parameters(), | ||
'outputs': self.calculate_outputs(), | ||
'activations': self.calculate_activations(), | ||
'gradients': self.calculate_gradients(), | ||
'firstMoments': self.calculate_first_moments(), | ||
'secondMoments': self.calculate_second_moments(), | ||
} | ||
|
||
return result_estimation | ||
|
||
def get_total_usage_per_gpu(self, result_estimation: dict, is_first: bool) -> float: | ||
total_usage = ( | ||
result_estimation['cudaKernels'] + | ||
result_estimation['parameters'] + | ||
(result_estimation['outputs'] if result_estimation['outputs'] is not None else 0) * int(is_first) + | ||
(result_estimation['activations'] if result_estimation['activations'] is not None else 0) + | ||
(result_estimation['gradients'] if result_estimation['gradients'] is not None else 0) + | ||
(result_estimation['firstMoments'] if result_estimation['firstMoments'] is not None else 0) + | ||
(result_estimation['secondMoments'] if result_estimation['secondMoments'] is not None else 0) | ||
) | ||
|
||
return self.round_num(total_usage) | ||
|
||
@staticmethod | ||
def round_num(num: float, fraction_digits: int = 3) -> float: | ||
return round(num, fraction_digits) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from dataclasses import dataclass, field | ||
|
||
@dataclass | ||
class DataArgs: | ||
batch_size: int = field(default=4) | ||
sequence_length: int = field(default=512) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from dataclasses import dataclass, field | ||
from typing import Optional | ||
|
||
@dataclass | ||
class LoraArgs: | ||
lora_alpha: Optional[int] = field(default=None) | ||
lora_dropout: Optional[float] = field(default=None) | ||
lora_rank: Optional[int] = field(default=8) | ||
lora_target: Optional[str] = field(default=None) | ||
|
||
@dataclass | ||
class QLoraArgs: | ||
qlora_alpha: Optional[int] = field(default=None) | ||
qlora_dropout: Optional[float] = field(default=None) | ||
|
||
@dataclass | ||
class FinetuningArgs(LoraArgs, QLoraArgs): | ||
training_precision: str = field(default='mixed') | ||
is_fsdp: bool = field(default=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from dataclasses import dataclass, field | ||
|
||
@dataclass | ||
class ModelArgs: | ||
num_params: int = field(default=1) | ||
vocab_size: int = field(default=1) | ||
hidden_size: int = field(default=1) | ||
num_attention_heads: int = field(default=1) | ||
num_key_value_heads: int = field(default=1) | ||
intermediate_size: int = field(default=1) | ||
num_layers: int = field(default=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from dataclasses import dataclass, field | ||
from typing import Optional | ||
|
||
@dataclass | ||
class OptimizerArgs: | ||
optimizer: str = field(default="adam") | ||
optimizer_sgd_momentum: Optional[float] = field(default=None, metadata={"help": "Momentum for SGD optimizer, if used."}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
[tool.poetry] | ||
name = "python-project-template" | ||
name = "gpu_bartender" | ||
version = "0.0.1" | ||
description = "A template for python-based research project" | ||
authors = ["Haofei Yu <[email protected]>"] | ||
|
Empty file.
Empty file.