Skip to content

Commit

Permalink
add init version
Browse files Browse the repository at this point in the history
  • Loading branch information
lwaekfjlk committed Jul 21, 2024
1 parent 80f2e90 commit 06c6307
Show file tree
Hide file tree
Showing 11 changed files with 287 additions and 1 deletion.
Empty file removed examples/.gitkeep
Empty file.
42 changes: 42 additions & 0 deletions examples/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from gpu_bartender import VRAMCalculator, ModelArgs, DataArgs, OptimizerArgs, FinetuningArgs

# Example usage
model_args = ModelArgs(
num_params=123456789,
vocab_size=30522,
hidden_size=768,
num_attention_heads=12,
num_key_value_heads=12,
intermediate_size=3072,
num_layers=12
)

data_args = DataArgs(
batch_size=32,
sequence_length=128
)

optimizer_args = OptimizerArgs(
optimizer='Adam',
optimizer_sgd_momentum=0.9
)

finetuning_args = FinetuningArgs(
training_precision='mixed',
is_fsdp=True
)

calculator = VRAMCalculator(
model_args=model_args,
finetuning_args=finetuning_args,
optimizer_args=optimizer_args,
data_args=data_args,
num_gpus=4,
unit="MiB"
)

result_estimation = calculator.estimate_result()
total_usage_per_gpu = calculator.get_total_usage_per_gpu(result_estimation, is_first=True)

print("Result Estimation:", result_estimation)
print("Total VRAM Usage per GPU:", total_usage_per_gpu, "MiB")
13 changes: 13 additions & 0 deletions gpu_bartender/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .calculator import VRAMCalculator
from .model_args import ModelArgs
from .finetuning_args import FinetuningArgs
from .optimizer_args import OptimizerArgs
from .data_args import DataArgs

__all__ = [
'VRAMCalculator',
'ModelArgs',
'FinetuningArgs',
'OptimizerArgs',
'DataArgs'
]
188 changes: 188 additions & 0 deletions gpu_bartender/calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from dataclasses import dataclass, field
from typing import Optional
from .data_args import DataArgs
from .finetuning_args import FinetuningArgs
from .model_args import ModelArgs
from .optimizer_args import OptimizerArgs

class VRAMCalculator:
def __init__(
self,
model_args: ModelArgs,
finetuning_args: FinetuningArgs,
optimizer_args: OptimizerArgs,
data_args: DataArgs,
num_gpus: int = 1,
unit: str = "MiB"
):
self.model_args = model_args
self.finetuning_args = finetuning_args
self.optimizer_args = optimizer_args
self.data_args = data_args
self.num_gpus = num_gpus
self.unit = unit
self.divisor = 2 ** 20 if unit == "MiB" else 2 ** 30
self.precision = 0 if unit == "MiB" else 3

self.bytes_per_param = self.compute_bytes_per_param()
self.gpu_divisor = self.compute_gpu_divisor()

def compute_bytes_per_param(self) -> int:
training_precision = self.finetuning_args.training_precision
if training_precision == 'mixed':
return 6
else:
return 4

def calculate_bytes_per_param(self) -> int:
return self.bytes_per_param

def compute_gpu_divisor(self) -> int:
is_fsdp = self.finetuning_args.is_fsdp
is_parallel_mode = is_fsdp
return self.num_gpus if self.num_gpus > 1 and is_parallel_mode else 1

def calculate_gpu_divisor(self) -> int:
return self.gpu_divisor

def compute_cuda_kernels(self) -> float:
cuda_kernels = 1000 * 2 ** 20
return cuda_kernels

def calculate_cuda_kernels(self) -> float:
cuda_kernels = self.compute_cuda_kernels()
return self.round_num(cuda_kernels / self.divisor)

def compute_parameters(self) -> float:
num_params = self.model_args.num_params
parameters = (self.bytes_per_param * num_params * 10 ** 9) / self.gpu_divisor
return parameters

def calculate_parameters(self) -> float:
parameters = self.compute_parameters()
return self.round_num(parameters / self.divisor)

def compute_activations(self) -> int:
hidden_size = self.model_args.hidden_size
num_attention_heads = self.model_args.num_attention_heads
num_key_value_heads = self.model_args.num_key_value_heads
intermediate_size = self.model_args.intermediate_size
num_layers = self.model_args.num_layers

batch_size = self.data_args.batch_size
sequence_length = self.data_args.sequence_length

bytes_per_param = self.bytes_per_param
head_dim = hidden_size // num_attention_heads

attention_input = bytes_per_param * batch_size * sequence_length * hidden_size
q = bytes_per_param * batch_size * sequence_length * head_dim * num_attention_heads
k = bytes_per_param * batch_size * sequence_length * head_dim * num_key_value_heads
softmax_output = bytes_per_param * batch_size * num_attention_heads * sequence_length ** 2
softmax_dropout_mask = batch_size * num_attention_heads * sequence_length ** 2
dropout_output = bytes_per_param * batch_size * num_attention_heads * sequence_length ** 2
v = bytes_per_param * batch_size * sequence_length * head_dim * num_key_value_heads
out_proj_input = bytes_per_param * batch_size * sequence_length * num_attention_heads * head_dim
attention_dropout = batch_size * sequence_length * hidden_size

attention_block = (
attention_input + q + k + softmax_output + v + out_proj_input + softmax_dropout_mask + dropout_output + attention_dropout
)

mlp_input = bytes_per_param * batch_size * sequence_length * hidden_size
activation_input = bytes_per_param * batch_size * sequence_length * intermediate_size
down_proj_input = bytes_per_param * batch_size * sequence_length * intermediate_size
dropout_mask = batch_size * sequence_length * hidden_size

mlp_block = mlp_input + activation_input + down_proj_input + dropout_mask

layer_norms = bytes_per_param * batch_size * sequence_length * hidden_size * 2

layer = attention_block + mlp_block + layer_norms

activations = layer * num_layers

return activations

def calculate_activations(self) -> float:
activations = self.compute_activations()
return self.round_num(activations / self.divisor)

def compute_outputs(self) -> float:
batch_size = self.data_args.batch_size
sequence_length = self.data_args.sequence_length
vocab_size = self.model_args.vocab_size
outputs = 4 * batch_size * sequence_length * vocab_size * 2
return outputs

def calculate_outputs(self) -> float:
outputs = self.compute_outputs()
return self.round_num(outputs / self.divisor)

def compute_gradients(self) -> Optional[float]:
num_params = self.model_args.num_params
gradients = (4 * num_params * 10 ** 9) / self.gpu_divisor
return gradients

def calculate_gradients(self) -> Optional[float]:
gradients = self.compute_gradients()
return self.round_num(gradients / self.divisor)

def compute_first_moments(self) -> Optional[float]:
optimizer = self.optimizer_args.optimizer
optimizer_sgd_momentum = self.optimizer_args.optimizer_sgd_momentum
if not ((optimizer == 'SGD' and optimizer_sgd_momentum) or optimizer == 'Adam'):
return None
num_params = self.model_args.num_params
first_moments = (4 * num_params * 10 ** 9) / self.gpu_divisor
return first_moments

def calculate_first_moments(self) -> Optional[float]:
first_moments = self.compute_first_moments()
if first_moments is None:
return None
return self.round_num(first_moments / self.divisor)

def compute_second_moments(self) -> Optional[float]:
optimizer = self.optimizer_args.optimizer
if optimizer != 'Adam':
return None
num_params = self.model_args.num_params
second_moments = (4 * num_params * 10 ** 9) / self.gpu_divisor
return second_moments

def calculate_second_moments(self) -> Optional[float]:
second_moments = self.compute_second_moments()
if second_moments is None:
return None
return self.round_num(second_moments / self.divisor)

def estimate_result(self) -> dict:
result_estimation = {
'cudaKernels': self.calculate_cuda_kernels(),
'parameters': self.calculate_parameters(),
'outputs': self.calculate_outputs(),
'activations': self.calculate_activations(),
'gradients': self.calculate_gradients(),
'firstMoments': self.calculate_first_moments(),
'secondMoments': self.calculate_second_moments(),
}

return result_estimation

def get_total_usage_per_gpu(self, result_estimation: dict, is_first: bool) -> float:
total_usage = (
result_estimation['cudaKernels'] +
result_estimation['parameters'] +
(result_estimation['outputs'] if result_estimation['outputs'] is not None else 0) * int(is_first) +
(result_estimation['activations'] if result_estimation['activations'] is not None else 0) +
(result_estimation['gradients'] if result_estimation['gradients'] is not None else 0) +
(result_estimation['firstMoments'] if result_estimation['firstMoments'] is not None else 0) +
(result_estimation['secondMoments'] if result_estimation['secondMoments'] is not None else 0)
)

return self.round_num(total_usage)

@staticmethod
def round_num(num: float, fraction_digits: int = 3) -> float:
return round(num, fraction_digits)
6 changes: 6 additions & 0 deletions gpu_bartender/data_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from dataclasses import dataclass, field

@dataclass
class DataArgs:
batch_size: int = field(default=4)
sequence_length: int = field(default=512)
19 changes: 19 additions & 0 deletions gpu_bartender/finetuning_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class LoraArgs:
lora_alpha: Optional[int] = field(default=None)
lora_dropout: Optional[float] = field(default=None)
lora_rank: Optional[int] = field(default=8)
lora_target: Optional[str] = field(default=None)

@dataclass
class QLoraArgs:
qlora_alpha: Optional[int] = field(default=None)
qlora_dropout: Optional[float] = field(default=None)

@dataclass
class FinetuningArgs(LoraArgs, QLoraArgs):
training_precision: str = field(default='mixed')
is_fsdp: bool = field(default=True)
11 changes: 11 additions & 0 deletions gpu_bartender/model_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from dataclasses import dataclass, field

@dataclass
class ModelArgs:
num_params: int = field(default=1)
vocab_size: int = field(default=1)
hidden_size: int = field(default=1)
num_attention_heads: int = field(default=1)
num_key_value_heads: int = field(default=1)
intermediate_size: int = field(default=1)
num_layers: int = field(default=1)
7 changes: 7 additions & 0 deletions gpu_bartender/optimizer_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class OptimizerArgs:
optimizer: str = field(default="adam")
optimizer_sgd_momentum: Optional[float] = field(default=None, metadata={"help": "Momentum for SGD optimizer, if used."})
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tool.poetry]
name = "python-project-template"
name = "gpu_bartender"
version = "0.0.1"
description = "A template for python-based research project"
authors = ["Haofei Yu <[email protected]>"]
Expand Down
Empty file removed src/.gitkeep
Empty file.
Empty file removed src/__init__.py
Empty file.

0 comments on commit 06c6307

Please sign in to comment.