Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(init): support init framework #3

Merged
merged 2 commits into from
Jul 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file removed examples/.gitkeep
Empty file.
48 changes: 48 additions & 0 deletions examples/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from gpu_bartender import (
DataArgs,
FinetuningArgs,
ModelArgs,
OptimizerArgs,
VRAMCalculator,
)

# Example usage
model_args = ModelArgs(
num_params=123456789,
vocab_size=30522,
hidden_size=768,
num_attention_heads=12,
num_key_value_heads=12,
intermediate_size=3072,
num_layers=12
)

data_args = DataArgs(
batch_size=32,
sequence_length=128
)

optimizer_args = OptimizerArgs(
optimizer='Adam',
optimizer_sgd_momentum=0.9
)

finetuning_args = FinetuningArgs(
training_precision='mixed',
is_fsdp=True
)

calculator = VRAMCalculator(
model_args=model_args,
finetuning_args=finetuning_args,
optimizer_args=optimizer_args,
data_args=data_args,
num_gpus=4,
unit="MiB"
)

result_estimation = calculator.estimate_result()
total_usage_per_gpu = calculator.get_total_usage_per_gpu(result_estimation, is_first=True)

print("Result Estimation:", result_estimation)
print("Total VRAM Usage per GPU:", total_usage_per_gpu, "MiB")
13 changes: 13 additions & 0 deletions gpu_bartender/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from .calculator import VRAMCalculator
from .data_args import DataArgs
from .finetuning_args import FinetuningArgs
from .model_args import ModelArgs
from .optimizer_args import OptimizerArgs

__all__ = [
'VRAMCalculator',
'ModelArgs',
'FinetuningArgs',
'OptimizerArgs',
'DataArgs'
]
189 changes: 189 additions & 0 deletions gpu_bartender/calculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
from typing import Optional

from .data_args import DataArgs
from .finetuning_args import FinetuningArgs
from .model_args import ModelArgs
from .optimizer_args import OptimizerArgs


class VRAMCalculator:
def __init__(
self,
model_args: ModelArgs,
finetuning_args: FinetuningArgs,
optimizer_args: OptimizerArgs,
data_args: DataArgs,
num_gpus: int = 1,
unit: str = "MiB"
):
self.model_args = model_args
self.finetuning_args = finetuning_args
self.optimizer_args = optimizer_args
self.data_args = data_args
self.num_gpus = num_gpus
self.unit = unit
self.divisor = 2 ** 20 if unit == "MiB" else 2 ** 30
self.precision = 0 if unit == "MiB" else 3

self.bytes_per_param = self.compute_bytes_per_param()
self.gpu_divisor = self.compute_gpu_divisor()

def compute_bytes_per_param(self) -> int:
training_precision = self.finetuning_args.training_precision
if training_precision == 'mixed':
return 6
else:
return 4

def calculate_bytes_per_param(self) -> int:
return self.bytes_per_param

def compute_gpu_divisor(self) -> int:
is_fsdp = self.finetuning_args.is_fsdp
is_parallel_mode = is_fsdp
return self.num_gpus if self.num_gpus > 1 and is_parallel_mode else 1

def calculate_gpu_divisor(self) -> int:
return self.gpu_divisor

def compute_cuda_kernels(self) -> float:
cuda_kernels = 1000 * 2 ** 20
return cuda_kernels

def calculate_cuda_kernels(self) -> float:
cuda_kernels = self.compute_cuda_kernels()
return self.round_num(cuda_kernels / self.divisor)

def compute_parameters(self) -> float:
num_params = self.model_args.num_params
parameters = (self.bytes_per_param * num_params * 10 ** 9) / self.gpu_divisor
return parameters

def calculate_parameters(self) -> float:
parameters = self.compute_parameters()
return self.round_num(parameters / self.divisor)

def compute_activations(self) -> int:
hidden_size = self.model_args.hidden_size
num_attention_heads = self.model_args.num_attention_heads
num_key_value_heads = self.model_args.num_key_value_heads
intermediate_size = self.model_args.intermediate_size
num_layers = self.model_args.num_layers

batch_size = self.data_args.batch_size
sequence_length = self.data_args.sequence_length

bytes_per_param = self.bytes_per_param
head_dim = hidden_size // num_attention_heads

attention_input = bytes_per_param * batch_size * sequence_length * hidden_size
q = bytes_per_param * batch_size * sequence_length * head_dim * num_attention_heads
k = bytes_per_param * batch_size * sequence_length * head_dim * num_key_value_heads
softmax_output = bytes_per_param * batch_size * num_attention_heads * sequence_length ** 2
softmax_dropout_mask = batch_size * num_attention_heads * sequence_length ** 2
dropout_output = bytes_per_param * batch_size * num_attention_heads * sequence_length ** 2
v = bytes_per_param * batch_size * sequence_length * head_dim * num_key_value_heads
out_proj_input = bytes_per_param * batch_size * sequence_length * num_attention_heads * head_dim
attention_dropout = batch_size * sequence_length * hidden_size

attention_block = (
attention_input + q + k + softmax_output + v + out_proj_input + softmax_dropout_mask + dropout_output + attention_dropout
)

mlp_input = bytes_per_param * batch_size * sequence_length * hidden_size
activation_input = bytes_per_param * batch_size * sequence_length * intermediate_size
down_proj_input = bytes_per_param * batch_size * sequence_length * intermediate_size
dropout_mask = batch_size * sequence_length * hidden_size

mlp_block = mlp_input + activation_input + down_proj_input + dropout_mask

layer_norms = bytes_per_param * batch_size * sequence_length * hidden_size * 2

layer = attention_block + mlp_block + layer_norms

activations = layer * num_layers

return activations

def calculate_activations(self) -> float:
activations = self.compute_activations()
return self.round_num(activations / self.divisor)

def compute_outputs(self) -> float:
batch_size = self.data_args.batch_size
sequence_length = self.data_args.sequence_length
vocab_size = self.model_args.vocab_size
outputs = 4 * batch_size * sequence_length * vocab_size * 2
return outputs

def calculate_outputs(self) -> float:
outputs = self.compute_outputs()
return self.round_num(outputs / self.divisor)

def compute_gradients(self) -> Optional[float]:
num_params = self.model_args.num_params
gradients = (4 * num_params * 10 ** 9) / self.gpu_divisor
return gradients

def calculate_gradients(self) -> Optional[float]:
gradients = self.compute_gradients()
return self.round_num(gradients / self.divisor)

def compute_first_moments(self) -> Optional[float]:
optimizer = self.optimizer_args.optimizer
optimizer_sgd_momentum = self.optimizer_args.optimizer_sgd_momentum
if not ((optimizer == 'SGD' and optimizer_sgd_momentum) or optimizer == 'Adam'):
return None
num_params = self.model_args.num_params
first_moments = (4 * num_params * 10 ** 9) / self.gpu_divisor
return first_moments

def calculate_first_moments(self) -> Optional[float]:
first_moments = self.compute_first_moments()
if first_moments is None:
return None
return self.round_num(first_moments / self.divisor)

def compute_second_moments(self) -> Optional[float]:
optimizer = self.optimizer_args.optimizer
if optimizer != 'Adam':
return None
num_params = self.model_args.num_params
second_moments = (4 * num_params * 10 ** 9) / self.gpu_divisor
return second_moments

def calculate_second_moments(self) -> Optional[float]:
second_moments = self.compute_second_moments()
if second_moments is None:
return None
return self.round_num(second_moments / self.divisor)

def estimate_result(self) -> dict:
result_estimation = {
'cudaKernels': self.calculate_cuda_kernels(),
'parameters': self.calculate_parameters(),
'outputs': self.calculate_outputs(),
'activations': self.calculate_activations(),
'gradients': self.calculate_gradients(),
'firstMoments': self.calculate_first_moments(),
'secondMoments': self.calculate_second_moments(),
}

return result_estimation

def get_total_usage_per_gpu(self, result_estimation: dict, is_first: bool) -> float:
total_usage = (
result_estimation['cudaKernels'] +
result_estimation['parameters'] +
(result_estimation['outputs'] if result_estimation['outputs'] is not None else 0) * int(is_first) +
(result_estimation['activations'] if result_estimation['activations'] is not None else 0) +
(result_estimation['gradients'] if result_estimation['gradients'] is not None else 0) +
(result_estimation['firstMoments'] if result_estimation['firstMoments'] is not None else 0) +
(result_estimation['secondMoments'] if result_estimation['secondMoments'] is not None else 0)
)

return self.round_num(total_usage)

@staticmethod
def round_num(num: float, fraction_digits: int = 3) -> float:
return round(num, fraction_digits)
7 changes: 7 additions & 0 deletions gpu_bartender/data_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from dataclasses import dataclass, field


@dataclass
class DataArgs:
batch_size: int = field(default=4)
sequence_length: int = field(default=512)
20 changes: 20 additions & 0 deletions gpu_bartender/finetuning_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from dataclasses import dataclass, field
from typing import Optional


@dataclass
class LoraArgs:
lora_alpha: Optional[int] = field(default=None)
lora_dropout: Optional[float] = field(default=None)
lora_rank: Optional[int] = field(default=8)
lora_target: Optional[str] = field(default=None)

@dataclass
class QLoraArgs:
qlora_alpha: Optional[int] = field(default=None)
qlora_dropout: Optional[float] = field(default=None)

@dataclass
class FinetuningArgs(LoraArgs, QLoraArgs):
training_precision: str = field(default='mixed')
is_fsdp: bool = field(default=True)
12 changes: 12 additions & 0 deletions gpu_bartender/model_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from dataclasses import dataclass, field


@dataclass
class ModelArgs:
num_params: int = field(default=1)
vocab_size: int = field(default=1)
hidden_size: int = field(default=1)
num_attention_heads: int = field(default=1)
num_key_value_heads: int = field(default=1)
intermediate_size: int = field(default=1)
num_layers: int = field(default=1)
8 changes: 8 additions & 0 deletions gpu_bartender/optimizer_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from dataclasses import dataclass, field
from typing import Optional


@dataclass
class OptimizerArgs:
optimizer: str = field(default="adam")
optimizer_sgd_momentum: Optional[float] = field(default=None, metadata={"help": "Momentum for SGD optimizer, if used."})
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tool.poetry]
name = "python-project-template"
name = "gpu_bartender"
version = "0.0.1"
description = "A template for python-based research project"
authors = ["Haofei Yu <[email protected]>"]
Expand Down
Empty file removed src/.gitkeep
Empty file.
Empty file removed src/__init__.py
Empty file.
Loading