Skip to content

Commit

Permalink
chore: doc strings and clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
sumny committed Oct 12, 2023
1 parent 24a8afa commit 7b8951b
Show file tree
Hide file tree
Showing 19 changed files with 803 additions and 307 deletions.
86 changes: 46 additions & 40 deletions yahpo_train/attic/tabular_transformer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import typing as ty

import torch
Expand All @@ -25,7 +24,7 @@ def __init__(
else:
d_bias = d_numerical + len(categories)
category_offsets = torch.tensor([0] + categories[:-1]).cumsum(0)
self.register_buffer('category_offsets', category_offsets)
self.register_buffer("category_offsets", category_offsets)
self.category_embeddings = nn.Embedding(sum(categories), d_token)
nn_init.kaiming_uniform_(self.category_embeddings.weight, a=math.sqrt(5))

Expand Down Expand Up @@ -67,13 +66,14 @@ def forward(self, x_cat: ty.Optional[Tensor], x_cont: Tensor) -> Tensor:
x = x + bias[None]
return x


class MultiheadAttention(nn.Module):
def __init__(
self, d: int, n_heads: int, dropout: float, initialization: str
) -> None:
if n_heads > 1:
assert d % n_heads == 0
assert initialization in ['xavier', 'kaiming']
assert initialization in ["xavier", "kaiming"]

super().__init__()
self.W_q = nn.Linear(d, d)
Expand All @@ -84,7 +84,7 @@ def __init__(
self.dropout = nn.Dropout(dropout) if dropout else None

for m in [self.W_q, self.W_k, self.W_v]:
if initialization == 'xavier' and (n_heads > 1 or m is not self.W_v):
if initialization == "xavier" and (n_heads > 1 or m is not self.W_v):
# gain is needed since W_qkv is represented with 3 separate layers
nn_init.xavier_uniform_(m.weight, gain=1 / math.sqrt(2))
nn_init.zeros_(m.bias)
Expand Down Expand Up @@ -150,24 +150,22 @@ class Transformer(AbstractSurrogate):
def __init__(
self,
dls,
embds_dbl:typing.List=None,
embds_tgt:typing.List=None,
d_token:int = 192,
final_act = nn.Sigmoid(),
embds_dbl: typing.List = None,
embds_tgt: typing.List = None,
d_token: int = 192,
final_act=nn.Sigmoid(),
# Tokenizer
token_bias: bool = True,
# transformer
n_layers: int = 3,

n_heads: int = 8,
d_ffn_factor: float = 4/3,
d_ffn_factor: float = 4 / 3,
attention_dropout: float = 0.2,
ffn_dropout: float = 0.1,
residual_dropout: float = 0.0,
activation: str = 'reglu',
activation: str = "reglu",
prenormalization: bool = True,
initialization: str = 'kaiming',

initialization: str = "kaiming",
# linformer
kv_compression: ty.Optional[float] = None,
kv_compression_sharing: ty.Optional[str] = None,
Expand All @@ -177,7 +175,12 @@ def __init__(
super().__init__()

self._build_embeddings_xcont(dls, embds_dbl)
self.tokenizer = Tokenizer(len(dls.cont_names), [len(dls.train.classes[n]) for n in dls.train.cat_names], d_token, token_bias)
self.tokenizer = Tokenizer(
len(dls.cont_names),
[len(dls.train.classes[n]) for n in dls.train.cat_names],
d_token,
token_bias,
)
n_tokens = self.tokenizer.n_tokens

self.final_act = final_act
Expand All @@ -188,13 +191,13 @@ def make_kv_compression():
compression = nn.Linear(
n_tokens, int(n_tokens * kv_compression), bias=False
)
if initialization == 'xavier':
if initialization == "xavier":
nn_init.xavier_uniform_(compression.weight)
return compression

self.shared_kv_compression = (
make_kv_compression()
if kv_compression and kv_compression_sharing == 'layerwise'
if kv_compression and kv_compression_sharing == "layerwise"
else None
)

Expand All @@ -206,24 +209,24 @@ def make_normalization():
for layer_idx in range(n_layers):
layer = nn.ModuleDict(
{
'attention': MultiheadAttention(
"attention": MultiheadAttention(
d_token, n_heads, attention_dropout, initialization
),
'linear0': nn.Linear(
d_token, d_hidden * (2 if activation.endswith('glu') else 1)
"linear0": nn.Linear(
d_token, d_hidden * (2 if activation.endswith("glu") else 1)
),
'linear1': nn.Linear(d_hidden, d_token),
'norm1': make_normalization(),
"linear1": nn.Linear(d_hidden, d_token),
"norm1": make_normalization(),
}
)
if not prenormalization or layer_idx:
layer['norm0'] = make_normalization()
layer["norm0"] = make_normalization()
if kv_compression and self.shared_kv_compression is None:
layer['key_compression'] = make_kv_compression()
if kv_compression_sharing == 'headwise':
layer['value_compression'] = make_kv_compression()
layer["key_compression"] = make_kv_compression()
if kv_compression_sharing == "headwise":
layer["value_compression"] = make_kv_compression()
else:
assert kv_compression_sharing == 'key-value'
assert kv_compression_sharing == "key-value"
self.layers.append(layer)

self.activation = get_activation_fn(activation)
Expand All @@ -238,17 +241,17 @@ def _get_kv_compressions(self, layer):
return (
(self.shared_kv_compression, self.shared_kv_compression)
if self.shared_kv_compression is not None
else (layer['key_compression'], layer['value_compression'])
if 'key_compression' in layer and 'value_compression' in layer
else (layer['key_compression'], layer['key_compression'])
if 'key_compression' in layer
else (layer["key_compression"], layer["value_compression"])
if "key_compression" in layer and "value_compression" in layer
else (layer["key_compression"], layer["key_compression"])
if "key_compression" in layer
else (None, None)
)

def _start_residual(self, x, layer, norm_idx):
x_residual = x
if self.prenormalization:
norm_key = f'norm{norm_idx}'
norm_key = f"norm{norm_idx}"
if norm_key in layer:
x_residual = layer[norm_key](x_residual)
return x_residual
Expand All @@ -258,14 +261,18 @@ def _end_residual(self, x, x_residual, layer, norm_idx):
x_residual = F.dropout(x_residual, self.residual_dropout, self.training)
x = x + x_residual
if not self.prenormalization:
x = layer[f'norm{norm_idx}'](x)
x = layer[f"norm{norm_idx}"](x)
return x

def forward(self, x_cat: ty.Optional[Tensor], x_cont: Tensor =None, invert_ytrafo: bool = True) -> Tensor:

def forward(
self,
x_cat: ty.Optional[Tensor],
x_cont: Tensor = None,
invert_ytrafo: bool = True,
) -> Tensor:
# Transform continuous features
if self.n_cont != 0:
xcont = [e(x_cont[:,i]).unsqueeze(1) for i,e in enumerate(self.embds_dbl)]
xcont = [e(x_cont[:, i]).unsqueeze(1) for i, e in enumerate(self.embds_dbl)]
xcont = torch.cat(xcont, 1)

x = self.tokenizer(x_cat, x_cont)
Expand All @@ -275,7 +282,7 @@ def forward(self, x_cat: ty.Optional[Tensor], x_cont: Tensor =None, invert_ytraf
layer = ty.cast(ty.Dict[str, nn.Module], layer)

x_residual = self._start_residual(x, layer, 0)
x_residual = layer['attention'](
x_residual = layer["attention"](
# for the last attention, it is enough to process only [CLS]
(x_residual[:, :1] if is_last_layer else x_residual),
x_residual,
Expand All @@ -286,11 +293,11 @@ def forward(self, x_cat: ty.Optional[Tensor], x_cont: Tensor =None, invert_ytraf
x = self._end_residual(x, x_residual, layer, 0)

x_residual = self._start_residual(x, layer, 1)
x_residual = layer['linear0'](x_residual)
x_residual = layer["linear0"](x_residual)
x_residual = self.activation(x_residual)
if self.ffn_dropout:
x_residual = F.dropout(x_residual, self.ffn_dropout, self.training)
x_residual = layer['linear1'](x_residual)
x_residual = layer["linear1"](x_residual)
x = self._end_residual(x, x_residual, layer, 1)

assert x.shape[1] == 1
Expand All @@ -299,10 +306,9 @@ def forward(self, x_cat: ty.Optional[Tensor], x_cont: Tensor =None, invert_ytraf
x = self.last_normalization(x)
x = self.last_activation(x)
x = self.head(x)

y = self.final_act(x)
if invert_ytrafo:
return self.inv_trafo_ys(y)
else:
return y

Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,19 @@ def test_cont_norm():
xss = [
torch.rand(100, 1),
(torch.rand(1000, 1) * 3) ** 2,
- 5 * (torch.rand(500, 1) - .5),
-5 * (torch.rand(500, 1) - 0.5),
(torch.rand(500, 1) + 2) ** 3,
torch.cat((torch.rand(500, 1), torch.rand(1, 1) + torch.Tensor([1000.]))),
torch.Tensor([1., 2., -1., 1000.]),
torch.cat((torch.rand(800, 1), torch.rand(1, 1) - torch.Tensor([1000.]))),
torch.cat((torch.rand(500, 1), torch.rand(1, 1) + torch.Tensor([1000.0]))),
torch.Tensor([1.0, 2.0, -1.0, 1000.0]),
torch.cat((torch.rand(800, 1), torch.rand(1, 1) - torch.Tensor([1000.0]))),
]
xs2 = torch.cat((torch.rand(50, 1), torch.rand(1, 1) + torch.Tensor([10000.]), torch.Tensor([-1000.]).unsqueeze(1)))
xs2 = torch.cat(
(
torch.rand(50, 1),
torch.rand(1, 1) + torch.Tensor([10000.0]),
torch.Tensor([-1000.0]).unsqueeze(1),
)
)
for xs in xss:
for normalize in ["scale", "range", None]:
lim = 1e-3
Expand All @@ -39,8 +45,8 @@ def test_cont_norm():
def test_cont_with_nan():
xss = [
torch.cat((torch.rand(50, 1), torch.Tensor(np.array([np.nan]).reshape(1, 1)))),
torch.Tensor([1., 2., -1., 1000, np.nan]),
torch.Tensor([1., np.nan, -1., 1000, np.nan])
torch.Tensor([1.0, 2.0, -1.0, 1000, np.nan]),
torch.Tensor([1.0, np.nan, -1.0, 1000, np.nan]),
]
for xs in xss:
for normalize in ["scale", "range", None]:
Expand All @@ -56,8 +62,8 @@ def test_cont_with_nan():

def test_cont_with_log():
xss = [
- torch.log(torch.rand(150, 1)),
torch.float_power(torch.rand(150, 1) * 2, 10.)
-torch.log(torch.rand(150, 1)),
torch.float_power(torch.rand(150, 1) * 2, 10.0),
]
for xs in xss:
for normalize in ["scale", "range", None]:
Expand All @@ -74,8 +80,8 @@ def test_cont_with_log():
def test_cont_norm_pd():
nrows = 1000000
file = cfg("lcbench").get_path("dataset")
df2 = pd.read_csv(file, nrows=nrows).sample(frac=.01)
df = pd.read_csv(file, nrows=nrows).sample(frac=.3)
df2 = pd.read_csv(file, nrows=nrows).sample(frac=0.01)
df = pd.read_csv(file, nrows=nrows).sample(frac=0.3)
for nm in df.columns[1:]:
lim = 1e-3
for normalize in ["scale", "range", None]:
Expand All @@ -93,7 +99,7 @@ def test_cont_norm_pd():
assert xsn.shape == xs.shape


if __name__ == '__main__':
if __name__ == "__main__":
test_cont_norm()
test_cont_with_nan()
test_cont_with_log()
Expand Down
36 changes: 36 additions & 0 deletions yahpo_train/attic/tests/test_cont_scalers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
import pytest
import torch

from yahpo_train.cont_scalers import *


def test_cont_scaler():
def cont_scaler_helper(x, transformer, eps=1e-5):
cr = transformer(x)
x_tf = cr(x)
x_rec = cr.invert(x_tf)
assert torch.max(torch.abs(x_rec - x)).numpy() < eps

x = torch.rand(100)

for transformer in [
ContTransformerNone,
ContTransformerRange,
ContTransformerNegExp,
ContTransformerLog,
ContTransformerMultScalar,
]:
cont_scaler_helper(x, transformer)

for transformer in [
ContTransformerNone,
ContTransformerRange,
ContTransformerNegExp,
ContTransformerMultScalar,
]:
cont_scaler_helper(-x, transformer)


if __name__ == "__main__":
test_cont_scaler()
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import onnxruntime as rt
from onnxruntime.datasets import get_example

if __name__ == '__main__':
if __name__ == "__main__":
file = "/home/flo/lrz_synchshare/multifidelity_data/lcbench/model.onnx"
rt.InferenceSession(file)
rt.InferenceSession(file)
Loading

0 comments on commit 7b8951b

Please sign in to comment.