diff --git a/glasses/models/classification/deit/__init__.py b/glasses/models/classification/deit/__init__.py index e0d2feb9..238f2a20 100755 --- a/glasses/models/classification/deit/__init__.py +++ b/glasses/models/classification/deit/__init__.py @@ -69,7 +69,7 @@ def __init__( *args, head: nn.Module = DeiTClassificationHead, tokens: nn.Module = DeiTTokens, - **kwargs + **kwargs, ): super().__init__(*args, head=head, tokens=tokens, **kwargs) @@ -105,6 +105,6 @@ def deit_base_patch16_384(cls, **kwargs): depth=12, num_heads=12, qkv_bias=True, - **kwargs + **kwargs, ) return model diff --git a/glasses/models/classification/densenet/__init__.py b/glasses/models/classification/densenet/__init__.py index 426b4427..3a37c4c2 100755 --- a/glasses/models/classification/densenet/__init__.py +++ b/glasses/models/classification/densenet/__init__.py @@ -29,7 +29,7 @@ def __init__( out_features: int, activation: nn.Module = ReLUInPlace, *args, - **kwargs + **kwargs, ): super().__init__() self.block = nn.Sequential( @@ -72,7 +72,7 @@ def __init__( out_features: int, activation: nn.Module = ReLUInPlace, expansion: int = 4, - **kwargs + **kwargs, ): super().__init__(in_features, out_features, activation, **kwargs) self.expansion = expansion @@ -88,7 +88,7 @@ def __init__( self.expanded_features, kernel_size=1, bias=False, - **kwargs + **kwargs, ), "bn2": nn.BatchNorm2d(self.expanded_features), "act2": activation(), @@ -97,7 +97,7 @@ def __init__( out_features, kernel_size=3, bias=False, - **kwargs + **kwargs, ), } ) @@ -154,7 +154,7 @@ def __init__( block: nn.Module = DenseBottleNeckBlock, transition_block: nn.Module = TransitionBlock, *args, - **kwargs + **kwargs, ): self.out_features = grow_rate * n + in_features super().__init__( @@ -166,7 +166,7 @@ def __init__( # reduce the output features by a factor of 2 transition_block(self.out_features, *args, **kwargs) if transition_block - else nn.Identity() + else nn.Identity(), ) @@ -191,7 +191,7 @@ def __init__( activation: nn.Module = ReLUInPlace, block: nn.Module = DenseBottleNeckBlock, *args, - **kwargs + **kwargs, ): super().__init__(in_channels) self.layers = nn.ModuleList([]) @@ -223,7 +223,7 @@ def __init__( transition_block=lambda x: nn.Sequential( nn.BatchNorm2d(self.widths[-1]), activation() ), - **kwargs + **kwargs, ) ) @@ -275,7 +275,7 @@ def __init__( encoder: nn.Module = DenseNetEncoder, head: nn.Module = ResNetHead, *args, - **kwargs + **kwargs, ): super().__init__(encoder, head, *args, **kwargs) diff --git a/glasses/models/classification/efficientnet/__init__.py b/glasses/models/classification/efficientnet/__init__.py index c671c204..bc178d4a 100755 --- a/glasses/models/classification/efficientnet/__init__.py +++ b/glasses/models/classification/efficientnet/__init__.py @@ -44,7 +44,7 @@ def __init__( drop_rate: float = 0.2, se: bool = True, kernel_size: int = 3, - **kwargs + **kwargs, ): super().__init__() @@ -138,7 +138,7 @@ def __init__( drop_rate: float = 0.2, stem: nn.Module = EfficientNetStem, activation: nn.Module = partial(nn.SiLU, inplace=True), - **kwargs + **kwargs, ): super().__init__() @@ -314,7 +314,7 @@ def __init__( encoder: nn.Module = EfficientNetEncoder, head: nn.Module = EfficientNetHead, *args, - **kwargs + **kwargs, ): super().__init__( encoder, partial(head, drop_rate=kwargs["drop_rate"]), *args, **kwargs diff --git a/glasses/models/classification/fishnet/__init__.py b/glasses/models/classification/fishnet/__init__.py index aff4ac00..6993f186 100755 --- a/glasses/models/classification/fishnet/__init__.py +++ b/glasses/models/classification/fishnet/__init__.py @@ -65,7 +65,7 @@ def __init__( depth: int = 1, trans_depth: int = 1, *args, - **kwargs + **kwargs, ): super().__init__() @@ -79,13 +79,13 @@ def __init__( out_features, shortcut=FishNetChannelReductionShortcut, *args, - **kwargs + **kwargs, ), *[ block(out_features, out_features, *args, **kwargs) for _ in range(depth - 1) ], - nn.Upsample(scale_factor=2) + nn.Upsample(scale_factor=2), ) def forward(self, x: Tensor, res: Tensor) -> Tensor: @@ -115,7 +115,7 @@ def __init__( block: nn.Module = FishNetBottleNeck, depth: int = 1, trans_depth: int = 1, - **kwargs + **kwargs, ): super().__init__( in_features, @@ -124,13 +124,13 @@ def __init__( block, depth, trans_depth, - **kwargs + **kwargs, ) self.block = nn.Sequential( block(in_features, out_features, shortcut=ResNetShorcut, **kwargs), *[block(out_features, out_features, **kwargs) for _ in range(depth - 1)], - nn.MaxPool2d(kernel_size=2, stride=2) + nn.MaxPool2d(kernel_size=2, stride=2), ) @@ -170,7 +170,7 @@ def __init__( *[ FishNetBottleNeck(out_features, out_features, activation=activation) for _ in range(depth - 1) - ] + ], ) # very wrong SE implementation and application -> I have contacted the authors and he confirmed they got it wrong. self.att = nn.Sequential( @@ -208,12 +208,12 @@ def __init__( depth: int = 1, block: nn.Module = FishNetBottleNeck, *args, - **kwargs + **kwargs, ): super().__init__( block(in_features, out_features, **kwargs), *[block(out_features, out_features, **kwargs) for _ in range(depth - 1)], - nn.MaxPool2d(kernel_size=2, stride=2) + nn.MaxPool2d(kernel_size=2, stride=2), ) @@ -251,7 +251,7 @@ def __init__( block: nn.Module = FishNetBottleNeck, stem: nn.Module = ResNetStemC, activation: nn.Module = ReLUInPlace, - **kwargs + **kwargs, ): super().__init__() @@ -269,7 +269,7 @@ def __init__( depth=depth, block=block, activation=activation, - **kwargs + **kwargs, ) for (in_features, out_features), depth in zip( self.tail_widths, tail_depths @@ -301,8 +301,8 @@ def __init__( trans_depth=trans_depth, block=block, activation=activation, - dilation=2 ** i, - padding=2 ** i, + dilation=2**i, + padding=2**i, ) ) @@ -444,7 +444,7 @@ def __init__( encoder: nn.Module = FishNetEncoder, head: nn.Module = FishNetHead, *args, - **kwargs + **kwargs, ): super().__init__(encoder, head, *args, **kwargs) @@ -483,7 +483,7 @@ def fishnet99(self, *args, **kwargs) -> FishNet: body_trans_depths=body_trans_depths, head_depths=head_depths, head_trans_depths=head_trans_depths, - **kwargs + **kwargs, ) @classmethod @@ -513,5 +513,5 @@ def fishnet150(self, *args, **kwargs) -> FishNet: body_trans_depths=body_trans_depths, head_depths=head_depths, head_trans_depths=head_trans_depths, - **kwargs + **kwargs, ) diff --git a/glasses/models/classification/mobilenet/__init__.py b/glasses/models/classification/mobilenet/__init__.py index bd06c1fd..01a3eaba 100755 --- a/glasses/models/classification/mobilenet/__init__.py +++ b/glasses/models/classification/mobilenet/__init__.py @@ -29,5 +29,5 @@ def mobilenet_v2(cls, *args, **kwargs) -> EfficientNet: drop_rate=0, activation=nn.ReLU6, *args, - **kwargs + **kwargs, ) diff --git a/glasses/models/classification/regnet/__init__.py b/glasses/models/classification/regnet/__init__.py index 24c9ac13..a576db21 100755 --- a/glasses/models/classification/regnet/__init__.py +++ b/glasses/models/classification/regnet/__init__.py @@ -61,7 +61,7 @@ def __init__( out_features, reduction=1, groups=out_features // groups_width, - **kwargs + **kwargs, ) @@ -291,7 +291,7 @@ def regnety_002(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -303,7 +303,7 @@ def regnety_004(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -315,7 +315,7 @@ def regnety_006(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -327,7 +327,7 @@ def regnety_008(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -339,7 +339,7 @@ def regnety_016(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -351,7 +351,7 @@ def regnety_032(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -363,7 +363,7 @@ def regnety_040(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -375,7 +375,7 @@ def regnety_064(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -387,7 +387,7 @@ def regnety_080(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -399,7 +399,7 @@ def regnety_120(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -411,7 +411,7 @@ def regnety_160(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) @classmethod @@ -423,5 +423,5 @@ def regnety_320(cls, *args, **kwargs): widths=widths, block=RegNetYBotteneckBlock, groups_width=groups_width, - **kwargs + **kwargs, ) diff --git a/glasses/models/classification/resnest/__init__.py b/glasses/models/classification/resnest/__init__.py index 890433e7..4dae2de7 100755 --- a/glasses/models/classification/resnest/__init__.py +++ b/glasses/models/classification/resnest/__init__.py @@ -72,7 +72,7 @@ def __init__( reduction: int = 4, activation: nn.Module = ReLUInPlace, drop_block_p: float = 0, - **kwargs + **kwargs, ): """Implementation of ResNeSt Bottleneck Block proposed in proposed in `"ResNeSt: Split-Attention Networks" `_. It subclasses `ResNetXtBottleNeckBlock` to use the inner features calculation based on the reduction and groups widths. @@ -100,7 +100,7 @@ def __init__( activation=activation, stride=stride, groups=groups, - **kwargs + **kwargs, ) att_features = max(self.features * radix // reduction, 32) pool = ( @@ -149,7 +149,7 @@ def __init__( block: nn.Module = ResNeStBottleneckBlock, downsample_first: bool = False, drop_block_p: float = 0.2, - **kwargs + **kwargs, ): super().__init__( @@ -161,7 +161,7 @@ def __init__( activation=activation, block=block, downsample_first=downsample_first, - **kwargs + **kwargs, ) self.layers = nn.ModuleList( @@ -173,7 +173,7 @@ def __init__( activation=activation, block=block, stride=2 if downsample_first else 1, - **kwargs + **kwargs, ), *[ ResNetLayer( @@ -184,7 +184,7 @@ def __init__( block=block, # add drop block in the last two stages drop_block_p=0 if i < 1 else drop_block_p, - **kwargs + **kwargs, ) for i, ((in_features, out_features), n) in enumerate( zip(self.in_out_widths, depths[1:]) @@ -252,7 +252,7 @@ def resnest14d(cls, *args, **kwargs) -> ResNeSt: widths=[256, 512, 1024, 2048], depths=[1, 1, 1, 1], base_width=64, - **kwargs + **kwargs, ) @classmethod @@ -281,7 +281,7 @@ def resnest50d_1s4x24d(cls, *args, **kwargs) -> ResNeSt: radix=1, groups=4, base_width=24, - **kwargs + **kwargs, ) @classmethod @@ -292,7 +292,7 @@ def resnest50d_4s2x40d(cls, *args, **kwargs) -> ResNeSt: radix=4, groups=2, base_width=40, - **kwargs + **kwargs, ) @classmethod @@ -303,7 +303,7 @@ def resnest101e(cls, *args, **kwargs) -> ResNeSt: start_features=128, block=ResNeStBottleneckBlock, base_width=64, - **kwargs + **kwargs, ) @classmethod @@ -314,7 +314,7 @@ def resnest200e(cls, *args, **kwargs) -> ResNeSt: start_features=128, block=ResNeStBottleneckBlock, base_width=64, - **kwargs + **kwargs, ) @classmethod @@ -326,5 +326,5 @@ def resnest269e(cls, *args, **kwargs) -> ResNeSt: block=ResNeStBottleneckBlock, depths=[3, 30, 48, 8], base_width=64, - **kwargs + **kwargs, ) diff --git a/glasses/models/classification/resnetxt/__init__.py b/glasses/models/classification/resnetxt/__init__.py index 8abb6dc4..1f36526d 100755 --- a/glasses/models/classification/resnetxt/__init__.py +++ b/glasses/models/classification/resnetxt/__init__.py @@ -15,7 +15,7 @@ def __init__( groups: int = 32, base_width: int = 4, reduction: int = 4, - **kwargs + **kwargs, ): """Basic ResNetXt block build on top of ResNetBottleneckBlock. It uses `base_width` to compute the inner features of the 3x3 conv. @@ -33,7 +33,7 @@ def __init__( features=self.features, groups=groups, reduction=reduction, - **kwargs + **kwargs, ) diff --git a/glasses/models/classification/vgg/__init__.py b/glasses/models/classification/vgg/__init__.py index a226f6f2..a4125841 100755 --- a/glasses/models/classification/vgg/__init__.py +++ b/glasses/models/classification/vgg/__init__.py @@ -34,7 +34,7 @@ def __init__( block: nn.Module = VGGBasicBlock, pool: nn.Module = nn.MaxPool2d, depth: int = 1, - **kwargs + **kwargs, ): super().__init__( block(in_features, out_features, **kwargs), @@ -60,7 +60,7 @@ def __init__( widths: List[int] = [64, 128, 256, 512, 512], depths: List[int] = [1, 1, 2, 2, 2], block: nn.Module = VGGBasicBlock, - **kwargs + **kwargs, ): super().__init__() diff --git a/glasses/models/classification/wide_resnet/__init__.py b/glasses/models/classification/wide_resnet/__init__.py index 2aae4b2e..a7766c21 100755 --- a/glasses/models/classification/wide_resnet/__init__.py +++ b/glasses/models/classification/wide_resnet/__init__.py @@ -21,7 +21,7 @@ def __init__( out_features: int, width_factor: int = 2, reduction: int = 4, - **kwargs + **kwargs, ): features = int(out_features * width_factor // reduction) diff --git a/glasses/models/segmentation/unet/__init__.py b/glasses/models/segmentation/unet/__init__.py index ecfcd101..6d017350 100755 --- a/glasses/models/segmentation/unet/__init__.py +++ b/glasses/models/segmentation/unet/__init__.py @@ -18,7 +18,7 @@ def __init__( out_features: int, activation: nn.Module = partial(nn.ReLU, inplace=True), *args, - **kwargs + **kwargs, ): super().__init__( ConvBnAct( @@ -27,7 +27,7 @@ def __init__( kernel_size=3, activation=activation, *args, - **kwargs + **kwargs, ), ConvBnAct( out_features, @@ -35,7 +35,7 @@ def __init__( kernel_size=3, activation=activation, *args, - **kwargs + **kwargs, ), ) @@ -62,7 +62,7 @@ def __init__( donwsample: bool = True, block: nn.Module = DownBlock, *args, - **kwargs + **kwargs, ): super().__init__() @@ -93,7 +93,7 @@ def __init__( lateral_features: int = None, block: nn.Module = UpBlock, *args, - **kwargs + **kwargs, ): super().__init__() lateral_features = ( @@ -122,7 +122,7 @@ def __init__( in_channels: int, widths: List[int] = [64, 128, 256, 512, 1024], *args, - **kwargs + **kwargs, ): super().__init__() self.in_out_widths = list(zip(widths, widths[1:])) @@ -157,7 +157,7 @@ def __init__( widths: List[int] = [256, 128, 64, 32], lateral_widths: List[int] = None, *args, - **kwargs + **kwargs, ): super().__init__() widths = [start_features, *widths] @@ -223,7 +223,7 @@ def __init__( n_classes: int = 2, encoder: Encoder = UNetEncoder, decoder: nn.Module = UNetDecoder, - **kwargs + **kwargs, ): super().__init__(in_channels, n_classes, encoder, decoder, **kwargs) diff --git a/glasses/nn/__init__.py b/glasses/nn/__init__.py index 2915af79..ec2231e9 100755 --- a/glasses/nn/__init__.py +++ b/glasses/nn/__init__.py @@ -1,12 +1,12 @@ from .att import ChannelSE, ECA, SpatialChannelSE, SpatialSE, CBAM from .blocks import BnActConv, Conv2dPad, ConvAct, ConvBn, ConvBnAct, Lambda from .pool import SpatialPyramidPool -from .regularization import DropBlock, StochasticDepth +from .regularization import DropBlock, DropPath __all__ = [ "ConvBnAct", "Conv2dPad", "SpatialPyramidPool", "DropBlock", - "StochasticDepth", + "DropPath", ] diff --git a/glasses/nn/regularization/__init__.py b/glasses/nn/regularization/__init__.py index 4d605849..25187b00 100755 --- a/glasses/nn/regularization/__init__.py +++ b/glasses/nn/regularization/__init__.py @@ -1,3 +1,4 @@ +from typing import Tuple import torch import torch.nn.functional as F from torch import nn @@ -23,7 +24,7 @@ def __init__(self, block_size: int = 7, p: float = 0.5): decreasing `keep_prob` over time from 1 to the target value is more robust and adds improvement for the most values of `keep_prob`. In our experiments, we use a linear scheme of decreasing the value of `keep_prob`, which tends to work well across many hyperparameter settings. This linear scheme is - similar to ScheduledDropPath. + similar to ScheduledStochasticDepth. `keep_prob` is `p` in our implementation. @@ -47,7 +48,7 @@ def calculate_gamma(self, x: Tensor) -> float: return ( self.p * x.shape[-1] ** 2 - / (self.block_size ** 2 * (x.shape[-1] - self.block_size + 1) ** 2) + / (self.block_size**2 * (x.shape[-1] - self.block_size + 1) ** 2) ) def forward(self, x: Tensor) -> Tensor: @@ -69,8 +70,22 @@ def __repr__(self): return f"{self.__class__.__name__}(p={self.p})" -class StochasticDepth(nn.Module): - """Implementation of Stochastic Depth proposed in `Deep Networks with Stochastic Depth `_. +def drop_path(x: Tensor, keep_prob: float = 1.0, inplace: bool = False) -> Tensor: + mask_shape: Tuple[int] = (x.shape[0],) + (1,) * (x.ndim - 1) + # remember tuples have the * operator -> (1,) * 3 = (1,1,1) + mask: Tensor = x.new_empty(mask_shape).bernoulli_(keep_prob) + # we divide to scale the input activations + # https://wandb.ai/wandb_fc/pytorch-image-models/reports/Revisiting-ResNets-Improved-Training-and-Scaling-Strategies--Vmlldzo2NDE3NTM + mask.div_(keep_prob) + if inplace: + x.mul_(mask) + else: + x = x * mask + return x + + +class DropPath(nn.Module): + """Implementation of Drop Path / Stochastic Depth proposed in `Deep Networks with Stochastic Depth `_. The main idea is to skip one layer completely. @@ -79,16 +94,14 @@ class StochasticDepth(nn.Module): """ - def __init__(self, p: float = 0.5): + def __init__(self, p: float = 0.5, inplace: bool = False): super().__init__() self.p = p + self.inplace = inplace def forward(self, x: Tensor) -> Tensor: if self.training and self.p > 0: - probs = torch.rand(x.shape[0], 1, 1, 1, device=x.device) < self.p - # we divide to scale the input activations - # https://wandb.ai/wandb_fc/pytorch-image-models/reports/Revisiting-ResNets-Improved-Training-and-Scaling-Strategies--Vmlldzo2NDE3NTM - x = x.div_(self.p).mul_(probs) + x = drop_path(x, self.p, self.inplace) return x def __repr__(self): diff --git a/test/test_auto.py b/test/test_auto.py index adb5fcff..0bcbae72 100755 --- a/test/test_auto.py +++ b/test/test_auto.py @@ -49,6 +49,13 @@ def test_AutoModel_from_pretrained(caplog): assert "Error(s) in loading state_dict for ResNet:" in caplog.records[1].msg +def test_AutoModel_from_pretrained(caplog): + AutoModel.from_pretrained("resnet18") + AutoModel.from_pretrained("resnet18", n_classes=2) + + assert "Error(s) in loading state_dict for ResNet:" in caplog.records[1].msg + + def test_AutoTransform(): cfg = AutoTransform.from_name("resnet18") assert isinstance(cfg, Transform) diff --git a/test/test_regularization.py b/test/test_regularization.py index e239b2ed..0a03865b 100755 --- a/test/test_regularization.py +++ b/test/test_regularization.py @@ -1,5 +1,5 @@ import torch -from glasses.nn.regularization import DropBlock, StochasticDepth +from glasses.nn.regularization import DropBlock, DropPath def test_drop_block(): @@ -18,16 +18,16 @@ def test_drop_block(): assert drop.__repr__() == "DropBlock(p=0.5)" -def test_stocastic_depth(): - stoc = StochasticDepth() - assert stoc.__repr__() == "StochasticDepth(p=0.5)" +def test_drop_path(): + stoc = DropPath() + assert stoc.__repr__() == "DropPath(p=0.5)" x = torch.ones((2, 3, 28, 28)) - stoc = StochasticDepth(p=1) + stoc = DropPath(p=1) out = stoc(x) assert out.sum() > 0 - stoc = StochasticDepth(p=10e-6) + stoc = DropPath(p=10e-6) out = stoc(x) assert out.sum() == 0