diff --git a/glasses/models/classification/deit/__init__.py b/glasses/models/classification/deit/__init__.py
index e0d2feb9..238f2a20 100755
--- a/glasses/models/classification/deit/__init__.py
+++ b/glasses/models/classification/deit/__init__.py
@@ -69,7 +69,7 @@ def __init__(
         *args,
         head: nn.Module = DeiTClassificationHead,
         tokens: nn.Module = DeiTTokens,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(*args, head=head, tokens=tokens, **kwargs)
 
@@ -105,6 +105,6 @@ def deit_base_patch16_384(cls, **kwargs):
             depth=12,
             num_heads=12,
             qkv_bias=True,
-            **kwargs
+            **kwargs,
         )
         return model
diff --git a/glasses/models/classification/densenet/__init__.py b/glasses/models/classification/densenet/__init__.py
index 426b4427..3a37c4c2 100755
--- a/glasses/models/classification/densenet/__init__.py
+++ b/glasses/models/classification/densenet/__init__.py
@@ -29,7 +29,7 @@ def __init__(
         out_features: int,
         activation: nn.Module = ReLUInPlace,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
         self.block = nn.Sequential(
@@ -72,7 +72,7 @@ def __init__(
         out_features: int,
         activation: nn.Module = ReLUInPlace,
         expansion: int = 4,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(in_features, out_features, activation, **kwargs)
         self.expansion = expansion
@@ -88,7 +88,7 @@ def __init__(
                         self.expanded_features,
                         kernel_size=1,
                         bias=False,
-                        **kwargs
+                        **kwargs,
                     ),
                     "bn2": nn.BatchNorm2d(self.expanded_features),
                     "act2": activation(),
@@ -97,7 +97,7 @@ def __init__(
                         out_features,
                         kernel_size=3,
                         bias=False,
-                        **kwargs
+                        **kwargs,
                     ),
                 }
             )
@@ -154,7 +154,7 @@ def __init__(
         block: nn.Module = DenseBottleNeckBlock,
         transition_block: nn.Module = TransitionBlock,
         *args,
-        **kwargs
+        **kwargs,
     ):
         self.out_features = grow_rate * n + in_features
         super().__init__(
@@ -166,7 +166,7 @@ def __init__(
             # reduce the output features by a factor of 2
             transition_block(self.out_features, *args, **kwargs)
             if transition_block
-            else nn.Identity()
+            else nn.Identity(),
         )
 
 
@@ -191,7 +191,7 @@ def __init__(
         activation: nn.Module = ReLUInPlace,
         block: nn.Module = DenseBottleNeckBlock,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(in_channels)
         self.layers = nn.ModuleList([])
@@ -223,7 +223,7 @@ def __init__(
                 transition_block=lambda x: nn.Sequential(
                     nn.BatchNorm2d(self.widths[-1]), activation()
                 ),
-                **kwargs
+                **kwargs,
             )
         )
 
@@ -275,7 +275,7 @@ def __init__(
         encoder: nn.Module = DenseNetEncoder,
         head: nn.Module = ResNetHead,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(encoder, head, *args, **kwargs)
 
diff --git a/glasses/models/classification/efficientnet/__init__.py b/glasses/models/classification/efficientnet/__init__.py
index c671c204..bc178d4a 100755
--- a/glasses/models/classification/efficientnet/__init__.py
+++ b/glasses/models/classification/efficientnet/__init__.py
@@ -44,7 +44,7 @@ def __init__(
         drop_rate: float = 0.2,
         se: bool = True,
         kernel_size: int = 3,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
@@ -138,7 +138,7 @@ def __init__(
         drop_rate: float = 0.2,
         stem: nn.Module = EfficientNetStem,
         activation: nn.Module = partial(nn.SiLU, inplace=True),
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
@@ -314,7 +314,7 @@ def __init__(
         encoder: nn.Module = EfficientNetEncoder,
         head: nn.Module = EfficientNetHead,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             encoder, partial(head, drop_rate=kwargs["drop_rate"]), *args, **kwargs
diff --git a/glasses/models/classification/fishnet/__init__.py b/glasses/models/classification/fishnet/__init__.py
index aff4ac00..6993f186 100755
--- a/glasses/models/classification/fishnet/__init__.py
+++ b/glasses/models/classification/fishnet/__init__.py
@@ -65,7 +65,7 @@ def __init__(
         depth: int = 1,
         trans_depth: int = 1,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
@@ -79,13 +79,13 @@ def __init__(
                 out_features,
                 shortcut=FishNetChannelReductionShortcut,
                 *args,
-                **kwargs
+                **kwargs,
             ),
             *[
                 block(out_features, out_features, *args, **kwargs)
                 for _ in range(depth - 1)
             ],
-            nn.Upsample(scale_factor=2)
+            nn.Upsample(scale_factor=2),
         )
 
     def forward(self, x: Tensor, res: Tensor) -> Tensor:
@@ -115,7 +115,7 @@ def __init__(
         block: nn.Module = FishNetBottleNeck,
         depth: int = 1,
         trans_depth: int = 1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             in_features,
@@ -124,13 +124,13 @@ def __init__(
             block,
             depth,
             trans_depth,
-            **kwargs
+            **kwargs,
         )
 
         self.block = nn.Sequential(
             block(in_features, out_features, shortcut=ResNetShorcut, **kwargs),
             *[block(out_features, out_features, **kwargs) for _ in range(depth - 1)],
-            nn.MaxPool2d(kernel_size=2, stride=2)
+            nn.MaxPool2d(kernel_size=2, stride=2),
         )
 
 
@@ -170,7 +170,7 @@ def __init__(
             *[
                 FishNetBottleNeck(out_features, out_features, activation=activation)
                 for _ in range(depth - 1)
-            ]
+            ],
         )
         # very wrong SE implementation and application -> I have contacted the authors and he confirmed they got it wrong.
         self.att = nn.Sequential(
@@ -208,12 +208,12 @@ def __init__(
         depth: int = 1,
         block: nn.Module = FishNetBottleNeck,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             block(in_features, out_features, **kwargs),
             *[block(out_features, out_features, **kwargs) for _ in range(depth - 1)],
-            nn.MaxPool2d(kernel_size=2, stride=2)
+            nn.MaxPool2d(kernel_size=2, stride=2),
         )
 
 
@@ -251,7 +251,7 @@ def __init__(
         block: nn.Module = FishNetBottleNeck,
         stem: nn.Module = ResNetStemC,
         activation: nn.Module = ReLUInPlace,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
@@ -269,7 +269,7 @@ def __init__(
                     depth=depth,
                     block=block,
                     activation=activation,
-                    **kwargs
+                    **kwargs,
                 )
                 for (in_features, out_features), depth in zip(
                     self.tail_widths, tail_depths
@@ -301,8 +301,8 @@ def __init__(
                     trans_depth=trans_depth,
                     block=block,
                     activation=activation,
-                    dilation=2 ** i,
-                    padding=2 ** i,
+                    dilation=2**i,
+                    padding=2**i,
                 )
             )
 
@@ -444,7 +444,7 @@ def __init__(
         encoder: nn.Module = FishNetEncoder,
         head: nn.Module = FishNetHead,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(encoder, head, *args, **kwargs)
 
@@ -483,7 +483,7 @@ def fishnet99(self, *args, **kwargs) -> FishNet:
             body_trans_depths=body_trans_depths,
             head_depths=head_depths,
             head_trans_depths=head_trans_depths,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -513,5 +513,5 @@ def fishnet150(self, *args, **kwargs) -> FishNet:
             body_trans_depths=body_trans_depths,
             head_depths=head_depths,
             head_trans_depths=head_trans_depths,
-            **kwargs
+            **kwargs,
         )
diff --git a/glasses/models/classification/mobilenet/__init__.py b/glasses/models/classification/mobilenet/__init__.py
index bd06c1fd..01a3eaba 100755
--- a/glasses/models/classification/mobilenet/__init__.py
+++ b/glasses/models/classification/mobilenet/__init__.py
@@ -29,5 +29,5 @@ def mobilenet_v2(cls, *args, **kwargs) -> EfficientNet:
             drop_rate=0,
             activation=nn.ReLU6,
             *args,
-            **kwargs
+            **kwargs,
         )
diff --git a/glasses/models/classification/regnet/__init__.py b/glasses/models/classification/regnet/__init__.py
index 24c9ac13..a576db21 100755
--- a/glasses/models/classification/regnet/__init__.py
+++ b/glasses/models/classification/regnet/__init__.py
@@ -61,7 +61,7 @@ def __init__(
             out_features,
             reduction=1,
             groups=out_features // groups_width,
-            **kwargs
+            **kwargs,
         )
 
 
@@ -291,7 +291,7 @@ def regnety_002(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -303,7 +303,7 @@ def regnety_004(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -315,7 +315,7 @@ def regnety_006(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -327,7 +327,7 @@ def regnety_008(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -339,7 +339,7 @@ def regnety_016(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -351,7 +351,7 @@ def regnety_032(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -363,7 +363,7 @@ def regnety_040(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -375,7 +375,7 @@ def regnety_064(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -387,7 +387,7 @@ def regnety_080(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -399,7 +399,7 @@ def regnety_120(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -411,7 +411,7 @@ def regnety_160(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -423,5 +423,5 @@ def regnety_320(cls, *args, **kwargs):
             widths=widths,
             block=RegNetYBotteneckBlock,
             groups_width=groups_width,
-            **kwargs
+            **kwargs,
         )
diff --git a/glasses/models/classification/resnest/__init__.py b/glasses/models/classification/resnest/__init__.py
index 890433e7..4dae2de7 100755
--- a/glasses/models/classification/resnest/__init__.py
+++ b/glasses/models/classification/resnest/__init__.py
@@ -72,7 +72,7 @@ def __init__(
         reduction: int = 4,
         activation: nn.Module = ReLUInPlace,
         drop_block_p: float = 0,
-        **kwargs
+        **kwargs,
     ):
         """Implementation of ResNeSt Bottleneck Block proposed in proposed in `"ResNeSt: Split-Attention Networks" <https://arxiv.org/abs/2004.08955>`_.
         It subclasses `ResNetXtBottleNeckBlock` to use the inner features calculation based on the reduction and groups widths.
@@ -100,7 +100,7 @@ def __init__(
             activation=activation,
             stride=stride,
             groups=groups,
-            **kwargs
+            **kwargs,
         )
         att_features = max(self.features * radix // reduction, 32)
         pool = (
@@ -149,7 +149,7 @@ def __init__(
         block: nn.Module = ResNeStBottleneckBlock,
         downsample_first: bool = False,
         drop_block_p: float = 0.2,
-        **kwargs
+        **kwargs,
     ):
 
         super().__init__(
@@ -161,7 +161,7 @@ def __init__(
             activation=activation,
             block=block,
             downsample_first=downsample_first,
-            **kwargs
+            **kwargs,
         )
 
         self.layers = nn.ModuleList(
@@ -173,7 +173,7 @@ def __init__(
                     activation=activation,
                     block=block,
                     stride=2 if downsample_first else 1,
-                    **kwargs
+                    **kwargs,
                 ),
                 *[
                     ResNetLayer(
@@ -184,7 +184,7 @@ def __init__(
                         block=block,
                         # add drop block in the last two stages
                         drop_block_p=0 if i < 1 else drop_block_p,
-                        **kwargs
+                        **kwargs,
                     )
                     for i, ((in_features, out_features), n) in enumerate(
                         zip(self.in_out_widths, depths[1:])
@@ -252,7 +252,7 @@ def resnest14d(cls, *args, **kwargs) -> ResNeSt:
             widths=[256, 512, 1024, 2048],
             depths=[1, 1, 1, 1],
             base_width=64,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -281,7 +281,7 @@ def resnest50d_1s4x24d(cls, *args, **kwargs) -> ResNeSt:
             radix=1,
             groups=4,
             base_width=24,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -292,7 +292,7 @@ def resnest50d_4s2x40d(cls, *args, **kwargs) -> ResNeSt:
             radix=4,
             groups=2,
             base_width=40,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -303,7 +303,7 @@ def resnest101e(cls, *args, **kwargs) -> ResNeSt:
             start_features=128,
             block=ResNeStBottleneckBlock,
             base_width=64,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -314,7 +314,7 @@ def resnest200e(cls, *args, **kwargs) -> ResNeSt:
             start_features=128,
             block=ResNeStBottleneckBlock,
             base_width=64,
-            **kwargs
+            **kwargs,
         )
 
     @classmethod
@@ -326,5 +326,5 @@ def resnest269e(cls, *args, **kwargs) -> ResNeSt:
             block=ResNeStBottleneckBlock,
             depths=[3, 30, 48, 8],
             base_width=64,
-            **kwargs
+            **kwargs,
         )
diff --git a/glasses/models/classification/resnetxt/__init__.py b/glasses/models/classification/resnetxt/__init__.py
index 8abb6dc4..1f36526d 100755
--- a/glasses/models/classification/resnetxt/__init__.py
+++ b/glasses/models/classification/resnetxt/__init__.py
@@ -15,7 +15,7 @@ def __init__(
         groups: int = 32,
         base_width: int = 4,
         reduction: int = 4,
-        **kwargs
+        **kwargs,
     ):
         """Basic ResNetXt block build on top of ResNetBottleneckBlock.
         It uses `base_width` to compute the inner features of the 3x3 conv.
@@ -33,7 +33,7 @@ def __init__(
             features=self.features,
             groups=groups,
             reduction=reduction,
-            **kwargs
+            **kwargs,
         )
 
 
diff --git a/glasses/models/classification/vgg/__init__.py b/glasses/models/classification/vgg/__init__.py
index a226f6f2..a4125841 100755
--- a/glasses/models/classification/vgg/__init__.py
+++ b/glasses/models/classification/vgg/__init__.py
@@ -34,7 +34,7 @@ def __init__(
         block: nn.Module = VGGBasicBlock,
         pool: nn.Module = nn.MaxPool2d,
         depth: int = 1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             block(in_features, out_features, **kwargs),
@@ -60,7 +60,7 @@ def __init__(
         widths: List[int] = [64, 128, 256, 512, 512],
         depths: List[int] = [1, 1, 2, 2, 2],
         block: nn.Module = VGGBasicBlock,
-        **kwargs
+        **kwargs,
     ):
 
         super().__init__()
diff --git a/glasses/models/classification/wide_resnet/__init__.py b/glasses/models/classification/wide_resnet/__init__.py
index 2aae4b2e..a7766c21 100755
--- a/glasses/models/classification/wide_resnet/__init__.py
+++ b/glasses/models/classification/wide_resnet/__init__.py
@@ -21,7 +21,7 @@ def __init__(
         out_features: int,
         width_factor: int = 2,
         reduction: int = 4,
-        **kwargs
+        **kwargs,
     ):
         features = int(out_features * width_factor // reduction)
 
diff --git a/glasses/models/segmentation/unet/__init__.py b/glasses/models/segmentation/unet/__init__.py
index ecfcd101..6d017350 100755
--- a/glasses/models/segmentation/unet/__init__.py
+++ b/glasses/models/segmentation/unet/__init__.py
@@ -18,7 +18,7 @@ def __init__(
         out_features: int,
         activation: nn.Module = partial(nn.ReLU, inplace=True),
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             ConvBnAct(
@@ -27,7 +27,7 @@ def __init__(
                 kernel_size=3,
                 activation=activation,
                 *args,
-                **kwargs
+                **kwargs,
             ),
             ConvBnAct(
                 out_features,
@@ -35,7 +35,7 @@ def __init__(
                 kernel_size=3,
                 activation=activation,
                 *args,
-                **kwargs
+                **kwargs,
             ),
         )
 
@@ -62,7 +62,7 @@ def __init__(
         donwsample: bool = True,
         block: nn.Module = DownBlock,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
@@ -93,7 +93,7 @@ def __init__(
         lateral_features: int = None,
         block: nn.Module = UpBlock,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
         lateral_features = (
@@ -122,7 +122,7 @@ def __init__(
         in_channels: int,
         widths: List[int] = [64, 128, 256, 512, 1024],
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
         self.in_out_widths = list(zip(widths, widths[1:]))
@@ -157,7 +157,7 @@ def __init__(
         widths: List[int] = [256, 128, 64, 32],
         lateral_widths: List[int] = None,
         *args,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
         widths = [start_features, *widths]
@@ -223,7 +223,7 @@ def __init__(
         n_classes: int = 2,
         encoder: Encoder = UNetEncoder,
         decoder: nn.Module = UNetDecoder,
-        **kwargs
+        **kwargs,
     ):
 
         super().__init__(in_channels, n_classes, encoder, decoder, **kwargs)
diff --git a/glasses/nn/__init__.py b/glasses/nn/__init__.py
index 2915af79..ec2231e9 100755
--- a/glasses/nn/__init__.py
+++ b/glasses/nn/__init__.py
@@ -1,12 +1,12 @@
 from .att import ChannelSE, ECA, SpatialChannelSE, SpatialSE, CBAM
 from .blocks import BnActConv, Conv2dPad, ConvAct, ConvBn, ConvBnAct, Lambda
 from .pool import SpatialPyramidPool
-from .regularization import DropBlock, StochasticDepth
+from .regularization import DropBlock, DropPath
 
 __all__ = [
     "ConvBnAct",
     "Conv2dPad",
     "SpatialPyramidPool",
     "DropBlock",
-    "StochasticDepth",
+    "DropPath",
 ]
diff --git a/glasses/nn/regularization/__init__.py b/glasses/nn/regularization/__init__.py
index 4d605849..25187b00 100755
--- a/glasses/nn/regularization/__init__.py
+++ b/glasses/nn/regularization/__init__.py
@@ -1,3 +1,4 @@
+from typing import Tuple
 import torch
 import torch.nn.functional as F
 from torch import nn
@@ -23,7 +24,7 @@ def __init__(self, block_size: int = 7, p: float = 0.5):
             decreasing `keep_prob` over time from 1 to the target value is more robust and adds improvement for
             the most values of `keep_prob`. In our experiments, we use a linear scheme of decreasing the value of
             `keep_prob`, which tends to work well across many hyperparameter settings. This linear scheme is
-            similar to ScheduledDropPath.
+            similar to ScheduledStochasticDepth.
 
             `keep_prob` is `p` in our implementation.
 
@@ -47,7 +48,7 @@ def calculate_gamma(self, x: Tensor) -> float:
         return (
             self.p
             * x.shape[-1] ** 2
-            / (self.block_size ** 2 * (x.shape[-1] - self.block_size + 1) ** 2)
+            / (self.block_size**2 * (x.shape[-1] - self.block_size + 1) ** 2)
         )
 
     def forward(self, x: Tensor) -> Tensor:
@@ -69,8 +70,22 @@ def __repr__(self):
         return f"{self.__class__.__name__}(p={self.p})"
 
 
-class StochasticDepth(nn.Module):
-    """Implementation of Stochastic Depth proposed in `Deep Networks with Stochastic Depth <https://arxiv.org/abs/1603.09382>`_.
+def drop_path(x: Tensor, keep_prob: float = 1.0, inplace: bool = False) -> Tensor:
+    mask_shape: Tuple[int] = (x.shape[0],) + (1,) * (x.ndim - 1)
+    # remember tuples have the * operator -> (1,) * 3 = (1,1,1)
+    mask: Tensor = x.new_empty(mask_shape).bernoulli_(keep_prob)
+    # we divide to scale the input activations
+    # https://wandb.ai/wandb_fc/pytorch-image-models/reports/Revisiting-ResNets-Improved-Training-and-Scaling-Strategies--Vmlldzo2NDE3NTM
+    mask.div_(keep_prob)
+    if inplace:
+        x.mul_(mask)
+    else:
+        x = x * mask
+    return x
+
+
+class DropPath(nn.Module):
+    """Implementation of Drop Path / Stochastic Depth proposed in `Deep Networks with Stochastic Depth <https://arxiv.org/abs/1603.09382>`_.
 
     The main idea is to skip one layer completely.
 
@@ -79,16 +94,14 @@ class StochasticDepth(nn.Module):
 
     """
 
-    def __init__(self, p: float = 0.5):
+    def __init__(self, p: float = 0.5, inplace: bool = False):
         super().__init__()
         self.p = p
+        self.inplace = inplace
 
     def forward(self, x: Tensor) -> Tensor:
         if self.training and self.p > 0:
-            probs = torch.rand(x.shape[0], 1, 1, 1, device=x.device) < self.p
-            # we divide to scale the input activations
-            # https://wandb.ai/wandb_fc/pytorch-image-models/reports/Revisiting-ResNets-Improved-Training-and-Scaling-Strategies--Vmlldzo2NDE3NTM
-            x = x.div_(self.p).mul_(probs)
+            x = drop_path(x, self.p, self.inplace)
         return x
 
     def __repr__(self):
diff --git a/test/test_auto.py b/test/test_auto.py
index adb5fcff..0bcbae72 100755
--- a/test/test_auto.py
+++ b/test/test_auto.py
@@ -49,6 +49,13 @@ def test_AutoModel_from_pretrained(caplog):
     assert "Error(s) in loading state_dict for ResNet:" in caplog.records[1].msg
 
 
+def test_AutoModel_from_pretrained(caplog):
+    AutoModel.from_pretrained("resnet18")
+    AutoModel.from_pretrained("resnet18", n_classes=2)
+
+    assert "Error(s) in loading state_dict for ResNet:" in caplog.records[1].msg
+
+
 def test_AutoTransform():
     cfg = AutoTransform.from_name("resnet18")
     assert isinstance(cfg, Transform)
diff --git a/test/test_regularization.py b/test/test_regularization.py
index e239b2ed..0a03865b 100755
--- a/test/test_regularization.py
+++ b/test/test_regularization.py
@@ -1,5 +1,5 @@
 import torch
-from glasses.nn.regularization import DropBlock, StochasticDepth
+from glasses.nn.regularization import DropBlock, DropPath
 
 
 def test_drop_block():
@@ -18,16 +18,16 @@ def test_drop_block():
     assert drop.__repr__() == "DropBlock(p=0.5)"
 
 
-def test_stocastic_depth():
-    stoc = StochasticDepth()
-    assert stoc.__repr__() == "StochasticDepth(p=0.5)"
+def test_drop_path():
+    stoc = DropPath()
+    assert stoc.__repr__() == "DropPath(p=0.5)"
 
     x = torch.ones((2, 3, 28, 28))
-    stoc = StochasticDepth(p=1)
+    stoc = DropPath(p=1)
     out = stoc(x)
 
     assert out.sum() > 0
 
-    stoc = StochasticDepth(p=10e-6)
+    stoc = DropPath(p=10e-6)
     out = stoc(x)
     assert out.sum() == 0