Merge pull request #59 from bonlime/dev

bonlime · web-flow · commit cf9a6e7f89e3 · 2020-04-29T19:57:11.000+03:00
Bunch of fixed for hrnet
diff --git a/pytorch_tools/__init__.py b/pytorch_tools/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.3"
+__version__ = "0.1.4"
 
 from . import fit_wrapper
 from . import losses
diff --git a/pytorch_tools/models/hrnet.py b/pytorch_tools/models/hrnet.py
@@ -185,11 +185,9 @@ def __init__(self, pre_channels, norm_layer=ABN, norm_act="relu"):
         
     def forward(self, x):
         x = [self.incre_modules[i](x[i]) for i in range(4)]        
-        y = x[0]
         for i in range(1, 4):
-            y = x[i] + self.downsamp_modules[i-1](y)
-        y = self.final_layer(y)
-        return y
+            x[i] = x[i] + self.downsamp_modules[i-1](x[i-1])
+        return self.final_layer(x[3])
     
 
 class HighResolutionNet(nn.Module):
@@ -359,7 +357,7 @@ def load_state_dict(self, state_dict, **kwargs):
     },
     "hrnet_w44": {
         "default": {"params": {"width": 44}, **DEFAULT_IMAGENET_SETTINGS,},
-        "imagenet": {"url": None},
+        "imagenet": {"url": "https://github.com/bonlime/pytorch-tools/releases/download/v0.1.2/hrnetv2_w44_imagenet_pretrained-8c55086c.pth"},
     },
     "hrnet_w48": {
         "default": {"params": {"width": 48}, **DEFAULT_IMAGENET_SETTINGS,},
diff --git a/pytorch_tools/modules/__init__.py b/pytorch_tools/modules/__init__.py
@@ -20,6 +20,7 @@
 from .activations import Mish, MishNaive, Swish, SwishNaive
 
 from .activated_batch_norm import ABN
+from .activated_group_norm import AGN
 from inplace_abn import InPlaceABN, InPlaceABNSync
 
 def bn_from_name(norm_name):
@@ -32,5 +33,7 @@ def bn_from_name(norm_name):
         return InPlaceABNSync
     elif norm_name in ("frozen_abn", "frozenabn"):
         return partial(ABN, frozen=True)
+    elif norm_name in ("agn", "groupnorm", "group_norm"):
+        return AGN
     else:
         raise ValueError(f"Normalization {norm_name} not supported")
diff --git a/pytorch_tools/modules/activated_group_norm.py b/pytorch_tools/modules/activated_group_norm.py
@@ -0,0 +1,72 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from .activations import ACT
+from .activations import ACT_FUNC_DICT
+
+class AGN(nn.Module):
+    """Activated Group Normalization
+    This gathers a GroupNorm and an activation function in a single module
+    Parameters
+    ----------
+    num_features : int
+        Number of feature channels in the input and output.
+    num_groups: int
+        Number of groups to separate the channels into
+    eps : float
+        Small constant to prevent numerical issues.
+    affine : bool
+        If `True` apply learned scale and shift transformation after normalization.
+    activation : str
+        Name of the activation functions, one of: `relu`, `leaky_relu`, `elu` or `identity`.
+    activation_param : float
+        Negative slope for the `leaky_relu` activation.
+    """
+
+    def __init__(
+        self,
+        num_features,
+        num_groups=32,
+        eps=1e-5, 
+        affine=True,
+        activation="relu",
+        activation_param=0.01,
+    ):
+        super(AGN, self).__init__()
+        self.num_features = num_features
+        self.num_groups = num_groups
+        self.affine = affine
+        self.eps = eps
+        self.activation = ACT(activation)
+        self.activation_param = activation_param
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(num_features))
+            self.bias = nn.Parameter(torch.zeros(num_features))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.affine:
+            nn.init.constant_(self.weight, 1)
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x):
+        x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
+        func = ACT_FUNC_DICT[self.activation]
+        if self.activation == ACT.LEAKY_RELU:
+            return func(x, inplace=True, negative_slope=self.activation_param)
+        elif self.activation == ACT.ELU:
+            return func(x, inplace=True, alpha=self.activation_param)
+        else:
+            return func(x, inplace=True)
+
+    def extra_repr(self):
+        rep = "{num_features}, eps={eps}, affine={affine}, activation={activation}"
+        if self.activation in ["leaky_relu", "elu"]:
+            rep += "[{activation_param}]"
+        return rep.format(**self.__dict__)
diff --git a/pytorch_tools/modules/weight_standartization.py b/pytorch_tools/modules/weight_standartization.py
@@ -0,0 +1,37 @@
+from torch import nn
+import torch.nn.functional as F
+
+# implements idea from `Weight Standardization` paper https://arxiv.org/abs/1903.10520
+# eps is inside sqrt to avoid overflow Idea from https://arxiv.org/abs/1911.05920    
+class WS_Conv2d(nn.Conv2d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
+        
+    def forward(self, x):
+        weight = self.weight
+        weight = weight.sub(weight.mean(dim=(1, 2, 3), keepdim=True))
+        std = weight.var(dim=(1, 2, 3), keepdim=True).add_(1e-7).sqrt_()
+        weight = weight.div(std.expand_as(weight))
+        return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+# code from random issue on github. 
+def convertConv2WeightStand(module, nextChild=None):
+    mod = module
+    norm_list = [torch.nn.modules.batchnorm.BatchNorm1d, torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.modules.batchnorm.BatchNorm3d, torch.nn.GroupNorm, torch.nn.LayerNorm]
+    conv_list = [torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d, torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, torch.nn.ConvTranspose3d]
+    for norm in norm_list:
+        for conv in conv_list:
+            if isinstance(mod, conv) and isinstance(nextChild, norm):
+                mod = Conv2d(mod.in_channels, mod.out_channels, mod.kernel_size, mod.stride,
+                 mod.padding, mod.dilation, mod.groups, mod.bias!=None)
+
+    moduleChildList = list(module.named_children())
+    for index, [name, child] in enumerate(moduleChildList):
+        nextChild = None
+        if index < len(moduleChildList) -1:
+            nextChild = moduleChildList[index+1][1]
+        mod.add_module(name, convertConv2WeightStand(child, nextChild))
+
+    return mod
+
diff --git a/pytorch_tools/optim/__init__.py b/pytorch_tools/optim/__init__.py
@@ -10,28 +10,29 @@
 
 from torch import optim
 
-
+# 2e-5 is the lowest epsilon than saves from overflow in fp16
 def optimizer_from_name(optim_name):
     optim_name = optim_name.lower()
     if optim_name == "sgd":
         return optim.SGD
     elif optim_name == "sgdw":
         return SGDW
     elif optim_name == "adam":
-        return optim.Adam
+        return partial(optim.Adam, eps=2e-5)
     elif optim_name == "adamw":
-        return optim.AdamW
+        return partial(AdamW_my, eps=2e-5)
     elif optim_name == "adamw_gc":
-        return partial(AdamW_my, center=True)
+        # in this implementation eps in inside sqrt so it can be smaller
+        return partial(AdamW_my, center=True, eps=1e-7)
     elif optim_name == "rmsprop":
-        return optim.RMSprop
+        return partial(optim.RMSprop, 2e-5)
     elif optim_name == "radam":
-        return RAdam
+        return partial(RAdam, eps=2e-5)
     elif optim_name in ["fused_sgd", "fusedsgd"]:
         return FusedSGD
     elif optim_name in ["fused_adam", "fusedadam"]:
-        return FusedAdam
+        return partial(FusedAdam, eps=2e-5)
     elif optim_name in ["fused_novograd", "fusednovograd", "novograd"]:
-        return FusedNovoGrad
+        return partial(FusedNovoGrad, eps=2e-5)
     else:
         raise ValueError(f"Optimizer {optim_name} not found")
diff --git a/pytorch_tools/optim/adamw.py b/pytorch_tools/optim/adamw.py
@@ -5,6 +5,7 @@
 # it's a copy from torch.optim with additional `center` param
 # AdamW is only differs from Adam in one line (where weight decay happens)
 # upd. flag `center` comes from `Gradient Centralization` paper. 
+# upd. moved `eps` inside sqrt to avoid nan in gradients
 class AdamW(Optimizer):
     r"""Implements AdamW algorithm.
 
@@ -79,7 +80,7 @@ def step(self, closure=None):
 
                 #Gradient Centralization operation for Conv layers
                 if group['center'] and len(list(grad.size()))>3:                    
-                   grad.add_(-grad.mean(dim = tuple(range(1,len(list(grad.size())))), keepdim = True))
+                   grad.add_(-grad.mean(dim = tuple(range(1,grad.dim())), keepdim = True))
                    
                 state = self.state[p]
 
@@ -110,9 +111,9 @@ def step(self, closure=None):
                     # Maintains the maximum of all 2nd moment running avg. till now
                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                     # Use the max. for normalizing running avg. of gradient
-                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+                    denom = (max_exp_avg_sq.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                 else:
-                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+                    denom = (exp_avg_sq.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
 
                 step_size = group['lr'] / bias_correction1
 
diff --git a/pytorch_tools/segmentation_models/hrnet.py b/pytorch_tools/segmentation_models/hrnet.py
@@ -39,7 +39,6 @@ class HRNet(nn.Module):
 
     Args:
         encoder_name (str): name of classification model used as feature extractor to build segmentation model.
-            Models expects encoder to have output stride 16 or 8. Only Resnet and Effnet family models are supported for now
         encoder_weights (str): one of ``None`` (random initialization), ``imagenet`` (pre-training on ImageNet).
         num_classes (int): a number of classes for output (output shape - ``(batch, classes, h, w)``).
         pretrained (Union[str, None]): hrnet_w48 and hrnet_w48+OCR have pretrained weights. init models using functions rather than
@@ -203,6 +202,8 @@ def _hrnet(arch, pretrained=None, **kwargs):
             )
             # if there is last_linear in state_dict, it's going to be overwritten
             if cfg_params.get("OCR", False):
+                state_dict["aux_head.2.weight"] = model.state_dict()["aux_head.2.weight"]
+                state_dict["aux_head.2.bias"] = model.state_dict()["aux_head.2.bias"]
                 state_dict["head.weight"] = model.state_dict()["head.weight"]
                 state_dict["head.bias"] = model.state_dict()["head.bias"]
             else:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.1.3"`
	`1`	`+__version__ = "0.1.4"`
`2`	`2`
`3`	`3`	`from . import fit_wrapper`
`4`	`4`	`from . import losses`