Merge pull request #41 from UofT-EcoSystem/wangshangsam/optim_updates

Update optim to 1.9.1.
UofT-EcoSystem · Oct 1, 2021 · d214082 · d214082
2 parents 22a5c66 + 9b208ac
commit d214082
Show file tree

Hide file tree

Showing 8 changed files with 505 additions and 288 deletions.
diff --git a/hfta/ops/utils.py b/hfta/ops/utils.py
@@ -2,7 +2,7 @@
 import numpy as np
 import re
 
-RE_PARSE_RATIO = re.compile('Mismatched elements: \d+ \/ \d+ \((\d+)\.(\d+)%\)')
+RE_PARSE_RATIO = re.compile('Mismatched elements: (\d+) \/ (\d+)')
 
 
 def testcase_automator(testcase, configs):
@@ -48,6 +48,5 @@ def assert_allclose(
     if not m:
       raise e
     else:
-      if (float('{}.{}'.format(m.group(1), m.group(2))) / 100 >=
-          population_threshold):
+      if (int(m.group(1)) / int(m.group(2))) >= population_threshold:
         raise e
diff --git a/hfta/optim/_functional.py b/hfta/optim/_functional.py
@@ -0,0 +1,127 @@
+import math
+import torch
+from torch import Tensor
+from typing import List, Optional, Union
+
+from .utils import Coefficient, is_coefficient
+
+
+def adadelta(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    acc_deltas: List[Tensor],
+    *,
+    lr: Union[float, Coefficient],
+    rho: Union[float, Coefficient],
+    eps: Union[float, Coefficient],
+    weight_decay: [float, Coefficient],
+):
+  r"""Functional API that performs Adadelta algorithm computation.
+    See :class:`~torch.optim.Adadelta` for details.
+    """
+
+  for (param, grad, square_avg, acc_delta) in zip(params, grads, square_avgs,
+                                                  acc_deltas):
+    if is_coefficient(weight_decay) or weight_decay != 0:
+      if is_coefficient(weight_decay):
+        grad = grad + weight_decay[param] * param
+      else:
+        grad = grad.add(param, alpha=weight_decay)
+
+    if is_coefficient(rho):
+      square_avg.mul_(rho[param]).add_((1 - rho[param]) * grad * grad)
+    else:
+      square_avg.mul_(rho).addcmul_(grad, grad, value=1 - rho)
+    if is_coefficient(eps):
+      std = square_avg.add(eps[param]).sqrt_()
+      delta = acc_delta.add(eps[param]).sqrt_().div_(std).mul_(grad)
+    else:
+      std = square_avg.add(eps).sqrt_()
+      delta = acc_delta.add(eps).sqrt_().div_(std).mul_(grad)
+    if is_coefficient(lr):
+      param.add_(-lr[param] * delta)
+    else:
+      param.add_(delta, alpha=-lr)
+    if is_coefficient(rho):
+      acc_delta.mul_(rho[param]).add_((1 - rho[param]) * delta * delta)
+    else:
+      acc_delta.mul_(rho).addcmul_(delta, delta, value=1 - rho)
+
+
+def adam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[int],
+    *,
+    amsgrad: bool,
+    beta1: Union[float, Coefficient],
+    beta2: Union[float, Coefficient],
+    lr: Union[float, Coefficient],
+    weight_decay: Union[float, Coefficient],
+    eps: Union[float, Coefficient],
+):
+  r"""Functional API that performs Adam algorithm computation.
+
+    See :class:`~torch.optim.Adam` for details.
+    """
+
+  for i, param in enumerate(params):
+
+    grad = grads[i]
+    exp_avg = exp_avgs[i]
+    exp_avg_sq = exp_avg_sqs[i]
+    step = state_steps[i]
+
+    if is_coefficient(beta1):
+      bias_correction1 = 1 - beta1[param]**step
+    else:
+      bias_correction1 = 1 - beta1**step
+    if is_coefficient(beta2):
+      sqrt_bias_correction2 = (1 - beta2[param]**step).sqrt()
+    else:
+      sqrt_bias_correction2 = math.sqrt(1 - beta2**step)
+
+    if is_coefficient(weight_decay) or weight_decay != 0:
+      if is_coefficient(weight_decay):
+        grad = grad + weight_decay[param] * param
+      else:
+        grad = grad.add(param, alpha=weight_decay)
+
+    # Decay the first and second moment running average coefficient
+    if is_coefficient(beta1):
+      exp_avg.mul_(beta1[param]).add_((1 - beta1[param]) * grad)
+    else:
+      exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
+    if is_coefficient(beta2):
+      exp_avg_sq.mul_(beta2[param]).add_(
+          (1 - beta2[param]) * grad * grad.conj())
+    else:
+      exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
+    if amsgrad:
+      # Maintains the maximum of all 2nd moment running avg. till now
+      torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
+      # Use the max. for normalizing running avg. of gradient
+      if is_coefficient(eps):
+        denom = (max_exp_avg_sqs[i].sqrt() / sqrt_bias_correction2).add_(
+            eps[param])
+      else:
+        denom = (max_exp_avg_sqs[i].sqrt() / sqrt_bias_correction2).add_(eps)
+    else:
+      if is_coefficient(eps):
+        denom = (exp_avg_sq.sqrt() / sqrt_bias_correction2).add_(eps[param])
+      else:
+        denom = (exp_avg_sq.sqrt() / sqrt_bias_correction2).add_(eps)
+
+    if is_coefficient(lr):
+      step_size = lr[param] / bias_correction1
+    else:
+      step_size = lr / bias_correction1
+
+    if torch.is_tensor(step_size):
+      param.add_(-step_size * (exp_avg / denom))
+    else:
+      param.addcdiv_(exp_avg, denom, value=-step_size)
diff --git a/hfta/optim/adadelta.py b/hfta/optim/adadelta.py
@@ -1,119 +1,124 @@
 import numpy as np
 import torch
 
+from . import _functional as F
 from torch.optim import Optimizer
 
-from .utils import (_validate_range, _broadcastablize,
-                    _move_coeff_to_same_device, _reduce_array_if_possible_for,
-                    _zero_grad_if_cuda, index_array_or_return_scalar)
+from .utils import (make_coefficient, reduce_array_if_possible_for,
+                    index_array_or_return_scalar)
 from .partial import PartiallyFusedOptimizer
 
 
 class Adadelta(Optimizer):
-  """Implements Adadelta algorithm.
-
-  It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
-
-  Arguments:
-    params (iterable): iterable of parameters to optimize or dicts defining
-      parameter groups
-    rho (float or a list/tuple/np.array/torch.Tensor of floats, optional):
-      coefficient used for computing a running average of squared
-      gradients (default: 0.9)
-    eps (float or a list/tuple/np.array/torch.Tensor of floats, optional): term
-      added to the denominator to improve numerical stability (default: 1e-6)
-    lr (float or a list/tuple/np.array/torch.Tensor of floats, optional):
-      coefficient that scale delta before it is applied to the parameters
-      (default: 1.0)
-    weight_decay (float or a list/tuple/np.array/torch.Tensor of floats,
-      optional): weight decay (L2 penalty) (default: 0)
-
-  __ https://arxiv.org/abs/1212.5701
-  """
+  r"""Implements Adadelta algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)},
+                \: f(\theta) \text{ (objective)}, \: \rho \text{ (decay)},
+                \: \lambda \text{ (weight decay)}                                                \\
+            &\textbf{initialize} :  v_0  \leftarrow 0 \: \text{ (square avg)},
+                \: u_0 \leftarrow 0 \: \text{ (accumulate variables)}                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm} v_t      \leftarrow v_{t-1} \rho + g^2_t (1 - \rho)                    \\
+            &\hspace{5mm}\Delta x_t    \leftarrow   \frac{\sqrt{u_{t-1} +
+                \epsilon }}{ \sqrt{v_t + \epsilon}  }g_t \hspace{21mm}                           \\
+            &\hspace{5mm} u_t  \leftarrow   u_{t-1}  \rho +
+                 \Delta x^2_t  (1 - \rho)                                                        \\
+            &\hspace{5mm}\theta_t      \leftarrow   \theta_{t-1} - \gamma  \Delta x_t            \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        rho (float or a list/tuple/np.array/torch.Tensor of floats, optional): coefficient used for computing a running average
+            of squared gradients (default: 0.9)
+        eps (float or a list/tuple/np.array/torch.Tensor of floats, optional): term added to the denominator to improve
+            numerical stability (default: 1e-6)
+        lr (float or a list/tuple/np.array/torch.Tensor of floats, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 1.0)
+        weight_decay (float or a list/tuple/np.array/torch.Tensor of floats, optional): weight decay (L2 penalty) (default: 0)
+
+    .. _ADADELTA\: An Adaptive Learning Rate Method:
+        https://arxiv.org/abs/1212.5701
+    """
 
   def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0, B=1):
-    _validate_range('learning rate', lr, 0.0, float('inf'))
-    _validate_range('rho value', rho, 0.0, 1.0)
-    _validate_range('epsilon value', eps, 0.0, float('inf'))
-    _validate_range('weight_decay value', weight_decay, 0.0, float('inf'))
-    lr, rho, eps, weight_decay = _reduce_array_if_possible_for(
+    lr, rho, eps, weight_decay = reduce_array_if_possible_for(
         lr, rho, eps, weight_decay)
+    lr = make_coefficient('learning rate', lr, lb=0.0, ub=float('inf'))
+    rho = make_coefficient('rho value', rho, lb=0.0, ub=1.0)
+    eps = make_coefficient('epsilon value', eps, lb=0.0, ub=float('inf'))
+    weight_decay = make_coefficient('weight_decay value',
+                                    weight_decay,
+                                    lb=0.0,
+                                    ub=float('inf'))
 
     defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
     super(Adadelta, self).__init__(params, defaults)
-    _broadcastablize(self, 'lr', B)
-    _broadcastablize(self, 'rho', B)
-    _broadcastablize(self, 'eps', B)
-    _broadcastablize(self, 'weight_decay', B)
-
-  def zero_grad(self):
-    if not _zero_grad_if_cuda(self):
-      super(Adadelta, self).zero_grad()
 
   @torch.no_grad()
   def step(self, closure=None):
     """Performs a single optimization step.
 
-    Arguments:
-      closure (callable, optional): A closure that reevaluates the model
-        and returns the loss.
-    """
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
     loss = None
     if closure is not None:
       with torch.enable_grad():
         loss = closure()
 
     for group in self.param_groups:
+      params_with_grad = []
+      grads = []
+      square_avgs = []
+      acc_deltas = []
+      lr, rho, eps, weight_decay = group['lr'], group['rho'], group[
+          'eps'], group['weight_decay']
+
       for p in group['params']:
         if p.grad is None:
           continue
-        grad = p.grad
-        if grad.is_sparse:
+        params_with_grad.append(p)
+        if p.grad.is_sparse:
           raise RuntimeError('Adadelta does not support sparse gradients')
+        grads.append(p.grad)
+
         state = self.state[p]
 
-        # State initialization
+        # Lazy state initialization
         if len(state) == 0:
           state['step'] = 0
           state['square_avg'] = torch.zeros_like(
               p, memory_format=torch.preserve_format)
           state['acc_delta'] = torch.zeros_like(
               p, memory_format=torch.preserve_format)
-          _move_coeff_to_same_device(group, 'lr', p)
-          _move_coeff_to_same_device(group, 'rho', p)
-          _move_coeff_to_same_device(group, 'eps', p)
-          _move_coeff_to_same_device(group, 'weight_decay', p)
 
-        square_avg, acc_delta = state['square_avg'], state['acc_delta']
-        lr, rho, eps, weight_decay = (group['lr'], group['rho'], group['eps'],
-                                      group['weight_decay'])
+        square_avgs.append(state['square_avg'])
+        acc_deltas.append(state['acc_delta'])
 
         state['step'] += 1
 
-        if isinstance(weight_decay, dict) or weight_decay != 0:
-          if isinstance(weight_decay, dict):
-            grad = grad + weight_decay[p] * p
-          else:
-            grad = grad.add(p, alpha=weight_decay)
-
-        if isinstance(rho, dict):
-          square_avg.mul_(rho[p]).add_((1 - rho[p]) * grad * grad)
-        else:
-          square_avg.mul_(rho).addcmul_(grad, grad, value=1 - rho)
-        if isinstance(eps, dict):
-          std = square_avg.add(eps[p]).sqrt_()
-          delta = acc_delta.add(eps[p]).sqrt_().div_(std).mul_(grad)
-        else:
-          std = square_avg.add(eps).sqrt_()
-          delta = acc_delta.add(eps).sqrt_().div_(std).mul_(grad)
-        if isinstance(lr, dict):
-          p.add_(-lr[p] * delta)
-        else:
-          p.add_(delta, alpha=-lr)
-        if isinstance(rho, dict):
-          acc_delta.mul_(rho[p]).add_((1 - rho[p]) * delta * delta)
-        else:
-          acc_delta.mul_(rho).addcmul_(delta, delta, value=1 - rho)
+      F.adadelta(params_with_grad,
+                 grads,
+                 square_avgs,
+                 acc_deltas,
+                 lr=lr,
+                 rho=rho,
+                 eps=eps,
+                 weight_decay=weight_decay)
 
     return loss