FEAT: added module 1

jacky1c · Aug 29, 2024 · 9535cc8 · 9535cc8
1 parent 36b6669
commit 9535cc8
Show file tree

Hide file tree

Showing 8 changed files with 263 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,11 @@
-# minitorch
-This repository is part of my journey to learn the underlying concepts of deep learning systems through implementing a minimal version of PyTorch library.
+# MiniTorch: A DIY Course on Machine Learning Engineering
+This repository is part of my journey to learn the underlying engineering concepts of deep learning systems through implementing a minimal version of PyTorch library.
 
 If you're interested in learning more, I highly recommend checking out the excellent [MiniTorch lectures](https://minitorch.github.io) and [Youtube playlist](https://www.youtube.com/playlist?list=PLO45-80-XKkQyROXXpn4PfjF1J2tH46w8) by [Prof. Rush](https://rush-nlp.com), and the [self-study guide](https://github.com/mukobi/Minitorch-Self-Study-Guide-SAIA/tree/main) by [Gabriel Mukobi](https://gabrielmukobi.com) that answers some common questions.
 
 ## Setup
 
-My venv is Python 3.8.
+My virtual environment is based on Python 3.8.
 
 Install dependencies
 ```bash
@@ -60,15 +60,52 @@ To access the autograder:
 - [x] Modules
 - [x] Visualization
 
+To run visualization for Module 0, use:
+``` bash
+streamlit run project/app.py -- 0
+```
+
+Then you should be able to see interactive data visualizations and you could "turn the knobs" of the parameters to explore model behaviour.
 ![task0.5](./figs/task0.5.png)
 
 ## Module 1 - Autodiff
 
-- [ ] Numerical Derivatives
-- [ ] Scalars
-- [ ] Chain Rule
-- [ ] Backpropagation
-- [ ] Training
+- [x] Numerical Derivatives
+- [x] Scalars
+- [x] Chain Rule
+- [x] Backpropagation
+- [x] Training
+
+PyTorch terminology in computational graph:
+- Leaf nodes: variables created from scratch on the left hand side (e.g., `minitorch.Scalar(0.0)` )
+- Non-leaf nodes: variables created with a Function
+- Constant nodes: terms that passed in that is not a variable (scalars without a history, e.g., `minitorch.Scalar(0.0, None)` )
+
+Backpropagation algorithm:
+1. Call topological sort. The result should start from the right of the computational graph (i.e., the output)
+2. Create a dict of variables and derivatives
+3. For each node in topological sort
+   1. If variable is a leaf, then add its final derivative
+   2. If the variable is not a leaf
+      1. Call backward with its derivative as $d$
+      2. Loop through all the variables and derivative
+      3. Accumulate derivatives for the variable
+
+To run visualization for Module 1, use:
+``` bash
+streamlit run project/app.py -- 1
+```
+
+Here's the training result for the XOR dataset.
+![task1.5](./figs/task1.5.png)
+
+The parameters are
+```
+PTS = 50
+DATASET = minitorch.datasets["Xor"](PTS)
+HIDDEN = 10
+RATE = 0.5
+```
 
 ## Module 2 - Tensors
 

diff --git a/figs/task1.5.png b/figs/task1.5.png
diff --git a/minitorch/autodiff.py b/minitorch/autodiff.py
@@ -10,6 +10,7 @@
 def central_difference(f: Any, *vals: Any, arg: int = 0, epsilon: float = 1e-6) -> Any:
     r"""
     Computes an approximation to the derivative of `f` with respect to one arg.
+    $f'(x) \approx \frac{f(x + \epsilon) - f(x - \epsilon)}{2 \epsilon}a$
 
     See :doc:`derivative` or https://en.wikipedia.org/wiki/Finite_difference for more details.
 
@@ -22,8 +23,11 @@ def central_difference(f: Any, *vals: Any, arg: int = 0, epsilon: float = 1e-6)
     Returns:
         An approximation of $f'_i(x_0, \ldots, x_{n-1})$
     """
-    # TODO: Implement for Task 1.1.
-    raise NotImplementedError("Need to implement for Task 1.1")
+    # raise NotImplementedError("Need to implement for Task 1.1")
+    vals_1, vals_0 = list(vals), list(vals)
+    vals_1[arg] += epsilon
+    vals_0[arg] -= epsilon
+    return (f(*vals_1) - f(*vals_0)) / (2 * epsilon)
 
 
 variable_count = 1
@@ -61,8 +65,33 @@ def topological_sort(variable: Variable) -> Iterable[Variable]:
     Returns:
         Non-constant Variables in topological order starting from the right.
     """
-    # TODO: Implement for Task 1.4.
-    raise NotImplementedError("Need to implement for Task 1.4")
+    # raise NotImplementedError("Need to implement for Task 1.4")
+
+    #### Pseudocode for topo sort
+    ## visit(last)
+    ## function visit(node n):
+    ##     if n has a mark then return
+    ##     for each node m with an edge from n to m, do:
+    ##         visit(m)
+    ##     mark n with a permanent mark
+    ##     add n to list
+
+    result: List[Variable] = []
+    visited = set()
+
+    def visit(var: Variable) -> None:
+        id = var.unique_id
+        if id in visited or var.is_constant():
+            return
+        if not var.is_leaf():
+            for m in var.parents:
+                if not m.is_constant():
+                    visit(m)
+        visited.add(id)
+        result.insert(0, var)
+
+    visit(variable)
+    return result
 
 
 def backpropagate(variable: Variable, deriv: Any) -> None:
@@ -76,8 +105,29 @@ def backpropagate(variable: Variable, deriv: Any) -> None:
 
     No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`.
     """
-    # TODO: Implement for Task 1.4.
-    raise NotImplementedError("Need to implement for Task 1.4")
+    # raise NotImplementedError("Need to implement for Task 1.4")
+
+    # get sorted computational graph, where the first element is the output (right of the computational graph)
+    queue = topological_sort(variable)
+    derivatives = {}  # key: var_id; value: $\diffp{variable}{var_id}$
+    derivatives[variable.unique_id] = deriv
+
+    for var in queue:
+        # for each variable `var`, find its derivative `deriv`
+        deriv = derivatives[var.unique_id]
+        if var.is_leaf():
+            # if `var` is a leaf, update its `derivative` attribute
+            var.accumulate_derivative(deriv)
+        else:
+            # if `var` is created by a function,
+            # calculate derivatives for all inputs using chain rule.
+            # `deriv` is the partial derivative of output, w.r.t. `var`
+            for v, d in var.chain_rule(deriv):
+                # if input is a constant, ignore
+                if v.is_constant():
+                    continue
+                # if input is a variable, accumulate its derivative
+                derivatives[v.unique_id] = derivatives.get(v.unique_id, 0.0) + d
 
 
 @dataclass

diff --git a/minitorch/operators.py b/minitorch/operators.py
@@ -13,25 +13,25 @@
 def mul(x: float, y: float) -> float:
     "$f(x, y) = x * y$"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return x * y
+    return float(x * y)
 
 
 def id(x: float) -> float:
     "$f(x) = x$"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return x
+    return float(x)
 
 
 def add(x: float, y: float) -> float:
     "$f(x, y) = x + y$"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return x + y
+    return float(x + y)
 
 
 def neg(x: float) -> float:
     "$f(x) = -x$"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return -x
+    return -1.0 * x
 
 
 def lt(x: float, y: float) -> float:
@@ -49,13 +49,13 @@ def eq(x: float, y: float) -> float:
 def max(x: float, y: float) -> float:
     "$f(x) =$ x if x is greater than y else y"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return x if x > y else y
+    return float(x) if x > y else float(y)
 
 
 def is_close(x: float, y: float) -> float:
     "$f(x) = |x - y| < 1e-2$"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return abs(x - y) < 1e-2
+    return 1.0 if abs(x - y) < 1e-2 else 0.0
 
 
 def sigmoid(x: float) -> float:
@@ -81,7 +81,7 @@ def relu(x: float) -> float:
     (See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) .)
     """
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return x if x > 0 else 0.0
+    return float(x) if x > 0 else 0.0
 
 
 EPS = 1e-6
@@ -100,7 +100,7 @@ def exp(x: float) -> float:
 def log_back(x: float, d: float) -> float:
     r"If $f = log$ as above, compute $d \times f'(x)$"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return d / x
+    return d / float(x)
 
 
 def inv(x: float) -> float:
@@ -112,13 +112,13 @@ def inv(x: float) -> float:
 def inv_back(x: float, d: float) -> float:
     r"If $f(x) = 1/x$ compute $d \times f'(x)$"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return -d * (x ** (-2))
+    return -float(d) * (x ** (-2))
 
 
 def relu_back(x: float, d: float) -> float:
     r"If $f = relu$ compute $d \times f'(x)$"
     # raise NotImplementedError("Need to implement for Task 0.1")
-    return d if x > 0 else 0
+    return float(d) if x > 0 else 0.0
 
 
 # ## Task 0.3

diff --git a/minitorch/scalar.py b/minitorch/scalar.py
@@ -18,6 +18,7 @@
     ReLU,
     ScalarFunction,
     Sigmoid,
+    wrap_tuple,
 )
 
 ScalarLike = Union[float, int, "Scalar"]
@@ -92,31 +93,31 @@ def __rtruediv__(self, b: ScalarLike) -> Scalar:
         return Mul.apply(b, Inv.apply(self))
 
     def __add__(self, b: ScalarLike) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return Add.apply(self, b)
 
     def __bool__(self) -> bool:
         return bool(self.data)
 
     def __lt__(self, b: ScalarLike) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return LT.apply(self, b)
 
     def __gt__(self, b: ScalarLike) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return LT.apply(b, self)
 
     def __eq__(self, b: ScalarLike) -> Scalar:  # type: ignore[override]
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return EQ.apply(self, b)
 
     def __sub__(self, b: ScalarLike) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return Add.apply(self, Neg.apply(b))
 
     def __neg__(self) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return Neg.apply(self)
 
     def __radd__(self, b: ScalarLike) -> Scalar:
         return self + b
@@ -125,20 +126,20 @@ def __rmul__(self, b: ScalarLike) -> Scalar:
         return self * b
 
     def log(self) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return Log.apply(self)
 
     def exp(self) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return Exp.apply(self)
 
     def sigmoid(self) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return Sigmoid.apply(self)
 
     def relu(self) -> Scalar:
-        # TODO: Implement for Task 1.2.
-        raise NotImplementedError("Need to implement for Task 1.2")
+        # raise NotImplementedError("Need to implement for Task 1.2")
+        return ReLU.apply(self)
 
     # Variable elements for backprop
 
@@ -168,13 +169,17 @@ def parents(self) -> Iterable[Variable]:
         return self.history.inputs
 
     def chain_rule(self, d_output: Any) -> Iterable[Tuple[Variable, Any]]:
+        "Calculates derivative for each input variable"
         h = self.history
         assert h is not None
         assert h.last_fn is not None
         assert h.ctx is not None
 
-        # TODO: Implement for Task 1.3.
-        raise NotImplementedError("Need to implement for Task 1.3")
+        # raise NotImplementedError("Need to implement for Task 1.3")
+        deriv = h.last_fn._backward(h.ctx, d_output)
+        return [
+            (x, y) for x, y in zip(h.inputs, wrap_tuple(deriv)) if (not x.is_constant())
+        ]
 
     def backward(self, d_output: Optional[float] = None) -> None:
         """