diff --git a/README.md b/README.md index 6edb6f8..fa88587 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -# minitorch -This repository is part of my journey to learn the underlying concepts of deep learning systems through implementing a minimal version of PyTorch library. +# MiniTorch: A DIY Course on Machine Learning Engineering +This repository is part of my journey to learn the underlying engineering concepts of deep learning systems through implementing a minimal version of PyTorch library. If you're interested in learning more, I highly recommend checking out the excellent [MiniTorch lectures](https://minitorch.github.io) and [Youtube playlist](https://www.youtube.com/playlist?list=PLO45-80-XKkQyROXXpn4PfjF1J2tH46w8) by [Prof. Rush](https://rush-nlp.com), and the [self-study guide](https://github.com/mukobi/Minitorch-Self-Study-Guide-SAIA/tree/main) by [Gabriel Mukobi](https://gabrielmukobi.com) that answers some common questions. ## Setup -My venv is Python 3.8. +My virtual environment is based on Python 3.8. Install dependencies ```bash @@ -60,15 +60,52 @@ To access the autograder: - [x] Modules - [x] Visualization +To run visualization for Module 0, use: +``` bash +streamlit run project/app.py -- 0 +``` + +Then you should be able to see interactive data visualizations and you could "turn the knobs" of the parameters to explore model behaviour. ![task0.5](./figs/task0.5.png) ## Module 1 - Autodiff -- [ ] Numerical Derivatives -- [ ] Scalars -- [ ] Chain Rule -- [ ] Backpropagation -- [ ] Training +- [x] Numerical Derivatives +- [x] Scalars +- [x] Chain Rule +- [x] Backpropagation +- [x] Training + +PyTorch terminology in computational graph: +- Leaf nodes: variables created from scratch on the left hand side (e.g., `minitorch.Scalar(0.0)` ) +- Non-leaf nodes: variables created with a Function +- Constant nodes: terms that passed in that is not a variable (scalars without a history, e.g., `minitorch.Scalar(0.0, None)` ) + +Backpropagation algorithm: +1. Call topological sort. The result should start from the right of the computational graph (i.e., the output) +2. Create a dict of variables and derivatives +3. For each node in topological sort + 1. If variable is a leaf, then add its final derivative + 2. If the variable is not a leaf + 1. Call backward with its derivative as $d$ + 2. Loop through all the variables and derivative + 3. Accumulate derivatives for the variable + +To run visualization for Module 1, use: +``` bash +streamlit run project/app.py -- 1 +``` + +Here's the training result for the XOR dataset. +![task1.5](./figs/task1.5.png) + +The parameters are +``` +PTS = 50 +DATASET = minitorch.datasets["Xor"](PTS) +HIDDEN = 10 +RATE = 0.5 +``` ## Module 2 - Tensors diff --git a/figs/task1.5.png b/figs/task1.5.png new file mode 100644 index 0000000..b55fdee Binary files /dev/null and b/figs/task1.5.png differ diff --git a/minitorch/autodiff.py b/minitorch/autodiff.py index 2b69873..15a86fe 100644 --- a/minitorch/autodiff.py +++ b/minitorch/autodiff.py @@ -10,6 +10,7 @@ def central_difference(f: Any, *vals: Any, arg: int = 0, epsilon: float = 1e-6) -> Any: r""" Computes an approximation to the derivative of `f` with respect to one arg. + $f'(x) \approx \frac{f(x + \epsilon) - f(x - \epsilon)}{2 \epsilon}a$ See :doc:`derivative` or https://en.wikipedia.org/wiki/Finite_difference for more details. @@ -22,8 +23,11 @@ def central_difference(f: Any, *vals: Any, arg: int = 0, epsilon: float = 1e-6) Returns: An approximation of $f'_i(x_0, \ldots, x_{n-1})$ """ - # TODO: Implement for Task 1.1. - raise NotImplementedError("Need to implement for Task 1.1") + # raise NotImplementedError("Need to implement for Task 1.1") + vals_1, vals_0 = list(vals), list(vals) + vals_1[arg] += epsilon + vals_0[arg] -= epsilon + return (f(*vals_1) - f(*vals_0)) / (2 * epsilon) variable_count = 1 @@ -61,8 +65,33 @@ def topological_sort(variable: Variable) -> Iterable[Variable]: Returns: Non-constant Variables in topological order starting from the right. """ - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + + #### Pseudocode for topo sort + ## visit(last) + ## function visit(node n): + ## if n has a mark then return + ## for each node m with an edge from n to m, do: + ## visit(m) + ## mark n with a permanent mark + ## add n to list + + result: List[Variable] = [] + visited = set() + + def visit(var: Variable) -> None: + id = var.unique_id + if id in visited or var.is_constant(): + return + if not var.is_leaf(): + for m in var.parents: + if not m.is_constant(): + visit(m) + visited.add(id) + result.insert(0, var) + + visit(variable) + return result def backpropagate(variable: Variable, deriv: Any) -> None: @@ -76,8 +105,29 @@ def backpropagate(variable: Variable, deriv: Any) -> None: No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`. """ - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + + # get sorted computational graph, where the first element is the output (right of the computational graph) + queue = topological_sort(variable) + derivatives = {} # key: var_id; value: $\diffp{variable}{var_id}$ + derivatives[variable.unique_id] = deriv + + for var in queue: + # for each variable `var`, find its derivative `deriv` + deriv = derivatives[var.unique_id] + if var.is_leaf(): + # if `var` is a leaf, update its `derivative` attribute + var.accumulate_derivative(deriv) + else: + # if `var` is created by a function, + # calculate derivatives for all inputs using chain rule. + # `deriv` is the partial derivative of output, w.r.t. `var` + for v, d in var.chain_rule(deriv): + # if input is a constant, ignore + if v.is_constant(): + continue + # if input is a variable, accumulate its derivative + derivatives[v.unique_id] = derivatives.get(v.unique_id, 0.0) + d @dataclass diff --git a/minitorch/operators.py b/minitorch/operators.py index 47c5962..1ffdc37 100644 --- a/minitorch/operators.py +++ b/minitorch/operators.py @@ -13,25 +13,25 @@ def mul(x: float, y: float) -> float: "$f(x, y) = x * y$" # raise NotImplementedError("Need to implement for Task 0.1") - return x * y + return float(x * y) def id(x: float) -> float: "$f(x) = x$" # raise NotImplementedError("Need to implement for Task 0.1") - return x + return float(x) def add(x: float, y: float) -> float: "$f(x, y) = x + y$" # raise NotImplementedError("Need to implement for Task 0.1") - return x + y + return float(x + y) def neg(x: float) -> float: "$f(x) = -x$" # raise NotImplementedError("Need to implement for Task 0.1") - return -x + return -1.0 * x def lt(x: float, y: float) -> float: @@ -49,13 +49,13 @@ def eq(x: float, y: float) -> float: def max(x: float, y: float) -> float: "$f(x) =$ x if x is greater than y else y" # raise NotImplementedError("Need to implement for Task 0.1") - return x if x > y else y + return float(x) if x > y else float(y) def is_close(x: float, y: float) -> float: "$f(x) = |x - y| < 1e-2$" # raise NotImplementedError("Need to implement for Task 0.1") - return abs(x - y) < 1e-2 + return 1.0 if abs(x - y) < 1e-2 else 0.0 def sigmoid(x: float) -> float: @@ -81,7 +81,7 @@ def relu(x: float) -> float: (See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) .) """ # raise NotImplementedError("Need to implement for Task 0.1") - return x if x > 0 else 0.0 + return float(x) if x > 0 else 0.0 EPS = 1e-6 @@ -100,7 +100,7 @@ def exp(x: float) -> float: def log_back(x: float, d: float) -> float: r"If $f = log$ as above, compute $d \times f'(x)$" # raise NotImplementedError("Need to implement for Task 0.1") - return d / x + return d / float(x) def inv(x: float) -> float: @@ -112,13 +112,13 @@ def inv(x: float) -> float: def inv_back(x: float, d: float) -> float: r"If $f(x) = 1/x$ compute $d \times f'(x)$" # raise NotImplementedError("Need to implement for Task 0.1") - return -d * (x ** (-2)) + return -float(d) * (x ** (-2)) def relu_back(x: float, d: float) -> float: r"If $f = relu$ compute $d \times f'(x)$" # raise NotImplementedError("Need to implement for Task 0.1") - return d if x > 0 else 0 + return float(d) if x > 0 else 0.0 # ## Task 0.3 diff --git a/minitorch/scalar.py b/minitorch/scalar.py index f5abbe9..1bec19f 100644 --- a/minitorch/scalar.py +++ b/minitorch/scalar.py @@ -18,6 +18,7 @@ ReLU, ScalarFunction, Sigmoid, + wrap_tuple, ) ScalarLike = Union[float, int, "Scalar"] @@ -92,31 +93,31 @@ def __rtruediv__(self, b: ScalarLike) -> Scalar: return Mul.apply(b, Inv.apply(self)) def __add__(self, b: ScalarLike) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return Add.apply(self, b) def __bool__(self) -> bool: return bool(self.data) def __lt__(self, b: ScalarLike) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return LT.apply(self, b) def __gt__(self, b: ScalarLike) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return LT.apply(b, self) def __eq__(self, b: ScalarLike) -> Scalar: # type: ignore[override] - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return EQ.apply(self, b) def __sub__(self, b: ScalarLike) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return Add.apply(self, Neg.apply(b)) def __neg__(self) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return Neg.apply(self) def __radd__(self, b: ScalarLike) -> Scalar: return self + b @@ -125,20 +126,20 @@ def __rmul__(self, b: ScalarLike) -> Scalar: return self * b def log(self) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return Log.apply(self) def exp(self) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return Exp.apply(self) def sigmoid(self) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return Sigmoid.apply(self) def relu(self) -> Scalar: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return ReLU.apply(self) # Variable elements for backprop @@ -168,13 +169,17 @@ def parents(self) -> Iterable[Variable]: return self.history.inputs def chain_rule(self, d_output: Any) -> Iterable[Tuple[Variable, Any]]: + "Calculates derivative for each input variable" h = self.history assert h is not None assert h.last_fn is not None assert h.ctx is not None - # TODO: Implement for Task 1.3. - raise NotImplementedError("Need to implement for Task 1.3") + # raise NotImplementedError("Need to implement for Task 1.3") + deriv = h.last_fn._backward(h.ctx, d_output) + return [ + (x, y) for x, y in zip(h.inputs, wrap_tuple(deriv)) if (not x.is_constant()) + ] def backward(self, d_output: Optional[float] = None) -> None: """ diff --git a/minitorch/scalar_functions.py b/minitorch/scalar_functions.py index d8d2307..5bcfde5 100644 --- a/minitorch/scalar_functions.py +++ b/minitorch/scalar_functions.py @@ -103,13 +103,15 @@ class Mul(ScalarFunction): @staticmethod def forward(ctx: Context, a: float, b: float) -> float: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + ctx.save_for_backward(a, b) + return operators.mul(a, b) @staticmethod def backward(ctx: Context, d_output: float) -> Tuple[float, float]: - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + a, b = ctx.saved_values + return b * d_output, a * d_output class Inv(ScalarFunction): @@ -117,13 +119,15 @@ class Inv(ScalarFunction): @staticmethod def forward(ctx: Context, a: float) -> float: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + ctx.save_for_backward(a) + return operators.inv(a) @staticmethod def backward(ctx: Context, d_output: float) -> float: - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + (a,) = ctx.saved_values + return operators.inv_back(a, d_output) class Neg(ScalarFunction): @@ -131,13 +135,13 @@ class Neg(ScalarFunction): @staticmethod def forward(ctx: Context, a: float) -> float: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return operators.neg(a) @staticmethod def backward(ctx: Context, d_output: float) -> float: - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + return -d_output class Sigmoid(ScalarFunction): @@ -145,13 +149,15 @@ class Sigmoid(ScalarFunction): @staticmethod def forward(ctx: Context, a: float) -> float: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + ctx.save_for_backward(a) + return operators.sigmoid(a) @staticmethod def backward(ctx: Context, d_output: float) -> float: - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + (a,) = ctx.saved_values + return d_output * operators.sigmoid(a) * (1 - operators.sigmoid(a)) class ReLU(ScalarFunction): @@ -159,13 +165,15 @@ class ReLU(ScalarFunction): @staticmethod def forward(ctx: Context, a: float) -> float: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + ctx.save_for_backward(a) + return operators.relu(a) @staticmethod def backward(ctx: Context, d_output: float) -> float: - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + (a,) = ctx.saved_values + return operators.relu_back(a, d_output) class Exp(ScalarFunction): @@ -173,13 +181,15 @@ class Exp(ScalarFunction): @staticmethod def forward(ctx: Context, a: float) -> float: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + ctx.save_for_backward(a) + return operators.exp(a) @staticmethod def backward(ctx: Context, d_output: float) -> float: - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + (a,) = ctx.saved_values + return operators.exp(a) * d_output class LT(ScalarFunction): @@ -187,13 +197,13 @@ class LT(ScalarFunction): @staticmethod def forward(ctx: Context, a: float, b: float) -> float: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return operators.lt(a, b) @staticmethod def backward(ctx: Context, d_output: float) -> Tuple[float, float]: - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + return 0.0, 0.0 class EQ(ScalarFunction): @@ -201,10 +211,10 @@ class EQ(ScalarFunction): @staticmethod def forward(ctx: Context, a: float, b: float) -> float: - # TODO: Implement for Task 1.2. - raise NotImplementedError("Need to implement for Task 1.2") + # raise NotImplementedError("Need to implement for Task 1.2") + return operators.eq(a, b) @staticmethod def backward(ctx: Context, d_output: float) -> Tuple[float, float]: - # TODO: Implement for Task 1.4. - raise NotImplementedError("Need to implement for Task 1.4") + # raise NotImplementedError("Need to implement for Task 1.4") + return 0.0, 0.0 diff --git a/tests/test_autodiff.py b/tests/test_autodiff.py index fc40f00..5beb65f 100644 --- a/tests/test_autodiff.py +++ b/tests/test_autodiff.py @@ -4,6 +4,11 @@ import minitorch from minitorch import Context, ScalarFunction, ScalarHistory +from minitorch.autodiff import topological_sort + +# from minitorch.scalar_functions import Log, Exp +# from minitorch.scalar import Scalar +import math # ## Task 1.3 - Tests for the autodifferentiation machinery. @@ -36,44 +41,62 @@ def backward(ctx: Context, d_output: float) -> Tuple[float, float]: return d_output * (y + 1), d_output * x +class Function3(ScalarFunction): + @staticmethod + def forward(ctx: Context, x: float, y: float) -> float: + "$f(x, y) = log(xy) + exp(xy)$" + return math.log(x * y) + math.exp(x * y) + + @staticmethod + def backward(ctx: Context, d_output: float) -> Tuple[float, float]: + "Derivatives are $f'_x(x, y) = \frac{1}{x} + exp(xy) \times y$ and $f'_y(x, y) = \frac{1}{y} + exp(xy) \times x$" + x, y = ctx.saved_values + return d_output * (1 / x + math.exp(x * y) * y), d_output * ( + 1 / y + math.exp(x * y) * x + ) + + # Checks for the chain rule function. @pytest.mark.task1_3 def test_chain_rule1() -> None: - x = minitorch.Scalar(0.0) - constant = minitorch.Scalar( - 0.0, ScalarHistory(Function1, ctx=Context(), inputs=[x, x]) - ) - back = constant.chain_rule(d_output=5) - assert len(list(back)) == 2 + "Check that constants are ignored." + constant = minitorch.Scalar(0.0, None) + + y = Function1.apply(constant, constant) + + back = y.chain_rule(d_output=5) + assert len(list(back)) == 0 @pytest.mark.task1_3 def test_chain_rule2() -> None: - var = minitorch.Scalar(0.0, ScalarHistory()) - constant = minitorch.Scalar( - 0.0, ScalarHistory(Function1, ctx=Context(), inputs=[var, var]) - ) - back = constant.chain_rule(d_output=5) + "Check that constants are ignored and variables get derivatives." + var = minitorch.Scalar(0.0) + constant = minitorch.Scalar(0.0, None) + + y = Function1.apply(var, constant) + + back = y.chain_rule(d_output=5) back = list(back) - assert len(back) == 2 + assert len(back) == 1 variable, deriv = back[0] assert deriv == 5 @pytest.mark.task1_3 def test_chain_rule3() -> None: - "Check that constrants are ignored and variables get derivatives." - constant = 10 + "Check that constants are ignored and variables get derivatives." var = minitorch.Scalar(5) + constant = minitorch.Scalar(10, None) y = Function2.apply(constant, var) back = y.chain_rule(d_output=5) back = list(back) - assert len(back) == 2 - variable, deriv = back[1] + assert len(back) == 1 + variable, deriv = back[0] # assert variable.name == var.name assert deriv == 5 * 10 @@ -141,3 +164,14 @@ def test_backprop4() -> None: var4 = Function1.apply(var2, var3) var4.backward(d_output=5) assert var0.derivative == 10 + + +@pytest.mark.task1_4 +def test_backprop5() -> None: + # Example 5: F2(F2(0, v1), F2(0, v1)) + var1 = minitorch.Scalar(2) + var2 = Function2.apply(1, var1) + var3 = Function2.apply(1, var1) + var4 = Function2.apply(var2, var3) + var4.backward(d_output=1) + assert var1.derivative == 7 diff --git a/tests/test_scalar.py b/tests/test_scalar.py index 38ffc00..2194345 100644 --- a/tests/test_scalar.py +++ b/tests/test_scalar.py @@ -65,7 +65,39 @@ def test_simple(a: float, b: float) -> None: c = Scalar(a).relu() + Scalar(b).relu() assert_close(c.data, minitorch.operators.relu(a) + minitorch.operators.relu(b)) - # Add others if you would like... + # Simple sigmoid + c = Scalar(a).sigmoid() + Scalar(b).sigmoid() + assert_close( + c.data, minitorch.operators.sigmoid(a) + minitorch.operators.sigmoid(b) + ) + + # Simple exp + c = Scalar(a).exp() + Scalar(b).exp() + assert_close(c.data, minitorch.operators.exp(a) + minitorch.operators.exp(b)) + + # Simple log + if a > 0: + c = Scalar(a).log() + assert_close(c.data, minitorch.operators.log(a)) + if b > 0: + c = Scalar(b).log() + assert_close(c.data, minitorch.operators.log(b)) + + # Simple lt + c = Scalar(a) < Scalar(b) + assert_close(c.data, a < b) + + # Simple gt + c = Scalar(a) > Scalar(b) + assert_close(c.data, a > b) + + # Simple eq + c = Scalar(a) == Scalar(b) + assert_close(c.data, a == b) + + # Simple sub + c = Scalar(a) - Scalar(b) + assert_close(c.data, a - b) one_arg, two_arg, _ = MathTestVariable._comp_testing()