So, I have been trying to implement a basic Autograd and Neural Network from scratch using some numpy. This is the part of code of my AD which matters for this question which has been greatly shorten down for MRE. This is grad.py
from typing import Self
import numpy as np
class Variable:
def __init__(self, value: np.ndarray=None):
self.value = value if isinstance(value, np.ndarray) else np.asarray(value)
self.prev = None
def _variablify(self, x) -> Self:
if not isinstance(x, Variable):
x = Variable(x)
return x
def __add__(self, x) -> Self:
x = self._variablify(x)
y = Variable(self.value + x.value)
return y
def __mul__(self, x) -> Self:
x = self._variablify(x)
y = Variable(self.value * x.value)
return y
__radd__ = __add__
__rmul__ = __mul__
def dot(self, x):
x = self._variablify(x)
y = Variable(self.value.dot(x.value))
return y
def __lt__(self, other):
return self.value < other
def __gt__(self, other):
return self.value > other
def dot(a: Variable, b: Variable):
return a.dot(b)
In the other file, main.py I try to implement a neural net
from typing import Self
import numpy as np
from grad import Variable
import grad
class Layer:
def __init__(self, neurons: int):
self.n_size = neurons
self.activation = Variable(0)
def previous(self, layer: Self):
self.previous_layer = layer
self.previous_layer.next_layer = self
def next(self, layer: Self):
self.next_layer = layer
self.next_layer.previous_layer = self
def initialise(self):
self.weight_matrix = Variable(np.random.normal(0, 0.01, (self.n_size, self.next_layer.n_size)))
self.bias_vector = Variable(np.random.normal(0, 0.01, (1, self.next_layer.n_size)))
self.next_layer.x = grad.dot(self.activation, self.weight_matrix) + self.bias_vector
self.next_layer.activation = np.where(self.next_layer.x > 0, self.next_layer.x, 0.01*self.next_layer.x) # Using LeakyReLU
if __name__ == "__main__":
input_layer = Layer(5)
input_layer.activation = Variable(np.random.randint(1, 5, (1,5)))
h1 = Layer(3)
h1.previous(input_layer)
output = Layer(2)
output.previous(h1)
input_layer.initialise()
h1.initialise()
print(input_layer.activation, h1.activation, output.activation)
So, as you can see in the grad.py I had implemented the code for dot product wrapper. But now, here comes the error upon running the main.py file-
Traceback (most recent call last):
File ".../main.py", line 62, in <module>
h1.initialise()
File ".../main.py", line 40, in initialise
self.next_layer.x = grad.dot(self.activation, self.weight_matrix) + self.bias_vector
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File ".../grad.py", line 191, in dot
return a.dot(b)
^^^^^^^^
File ".../grad.py", line 49, in __mul__
y = Variable(self.value * x.value)
~~~~~~~~~~~^~~~~~~~~
ValueError: operands could not be broadcast together with shapes (1,3) (3,2)
Now to me, this is very strange. Because the error seems to tell us that a.dot(b)
somehow called __mul__
which...it never did. I have absolutely no idea what is going on here. Any help would be greatly appreciated.
Thanks.
I see you are using type annotations. But apparently you aren't using a tool to enforce them like mypy
. That's why they give you a sense of false security. It is true that def dot()
is annotated to accept two Variables
but you are not passing in two Variables. One of the parameters is a numpy array.
A call to mypy
shows that this is the culprit:
self.next_layer.activation = np.where(self.next_layer.x > 0, self.next_layer.x, 0.01*self.next_layer.x)
You assign a numpy array to activation
, but it is supposed and assumed to be a Variable
This explains the weird traceback. a.dot(b)
is not calling your implementation, but numpy
's instead. And numpy uses __mul__
on the array elements.
So you need to figure out how to fix the above line. And you really need to run mypy
or something similar if you are using type hints.