!pip install -q numpy matplotlib

import submitted
import importlib
import numpy as np
import matplotlib.pyplot as plt

def create_function(what_to_add):
  """
  This method returns another method called ```add```. The inner method ```add```
  uses the variable ```what_to_add``` from the outer scope and is remembered
  at execution time of the returned method. This is called a closure.
  """
  def add(x):
      return x + what_to_add  # what_to_add is "remembered" in the closure
  return add

### Create the adding function ###
adding_function = create_function(5)
print(adding_function)

### Run the adding function (adding 5 from the closure to 10) ###
print(adding_function(10))

<function create_function.<locals>.add at 0x7cce39c98e00>
15

# The type of the number 2 is int
print(type(2)) -> <class 'int'>

# Add together two integers
print(2 + 3)

# Equivalently we can do
(2).__add__(3)

# The type of the number 2 is int
print(type(2))

# Add together two integers
print(2 + 3)

# Equivalently we can do
(2).__add__(3)

<class 'int'>
5

5

import numpy as np

def ensure_tensor(x):
    """
    
    #### DO NOT CHANGE ####
    
    Quick helper method to check if something is a tensor
    otherwise convert it. Tensors have a .grad attribute
    unlike Numpy so wwe can check for that!
    """
    if hasattr(x, "grad"):
        return x
    else:
        return Tensor(x, requires_grad=False)
        
class Tensor:

    """
    This is the structure for a tensor! There is no need 
    to change any code here, through the MP we will be defining 
    all of these methods!

    Args:
        - array: numpy array
        - requires_grad: do we need to track gradients on this tensor
        - parents: From what tensor was this new tensor derived from
        - backward_fn: What is the backward method for this operation
    """
    def __init__(self,
                 array,
                 requires_grad=False,
                 parents=(),
                 grad_fn=None,
                 grad_fn_name=None):

        self.array = np.array(array, dtype=float)
        self.requires_grad = requires_grad
        self.grad = np.zeros_like(self.array) if requires_grad else None
        self.parents = parents
        self.grad_fn = grad_fn
        self.grad_fn_name = grad_fn_name

    def zero_grad(self):
        """
        Allows us to zero out the gradients. After an optimization step, we can use this
        to zero the grad for the next gradient computation
        """
        if self.requires_grad:
            self.grad = np.zeros_like(self.grad)

    @property
    def shape(self):
        """
        Allows us to do Tensor.shape to get the shape, just like in numpy where you can
        do ndarray.shape to get the shape of the array
        """
        return self.array.shape
        
    def __add__(self, other):
        """
        self + other
        """
        return add(ensure_tensor(self), ensure_tensor(other))

    def __radd__(self, other):
        """
        __r(op)__ is a reverse method incase of failure. For example, if we do:

            Tensor + Int -> (Tensor).__add__(Int) This will work as our Tensor class has a __add__ method that can handle this by converting the int
            to a Tensor first and then performing the operation (this will be defined in our add() method later

            Int + Tensor -> (Int).__add__(Tensor) This will fail, as the default Python Int doesnt know how to perform an add operation with a Tensor
            as the Tensor is a class we are making and Python Int has never seen this before. This then triggers Python to try __radd__ which will
            be again Tensor + Int -> (Tensor).__add__(Int) as a backup case to see if this works.

        self + other
        """
        return add(ensure_tensor(self), ensure_tensor(other))

    def __repr__(self):

        """
        Just some pretty printing to make it easier to inspect the Tensors!
        """
        data = self.array

        data_str = np.array2string(
            data,
            separator=" ",
            precision=3,
            floatmode="fixed",
            max_line_width=80
        )

        lines = data_str.split("\n")
        if len(lines) > 1:
            indent = " " * len("tensor(")
            data_str = lines[0] + "\n" + "\n".join(indent + line for line in lines[1:])

        grad_info = ""
        if getattr(self, "requires_grad", False):
            if getattr(self, "grad_fn_name", None) is not None:
                grad_info = f", grad_fn={getattr(self, 'grad_fn_name', None)}"
            else:
                grad_info = ", requires_grad=True"
                
        return f"tensor({data_str}{grad_info})"

importlib.reload(submitted)

test_tensor = Tensor([[1,2,3]]) # (1,3) tensor that was broadcasted to a (2,3) tensor
dummy_grad = np.array([[1,4,7], [2,1,6]]) # (2,3) upstream gradients we want to accumulate

### In submitted.py implement broadcast_grad_accumulate
accum_grad = submitted.broadcast_grad_accumulate(test_tensor, dummy_grad)

print(accum_grad)
print(accum_grad.shape)

[[ 3  5 13]]
(1, 3)

def add(a, b):


    ### Check if this operation requires grad. If neither a nor b requires gradients ###
    ### then we dont need to track the computational graph here ###
    requires_grad = a.requires_grad or b.requires_grad

    ### Compute the forward pass ###
    output = a.array + b.array
    
    ### Define the backward method as a Closure ###
    def backward(grad_out): # grad_out is the upstream gradients

        # grad_out = dL/dy -> our upstream gradients

        ### If a requires grad then we compute it here ###
        if a.requires_grad:

            ### grad_a = dL/dy * 1 = dL/dy
            grad_a = grad_out

            ### Broadcasting accumulation ###
            grad_a = submitted.broadcast_grad_accumulate(a, grad_a)

            ### Accumulate gradients into the .grad attribute in A
            a.grad += grad_a

        ### Same logic! ###
        if b.requires_grad:
            grad_b = grad_out
            grad_b = submitted.broadcast_grad_accumulate(b, grad_b)
            b.grad += grad_b

    ### Create a new tensor with our attributes 
    return Tensor(
        output, # new tensor is A + B so store the output here
        requires_grad=requires_grad, # Do we need to track grads?
        parents=(a, b), # What tensors were used to create this new one?
        grad_fn=backward if requires_grad else None, # Store the backward method for usage later
        grad_fn_name="<AddBackward>" # Keep a name for visualization
    )

### Create some dummy tensors to add ###
A = Tensor([[1,2,3],[4,5,6]], requires_grad=True)
B = Tensor([[8,2,5],[2,7,3]], requires_grad=True)

### Perform the Sum ###
out = A + B
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.ones_like(out.array)

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)
print("Grad w.r.t B:")
print(B.grad)
print(B.shape)

Output:
tensor([[ 9.000  4.000  8.000]
        [ 6.000 12.000  9.000]], grad_fn=<AddBackward>)
Grad w.r.t A:
[[1. 1. 1.]
 [1. 1. 1.]]
(2, 3)
Grad w.r.t B:
[[1. 1. 1.]
 [1. 1. 1.]]
(2, 3)

### Create some dummy tensors to add ###
A = Tensor([[1,2,3],[4,5,6]], requires_grad=True)
B = Tensor([[8,2,5]], requires_grad=True)

### Perform the Sum ###
out = A + B
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.ones_like(out.array)

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)
print("Grad w.r.t B:")
print(B.grad)
print(B.shape)

Output:
tensor([[ 9.000  4.000  8.000]
        [12.000  7.000 11.000]], grad_fn=<AddBackward>)
Grad w.r.t A:
[[1. 1. 1.]
 [1. 1. 1.]]
(2, 3)
Grad w.r.t B:
[[2. 2. 2.]]
(1, 3)

importlib.reload(submitted)
from submitted import Tensor

importlib.reload(submitted)
from submitted import Tensor

### Create some dummy tensors to subtract ###
A = Tensor([[1,2,3],[4,5,6]], requires_grad=True)
B = Tensor([[8,2,5]], requires_grad=True)

### Perform the subtraction ###
out = A - B
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.ones_like(out.array)

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)
print("Grad w.r.t B:")
print(B.grad)
print(B.shape)

Output:
tensor([[-7.000  0.000 -2.000]
        [-4.000  3.000  1.000]], grad_fn=<SubBackward>)
Grad w.r.t A:
[[1. 1. 1.]
 [1. 1. 1.]]
(2, 3)
Grad w.r.t B:
[[-2. -2. -2.]]
(1, 3)

importlib.reload(submitted)
from submitted import Tensor

### Create some dummy tensors to multiply ###
A = Tensor([[1,2,3],[4,5,6]], requires_grad=True)
B = Tensor([[8,2,5]], requires_grad=True)

### Perform the multiplication ###
out = A * B
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.ones_like(out.array)

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)
print("Grad w.r.t B:")
print(B.grad)
print(B.shape)

Output:
tensor([[ 8.000  4.000 15.000]
        [32.000 10.000 30.000]], grad_fn=<MulBackward>)
Grad w.r.t A:
[[8. 2. 5.]
 [8. 2. 5.]]
(2, 3)
Grad w.r.t B:
[[5. 7. 9.]]
(1, 3)

importlib.reload(submitted)
from submitted import Tensor

### Create some dummy tensors to divide ###
A = Tensor([[1,2,3],[4,5,6]], requires_grad=True)
B = Tensor([[8,2,5]], requires_grad=True)

### Perform the division ###
out = A / B
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.ones_like(out.array)

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)
print("Grad w.r.t B:")
print(B.grad)
print(B.shape)

Output:
tensor([[0.125 1.000 0.600]
        [0.500 2.500 1.200]], grad_fn=<DivBackward>)
Grad w.r.t A:
[[0.125 0.5   0.2  ]
 [0.125 0.5   0.2  ]]
(2, 3)
Grad w.r.t B:
[[-0.078125   -1.75       -0.35999998]]
(1, 3)

importlib.reload(submitted)
from submitted import Tensor

### Create some dummy tensors to matmul ###
A = Tensor([[1,1,1],[1,1,1]], requires_grad=True)
B = Tensor([[2,2], [2,2], [2,2]], requires_grad=True)

### Perform the matmul ###
out = A @ B
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.ones_like(out.array)

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)
print("Grad w.r.t B:")
print(B.grad)
print(B.shape)

Output:
tensor([[6.000 6.000]
        [6.000 6.000]], grad_fn=<MatmulBackward>)
Grad w.r.t A:
[[4. 4. 4.]
 [4. 4. 4.]]
(2, 3)
Grad w.r.t B:
[[2. 2.]
 [2. 2.]
 [2. 2.]]
(3, 2)

importlib.reload(submitted)
from submitted import Tensor

### Create some dummy tensors to log ###
A = Tensor([[1,1,1],[1,1,1]], requires_grad=True)

### Perform the log ###
out = A.log()
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.ones_like(out.array)

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)

Output:
tensor([[0.000 0.000 0.000]
        [0.000 0.000 0.000]], grad_fn=<LogBackward>)
Grad w.r.t A:
[[1. 1. 1.]
 [1. 1. 1.]]
(2, 3)

importlib.reload(submitted)
from submitted import Tensor

### Create some dummy tensors to exponentiate ###
A = Tensor([[1,1,1],[1,1,1]], requires_grad=True)

### Perform the exp ###
out = A.exp()
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.ones_like(out.array)

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)

Output:
tensor([[2.718 2.718 2.718]
        [2.718 2.718 2.718]], grad_fn=<ExpBackward>)
Grad w.r.t A:
[[2.718282 2.718282 2.718282]
 [2.718282 2.718282 2.718282]]
(2, 3)

importlib.reload(submitted)
from submitted import Tensor

### Create a tensor to reduce sum on
A = Tensor([[1,1,1],[1,1,1]], requires_grad=True)

### Perform the sum reduction
out = A.sum(axis=1, keepdims=False)
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.array([1,2])

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)

Output:
tensor([3.000 3.000], grad_fn=<SumBackward>)
Grad w.r.t A:
[[1. 1. 1.]
 [2. 2. 2.]]
(2, 3)

importlib.reload(submitted)
from submitted import Tensor

### Create a tensor to reduce max on
A = Tensor([[1,2,3],[6,4,0]], requires_grad=True)

### Perform the reduce max
out = A.max(axis=1, keepdims=False)
print("Output:")
print(out)

### Create some upstream gradient ###
pretend_upstream_grad = np.array([1,2])

### Perform the grad fn ###
out.grad_fn(pretend_upstream_grad)

print("Grad w.r.t A:")
print(A.grad)
print(A.shape)

Output:
tensor([3.000 6.000], grad_fn=<MaxBackward>)
Grad w.r.t A:
[[0. 0. 1.]
 [2. 0. 0.]]
(2, 3)

### Define some Tensors ###
A = Tensor([[1,2,3],[4,5,6]], requires_grad=True)
B = Tensor([[8,2,5]], requires_grad=True)

### Perform an operation 
Y = A + B
print(Y)

### Check parents ###
Y.parents

tensor([[ 9.000  4.000  8.000]
        [12.000  7.000 11.000]], grad_fn=<AddBackward>)

(tensor([[1.000 2.000 3.000]
         [4.000 5.000 6.000]], requires_grad=True),
 tensor([[8.000 2.000 5.000]], requires_grad=True))

importlib.reload(submitted)
from submitted import Tensor

x = Tensor([[1,2,3],[6,4,0]], requires_grad=True)

def sigmoid(x):
    return 1 / (1 + (-1 * x).exp())
    

output = sigmoid(x)
topo = output.backward()

print("Gradient from AutoGrad")
print(x.grad)

print("Manual Gradients")
print(output.array * (1 - output.array))

Gradient from AutoGrad
[[0.19661194 0.10499357 0.04517666]
 [0.00246651 0.01766271 0.25      ]]
Manual Gradients
[[0.19661193 0.10499363 0.04517666]
 [0.00246647 0.01766273 0.25      ]]

def tanh(x):
    
    """
    tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
    """
    
    output = (x.exp() - (-1*x).exp()) / (x.exp() + (-1*x).exp())
    
    return output

def softmax(x):
    """
    Stable Softmax:
    
    softmax(x_i) = exp(x_i - max(x)) / sum_j exp(x_j - max(x))
    """
    shifted = x - x.max(axis=-1, keepdims=True)
    exp = shifted.exp()
    probs = exp / exp.sum(axis=1, keepdims=True)
    return probs

class Linear:

    """
    y = x @ weights + b 
    
    weights: (in_features x out_features)
    bias: (1 x out_features)

    Initialize weights/bias from uniform(-sqrt(k), sqrt(k)) where k = 1 / in_features, This is standard in PyTorch
    """
    def __init__(self, 
                 in_features,
                 out_features):

        k = 1 / in_features

        # Initialize weights uniformly
        self.W = Tensor(
            np.random.uniform(-np.sqrt(k), np.sqrt(k), size=(in_features, out_features)),
            requires_grad=True # This is a learnable parameter, so we require grad
        )
        
        # Initialize bias uniformly
        self.b = Tensor(
            np.random.uniform(-np.sqrt(k), np.sqrt(k), size=(1, out_features)),
            requires_grad=True # This is a learnable parameter, so we require grad
        )

    def forward(self, x):

        ### Write the operation of a linear layer (projection and a bias)
        return x @ self.W + self.b

    def __call__(self, x):
        """
        allows us to not keeping using .foward, just a shorthand to do:

            layer = Linear(input, output)

            layer.forward(x) or layer(x) are equivalent!
            
        """
        return self.forward(x)
        
    def parameters(self):
        """
        These are the parameters we want to update later
        """
        return [self.W, self.b]

def cross_entropy_loss(logits, targets):
    """
    logits: (B x num_classes)
    targets: (B,)

    targets are label encoded (just an integer that tells us which labels is the correct answer) we must 
    convert these to one hot encoding! This portion of the code is provided as seen below. 
    """

    batch_size = logits.array.shape[0]

    ### If targets is a tensor convert to a numpy array as thats all we need ###
    if isinstance(targets, Tensor):
        targets = targets.array.astype(int)

    # Apply softmax to get probabilities (uses autograd!)
    probs = softmax(logits)

    # Create one-hot encoded targets [4] -> [0 0 0 0 1 0 0 0 0 0]
    num_classes = logits.array.shape[1]
    targets_one_hot = np.zeros((batch_size, num_classes))
    targets_one_hot[np.arange(batch_size), targets] = 1
    targets_tensor = Tensor(targets_one_hot)

    # Add small epsilon for numerical stability in log
    eps = 1e-8
    probs = probs + eps

    # Compute cross entropy loss -mean(sum(y * log(p)))
    log_probs = probs.log()
    loss = -1 * (targets_tensor * log_probs).sum() / batch_size

    return loss

class MLP:

    """
    Lets define a small MLP! It will consist of 3 layers

        self.fc1 -> input_size to hidden_size
        self.fc2 -> hidden_size to hidden_size
        self.fc3 -> hidden_size to num_classes

    """
    def __init__(self, 
                 input_size, # This will be 784, as that is how many pixels we have in mnist
                 hidden_size, # This is a hyperparameter, arbritrary and lets us control model parameters
                 num_classes): # This is our number of outputs, we have one output per class

        ### We Define 3 Layers here ###
        self.fc1 = Linear(input_size, hidden_size)
        self.fc2 = Linear(hidden_size, hidden_size)
        self.fc3 = Linear(hidden_size, num_classes)

    def forward(self, x):

        """
        Pass data through the layers and use our tanh activation
        function after the first two layers, returning raw logit
        outputs from the last layer
        """
        x = self.fc1(x)
        x = tanh(x)
        x = self.fc2(x)
        x = tanh(x)
        x = self.fc3(x)
        
        return x

    def parameters(self):
        """
        This just returns all the parameters for all the layers in our model (we assume just 3 here)
        """
        return self.fc1.parameters() + self.fc2.parameters() + self.fc3.parameters()

class SGD:

    """
    We implement here a simple Stochastic Gradient Descent Optimizer
    """
    def __init__(self, parameters, lr):
        self.parameters = parameters
        self.lr = lr

    def step(self):

        ### Write the update rule for the parameters. Remember, each param in self.parameters
        ### has a .grad object that has been populated with a gradient!
        for param in self.parameters:
            param.array -= self.lr * param.grad

    def zero_grad(self):
        for param in self.parameters:
            param.zero_grad()

import load_mnist
importlib.reload(load_mnist)

def plot_mnist_image(x, y=None, index=0):
    """
    Plot a single MNIST image from flattened data.
    
    Args:
        x: np.array of shape (N, 784)
        y: np.array of shape (N, 10) or (N,) (optional, for labels)
        index: which sample to plot
    """
    img = x[index].reshape(28, 28)  # reshape flattened image
    plt.imshow(img, cmap='gray')
    
    if y is not None:
        # If one-hot, convert to integer label
        if len(y.shape) == 2:
            label = np.argmax(y[index])
        else:
            label = y[index]
        plt.title(f"Label: {label}", fontsize=16)
    
    plt.axis('off')
    plt.show()


x_train, y_train, x_test, y_test = load_mnist.load_mnist_numpy()

# Plot the first image in training set
plot_mnist_image(x_train, y_train, index=0)

def evaluate(model, x_test, y_test, batch_size=1000):
    correct = 0 # number correct
    total = 0 # total samples

    ### Loop through batches
    for i in range(0, len(x_test), batch_size):
        batch_x = x_test[i:i+batch_size]
        batch_y = y_test[i:i+batch_size]

        ### Inference with model
        x_tensor = Tensor(batch_x)
        logits = model.forward(x_tensor)

        ### Get the max logit (as that is our prediction)
        pred_labels = np.argmax(logits.array, axis=1)

        ### Compute how many we got correct (matches label)
        correct += np.sum(pred_labels == batch_y)

        ### Iterate total samples
        total += len(batch_y)

    return 100 * correct / total

def train(model, 
          x_train, 
          y_train, 
          x_test, 
          y_test, 
          epochs, 
          batch_size, 
          lr):
    """
    Train a model using SGD and track training & validation loss.

    Returns:
        history: dict with keys 'train_loss' and 'val_loss'
    """
    n_samples = x_train.shape[0]
    optimizer = SGD(model.parameters(), lr=lr)

    history = {
        "train_loss": [],
        "val_loss": []
    }
    
    for epoch in range(epochs):
        
        # Shuffle training data for every epoch
        indices = np.random.permutation(n_samples)
        x_train_shuffled = x_train[indices]
        y_train_shuffled = y_train[indices]

        ### Store the training loss for entire epoch so we can compute avg epoch performance
        total_train_loss = 0
        num_batches = 0

        for i in range(0, n_samples, batch_size):

            ### Grab a batch of data
            batch_x = x_train_shuffled[i:i+batch_size]
            batch_y = y_train_shuffled[i:i+batch_size]

            ### Convert to Tensor
            x_tensor = Tensor(batch_x)

            ### Pass through model
            logits = model.forward(x_tensor)

            ### Compute loss ###
            loss = cross_entropy_loss(logits, batch_y)
            
            ### Log Loss ###
            total_train_loss += loss.array
            num_batches += 1

            # Zero gradients, backward, optimizer step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        ### Logging for visuals ###
        avg_train_loss = total_train_loss / num_batches
        history["train_loss"].append(avg_train_loss)

        val_logits = model.forward(Tensor(x_test, requires_grad=False))
        val_loss = cross_entropy_loss(val_logits, y_test)
        val_loss_scalar = val_loss.array
        history["val_loss"].append(val_loss_scalar)

        test_acc = evaluate(model, x_test, y_test)
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss_scalar:.4f} | Test Acc: {test_acc:.2f}%")

    return history

### SET TRAINING HYPERPARAMETERS ###
LEARNING_RATE = 0.1
BATCH_SIZE = 32
EPOCHS = 5

print(f"Training set: {x_train.shape}")
print(f"Test set: {x_test.shape}")

model = MLP(input_size=784, hidden_size=256, num_classes=10)

print("\nTraining...")
history = train(model, x_train, y_train, x_test, y_test, epochs=EPOCHS, batch_size=BATCH_SIZE, lr=LEARNING_RATE)

print("\nFinal evaluation...")
final_acc = evaluate(model, x_test, y_test)
print(f"Final Test Accuracy: {final_acc:.2f}%")

### Make a Plot of Results ###
plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["val_loss"], label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

Training set: (60000, 784)
Test set: (5000, 784)

Training...
Epoch 1/5 | Train Loss: 0.3651 | Val Loss: 0.2985 | Test Acc: 90.84%
Epoch 2/5 | Train Loss: 0.1869 | Val Loss: 0.1853 | Test Acc: 94.44%
Epoch 3/5 | Train Loss: 0.1296 | Val Loss: 0.1549 | Test Acc: 95.16%
Epoch 4/5 | Train Loss: 0.0986 | Val Loss: 0.1380 | Test Acc: 95.54%
Epoch 5/5 | Train Loss: 0.0788 | Val Loss: 0.1189 | Test Acc: 96.02%

Final evaluation...
Final Test Accuracy: 96.02%

Machine Problem: Automatic Differentiation¶

Instructions¶

About submitted.py¶

Import Dependencies¶

Backpropagation¶

Case Study: Sigmoid¶

Python Recap #1: Closures¶

Python Recap #2: Dunder Methods¶

Tensor¶

What operations do we need?¶

Linear Layers¶

Tanh Activation¶

Stable Softmax¶

Cross Entropy Loss¶

Types of Operations¶

A Short Example¶

Lets Do An Example¶

Recap: Broadcasting¶

Add Method¶

Adjusting for Broadcasting¶

Grad w.r.t A¶

Grad w.r.t B¶

Implement broadcast_grad_accumulate¶

Study the add method¶

Test 1: No Broadcasting¶

Test 2: Broadcasting¶

Use the Full Tensor Definition¶

Subtraction¶

Multiplication¶

Division¶

Matrix Multiplication¶

Log¶

Exponentiation¶

Example: Reduction Operations and Their Derivatives¶

1. Sum across rows (axis=0)¶

2. Sum across columns (axis=1)¶

3. Sum of all elements¶

HINT¶

Max¶

1. Max along rows (axis=0)¶

2. Max along columns (axis=1)¶

3. Max of all elements¶

Topological Sort¶

Why topological sort?¶

HINT:¶

DONE!¶

Activation Functions¶

Linear Layer¶

Cross Entropy Loss¶

Step 1: Model outputs (logits)¶

Step 2: Convert logits to probabilities using softmax¶

Step 3: One-hot encode the true label¶

Step 4: Compute cross-entropy loss¶

Define Model¶

Define Optimizer¶

Code for Loading MNIST Data and Evaluation Provided¶

Simple Evaluation Script¶

Training Script¶

Lets Train!¶

About `submitted.py`¶

Implement `broadcast_grad_accumulate`¶

Study the `add` method¶