Skip to content

back to Claude Sonnet 3.5 - Fill-in summary

Claude Sonnet 3.5 - Fill-in: minitorch

Failed to run pytests for test tests

Pytest collection failure.

Patch diff

diff --git a/minitorch/autodiff.py b/minitorch/autodiff.py
index 39acbe8..5dc0f71 100644
--- a/minitorch/autodiff.py
+++ b/minitorch/autodiff.py
@@ -19,7 +19,11 @@ def central_difference(f: Any, *vals: Any, arg: int=0, epsilon: float=1e-06
     Returns:
         An approximation of $f'_i(x_0, \\ldots, x_{n-1})$
     """
-    pass
+    vals_plus = list(vals)
+    vals_minus = list(vals)
+    vals_plus[arg] += epsilon
+    vals_minus[arg] -= epsilon
+    return (f(*vals_plus) - f(*vals_minus)) / (2 * epsilon)


 variable_count = 1
@@ -39,7 +43,18 @@ def topological_sort(variable: Variable) ->Iterable[Variable]:
     Returns:
         Non-constant Variables in topological order starting from the right.
     """
-    pass
+    def visit(var: Variable, visited: Set[Variable], result: List[Variable]):
+        if var not in visited:
+            visited.add(var)
+            if hasattr(var, 'history') and var.history:
+                for input_var in var.history.inputs:
+                    visit(input_var, visited, result)
+            result.append(var)
+
+    visited = set()
+    result = []
+    visit(variable, visited, result)
+    return reversed(result)


 def backpropagate(variable: Variable, deriv: Any) ->None:
@@ -53,7 +68,30 @@ def backpropagate(variable: Variable, deriv: Any) ->None:

     No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`.
     """
-    pass
+    queue = [(variable, deriv)]
+    visited = set()
+
+    while queue:
+        var, d = queue.pop(0)
+        if var in visited:
+            continue
+        visited.add(var)
+
+        if hasattr(var, 'accumulate_derivative'):
+            var.accumulate_derivative(d)
+
+        if hasattr(var, 'history') and var.history:
+            backward = var.history.last_fn.backward
+            context = var.history.ctx
+            inputs = var.history.inputs
+            backward_derivatives = backward(d, inputs, context)
+            
+            if not isinstance(backward_derivatives, tuple):
+                backward_derivatives = (backward_derivatives,)
+
+            for input_var, back_deriv in zip(inputs, backward_derivatives):
+                if back_deriv is not None:
+                    queue.append((input_var, back_deriv))


 @dataclass
@@ -66,4 +104,5 @@ class Context:

     def save_for_backward(self, *values: Any) ->None:
         """Store the given `values` if they need to be used during backpropagation."""
-        pass
+        if not self.no_grad:
+            self.saved_values = values
diff --git a/minitorch/cuda_ops.py b/minitorch/cuda_ops.py
index c1d7c5e..0e7d237 100644
--- a/minitorch/cuda_ops.py
+++ b/minitorch/cuda_ops.py
@@ -16,7 +16,11 @@ class CudaOps(TensorOps):
     @staticmethod
     def map(fn: Callable[[float], float]) ->MapProto:
         """See `tensor_ops.py`"""
-        pass
+        def _map(out: Storage, out_shape: Shape, out_strides: Strides,
+                 in_storage: Storage, in_shape: Shape, in_strides: Strides) ->None:
+            return tensor_map(fn)(out, out_shape, out_strides,
+                                  in_storage, in_shape, in_strides)
+        return _map


 def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
@@ -33,7 +37,18 @@ def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
     Returns:
         Tensor map function.
     """
-    pass
+    def _map(out: Storage, out_shape: Shape, out_strides: Strides,
+             in_storage: Storage, in_shape: Shape, in_strides: Strides) ->None:
+        i = cuda.grid(1)
+        if i < len(out):
+            out_index = cuda.local.array(MAX_DIMS, numba.int32)
+            in_index = cuda.local.array(MAX_DIMS, numba.int32)
+            to_index(i, out_shape, out_index)
+            broadcast_index(out_index, out_shape, in_shape, in_index)
+            in_pos = index_to_position(in_index, in_strides)
+            out_pos = index_to_position(out_index, out_strides)
+            out[out_pos] = fn(in_storage[in_pos])
+    return cuda.jit()(_map)


 def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
@@ -50,7 +65,22 @@ def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
     Returns:
         Tensor zip function.
     """
-    pass
+    def _zip(out: Storage, out_shape: Shape, out_strides: Strides,
+             a_storage: Storage, a_shape: Shape, a_strides: Strides,
+             b_storage: Storage, b_shape: Shape, b_strides: Strides) ->None:
+        i = cuda.grid(1)
+        if i < len(out):
+            out_index = cuda.local.array(MAX_DIMS, numba.int32)
+            a_index = cuda.local.array(MAX_DIMS, numba.int32)
+            b_index = cuda.local.array(MAX_DIMS, numba.int32)
+            to_index(i, out_shape, out_index)
+            broadcast_index(out_index, out_shape, a_shape, a_index)
+            broadcast_index(out_index, out_shape, b_shape, b_index)
+            a_pos = index_to_position(a_index, a_strides)
+            b_pos = index_to_position(b_index, b_strides)
+            out_pos = index_to_position(out_index, out_strides)
+            out[out_pos] = fn(a_storage[a_pos], b_storage[b_pos])
+    return cuda.jit()(_zip)


 def _sum_practice(out: Storage, a: Storage, size: int) ->None:
@@ -74,7 +104,19 @@ def _sum_practice(out: Storage, a: Storage, size: int) ->None:
         size (int):  length of a.

     """
-    pass
+    shared = cuda.shared.array(THREADS_PER_BLOCK, numba.float64)
+    i = cuda.grid(1)
+    local_i = cuda.threadIdx.x
+    shared[local_i] = a[i] if i < size else 0.0
+    cuda.syncthreads()
+
+    for s in [1, 2, 4, 8, 16]:
+        if local_i % (2 * s) == 0 and i + s < size:
+            shared[local_i] += shared[local_i + s]
+        cuda.syncthreads()
+
+    if local_i == 0:
+        out[cuda.blockIdx.x] = shared[0]


 jit_sum_practice = cuda.jit()(_sum_practice)
@@ -92,7 +134,42 @@ def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
         Tensor reduce function.

     """
-    pass
+    def _reduce(out: Storage, out_shape: Shape, out_strides: Strides,
+                a_storage: Storage, a_shape: Shape, a_strides: Strides,
+                reduce_dim: int) ->None:
+        shared = cuda.shared.array(THREADS_PER_BLOCK, numba.float64)
+        i = cuda.grid(1)
+        local_i = cuda.threadIdx.x
+
+        if i < len(out):
+            out_index = cuda.local.array(MAX_DIMS, numba.int32)
+            to_index(i, out_shape, out_index)
+            out_pos = index_to_position(out_index, out_strides)
+
+            a_index = cuda.local.array(MAX_DIMS, numba.int32)
+            for j in range(MAX_DIMS):
+                a_index[j] = out_index[j]
+
+            reduce_size = a_shape[reduce_dim]
+            reduce_stride = a_strides[reduce_dim]
+
+            acc = a_storage[index_to_position(a_index, a_strides)]
+            for j in range(1, reduce_size):
+                a_index[reduce_dim] = j
+                acc = fn(acc, a_storage[index_to_position(a_index, a_strides)])
+
+            shared[local_i] = acc
+            cuda.syncthreads()
+
+            for s in [1, 2, 4, 8, 16]:
+                if local_i % (2 * s) == 0 and local_i + s < THREADS_PER_BLOCK:
+                    shared[local_i] = fn(shared[local_i], shared[local_i + s])
+                cuda.syncthreads()
+
+            if local_i == 0:
+                out[out_pos] = shared[0]
+
+    return cuda.jit()(_reduce)


 def _mm_practice(out: Storage, a: Storage, b: Storage, size: int) ->None:
@@ -125,7 +202,20 @@ def _mm_practice(out: Storage, a: Storage, b: Storage, size: int) ->None:
         b (Storage): storage for `b` tensor.
         size (int): size of the square
     """
-    pass
+    shared_a = cuda.shared.array((32, 32), numba.float64)
+    shared_b = cuda.shared.array((32, 32), numba.float64)
+
+    i, j = cuda.grid(2)
+    if i < size and j < size:
+        shared_a[i, j] = a[i * size + j]
+        shared_b[i, j] = b[i * size + j]
+    cuda.syncthreads()
+
+    if i < size and j < size:
+        acc = 0.0
+        for k in range(size):
+            acc += shared_a[i, k] * shared_b[k, j]
+        out[i * size + j] = acc


 jit_mm_practice = cuda.jit()(_mm_practice)
@@ -151,7 +241,37 @@ def _tensor_matrix_multiply(out: Storage, out_shape: Shape, out_strides:
     Returns:
         None : Fills in `out`
     """
-    pass
+    a_batch_stride = a_strides[0] if len(a_shape) > 2 else 0
+    b_batch_stride = b_strides[0] if len(b_shape) > 2 else 0
+
+    BLOCK_DIM = 32
+    shared_a = cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+    shared_b = cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+
+    batch_id, i, j = cuda.grid(3)
+    tx, ty = cuda.threadIdx.x, cuda.threadIdx.y
+
+    if batch_id < out_shape[0] and i < out_shape[1] and j < out_shape[2]:
+        acc = 0.0
+        for k_start in range(0, a_shape[-1], BLOCK_DIM):
+            if tx < min(BLOCK_DIM, a_shape[-1] - k_start) and i < a_shape[-2]:
+                shared_a[ty, tx] = a_storage[batch_id * a_batch_stride + i * a_strides[-2] + (k_start + tx) * a_strides[-1]]
+            else:
+                shared_a[ty, tx] = 0.0
+
+            if ty < min(BLOCK_DIM, b_shape[-1] - j) and k_start + tx < b_shape[-2]:
+                shared_b[tx, ty] = b_storage[batch_id * b_batch_stride + (k_start + tx) * b_strides[-2] + j * b_strides[-1]]
+            else:
+                shared_b[tx, ty] = 0.0
+
+            cuda.syncthreads()
+
+            for k in range(min(BLOCK_DIM, a_shape[-1] - k_start)):
+                acc += shared_a[ty, k] * shared_b[k, tx]
+
+            cuda.syncthreads()
+
+        out[batch_id * out_strides[0] + i * out_strides[1] + j * out_strides[2]] = acc


 tensor_matrix_multiply = cuda.jit(_tensor_matrix_multiply)
diff --git a/minitorch/fast_conv.py b/minitorch/fast_conv.py
index 14f13fc..2c57de4 100644
--- a/minitorch/fast_conv.py
+++ b/minitorch/fast_conv.py
@@ -45,7 +45,32 @@ def _tensor_conv1d(out: Tensor, out_shape: Shape, out_strides: Strides,
         weight_strides (Strides): strides for `input` tensor.
         reverse (bool): anchor weight at left or right
     """
-    pass
+    batch, out_channels, out_width = out_shape
+    batch, in_channels, width = input_shape
+    out_channels, in_channels, k_width = weight_shape
+
+    for i in prange(out_size):
+        out_index = np.empty(3, np.int32)
+        to_index(i, out_shape, out_index)
+        b, oc, x = out_index
+
+        # Initialize accumulator
+        acc = 0.0
+
+        for ic in range(in_channels):
+            for k in range(k_width):
+                if reverse:
+                    w = x + k
+                else:
+                    w = x - k + k_width - 1
+
+                if 0 <= w < width:
+                    in_pos = index_to_position((b, ic, w), input_strides)
+                    w_pos = index_to_position((oc, ic, k), weight_strides)
+                    acc += input[in_pos] * weight[w_pos]
+
+        out_pos = index_to_position((b, oc, x), out_strides)
+        out[out_pos] = acc


 tensor_conv1d = njit(parallel=True)(_tensor_conv1d)
@@ -60,13 +85,38 @@ class Conv1dFun(Function):

         Args:
             ctx : Context
-            input : batch x in_channel x h x w
-            weight : out_channel x in_channel x kh x kw
+            input : batch x in_channel x w
+            weight : out_channel x in_channel x kw

         Returns:
-            batch x out_channel x h x w
+            batch x out_channel x w
         """
-        pass
+        ctx.save_for_backward(input, weight)
+        batch, in_channels, w = input.shape
+        out_channels, in_channels, kw = weight.shape
+        
+        # Compute output shape
+        ow = w - kw + 1
+        
+        # Create output tensor
+        out = input.zeros((batch, out_channels, ow))
+        
+        # Call tensor_conv1d
+        tensor_conv1d(
+            out._tensor._storage,
+            out.shape,
+            out.strides,
+            out._tensor._size,
+            input._tensor._storage,
+            input.shape,
+            input.strides,
+            weight._tensor._storage,
+            weight.shape,
+            weight.strides,
+            False
+        )
+        
+        return out


 conv1d = Conv1dFun.apply
@@ -108,7 +158,33 @@ def _tensor_conv2d(out: Tensor, out_shape: Shape, out_strides: Strides,
         weight_strides (Strides): strides for `input` tensor.
         reverse (bool): anchor weight at top-left or bottom-right
     """
-    pass
+    batch, out_channels, out_height, out_width = out_shape
+    batch, in_channels, in_height, in_width = input_shape
+    out_channels, in_channels, k_height, k_width = weight_shape
+
+    for i in prange(out_size):
+        out_index = np.empty(4, np.int32)
+        to_index(i, out_shape, out_index)
+        b, oc, y, x = out_index
+
+        # Initialize accumulator
+        acc = 0.0
+
+        for ic in range(in_channels):
+            for kh in range(k_height):
+                for kw in range(k_width):
+                    if reverse:
+                        h, w = y + kh, x + kw
+                    else:
+                        h, w = y - kh + k_height - 1, x - kw + k_width - 1
+
+                    if 0 <= h < in_height and 0 <= w < in_width:
+                        in_pos = index_to_position((b, ic, h, w), input_strides)
+                        w_pos = index_to_position((oc, ic, kh, kw), weight_strides)
+                        acc += input[in_pos] * weight[w_pos]
+
+        out_pos = index_to_position((b, oc, y, x), out_strides)
+        out[out_pos] = acc


 tensor_conv2d = njit(parallel=True, fastmath=True)(_tensor_conv2d)
@@ -129,7 +205,33 @@ class Conv2dFun(Function):
         Returns:
             (:class:`Tensor`) : batch x out_channel x h x w
         """
-        pass
+        ctx.save_for_backward(input, weight)
+        batch, in_channels, h, w = input.shape
+        out_channels, in_channels, kh, kw = weight.shape
+        
+        # Compute output shape
+        oh = h - kh + 1
+        ow = w - kw + 1
+        
+        # Create output tensor
+        out = input.zeros((batch, out_channels, oh, ow))
+        
+        # Call tensor_conv2d
+        tensor_conv2d(
+            out._tensor._storage,
+            out.shape,
+            out.strides,
+            out._tensor._size,
+            input._tensor._storage,
+            input.shape,
+            input.strides,
+            weight._tensor._storage,
+            weight.shape,
+            weight.strides,
+            False
+        )
+        
+        return out


 conv2d = Conv2dFun.apply
diff --git a/minitorch/fast_ops.py b/minitorch/fast_ops.py
index 71f96b7..19e99f4 100644
--- a/minitorch/fast_ops.py
+++ b/minitorch/fast_ops.py
@@ -18,19 +18,52 @@ class FastOps(TensorOps):
     @staticmethod
     def map(fn: Callable[[float], float]) ->MapProto:
         """See `tensor_ops.py`"""
-        pass
+        def _map(out: Tensor, out_shape: Shape, out_strides: Strides, in_storage: Storage, in_shape: Shape, in_strides: Strides) -> None:
+            return tensor_map(fn)(out._tensor._storage, out_shape, out_strides, in_storage, in_shape, in_strides)
+        return _map

     @staticmethod
-    def zip(fn: Callable[[float, float], float]) ->Callable[[Tensor, Tensor
-        ], Tensor]:
+    def zip(fn: Callable[[float, float], float]) ->Callable[[Tensor, Tensor], Tensor]:
         """See `tensor_ops.py`"""
-        pass
+        def _zip(a: Tensor, b: Tensor) -> Tensor:
+            c_shape = shape_broadcast(a.shape, b.shape)
+            out = a.zeros(c_shape)
+            tensor_zip(fn)(
+                out._tensor._storage,
+                out._tensor._shape,
+                out._tensor._strides,
+                a._tensor._storage,
+                a._tensor._shape,
+                a._tensor._strides,
+                b._tensor._storage,
+                b._tensor._shape,
+                b._tensor._strides,
+            )
+            return out
+        return _zip

     @staticmethod
-    def reduce(fn: Callable[[float, float], float], start: float=0.0
-        ) ->Callable[[Tensor, int], Tensor]:
+    def reduce(fn: Callable[[float, float], float], start: float=0.0) ->Callable[[Tensor, int], Tensor]:
         """See `tensor_ops.py`"""
-        pass
+        def _reduce(a: Tensor, dim: int) -> Tensor:
+            out_shape = list(a.shape)
+            out_shape[dim] = 1
+
+            # Other values when not sum.
+            out = a.zeros(tuple(out_shape))
+            out._tensor._storage[:] = start
+
+            tensor_reduce(fn)(
+                out._tensor._storage,
+                out._tensor._shape,
+                out._tensor._strides,
+                a._tensor._storage,
+                a._tensor._shape,
+                a._tensor._strides,
+                dim,
+            )
+            return out
+        return _reduce

     @staticmethod
     def matrix_multiply(a: Tensor, b: Tensor) ->Tensor:
@@ -56,11 +89,34 @@ class FastOps(TensorOps):
         Returns:
             New tensor data
         """
-        pass
-
-
-def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
-    Strides, Storage, Shape, Strides], None]:
+        # Make sure the last dimension of a matches the second-to-last dimension of b
+        assert a.shape[-1] == b.shape[-2]
+
+        # Calculate the output shape
+        out_shape = list(shape_broadcast(a.shape[:-2], b.shape[:-2]))
+        out_shape.append(a.shape[-2])
+        out_shape.append(b.shape[-1])
+
+        # Create the output tensor
+        out = a.zeros(tuple(out_shape))
+
+        # Perform matrix multiplication
+        tensor_matrix_multiply(
+            out._tensor._storage,
+            out._tensor._shape,
+            out._tensor._strides,
+            a._tensor._storage,
+            a._tensor._shape,
+            a._tensor._strides,
+            b._tensor._storage,
+            b._tensor._shape,
+            b._tensor._strides,
+        )
+
+        return out
+
+
+def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape, Strides, Storage, Shape, Strides], None]:
     """
     NUMBA low_level tensor_map function. See `tensor_ops.py` for description.

@@ -76,11 +132,34 @@ def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
     Returns:
         Tensor map function.
     """
-    pass
-
-
-def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
-    Shape, Strides, Storage, Shape, Strides, Storage, Shape, Strides], None]:
+    @njit(parallel=True)
+    def _map(out: Storage, out_shape: Shape, out_strides: Strides,
+             in_storage: Storage, in_shape: Shape, in_strides: Strides) -> None:
+        # Calculate total number of elements
+        size = int(np.prod(in_shape))
+
+        # Check if the tensors are stride-aligned
+        is_stride_aligned = np.array_equal(out_strides, in_strides) and np.array_equal(out_shape, in_shape)
+
+        if is_stride_aligned:
+            # If stride-aligned, we can avoid indexing
+            for i in prange(size):
+                out[i] = fn(in_storage[i])
+        else:
+            # If not stride-aligned, we need to use indexing
+            for i in prange(size):
+                out_index = np.zeros(len(out_shape), dtype=np.int32)
+                in_index = np.zeros(len(in_shape), dtype=np.int32)
+                to_index(i, in_shape, out_index)
+                broadcast_index(out_index, out_shape, in_shape, in_index)
+                o = index_to_position(out_index, out_strides)
+                j = index_to_position(in_index, in_strides)
+                out[o] = fn(in_storage[j])
+
+    return _map
+
+
+def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage, Shape, Strides, Storage, Shape, Strides, Storage, Shape, Strides], None]:
     """
     NUMBA higher-order tensor zip function. See `tensor_ops.py` for description.

@@ -97,11 +176,41 @@ def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
     Returns:
         Tensor zip function.
     """
-    pass
-
-
-def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
-    Shape, Strides, Storage, Shape, Strides, int], None]:
+    @njit(parallel=True)
+    def _zip(out: Storage, out_shape: Shape, out_strides: Strides,
+             a_storage: Storage, a_shape: Shape, a_strides: Strides,
+             b_storage: Storage, b_shape: Shape, b_strides: Strides) -> None:
+        # Calculate total number of elements
+        size = int(np.prod(out_shape))
+
+        # Check if all tensors are stride-aligned
+        is_stride_aligned = (np.array_equal(out_strides, a_strides) and
+                             np.array_equal(out_strides, b_strides) and
+                             np.array_equal(out_shape, a_shape) and
+                             np.array_equal(out_shape, b_shape))
+
+        if is_stride_aligned:
+            # If stride-aligned, we can avoid indexing
+            for i in prange(size):
+                out[i] = fn(a_storage[i], b_storage[i])
+        else:
+            # If not stride-aligned, we need to use indexing
+            for i in prange(size):
+                out_index = np.zeros(len(out_shape), dtype=np.int32)
+                a_index = np.zeros(len(a_shape), dtype=np.int32)
+                b_index = np.zeros(len(b_shape), dtype=np.int32)
+                to_index(i, out_shape, out_index)
+                o = index_to_position(out_index, out_strides)
+                broadcast_index(out_index, out_shape, a_shape, a_index)
+                j = index_to_position(a_index, a_strides)
+                broadcast_index(out_index, out_shape, b_shape, b_index)
+                k = index_to_position(b_index, b_strides)
+                out[o] = fn(a_storage[j], b_storage[k])
+
+    return _zip
+
+
+def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage, Shape, Strides, Storage, Shape, Strides, int], None]:
     """
     NUMBA higher-order tensor reduce function. See `tensor_ops.py` for description.

@@ -117,12 +226,39 @@ def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
     Returns:
         Tensor reduce function
     """
-    pass
-
-
-def _tensor_matrix_multiply(out: Storage, out_shape: Shape, out_strides:
-    Strides, a_storage: Storage, a_shape: Shape, a_strides: Strides,
-    b_storage: Storage, b_shape: Shape, b_strides: Strides) ->None:
+    @njit(parallel=True)
+    def _reduce(out: Storage, out_shape: Shape, out_strides: Strides,
+                a_storage: Storage, a_shape: Shape, a_strides: Strides,
+                reduce_dim: int) -> None:
+        # Calculate sizes
+        reduce_size = a_shape[reduce_dim]
+        non_reduce_size = int(np.prod([s for i, s in enumerate(a_shape) if i != reduce_dim]))
+
+        # Main loop in parallel
+        for i in prange(non_reduce_size):
+            # Calculate the base index for the non-reduce dimensions
+            out_index = np.zeros(len(out_shape), dtype=np.int32)
+            to_index(i, out_shape, out_index)
+            out_pos = index_to_position(out_index, out_strides)
+
+            # Inner reduction loop
+            acc = out[out_pos]  # Initialize accumulator with the starting value
+            for j in range(reduce_size):
+                # Calculate the full index for the input tensor
+                a_index = out_index.copy()
+                a_index = np.insert(a_index, reduce_dim, j)
+                a_pos = index_to_position(a_index, a_strides)
+                acc = fn(acc, a_storage[a_pos])
+
+            # Store the result
+            out[out_pos] = acc
+
+    return _reduce
+
+
+def _tensor_matrix_multiply(out: Storage, out_shape: Shape, out_strides: Strides,
+                            a_storage: Storage, a_shape: Shape, a_strides: Strides,
+                            b_storage: Storage, b_shape: Shape, b_strides: Strides) ->None:
     """
     NUMBA tensor matrix multiply function.

@@ -153,7 +289,20 @@ def _tensor_matrix_multiply(out: Storage, out_shape: Shape, out_strides:
     Returns:
         None : Fills in `out`
     """
-    pass
+    a_batch_stride = a_strides[0] if len(a_shape) == 3 else 0
+    b_batch_stride = b_strides[0] if len(b_shape) == 3 else 0
+    
+    # Outer loop in parallel
+    for i in prange(out_shape[0]):
+        for j in range(out_shape[1]):
+            for k in range(out_shape[2]):
+                a_inner = 0.0
+                for l in range(a_shape[-1]):
+                    a_index = i * a_batch_stride + j * a_strides[-2] + l * a_strides[-1]
+                    b_index = i * b_batch_stride + l * b_strides[-2] + k * b_strides[-1]
+                    a_inner += a_storage[a_index] * b_storage[b_index]
+                out_index = i * out_strides[0] + j * out_strides[1] + k * out_strides[2]
+                out[out_index] = a_inner


 tensor_matrix_multiply = njit(parallel=True, fastmath=True)(
diff --git a/minitorch/module.py b/minitorch/module.py
index 91f0dc2..531d52c 100644
--- a/minitorch/module.py
+++ b/minitorch/module.py
@@ -24,15 +24,19 @@ class Module:

     def modules(self) ->Sequence[Module]:
         """Return the direct child modules of this module."""
-        pass
+        return list(self._modules.values())

     def train(self) ->None:
         """Set the mode of this module and all descendent modules to `train`."""
-        pass
+        self.training = True
+        for module in self.modules():
+            module.train()

     def eval(self) ->None:
         """Set the mode of this module and all descendent modules to `eval`."""
-        pass
+        self.training = False
+        for module in self.modules():
+            module.eval()

     def named_parameters(self) ->Sequence[Tuple[str, Parameter]]:
         """
@@ -42,11 +46,17 @@ class Module:
         Returns:
             The name and `Parameter` of each ancestor parameter.
         """
-        pass
+        result = []
+        for name, param in self._parameters.items():
+            result.append((name, param))
+        for module_name, module in self._modules.items():
+            for sub_name, sub_param in module.named_parameters():
+                result.append((f"{module_name}.{sub_name}", sub_param))
+        return result

     def parameters(self) ->Sequence[Parameter]:
         """Enumerate over all the parameters of this module and its descendents."""
-        pass
+        return [param for _, param in self.named_parameters()]

     def add_parameter(self, k: str, v: Any) ->Parameter:
         """
@@ -59,7 +69,9 @@ class Module:
         Returns:
             Newly created parameter.
         """
-        pass
+        val = Parameter(v, k)
+        self.__dict__['_parameters'][k] = val
+        return val

     def __setattr__(self, key: str, val: Parameter) ->None:
         if isinstance(val, Parameter):
@@ -121,7 +133,11 @@ class Parameter:

     def update(self, x: Any) ->None:
         """Update the parameter value."""
-        pass
+        self.value = x
+        if hasattr(x, 'requires_grad_'):
+            self.value.requires_grad_(True)
+            if self.name:
+                self.value.name = self.name

     def __repr__(self) ->str:
         return repr(self.value)
diff --git a/minitorch/nn.py b/minitorch/nn.py
index 577a3ff..1679394 100644
--- a/minitorch/nn.py
+++ b/minitorch/nn.py
@@ -17,7 +17,16 @@ def tile(input: Tensor, kernel: Tuple[int, int]) ->Tuple[Tensor, int, int]:
     Returns:
         Tensor of size batch x channel x new_height x new_width x (kernel_height * kernel_width) as well as the new_height and new_width value.
     """
-    pass
+    batch, channel, height, width = input.shape
+    kh, kw = kernel
+    new_height = (height - kh) // 1 + 1
+    new_width = (width - kw) // 1 + 1
+    
+    tiled = input.view(batch, channel, height, 1, width, 1)
+    tiled = tiled.unfold(3, kh, 1).unfold(5, kw, 1)
+    tiled = tiled.contiguous().view(batch, channel, new_height, new_width, kh * kw)
+    
+    return tiled, new_height, new_width


 def avgpool2d(input: Tensor, kernel: Tuple[int, int]) ->Tensor:
@@ -31,7 +40,9 @@ def avgpool2d(input: Tensor, kernel: Tuple[int, int]) ->Tensor:
     Returns:
         Pooled tensor
     """
-    pass
+    tiled, new_height, new_width = tile(input, kernel)
+    pooled = tiled.mean(dim=-1)
+    return pooled


 max_reduce = FastOps.reduce(operators.max, -1000000000.0)
@@ -50,7 +61,8 @@ def argmax(input: Tensor, dim: int) ->Tensor:
         :class:`Tensor` : tensor with 1 on highest cell in dim, 0 otherwise

     """
-    pass
+    max_vals, _ = input.max(dim=dim, keepdim=True)
+    return (input == max_vals).float()


 class Max(Function):
@@ -58,12 +70,16 @@ class Max(Function):
     @staticmethod
     def forward(ctx: Context, input: Tensor, dim: Tensor) ->Tensor:
         """Forward of max should be max reduction"""
-        pass
+        output = max_reduce(input, int(dim.item()))
+        ctx.save_for_backward(input, dim, output)
+        return output

     @staticmethod
     def backward(ctx: Context, grad_output: Tensor) ->Tuple[Tensor, float]:
         """Backward of max should be argmax (see above)"""
-        pass
+        input, dim, output = ctx.saved_values
+        arg_max = argmax(input, int(dim.item()))
+        return grad_output * arg_max, 0.0


 def softmax(input: Tensor, dim: int) ->Tensor:
@@ -81,7 +97,9 @@ def softmax(input: Tensor, dim: int) ->Tensor:
     Returns:
         softmax tensor
     """
-    pass
+    max_val = input.max(dim=dim, keepdim=True)[0]
+    exp_x = (input - max_val).exp()
+    return exp_x / exp_x.sum(dim=dim, keepdim=True)


 def logsoftmax(input: Tensor, dim: int) ->Tensor:
@@ -99,7 +117,9 @@ def logsoftmax(input: Tensor, dim: int) ->Tensor:
     Returns:
          log of softmax tensor
     """
-    pass
+    max_val = input.max(dim=dim, keepdim=True)[0]
+    exp_x = (input - max_val).exp()
+    return input - max_val - (exp_x.sum(dim=dim, keepdim=True)).log()


 def maxpool2d(input: Tensor, kernel: Tuple[int, int]) ->Tensor:
@@ -113,7 +133,9 @@ def maxpool2d(input: Tensor, kernel: Tuple[int, int]) ->Tensor:
     Returns:
         Tensor : pooled tensor
     """
-    pass
+    tiled, new_height, new_width = tile(input, kernel)
+    pooled = max_reduce(tiled, dim=-1)
+    return pooled


 def dropout(input: Tensor, rate: float, ignore: bool=False) ->Tensor:
@@ -128,4 +150,8 @@ def dropout(input: Tensor, rate: float, ignore: bool=False) ->Tensor:
     Returns:
         tensor with random positions dropped out
     """
-    pass
+    if ignore or rate == 0:
+        return input
+    
+    mask = (rand(input.shape) > rate).float()
+    return (input * mask) / (1 - rate)
diff --git a/minitorch/operators.py b/minitorch/operators.py
index 8740347..2315e0a 100644
--- a/minitorch/operators.py
+++ b/minitorch/operators.py
@@ -7,42 +7,42 @@ from typing import Callable, Iterable

 def mul(x: float, y: float) ->float:
     """$f(x, y) = x * y$"""
-    pass
+    return x * y


 def id(x: float) ->float:
     """$f(x) = x$"""
-    pass
+    return x


 def add(x: float, y: float) ->float:
     """$f(x, y) = x + y$"""
-    pass
+    return x + y


 def neg(x: float) ->float:
     """$f(x) = -x$"""
-    pass
+    return -x


 def lt(x: float, y: float) ->float:
     """$f(x) =$ 1.0 if x is less than y else 0.0"""
-    pass
+    return 1.0 if x < y else 0.0


 def eq(x: float, y: float) ->float:
     """$f(x) =$ 1.0 if x is equal to y else 0.0"""
-    pass
+    return 1.0 if x == y else 0.0


 def max(x: float, y: float) ->float:
     """$f(x) =$ x if x is greater than y else y"""
-    pass
+    return x if x > y else y


 def is_close(x: float, y: float) ->float:
     """$f(x) = |x - y| < 1e-2$"""
-    pass
+    return 1.0 if abs(x - y) < 1e-2 else 0.0


 def sigmoid(x: float) ->float:
@@ -57,7 +57,10 @@ def sigmoid(x: float) ->float:

     for stability.
     """
-    pass
+    if x >= 0:
+        return 1.0 / (1.0 + math.exp(-x))
+    else:
+        return math.exp(x) / (1.0 + math.exp(x))


 def relu(x: float) ->float:
@@ -66,7 +69,7 @@ def relu(x: float) ->float:

     (See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) .)
     """
-    pass
+    return max(0.0, x)


 EPS = 1e-06
@@ -74,32 +77,32 @@ EPS = 1e-06

 def log(x: float) ->float:
     """$f(x) = log(x)$"""
-    pass
+    return math.log(x + EPS)


 def exp(x: float) ->float:
     """$f(x) = e^{x}$"""
-    pass
+    return math.exp(x)


 def log_back(x: float, d: float) ->float:
     """If $f = log$ as above, compute $d \\times f'(x)$"""
-    pass
+    return d / (x + EPS)


 def inv(x: float) ->float:
     """$f(x) = 1/x$"""
-    pass
+    return 1.0 / (x + EPS)


 def inv_back(x: float, d: float) ->float:
     """If $f(x) = 1/x$ compute $d \\times f'(x)$"""
-    pass
+    return -d / ((x + EPS) ** 2)


 def relu_back(x: float, d: float) ->float:
     """If $f = relu$ compute $d \\times f'(x)$"""
-    pass
+    return d if x > 0 else 0.0


 def map(fn: Callable[[float], float]) ->Callable[[Iterable[float]],
@@ -116,12 +119,12 @@ def map(fn: Callable[[float], float]) ->Callable[[Iterable[float]],
          A function that takes a list, applies `fn` to each element, and returns a
          new list
     """
-    pass
+    return lambda ls: [fn(x) for x in ls]


 def negList(ls: Iterable[float]) ->Iterable[float]:
     """Use `map` and `neg` to negate each element in `ls`"""
-    pass
+    return map(neg)(ls)


 def zipWith(fn: Callable[[float, float], float]) ->Callable[[Iterable[float
@@ -139,12 +142,12 @@ def zipWith(fn: Callable[[float, float], float]) ->Callable[[Iterable[float
          applying fn(x, y) on each pair of elements.

     """
-    pass
+    return lambda ls1, ls2: [fn(x, y) for x, y in zip(ls1, ls2)]


 def addLists(ls1: Iterable[float], ls2: Iterable[float]) ->Iterable[float]:
     """Add the elements of `ls1` and `ls2` using `zipWith` and `add`"""
-    pass
+    return zipWith(add)(ls1, ls2)


 def reduce(fn: Callable[[float, float], float], start: float) ->Callable[[
@@ -161,14 +164,19 @@ def reduce(fn: Callable[[float, float], float], start: float) ->Callable[[
          $x_1 \\ldots x_n$ and computes the reduction :math:`fn(x_3, fn(x_2,
          fn(x_1, x_0)))`
     """
-    pass
+    def reducer(ls: Iterable[float]) ->float:
+        result = start
+        for x in ls:
+            result = fn(result, x)
+        return result
+    return reducer


 def sum(ls: Iterable[float]) ->float:
     """Sum up a list using `reduce` and `add`."""
-    pass
+    return reduce(add, 0.0)(ls)


 def prod(ls: Iterable[float]) ->float:
     """Product of a list using `reduce` and `mul`."""
-    pass
+    return reduce(mul, 1.0)(ls)
diff --git a/minitorch/scalar.py b/minitorch/scalar.py
index a8a0420..37b2931 100644
--- a/minitorch/scalar.py
+++ b/minitorch/scalar.py
@@ -67,25 +67,25 @@ class Scalar:
         return Mul.apply(b, Inv.apply(self))

     def __add__(self, b: ScalarLike) ->Scalar:
-        raise NotImplementedError('Need to implement for Task 1.2')
+        return Add.apply(self, b)

     def __bool__(self) ->bool:
         return bool(self.data)

     def __lt__(self, b: ScalarLike) ->Scalar:
-        raise NotImplementedError('Need to implement for Task 1.2')
+        return LT.apply(self, b)

     def __gt__(self, b: ScalarLike) ->Scalar:
-        raise NotImplementedError('Need to implement for Task 1.2')
+        return LT.apply(b, self)

     def __eq__(self, b: ScalarLike) ->Scalar:
-        raise NotImplementedError('Need to implement for Task 1.2')
+        return EQ.apply(self, b)

     def __sub__(self, b: ScalarLike) ->Scalar:
-        raise NotImplementedError('Need to implement for Task 1.2')
+        return Add.apply(self, Neg.apply(b))

     def __neg__(self) ->Scalar:
-        raise NotImplementedError('Need to implement for Task 1.2')
+        return Neg.apply(self)

     def __radd__(self, b: ScalarLike) ->Scalar:
         return self + b
@@ -101,11 +101,13 @@ class Scalar:
         Args:
             x: value to be accumulated
         """
-        pass
+        if self.derivative is None:
+            self.derivative = 0.0
+        self.derivative += x

     def is_leaf(self) ->bool:
         """True if this variable created by the user (no `last_fn`)"""
-        pass
+        return self.history.last_fn is None

     def backward(self, d_output: Optional[float]=None) ->None:
         """
@@ -115,7 +117,9 @@ class Scalar:
             d_output (number, opt): starting derivative to backpropagate through the model
                                    (typically left out, and assumed to be 1.0).
         """
-        pass
+        if d_output is None:
+            d_output = 1.0
+        backpropagate(self, d_output)


 def derivative_check(f: Any, *scalars: Scalar) ->None:
@@ -127,4 +131,12 @@ def derivative_check(f: Any, *scalars: Scalar) ->None:
         f : function from n-scalars to 1-scalar.
         *scalars  : n input scalar values.
     """
-    pass
+    out = f(*scalars)
+    out.backward()
+    
+    for i, x in enumerate(scalars):
+        check = central_difference(f, *scalars, arg=i)
+        assert abs(x.derivative - check) < 1e-2, (
+            f"Derivative of {f.__name__} with respect to argument {i} is incorrect. "
+            f"Expected {check}, got {x.derivative}"
+        )
diff --git a/minitorch/scalar_functions.py b/minitorch/scalar_functions.py
index c55ef86..7d855f4 100644
--- a/minitorch/scalar_functions.py
+++ b/minitorch/scalar_functions.py
@@ -10,12 +10,16 @@ if TYPE_CHECKING:

 def wrap_tuple(x):
     """Turn a possible value into a tuple"""
-    pass
+    if isinstance(x, tuple):
+        return x
+    return (x,)


 def unwrap_tuple(x):
     """Turn a singleton tuple into a value"""
-    pass
+    if len(x) == 1:
+        return x[0]
+    return x


 class ScalarFunction:
@@ -31,38 +35,132 @@ class ScalarFunction:
 class Add(ScalarFunction):
     """Addition function $f(x, y) = x + y$"""

+    @staticmethod
+    def forward(ctx: Context, a: float, b: float) -> float:
+        return a + b
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> Tuple[float, float]:
+        return d_output, d_output
+

 class Log(ScalarFunction):
     """Log function $f(x) = log(x)$"""

+    @staticmethod
+    def forward(ctx: Context, a: float) -> float:
+        ctx.save_for_backward(a)
+        return operators.log(a)
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> float:
+        (a,) = ctx.saved_values
+        return d_output / a
+

 class Mul(ScalarFunction):
     """Multiplication function"""

+    @staticmethod
+    def forward(ctx: Context, a: float, b: float) -> float:
+        ctx.save_for_backward(a, b)
+        return a * b
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> Tuple[float, float]:
+        a, b = ctx.saved_values
+        return d_output * b, d_output * a
+

 class Inv(ScalarFunction):
     """Inverse function"""

+    @staticmethod
+    def forward(ctx: Context, a: float) -> float:
+        ctx.save_for_backward(a)
+        return operators.inv(a)
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> float:
+        (a,) = ctx.saved_values
+        return -d_output / (a * a)
+

 class Neg(ScalarFunction):
     """Negation function"""

+    @staticmethod
+    def forward(ctx: Context, a: float) -> float:
+        return -a
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> float:
+        return -d_output
+

 class Sigmoid(ScalarFunction):
     """Sigmoid function"""

+    @staticmethod
+    def forward(ctx: Context, a: float) -> float:
+        sigmoid_value = operators.sigmoid(a)
+        ctx.save_for_backward(sigmoid_value)
+        return sigmoid_value
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> float:
+        (sigmoid_value,) = ctx.saved_values
+        return d_output * sigmoid_value * (1 - sigmoid_value)
+

 class ReLU(ScalarFunction):
     """ReLU function"""

+    @staticmethod
+    def forward(ctx: Context, a: float) -> float:
+        ctx.save_for_backward(a)
+        return operators.relu(a)
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> float:
+        (a,) = ctx.saved_values
+        return d_output if a > 0 else 0.0
+

 class Exp(ScalarFunction):
     """Exp function"""

+    @staticmethod
+    def forward(ctx: Context, a: float) -> float:
+        exp_value = operators.exp(a)
+        ctx.save_for_backward(exp_value)
+        return exp_value
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> float:
+        (exp_value,) = ctx.saved_values
+        return d_output * exp_value
+

 class LT(ScalarFunction):
     """Less-than function $f(x) =$ 1.0 if x is less than y else 0.0"""

+    @staticmethod
+    def forward(ctx: Context, a: float, b: float) -> float:
+        return 1.0 if a < b else 0.0
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> Tuple[float, float]:
+        return 0.0, 0.0
+

 class EQ(ScalarFunction):
     """Equal function $f(x) =$ 1.0 if x is equal to y else 0.0"""
+
+    @staticmethod
+    def forward(ctx: Context, a: float, b: float) -> float:
+        return 1.0 if a == b else 0.0
+
+    @staticmethod
+    def backward(ctx: Context, d_output: float) -> Tuple[float, float]:
+        return 0.0, 0.0
diff --git a/minitorch/tensor.py b/minitorch/tensor.py
index dc62ddc..1835693 100644
--- a/minitorch/tensor.py
+++ b/minitorch/tensor.py
@@ -66,7 +66,7 @@ class Tensor:
         Returns:
              Converted to numpy array
         """
-        pass
+        return self._tensor.to_numpy()

     @property
     def shape(self) ->UserShape:
@@ -74,7 +74,7 @@ class Tensor:
         Returns:
              shape of the tensor
         """
-        pass
+        return self._tensor.shape

     @property
     def size(self) ->int:
@@ -82,7 +82,7 @@ class Tensor:
         Returns:
              int : size of the tensor
         """
-        pass
+        return self._tensor.size

     @property
     def dims(self) ->int:
@@ -90,11 +90,16 @@ class Tensor:
         Returns:
              int : dimensionality of the tensor
         """
-        pass
+        return len(self._tensor.shape)

     def _ensure_tensor(self, b: TensorLike) ->Tensor:
         """Turns a python number into a tensor with the same backend."""
-        pass
+        if isinstance(b, (int, float)):
+            return Tensor.make([b], (1,), backend=self.backend)
+        elif isinstance(b, Tensor):
+            return b
+        else:
+            raise TypeError(f"Can't convert {type(b)} to Tensor")

     def __add__(self, b: TensorLike) ->Tensor:
         return Add.apply(self, self._ensure_tensor(b))
@@ -135,23 +140,27 @@ class Tensor:

     def sum(self, dim: Optional[int]=None) ->Tensor:
         """Compute the sum over dimension `dim`"""
-        pass
+        return Sum.apply(self, dim)

     def mean(self, dim: Optional[int]=None) ->Tensor:
         """Compute the mean over dimension `dim`"""
-        pass
+        sum_tensor = self.sum(dim)
+        if dim is None:
+            return sum_tensor / self.size
+        else:
+            return sum_tensor / self.shape[dim]

     def permute(self, *order: int) ->Tensor:
         """Permute tensor dimensions to *order"""
-        pass
+        return Permute.apply(self, order)

     def view(self, *shape: int) ->Tensor:
         """Change the shape of the tensor to a new shape with the same size"""
-        pass
+        return View.apply(self, shape)

     def contiguous(self) ->Tensor:
         """Return a contiguous tensor with the same data"""
-        pass
+        return Copy.apply(self)

     def __repr__(self) ->str:
         return self._tensor.to_string()
@@ -169,7 +178,8 @@ class Tensor:
         strides: Optional[UserStrides]=None, backend: Optional[
         TensorBackend]=None) ->Tensor:
         """Create a new tensor from data"""
-        pass
+        tensor_data = TensorData(storage, shape, strides)
+        return Tensor(tensor_data, backend=backend)

     def expand(self, other: Tensor) ->Tensor:
         """
@@ -185,7 +195,14 @@ class Tensor:
             Expanded version of `other` with the right derivatives

         """
-        pass
+        if self.shape == other.shape:
+            return other
+        
+        expanded = Tensor.make(other._tensor._storage, self.shape, backend=self.backend)
+        for i in range(len(self.shape)):
+            if self.shape[i] != other.shape[i]:
+                expanded = expanded.sum(i)
+        return expanded

     def accumulate_derivative(self, x: Any) ->None:
         """
@@ -195,14 +212,20 @@ class Tensor:
         Args:
             x : value to be accumulated
         """
-        pass
+        if self.is_leaf():
+            if self.grad is None:
+                self.grad = Tensor.make(x._tensor._storage, x.shape, backend=self.backend)
+            else:
+                self.grad += x
+        else:
+            raise RuntimeError("Derivative can only be accumulated on leaf variables.")

     def is_leaf(self) ->bool:
         """True if this variable created by the user (no `last_fn`)"""
-        pass
+        return self.history is None or self.history.last_fn is None

     def zero_grad_(self) ->None:
         """
         Reset the derivative on this variable.
         """
-        pass
+        self.grad = None
diff --git a/minitorch/tensor_data.py b/minitorch/tensor_data.py
index a28b7f8..b106b7c 100644
--- a/minitorch/tensor_data.py
+++ b/minitorch/tensor_data.py
@@ -37,7 +37,7 @@ def index_to_position(index: Index, strides: Strides) ->int:
     Returns:
         Position in storage
     """
-    pass
+    return int(np.sum(index * strides))


 def to_index(ordinal: int, shape: Shape, out_index: OutIndex) ->None:
@@ -53,7 +53,9 @@ def to_index(ordinal: int, shape: Shape, out_index: OutIndex) ->None:
         out_index : return index corresponding to position.

     """
-    pass
+    for i in range(len(shape) - 1, -1, -1):
+        out_index[i] = ordinal % shape[i]
+        ordinal //= shape[i]


 def broadcast_index(big_index: Index, big_shape: Shape, shape: Shape,
@@ -74,7 +76,12 @@ def broadcast_index(big_index: Index, big_shape: Shape, shape: Shape,
     Returns:
         None
     """
-    pass
+    offset = len(big_shape) - len(shape)
+    for i, s in enumerate(shape):
+        if s > 1:
+            out_index[i] = big_index[offset + i]
+        else:
+            out_index[i] = 0


 def shape_broadcast(shape1: UserShape, shape2: UserShape) ->UserShape:
@@ -91,7 +98,16 @@ def shape_broadcast(shape1: UserShape, shape2: UserShape) ->UserShape:
     Raises:
         IndexingError : if cannot broadcast
     """
-    pass
+    max_len = max(len(shape1), len(shape2))
+    new_shape = []
+    for i in range(max_len):
+        dim1 = shape1[i] if i < len(shape1) else 1
+        dim2 = shape2[i] if i < len(shape2) else 1
+        if dim1 == 1 or dim2 == 1 or dim1 == dim2:
+            new_shape.append(max(dim1, dim2))
+        else:
+            raise IndexingError(f"Cannot broadcast shapes {shape1} and {shape2}")
+    return tuple(new_shape)


 class TensorData:
@@ -130,7 +146,12 @@ class TensorData:
         Returns:
             bool : True if contiguous
         """
-        pass
+        last_stride = 1
+        for i in range(self.dims - 1, -1, -1):
+            if self._strides[i] < last_stride:
+                return False
+            last_stride = self._strides[i] * self._shape[i]
+        return True

     def permute(self, *order: int) ->TensorData:
         """
@@ -142,4 +163,7 @@ class TensorData:
         Returns:
             New `TensorData` with the same storage and a new dimension order.
         """
-        pass
+        assert len(order) == self.dims, f"Invalid permutation {order} for shape {self.shape}"
+        new_shape = tuple(self.shape[i] for i in order)
+        new_strides = tuple(self.strides[i] for i in order)
+        return TensorData(self._storage, new_shape, new_strides)
diff --git a/minitorch/tensor_functions.py b/minitorch/tensor_functions.py
index 7602588..5f50827 100644
--- a/minitorch/tensor_functions.py
+++ b/minitorch/tensor_functions.py
@@ -17,79 +17,223 @@ if TYPE_CHECKING:

 def wrap_tuple(x):
     """Turn a possible value into a tuple"""
-    pass
+    if isinstance(x, tuple):
+        return x
+    return (x,)


 class Function:
-    pass
+    @staticmethod
+    def forward(ctx: Context, *inputs: Tensor) -> Tensor:
+        raise NotImplementedError()
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, ...]:
+        raise NotImplementedError()


 class Neg(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor) -> Tensor:
+        return t1.f.neg_map(t1)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+        return (grad_output.f.neg_map(grad_output),)


 class Inv(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor) -> Tensor:
+        ctx.save_for_backward(t1)
+        return t1.f.inv_map(t1)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+        t1 = ctx.saved_values[0]
+        return (grad_output.f.inv_map(t1.f.mul_map(t1)).f.mul_map(grad_output).f.neg_map(),)


 class Add(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor, t2: Tensor) -> Tensor:
+        return t1.f.add_zip(t1, t2)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+        return (grad_output, grad_output)


 class Mul(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor, t2: Tensor) -> Tensor:
+        ctx.save_for_backward(t1, t2)
+        return t1.f.mul_zip(t1, t2)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+        t1, t2 = ctx.saved_values
+        return (grad_output.f.mul_zip(grad_output, t2),
+                grad_output.f.mul_zip(grad_output, t1))


 class Sigmoid(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor) -> Tensor:
+        result = t1.f.sigmoid_map(t1)
+        ctx.save_for_backward(result)
+        return result
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+        sigmoid_output = ctx.saved_values[0]
+        return (grad_output.f.mul_zip(grad_output, 
+                sigmoid_output.f.mul_zip(sigmoid_output, 
+                sigmoid_output.f.add_scalar(-1).f.neg_map())),)


 class ReLU(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor) -> Tensor:
+        ctx.save_for_backward(t1)
+        return t1.f.relu_map(t1)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+        t1 = ctx.saved_values[0]
+        return (grad_output.f.mul_zip(grad_output, t1.f.relu_back_zip(t1)),)


 class Log(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor) -> Tensor:
+        ctx.save_for_backward(t1)
+        return t1.f.log_map(t1)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+        t1 = ctx.saved_values[0]
+        return (grad_output.f.mul_zip(grad_output, t1.f.inv_map(t1)),)


 class Exp(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor) -> Tensor:
+        result = t1.f.exp_map(t1)
+        ctx.save_for_backward(result)
+        return result
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+        exp_output = ctx.saved_values[0]
+        return (grad_output.f.mul_zip(grad_output, exp_output),)


 class Sum(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, a: Tensor, dim: Tensor) -> Tensor:
+        ctx.save_for_backward(a.shape, dim)
+        return a.sum(int(dim.item()))
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, None]:
+        original_shape, dim = ctx.saved_values
+        grad = grad_output
+        for _ in range(len(original_shape) - grad.dims):
+            grad = grad.unsqueeze(int(dim.item()))
+        grad = grad.expand(original_shape)
+        return (grad, None)


 class All(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, a: Tensor, dim: Tensor) -> Tensor:
+        return a.all(int(dim.item()))
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[None, None]:
+        return (None, None)


 class LT(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, a: Tensor, b: Tensor) -> Tensor:
+        return a.f.lt_zip(a, b)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[None, None]:
+        return (None, None)


 class EQ(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, a: Tensor, b: Tensor) -> Tensor:
+        return a.f.eq_zip(a, b)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[None, None]:
+        return (None, None)


 class IsClose(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, a: Tensor, b: Tensor) -> Tensor:
+        return a.f.is_close_zip(a, b)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[None, None]:
+        return (None, None)


 class Permute(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, a: Tensor, order: Tensor) -> Tensor:
+        ctx.save_for_backward(order)
+        return a.permute(*[int(i.item()) for i in order])
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, None]:
+        order = ctx.saved_values[0]
+        inv_order = [0] * len(order)
+        for i, p in enumerate(order):
+            inv_order[int(p.item())] = i
+        return (grad_output.permute(*inv_order), None)


 class View(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, a: Tensor, shape: Tensor) -> Tensor:
+        ctx.save_for_backward(a.shape)
+        return a.view(*[int(i.item()) for i in shape])
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, None]:
+        original_shape = ctx.saved_values[0]
+        return (grad_output.view(original_shape), None)


 class Copy(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, a: Tensor) -> Tensor:
+        return a.f.id_map(a)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+        return (grad_output,)


 class MatMul(Function):
-    pass
+    @staticmethod
+    def forward(ctx: Context, t1: Tensor, t2: Tensor) -> Tensor:
+        ctx.save_for_backward(t1, t2)
+        return t1.f.matrix_multiply(t1, t2)
+
+    @staticmethod
+    def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+        t1, t2 = ctx.saved_values
+        grad_t1 = grad_output.f.matrix_multiply(grad_output, t2.transpose())
+        grad_t2 = t1.transpose().f.matrix_multiply(t1.transpose(), grad_output)
+        return (grad_t1, grad_t2)


 def zeros(shape: UserShape, backend: TensorBackend=SimpleBackend) ->Tensor:
@@ -103,7 +247,7 @@ def zeros(shape: UserShape, backend: TensorBackend=SimpleBackend) ->Tensor:
     Returns:
         new tensor
     """
-    pass
+    return Tensor.make([0] * int(operators.prod(shape)), shape, backend=backend)


 def rand(shape: UserShape, backend: TensorBackend=SimpleBackend,
@@ -119,7 +263,10 @@ def rand(shape: UserShape, backend: TensorBackend=SimpleBackend,
     Returns:
         :class:`Tensor` : new tensor
     """
-    pass
+    vals = [random.random() for _ in range(int(operators.prod(shape)))]
+    tensor = Tensor.make(vals, shape, backend=backend)
+    tensor.requires_grad_(requires_grad)
+    return tensor


 def _tensor(ls: Any, shape: UserShape, backend: TensorBackend=SimpleBackend,
@@ -136,7 +283,9 @@ def _tensor(ls: Any, shape: UserShape, backend: TensorBackend=SimpleBackend,
     Returns:
         new tensor
     """
-    pass
+    tensor = Tensor.make(ls, shape, backend=backend)
+    tensor.requires_grad_(requires_grad)
+    return tensor


 def tensor(ls: Any, backend: TensorBackend=SimpleBackend, requires_grad:
@@ -152,4 +301,12 @@ def tensor(ls: Any, backend: TensorBackend=SimpleBackend, requires_grad:
     Returns:
         :class:`Tensor` : new tensor
     """
-    pass
+    if isinstance(ls, Tensor):
+        return ls
+    if isinstance(ls, (float, int)):
+        return _tensor([ls], (1,), backend=backend, requires_grad=requires_grad)
+    if isinstance(ls, (list, tuple)):
+        tensor = _tensor(np.array(ls).flatten(), shape=np.array(ls).shape, backend=backend)
+        tensor.requires_grad_(requires_grad)
+        return tensor
+    raise TypeError("Cannot create tensor from %s" % (type(ls)))
diff --git a/minitorch/tensor_ops.py b/minitorch/tensor_ops.py
index e10ff6c..604d078 100644
--- a/minitorch/tensor_ops.py
+++ b/minitorch/tensor_ops.py
@@ -88,7 +88,19 @@ class SimpleOps(TensorOps):
         Returns:
             new tensor data
         """
-        pass
+        def _map(a: Tensor, out: Optional[Tensor] = None) -> Tensor:
+            if out is None:
+                out = a.zeros(a.shape)
+            
+            for i in range(a.shape[0]):
+                for j in range(a.shape[1]):
+                    out_index = [i, j]
+                    a_index = [i, min(j, a.shape[1] - 1)]
+                    out.data[out.index_to_position(out_index)] = fn(a.data[a.index_to_position(a_index)])
+            
+            return out
+        
+        return _map

     @staticmethod
     def zip(fn: Callable[[float, float], float]) ->Callable[['Tensor',
@@ -120,7 +132,23 @@ class SimpleOps(TensorOps):
         Returns:
             :class:`TensorData` : new tensor data
         """
-        pass
+        def _zip(a: Tensor, b: Tensor) -> Tensor:
+            out_shape = shape_broadcast(a.shape, b.shape)
+            out = a.zeros(out_shape)
+            
+            for i in range(out_shape[0]):
+                for j in range(out_shape[1]):
+                    out_index = [i, j]
+                    a_index = [min(i, a.shape[0] - 1), min(j, a.shape[1] - 1)]
+                    b_index = [min(i, b.shape[0] - 1), min(j, b.shape[1] - 1)]
+                    out.data[out.index_to_position(out_index)] = fn(
+                        a.data[a.index_to_position(a_index)],
+                        b.data[b.index_to_position(b_index)]
+                    )
+            
+            return out
+        
+        return _zip

     @staticmethod
     def reduce(fn: Callable[[float, float], float], start: float=0.0
@@ -147,7 +175,27 @@ class SimpleOps(TensorOps):
         Returns:
             :class:`TensorData` : new tensor
         """
-        pass
+        def _reduce(a: Tensor, dim: int) -> Tensor:
+            out_shape = list(a.shape)
+            out_shape[dim] = 1
+            out = a.zeros(tuple(out_shape))
+            
+            for i in range(out_shape[0]):
+                for j in range(out_shape[1]):
+                    out_index = [i, j]
+                    out.data[out.index_to_position(out_index)] = start
+                    
+                    for k in range(a.shape[dim]):
+                        a_index = [i, j]
+                        a_index[dim] = k
+                        out.data[out.index_to_position(out_index)] = fn(
+                            out.data[out.index_to_position(out_index)],
+                            a.data[a.index_to_position(a_index)]
+                        )
+            
+            return out
+        
+        return _reduce
     is_cuda = False


@@ -175,7 +223,21 @@ def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
     Returns:
         Tensor map function.
     """
-    pass
+    def _map(in_storage: Storage, in_shape: Shape, in_strides: Strides,
+             out_storage: Storage, out_shape: Shape, out_strides: Strides) -> None:
+        
+        in_index = [0] * len(in_shape)
+        out_index = [0] * len(out_shape)
+        
+        for i in range(len(out_storage)):
+            out_pos = index_to_position(out_index, out_strides)
+            in_pos = index_to_position(in_index, in_strides)
+            out_storage[out_pos] = fn(in_storage[in_pos])
+            
+            to_index(i + 1, out_shape, out_index)
+            broadcast_index(in_index, in_shape, out_index, out_shape)
+    
+    return _map


 def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
@@ -202,7 +264,25 @@ def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
     Returns:
         Tensor zip function.
     """
-    pass
+    def _zip(a_storage: Storage, a_shape: Shape, a_strides: Strides,
+             b_storage: Storage, b_shape: Shape, b_strides: Strides,
+             out_storage: Storage, out_shape: Shape, out_strides: Strides) -> None:
+        
+        a_index = [0] * len(a_shape)
+        b_index = [0] * len(b_shape)
+        out_index = [0] * len(out_shape)
+        
+        for i in range(len(out_storage)):
+            a_pos = index_to_position(a_index, a_strides)
+            b_pos = index_to_position(b_index, b_strides)
+            out_pos = index_to_position(out_index, out_strides)
+            out_storage[out_pos] = fn(a_storage[a_pos], b_storage[b_pos])
+            
+            to_index(i + 1, out_shape, out_index)
+            broadcast_index(a_index, a_shape, out_index, out_shape)
+            broadcast_index(b_index, b_shape, out_index, out_shape)
+    
+    return _zip


 def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
@@ -219,7 +299,27 @@ def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
     Returns:
         Tensor reduce function.
     """
-    pass
+    def _reduce(a_storage: Storage, a_shape: Shape, a_strides: Strides,
+                out_storage: Storage, out_shape: Shape, out_strides: Strides,
+                reduce_dim: int) -> None:
+        
+        a_index = [0] * len(a_shape)
+        out_index = [0] * len(out_shape)
+        
+        for i in range(len(out_storage)):
+            out_pos = index_to_position(out_index, out_strides)
+            out_storage[out_pos] = a_storage[index_to_position(a_index, a_strides)]
+            
+            for j in range(1, a_shape[reduce_dim]):
+                a_index[reduce_dim] = j
+                a_pos = index_to_position(a_index, a_strides)
+                out_storage[out_pos] = fn(out_storage[out_pos], a_storage[a_pos])
+            
+            to_index(i + 1, out_shape, out_index)
+            broadcast_index(a_index, a_shape, out_index, out_shape)
+            a_index[reduce_dim] = 0
+    
+    return _reduce


 SimpleBackend = TensorBackend(SimpleOps)
diff --git a/minitorch/testing.py b/minitorch/testing.py
index bc2dc74..690d1dc 100644
--- a/minitorch/testing.py
+++ b/minitorch/testing.py
@@ -8,77 +8,77 @@ class MathTest(Generic[A]):
     @staticmethod
     def neg(a: A) ->A:
         """Negate the argument"""
-        pass
+        return -a

     @staticmethod
     def addConstant(a: A) ->A:
         """Add contant to the argument"""
-        pass
+        return a + 5

     @staticmethod
     def square(a: A) ->A:
         """Manual square"""
-        pass
+        return a * a

     @staticmethod
     def cube(a: A) ->A:
         """Manual cube"""
-        pass
+        return a * a * a

     @staticmethod
     def subConstant(a: A) ->A:
         """Subtract a constant from the argument"""
-        pass
+        return a - 5

     @staticmethod
     def multConstant(a: A) ->A:
         """Multiply a constant to the argument"""
-        pass
+        return a * 5

     @staticmethod
     def div(a: A) ->A:
         """Divide by a constant"""
-        pass
+        return a / 5

     @staticmethod
     def inv(a: A) ->A:
         """Invert after adding"""
-        pass
+        return 1 / (a + 10)

     @staticmethod
     def sig(a: A) ->A:
         """Apply sigmoid"""
-        pass
+        return 1 / (1 + operators.exp(-a))

     @staticmethod
     def log(a: A) ->A:
         """Apply log to a large value"""
-        pass
+        return operators.log(a + 100)

     @staticmethod
     def relu(a: A) ->A:
         """Apply relu"""
-        pass
+        return max(a, 0)

     @staticmethod
     def exp(a: A) ->A:
         """Apply exp to a smaller value"""
-        pass
+        return operators.exp(a / 100)

     @staticmethod
     def add2(a: A, b: A) ->A:
         """Add two arguments"""
-        pass
+        return a + b

     @staticmethod
     def mul2(a: A, b: A) ->A:
         """Mul two arguments"""
-        pass
+        return a * b

     @staticmethod
     def div2(a: A, b: A) ->A:
         """Divide two arguments"""
-        pass
+        return a / b

     @classmethod
     def _tests(cls) ->Tuple[Tuple[str, Callable[[A], A]], Tuple[str,
@@ -86,7 +86,26 @@ class MathTest(Generic[A]):
         """
         Returns a list of all the math tests.
         """
-        pass
+        one_arg = [
+            ("neg", cls.neg),
+            ("addConstant", cls.addConstant),
+            ("square", cls.square),
+            ("cube", cls.cube),
+            ("subConstant", cls.subConstant),
+            ("multConstant", cls.multConstant),
+            ("div", cls.div),
+            ("inv", cls.inv),
+            ("sig", cls.sig),
+            ("log", cls.log),
+            ("relu", cls.relu),
+            ("exp", cls.exp)
+        ]
+        two_arg = [
+            ("add2", cls.add2),
+            ("mul2", cls.mul2),
+            ("div2", cls.div2)
+        ]
+        return (one_arg, two_arg, [])


 class MathTestVariable(MathTest):
diff --git a/tests/test_conv.py b/tests/test_conv.py
index cb17993..70cf08f 100644
--- a/tests/test_conv.py
+++ b/tests/test_conv.py
@@ -67,3 +67,33 @@ def test_conv2() -> None:
     out.sum().backward()

     minitorch.grad_check(minitorch.Conv2dFun.apply, t, t2)
+
+
+@pytest.mark.task4_1
+def test_conv1d_multi_channel_batch() -> None:
+    t = minitorch.tensor([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]]).view(2, 2, 3)
+    t.requires_grad_(True)
+    t2 = minitorch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).view(2, 2, 2)
+    out = minitorch.Conv1dFun.apply(t, t2)
+
+    assert out.shape == (2, 2, 2)
+    assert out[0, 0, 0] == 0 * 1 + 1 * 2 + 3 * 3 + 4 * 4
+    assert out[1, 1, 1] == 7 * 5 + 8 * 6 + 10 * 7 + 11 * 8
+    minitorch.grad_check(minitorch.Conv1dFun.apply, t, t2)
+
+
+@pytest.mark.task4_2
+def test_conv2d_multi_channel_batch() -> None:
+    t = minitorch.tensor([[[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
+                           [[9, 10, 11], [12, 13, 14], [15, 16, 17]]],
+                          [[[18, 19, 20], [21, 22, 23], [24, 25, 26]],
+                           [[27, 28, 29], [30, 31, 32], [33, 34, 35]]]]).view(2, 2, 3, 3)
+    t.requires_grad_(True)
+    t2 = minitorch.tensor([[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                           [[[9, 10], [11, 12]], [[13, 14], [15, 16]]]]).view(2, 2, 2, 2)
+    out = minitorch.Conv2dFun.apply(t, t2)
+
+    assert out.shape == (2, 2, 2, 2)
+    assert out[0, 0, 0, 0] == (0*1 + 1*2 + 3*3 + 4*4) + (9*5 + 10*6 + 12*7 + 13*8)
+    assert out[1, 1, 1, 1] == (25*9 + 26*10 + 34*11 + 35*12) + (34*13 + 35*14 + 43*15 + 44*16)
+    minitorch.grad_check(minitorch.Conv2dFun.apply, t, t2)