back to Claude Sonnet 3.5 - Fill-in summary
Claude Sonnet 3.5 - Fill-in: minitorch
Failed to run pytests for test tests
Pytest collection failure.
Patch diff
diff --git a/minitorch/autodiff.py b/minitorch/autodiff.py
index 39acbe8..5dc0f71 100644
--- a/minitorch/autodiff.py
+++ b/minitorch/autodiff.py
@@ -19,7 +19,11 @@ def central_difference(f: Any, *vals: Any, arg: int=0, epsilon: float=1e-06
Returns:
An approximation of $f'_i(x_0, \\ldots, x_{n-1})$
"""
- pass
+ vals_plus = list(vals)
+ vals_minus = list(vals)
+ vals_plus[arg] += epsilon
+ vals_minus[arg] -= epsilon
+ return (f(*vals_plus) - f(*vals_minus)) / (2 * epsilon)
variable_count = 1
@@ -39,7 +43,18 @@ def topological_sort(variable: Variable) ->Iterable[Variable]:
Returns:
Non-constant Variables in topological order starting from the right.
"""
- pass
+ def visit(var: Variable, visited: Set[Variable], result: List[Variable]):
+ if var not in visited:
+ visited.add(var)
+ if hasattr(var, 'history') and var.history:
+ for input_var in var.history.inputs:
+ visit(input_var, visited, result)
+ result.append(var)
+
+ visited = set()
+ result = []
+ visit(variable, visited, result)
+ return reversed(result)
def backpropagate(variable: Variable, deriv: Any) ->None:
@@ -53,7 +68,30 @@ def backpropagate(variable: Variable, deriv: Any) ->None:
No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`.
"""
- pass
+ queue = [(variable, deriv)]
+ visited = set()
+
+ while queue:
+ var, d = queue.pop(0)
+ if var in visited:
+ continue
+ visited.add(var)
+
+ if hasattr(var, 'accumulate_derivative'):
+ var.accumulate_derivative(d)
+
+ if hasattr(var, 'history') and var.history:
+ backward = var.history.last_fn.backward
+ context = var.history.ctx
+ inputs = var.history.inputs
+ backward_derivatives = backward(d, inputs, context)
+
+ if not isinstance(backward_derivatives, tuple):
+ backward_derivatives = (backward_derivatives,)
+
+ for input_var, back_deriv in zip(inputs, backward_derivatives):
+ if back_deriv is not None:
+ queue.append((input_var, back_deriv))
@dataclass
@@ -66,4 +104,5 @@ class Context:
def save_for_backward(self, *values: Any) ->None:
"""Store the given `values` if they need to be used during backpropagation."""
- pass
+ if not self.no_grad:
+ self.saved_values = values
diff --git a/minitorch/cuda_ops.py b/minitorch/cuda_ops.py
index c1d7c5e..0e7d237 100644
--- a/minitorch/cuda_ops.py
+++ b/minitorch/cuda_ops.py
@@ -16,7 +16,11 @@ class CudaOps(TensorOps):
@staticmethod
def map(fn: Callable[[float], float]) ->MapProto:
"""See `tensor_ops.py`"""
- pass
+ def _map(out: Storage, out_shape: Shape, out_strides: Strides,
+ in_storage: Storage, in_shape: Shape, in_strides: Strides) ->None:
+ return tensor_map(fn)(out, out_shape, out_strides,
+ in_storage, in_shape, in_strides)
+ return _map
def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
@@ -33,7 +37,18 @@ def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
Returns:
Tensor map function.
"""
- pass
+ def _map(out: Storage, out_shape: Shape, out_strides: Strides,
+ in_storage: Storage, in_shape: Shape, in_strides: Strides) ->None:
+ i = cuda.grid(1)
+ if i < len(out):
+ out_index = cuda.local.array(MAX_DIMS, numba.int32)
+ in_index = cuda.local.array(MAX_DIMS, numba.int32)
+ to_index(i, out_shape, out_index)
+ broadcast_index(out_index, out_shape, in_shape, in_index)
+ in_pos = index_to_position(in_index, in_strides)
+ out_pos = index_to_position(out_index, out_strides)
+ out[out_pos] = fn(in_storage[in_pos])
+ return cuda.jit()(_map)
def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
@@ -50,7 +65,22 @@ def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
Returns:
Tensor zip function.
"""
- pass
+ def _zip(out: Storage, out_shape: Shape, out_strides: Strides,
+ a_storage: Storage, a_shape: Shape, a_strides: Strides,
+ b_storage: Storage, b_shape: Shape, b_strides: Strides) ->None:
+ i = cuda.grid(1)
+ if i < len(out):
+ out_index = cuda.local.array(MAX_DIMS, numba.int32)
+ a_index = cuda.local.array(MAX_DIMS, numba.int32)
+ b_index = cuda.local.array(MAX_DIMS, numba.int32)
+ to_index(i, out_shape, out_index)
+ broadcast_index(out_index, out_shape, a_shape, a_index)
+ broadcast_index(out_index, out_shape, b_shape, b_index)
+ a_pos = index_to_position(a_index, a_strides)
+ b_pos = index_to_position(b_index, b_strides)
+ out_pos = index_to_position(out_index, out_strides)
+ out[out_pos] = fn(a_storage[a_pos], b_storage[b_pos])
+ return cuda.jit()(_zip)
def _sum_practice(out: Storage, a: Storage, size: int) ->None:
@@ -74,7 +104,19 @@ def _sum_practice(out: Storage, a: Storage, size: int) ->None:
size (int): length of a.
"""
- pass
+ shared = cuda.shared.array(THREADS_PER_BLOCK, numba.float64)
+ i = cuda.grid(1)
+ local_i = cuda.threadIdx.x
+ shared[local_i] = a[i] if i < size else 0.0
+ cuda.syncthreads()
+
+ for s in [1, 2, 4, 8, 16]:
+ if local_i % (2 * s) == 0 and i + s < size:
+ shared[local_i] += shared[local_i + s]
+ cuda.syncthreads()
+
+ if local_i == 0:
+ out[cuda.blockIdx.x] = shared[0]
jit_sum_practice = cuda.jit()(_sum_practice)
@@ -92,7 +134,42 @@ def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
Tensor reduce function.
"""
- pass
+ def _reduce(out: Storage, out_shape: Shape, out_strides: Strides,
+ a_storage: Storage, a_shape: Shape, a_strides: Strides,
+ reduce_dim: int) ->None:
+ shared = cuda.shared.array(THREADS_PER_BLOCK, numba.float64)
+ i = cuda.grid(1)
+ local_i = cuda.threadIdx.x
+
+ if i < len(out):
+ out_index = cuda.local.array(MAX_DIMS, numba.int32)
+ to_index(i, out_shape, out_index)
+ out_pos = index_to_position(out_index, out_strides)
+
+ a_index = cuda.local.array(MAX_DIMS, numba.int32)
+ for j in range(MAX_DIMS):
+ a_index[j] = out_index[j]
+
+ reduce_size = a_shape[reduce_dim]
+ reduce_stride = a_strides[reduce_dim]
+
+ acc = a_storage[index_to_position(a_index, a_strides)]
+ for j in range(1, reduce_size):
+ a_index[reduce_dim] = j
+ acc = fn(acc, a_storage[index_to_position(a_index, a_strides)])
+
+ shared[local_i] = acc
+ cuda.syncthreads()
+
+ for s in [1, 2, 4, 8, 16]:
+ if local_i % (2 * s) == 0 and local_i + s < THREADS_PER_BLOCK:
+ shared[local_i] = fn(shared[local_i], shared[local_i + s])
+ cuda.syncthreads()
+
+ if local_i == 0:
+ out[out_pos] = shared[0]
+
+ return cuda.jit()(_reduce)
def _mm_practice(out: Storage, a: Storage, b: Storage, size: int) ->None:
@@ -125,7 +202,20 @@ def _mm_practice(out: Storage, a: Storage, b: Storage, size: int) ->None:
b (Storage): storage for `b` tensor.
size (int): size of the square
"""
- pass
+ shared_a = cuda.shared.array((32, 32), numba.float64)
+ shared_b = cuda.shared.array((32, 32), numba.float64)
+
+ i, j = cuda.grid(2)
+ if i < size and j < size:
+ shared_a[i, j] = a[i * size + j]
+ shared_b[i, j] = b[i * size + j]
+ cuda.syncthreads()
+
+ if i < size and j < size:
+ acc = 0.0
+ for k in range(size):
+ acc += shared_a[i, k] * shared_b[k, j]
+ out[i * size + j] = acc
jit_mm_practice = cuda.jit()(_mm_practice)
@@ -151,7 +241,37 @@ def _tensor_matrix_multiply(out: Storage, out_shape: Shape, out_strides:
Returns:
None : Fills in `out`
"""
- pass
+ a_batch_stride = a_strides[0] if len(a_shape) > 2 else 0
+ b_batch_stride = b_strides[0] if len(b_shape) > 2 else 0
+
+ BLOCK_DIM = 32
+ shared_a = cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+ shared_b = cuda.shared.array((BLOCK_DIM, BLOCK_DIM), numba.float64)
+
+ batch_id, i, j = cuda.grid(3)
+ tx, ty = cuda.threadIdx.x, cuda.threadIdx.y
+
+ if batch_id < out_shape[0] and i < out_shape[1] and j < out_shape[2]:
+ acc = 0.0
+ for k_start in range(0, a_shape[-1], BLOCK_DIM):
+ if tx < min(BLOCK_DIM, a_shape[-1] - k_start) and i < a_shape[-2]:
+ shared_a[ty, tx] = a_storage[batch_id * a_batch_stride + i * a_strides[-2] + (k_start + tx) * a_strides[-1]]
+ else:
+ shared_a[ty, tx] = 0.0
+
+ if ty < min(BLOCK_DIM, b_shape[-1] - j) and k_start + tx < b_shape[-2]:
+ shared_b[tx, ty] = b_storage[batch_id * b_batch_stride + (k_start + tx) * b_strides[-2] + j * b_strides[-1]]
+ else:
+ shared_b[tx, ty] = 0.0
+
+ cuda.syncthreads()
+
+ for k in range(min(BLOCK_DIM, a_shape[-1] - k_start)):
+ acc += shared_a[ty, k] * shared_b[k, tx]
+
+ cuda.syncthreads()
+
+ out[batch_id * out_strides[0] + i * out_strides[1] + j * out_strides[2]] = acc
tensor_matrix_multiply = cuda.jit(_tensor_matrix_multiply)
diff --git a/minitorch/fast_conv.py b/minitorch/fast_conv.py
index 14f13fc..2c57de4 100644
--- a/minitorch/fast_conv.py
+++ b/minitorch/fast_conv.py
@@ -45,7 +45,32 @@ def _tensor_conv1d(out: Tensor, out_shape: Shape, out_strides: Strides,
weight_strides (Strides): strides for `input` tensor.
reverse (bool): anchor weight at left or right
"""
- pass
+ batch, out_channels, out_width = out_shape
+ batch, in_channels, width = input_shape
+ out_channels, in_channels, k_width = weight_shape
+
+ for i in prange(out_size):
+ out_index = np.empty(3, np.int32)
+ to_index(i, out_shape, out_index)
+ b, oc, x = out_index
+
+ # Initialize accumulator
+ acc = 0.0
+
+ for ic in range(in_channels):
+ for k in range(k_width):
+ if reverse:
+ w = x + k
+ else:
+ w = x - k + k_width - 1
+
+ if 0 <= w < width:
+ in_pos = index_to_position((b, ic, w), input_strides)
+ w_pos = index_to_position((oc, ic, k), weight_strides)
+ acc += input[in_pos] * weight[w_pos]
+
+ out_pos = index_to_position((b, oc, x), out_strides)
+ out[out_pos] = acc
tensor_conv1d = njit(parallel=True)(_tensor_conv1d)
@@ -60,13 +85,38 @@ class Conv1dFun(Function):
Args:
ctx : Context
- input : batch x in_channel x h x w
- weight : out_channel x in_channel x kh x kw
+ input : batch x in_channel x w
+ weight : out_channel x in_channel x kw
Returns:
- batch x out_channel x h x w
+ batch x out_channel x w
"""
- pass
+ ctx.save_for_backward(input, weight)
+ batch, in_channels, w = input.shape
+ out_channels, in_channels, kw = weight.shape
+
+ # Compute output shape
+ ow = w - kw + 1
+
+ # Create output tensor
+ out = input.zeros((batch, out_channels, ow))
+
+ # Call tensor_conv1d
+ tensor_conv1d(
+ out._tensor._storage,
+ out.shape,
+ out.strides,
+ out._tensor._size,
+ input._tensor._storage,
+ input.shape,
+ input.strides,
+ weight._tensor._storage,
+ weight.shape,
+ weight.strides,
+ False
+ )
+
+ return out
conv1d = Conv1dFun.apply
@@ -108,7 +158,33 @@ def _tensor_conv2d(out: Tensor, out_shape: Shape, out_strides: Strides,
weight_strides (Strides): strides for `input` tensor.
reverse (bool): anchor weight at top-left or bottom-right
"""
- pass
+ batch, out_channels, out_height, out_width = out_shape
+ batch, in_channels, in_height, in_width = input_shape
+ out_channels, in_channels, k_height, k_width = weight_shape
+
+ for i in prange(out_size):
+ out_index = np.empty(4, np.int32)
+ to_index(i, out_shape, out_index)
+ b, oc, y, x = out_index
+
+ # Initialize accumulator
+ acc = 0.0
+
+ for ic in range(in_channels):
+ for kh in range(k_height):
+ for kw in range(k_width):
+ if reverse:
+ h, w = y + kh, x + kw
+ else:
+ h, w = y - kh + k_height - 1, x - kw + k_width - 1
+
+ if 0 <= h < in_height and 0 <= w < in_width:
+ in_pos = index_to_position((b, ic, h, w), input_strides)
+ w_pos = index_to_position((oc, ic, kh, kw), weight_strides)
+ acc += input[in_pos] * weight[w_pos]
+
+ out_pos = index_to_position((b, oc, y, x), out_strides)
+ out[out_pos] = acc
tensor_conv2d = njit(parallel=True, fastmath=True)(_tensor_conv2d)
@@ -129,7 +205,33 @@ class Conv2dFun(Function):
Returns:
(:class:`Tensor`) : batch x out_channel x h x w
"""
- pass
+ ctx.save_for_backward(input, weight)
+ batch, in_channels, h, w = input.shape
+ out_channels, in_channels, kh, kw = weight.shape
+
+ # Compute output shape
+ oh = h - kh + 1
+ ow = w - kw + 1
+
+ # Create output tensor
+ out = input.zeros((batch, out_channels, oh, ow))
+
+ # Call tensor_conv2d
+ tensor_conv2d(
+ out._tensor._storage,
+ out.shape,
+ out.strides,
+ out._tensor._size,
+ input._tensor._storage,
+ input.shape,
+ input.strides,
+ weight._tensor._storage,
+ weight.shape,
+ weight.strides,
+ False
+ )
+
+ return out
conv2d = Conv2dFun.apply
diff --git a/minitorch/fast_ops.py b/minitorch/fast_ops.py
index 71f96b7..19e99f4 100644
--- a/minitorch/fast_ops.py
+++ b/minitorch/fast_ops.py
@@ -18,19 +18,52 @@ class FastOps(TensorOps):
@staticmethod
def map(fn: Callable[[float], float]) ->MapProto:
"""See `tensor_ops.py`"""
- pass
+ def _map(out: Tensor, out_shape: Shape, out_strides: Strides, in_storage: Storage, in_shape: Shape, in_strides: Strides) -> None:
+ return tensor_map(fn)(out._tensor._storage, out_shape, out_strides, in_storage, in_shape, in_strides)
+ return _map
@staticmethod
- def zip(fn: Callable[[float, float], float]) ->Callable[[Tensor, Tensor
- ], Tensor]:
+ def zip(fn: Callable[[float, float], float]) ->Callable[[Tensor, Tensor], Tensor]:
"""See `tensor_ops.py`"""
- pass
+ def _zip(a: Tensor, b: Tensor) -> Tensor:
+ c_shape = shape_broadcast(a.shape, b.shape)
+ out = a.zeros(c_shape)
+ tensor_zip(fn)(
+ out._tensor._storage,
+ out._tensor._shape,
+ out._tensor._strides,
+ a._tensor._storage,
+ a._tensor._shape,
+ a._tensor._strides,
+ b._tensor._storage,
+ b._tensor._shape,
+ b._tensor._strides,
+ )
+ return out
+ return _zip
@staticmethod
- def reduce(fn: Callable[[float, float], float], start: float=0.0
- ) ->Callable[[Tensor, int], Tensor]:
+ def reduce(fn: Callable[[float, float], float], start: float=0.0) ->Callable[[Tensor, int], Tensor]:
"""See `tensor_ops.py`"""
- pass
+ def _reduce(a: Tensor, dim: int) -> Tensor:
+ out_shape = list(a.shape)
+ out_shape[dim] = 1
+
+ # Other values when not sum.
+ out = a.zeros(tuple(out_shape))
+ out._tensor._storage[:] = start
+
+ tensor_reduce(fn)(
+ out._tensor._storage,
+ out._tensor._shape,
+ out._tensor._strides,
+ a._tensor._storage,
+ a._tensor._shape,
+ a._tensor._strides,
+ dim,
+ )
+ return out
+ return _reduce
@staticmethod
def matrix_multiply(a: Tensor, b: Tensor) ->Tensor:
@@ -56,11 +89,34 @@ class FastOps(TensorOps):
Returns:
New tensor data
"""
- pass
-
-
-def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
- Strides, Storage, Shape, Strides], None]:
+ # Make sure the last dimension of a matches the second-to-last dimension of b
+ assert a.shape[-1] == b.shape[-2]
+
+ # Calculate the output shape
+ out_shape = list(shape_broadcast(a.shape[:-2], b.shape[:-2]))
+ out_shape.append(a.shape[-2])
+ out_shape.append(b.shape[-1])
+
+ # Create the output tensor
+ out = a.zeros(tuple(out_shape))
+
+ # Perform matrix multiplication
+ tensor_matrix_multiply(
+ out._tensor._storage,
+ out._tensor._shape,
+ out._tensor._strides,
+ a._tensor._storage,
+ a._tensor._shape,
+ a._tensor._strides,
+ b._tensor._storage,
+ b._tensor._shape,
+ b._tensor._strides,
+ )
+
+ return out
+
+
+def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape, Strides, Storage, Shape, Strides], None]:
"""
NUMBA low_level tensor_map function. See `tensor_ops.py` for description.
@@ -76,11 +132,34 @@ def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
Returns:
Tensor map function.
"""
- pass
-
-
-def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
- Shape, Strides, Storage, Shape, Strides, Storage, Shape, Strides], None]:
+ @njit(parallel=True)
+ def _map(out: Storage, out_shape: Shape, out_strides: Strides,
+ in_storage: Storage, in_shape: Shape, in_strides: Strides) -> None:
+ # Calculate total number of elements
+ size = int(np.prod(in_shape))
+
+ # Check if the tensors are stride-aligned
+ is_stride_aligned = np.array_equal(out_strides, in_strides) and np.array_equal(out_shape, in_shape)
+
+ if is_stride_aligned:
+ # If stride-aligned, we can avoid indexing
+ for i in prange(size):
+ out[i] = fn(in_storage[i])
+ else:
+ # If not stride-aligned, we need to use indexing
+ for i in prange(size):
+ out_index = np.zeros(len(out_shape), dtype=np.int32)
+ in_index = np.zeros(len(in_shape), dtype=np.int32)
+ to_index(i, in_shape, out_index)
+ broadcast_index(out_index, out_shape, in_shape, in_index)
+ o = index_to_position(out_index, out_strides)
+ j = index_to_position(in_index, in_strides)
+ out[o] = fn(in_storage[j])
+
+ return _map
+
+
+def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage, Shape, Strides, Storage, Shape, Strides, Storage, Shape, Strides], None]:
"""
NUMBA higher-order tensor zip function. See `tensor_ops.py` for description.
@@ -97,11 +176,41 @@ def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
Returns:
Tensor zip function.
"""
- pass
-
-
-def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
- Shape, Strides, Storage, Shape, Strides, int], None]:
+ @njit(parallel=True)
+ def _zip(out: Storage, out_shape: Shape, out_strides: Strides,
+ a_storage: Storage, a_shape: Shape, a_strides: Strides,
+ b_storage: Storage, b_shape: Shape, b_strides: Strides) -> None:
+ # Calculate total number of elements
+ size = int(np.prod(out_shape))
+
+ # Check if all tensors are stride-aligned
+ is_stride_aligned = (np.array_equal(out_strides, a_strides) and
+ np.array_equal(out_strides, b_strides) and
+ np.array_equal(out_shape, a_shape) and
+ np.array_equal(out_shape, b_shape))
+
+ if is_stride_aligned:
+ # If stride-aligned, we can avoid indexing
+ for i in prange(size):
+ out[i] = fn(a_storage[i], b_storage[i])
+ else:
+ # If not stride-aligned, we need to use indexing
+ for i in prange(size):
+ out_index = np.zeros(len(out_shape), dtype=np.int32)
+ a_index = np.zeros(len(a_shape), dtype=np.int32)
+ b_index = np.zeros(len(b_shape), dtype=np.int32)
+ to_index(i, out_shape, out_index)
+ o = index_to_position(out_index, out_strides)
+ broadcast_index(out_index, out_shape, a_shape, a_index)
+ j = index_to_position(a_index, a_strides)
+ broadcast_index(out_index, out_shape, b_shape, b_index)
+ k = index_to_position(b_index, b_strides)
+ out[o] = fn(a_storage[j], b_storage[k])
+
+ return _zip
+
+
+def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage, Shape, Strides, Storage, Shape, Strides, int], None]:
"""
NUMBA higher-order tensor reduce function. See `tensor_ops.py` for description.
@@ -117,12 +226,39 @@ def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
Returns:
Tensor reduce function
"""
- pass
-
-
-def _tensor_matrix_multiply(out: Storage, out_shape: Shape, out_strides:
- Strides, a_storage: Storage, a_shape: Shape, a_strides: Strides,
- b_storage: Storage, b_shape: Shape, b_strides: Strides) ->None:
+ @njit(parallel=True)
+ def _reduce(out: Storage, out_shape: Shape, out_strides: Strides,
+ a_storage: Storage, a_shape: Shape, a_strides: Strides,
+ reduce_dim: int) -> None:
+ # Calculate sizes
+ reduce_size = a_shape[reduce_dim]
+ non_reduce_size = int(np.prod([s for i, s in enumerate(a_shape) if i != reduce_dim]))
+
+ # Main loop in parallel
+ for i in prange(non_reduce_size):
+ # Calculate the base index for the non-reduce dimensions
+ out_index = np.zeros(len(out_shape), dtype=np.int32)
+ to_index(i, out_shape, out_index)
+ out_pos = index_to_position(out_index, out_strides)
+
+ # Inner reduction loop
+ acc = out[out_pos] # Initialize accumulator with the starting value
+ for j in range(reduce_size):
+ # Calculate the full index for the input tensor
+ a_index = out_index.copy()
+ a_index = np.insert(a_index, reduce_dim, j)
+ a_pos = index_to_position(a_index, a_strides)
+ acc = fn(acc, a_storage[a_pos])
+
+ # Store the result
+ out[out_pos] = acc
+
+ return _reduce
+
+
+def _tensor_matrix_multiply(out: Storage, out_shape: Shape, out_strides: Strides,
+ a_storage: Storage, a_shape: Shape, a_strides: Strides,
+ b_storage: Storage, b_shape: Shape, b_strides: Strides) ->None:
"""
NUMBA tensor matrix multiply function.
@@ -153,7 +289,20 @@ def _tensor_matrix_multiply(out: Storage, out_shape: Shape, out_strides:
Returns:
None : Fills in `out`
"""
- pass
+ a_batch_stride = a_strides[0] if len(a_shape) == 3 else 0
+ b_batch_stride = b_strides[0] if len(b_shape) == 3 else 0
+
+ # Outer loop in parallel
+ for i in prange(out_shape[0]):
+ for j in range(out_shape[1]):
+ for k in range(out_shape[2]):
+ a_inner = 0.0
+ for l in range(a_shape[-1]):
+ a_index = i * a_batch_stride + j * a_strides[-2] + l * a_strides[-1]
+ b_index = i * b_batch_stride + l * b_strides[-2] + k * b_strides[-1]
+ a_inner += a_storage[a_index] * b_storage[b_index]
+ out_index = i * out_strides[0] + j * out_strides[1] + k * out_strides[2]
+ out[out_index] = a_inner
tensor_matrix_multiply = njit(parallel=True, fastmath=True)(
diff --git a/minitorch/module.py b/minitorch/module.py
index 91f0dc2..531d52c 100644
--- a/minitorch/module.py
+++ b/minitorch/module.py
@@ -24,15 +24,19 @@ class Module:
def modules(self) ->Sequence[Module]:
"""Return the direct child modules of this module."""
- pass
+ return list(self._modules.values())
def train(self) ->None:
"""Set the mode of this module and all descendent modules to `train`."""
- pass
+ self.training = True
+ for module in self.modules():
+ module.train()
def eval(self) ->None:
"""Set the mode of this module and all descendent modules to `eval`."""
- pass
+ self.training = False
+ for module in self.modules():
+ module.eval()
def named_parameters(self) ->Sequence[Tuple[str, Parameter]]:
"""
@@ -42,11 +46,17 @@ class Module:
Returns:
The name and `Parameter` of each ancestor parameter.
"""
- pass
+ result = []
+ for name, param in self._parameters.items():
+ result.append((name, param))
+ for module_name, module in self._modules.items():
+ for sub_name, sub_param in module.named_parameters():
+ result.append((f"{module_name}.{sub_name}", sub_param))
+ return result
def parameters(self) ->Sequence[Parameter]:
"""Enumerate over all the parameters of this module and its descendents."""
- pass
+ return [param for _, param in self.named_parameters()]
def add_parameter(self, k: str, v: Any) ->Parameter:
"""
@@ -59,7 +69,9 @@ class Module:
Returns:
Newly created parameter.
"""
- pass
+ val = Parameter(v, k)
+ self.__dict__['_parameters'][k] = val
+ return val
def __setattr__(self, key: str, val: Parameter) ->None:
if isinstance(val, Parameter):
@@ -121,7 +133,11 @@ class Parameter:
def update(self, x: Any) ->None:
"""Update the parameter value."""
- pass
+ self.value = x
+ if hasattr(x, 'requires_grad_'):
+ self.value.requires_grad_(True)
+ if self.name:
+ self.value.name = self.name
def __repr__(self) ->str:
return repr(self.value)
diff --git a/minitorch/nn.py b/minitorch/nn.py
index 577a3ff..1679394 100644
--- a/minitorch/nn.py
+++ b/minitorch/nn.py
@@ -17,7 +17,16 @@ def tile(input: Tensor, kernel: Tuple[int, int]) ->Tuple[Tensor, int, int]:
Returns:
Tensor of size batch x channel x new_height x new_width x (kernel_height * kernel_width) as well as the new_height and new_width value.
"""
- pass
+ batch, channel, height, width = input.shape
+ kh, kw = kernel
+ new_height = (height - kh) // 1 + 1
+ new_width = (width - kw) // 1 + 1
+
+ tiled = input.view(batch, channel, height, 1, width, 1)
+ tiled = tiled.unfold(3, kh, 1).unfold(5, kw, 1)
+ tiled = tiled.contiguous().view(batch, channel, new_height, new_width, kh * kw)
+
+ return tiled, new_height, new_width
def avgpool2d(input: Tensor, kernel: Tuple[int, int]) ->Tensor:
@@ -31,7 +40,9 @@ def avgpool2d(input: Tensor, kernel: Tuple[int, int]) ->Tensor:
Returns:
Pooled tensor
"""
- pass
+ tiled, new_height, new_width = tile(input, kernel)
+ pooled = tiled.mean(dim=-1)
+ return pooled
max_reduce = FastOps.reduce(operators.max, -1000000000.0)
@@ -50,7 +61,8 @@ def argmax(input: Tensor, dim: int) ->Tensor:
:class:`Tensor` : tensor with 1 on highest cell in dim, 0 otherwise
"""
- pass
+ max_vals, _ = input.max(dim=dim, keepdim=True)
+ return (input == max_vals).float()
class Max(Function):
@@ -58,12 +70,16 @@ class Max(Function):
@staticmethod
def forward(ctx: Context, input: Tensor, dim: Tensor) ->Tensor:
"""Forward of max should be max reduction"""
- pass
+ output = max_reduce(input, int(dim.item()))
+ ctx.save_for_backward(input, dim, output)
+ return output
@staticmethod
def backward(ctx: Context, grad_output: Tensor) ->Tuple[Tensor, float]:
"""Backward of max should be argmax (see above)"""
- pass
+ input, dim, output = ctx.saved_values
+ arg_max = argmax(input, int(dim.item()))
+ return grad_output * arg_max, 0.0
def softmax(input: Tensor, dim: int) ->Tensor:
@@ -81,7 +97,9 @@ def softmax(input: Tensor, dim: int) ->Tensor:
Returns:
softmax tensor
"""
- pass
+ max_val = input.max(dim=dim, keepdim=True)[0]
+ exp_x = (input - max_val).exp()
+ return exp_x / exp_x.sum(dim=dim, keepdim=True)
def logsoftmax(input: Tensor, dim: int) ->Tensor:
@@ -99,7 +117,9 @@ def logsoftmax(input: Tensor, dim: int) ->Tensor:
Returns:
log of softmax tensor
"""
- pass
+ max_val = input.max(dim=dim, keepdim=True)[0]
+ exp_x = (input - max_val).exp()
+ return input - max_val - (exp_x.sum(dim=dim, keepdim=True)).log()
def maxpool2d(input: Tensor, kernel: Tuple[int, int]) ->Tensor:
@@ -113,7 +133,9 @@ def maxpool2d(input: Tensor, kernel: Tuple[int, int]) ->Tensor:
Returns:
Tensor : pooled tensor
"""
- pass
+ tiled, new_height, new_width = tile(input, kernel)
+ pooled = max_reduce(tiled, dim=-1)
+ return pooled
def dropout(input: Tensor, rate: float, ignore: bool=False) ->Tensor:
@@ -128,4 +150,8 @@ def dropout(input: Tensor, rate: float, ignore: bool=False) ->Tensor:
Returns:
tensor with random positions dropped out
"""
- pass
+ if ignore or rate == 0:
+ return input
+
+ mask = (rand(input.shape) > rate).float()
+ return (input * mask) / (1 - rate)
diff --git a/minitorch/operators.py b/minitorch/operators.py
index 8740347..2315e0a 100644
--- a/minitorch/operators.py
+++ b/minitorch/operators.py
@@ -7,42 +7,42 @@ from typing import Callable, Iterable
def mul(x: float, y: float) ->float:
"""$f(x, y) = x * y$"""
- pass
+ return x * y
def id(x: float) ->float:
"""$f(x) = x$"""
- pass
+ return x
def add(x: float, y: float) ->float:
"""$f(x, y) = x + y$"""
- pass
+ return x + y
def neg(x: float) ->float:
"""$f(x) = -x$"""
- pass
+ return -x
def lt(x: float, y: float) ->float:
"""$f(x) =$ 1.0 if x is less than y else 0.0"""
- pass
+ return 1.0 if x < y else 0.0
def eq(x: float, y: float) ->float:
"""$f(x) =$ 1.0 if x is equal to y else 0.0"""
- pass
+ return 1.0 if x == y else 0.0
def max(x: float, y: float) ->float:
"""$f(x) =$ x if x is greater than y else y"""
- pass
+ return x if x > y else y
def is_close(x: float, y: float) ->float:
"""$f(x) = |x - y| < 1e-2$"""
- pass
+ return 1.0 if abs(x - y) < 1e-2 else 0.0
def sigmoid(x: float) ->float:
@@ -57,7 +57,10 @@ def sigmoid(x: float) ->float:
for stability.
"""
- pass
+ if x >= 0:
+ return 1.0 / (1.0 + math.exp(-x))
+ else:
+ return math.exp(x) / (1.0 + math.exp(x))
def relu(x: float) ->float:
@@ -66,7 +69,7 @@ def relu(x: float) ->float:
(See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) .)
"""
- pass
+ return max(0.0, x)
EPS = 1e-06
@@ -74,32 +77,32 @@ EPS = 1e-06
def log(x: float) ->float:
"""$f(x) = log(x)$"""
- pass
+ return math.log(x + EPS)
def exp(x: float) ->float:
"""$f(x) = e^{x}$"""
- pass
+ return math.exp(x)
def log_back(x: float, d: float) ->float:
"""If $f = log$ as above, compute $d \\times f'(x)$"""
- pass
+ return d / (x + EPS)
def inv(x: float) ->float:
"""$f(x) = 1/x$"""
- pass
+ return 1.0 / (x + EPS)
def inv_back(x: float, d: float) ->float:
"""If $f(x) = 1/x$ compute $d \\times f'(x)$"""
- pass
+ return -d / ((x + EPS) ** 2)
def relu_back(x: float, d: float) ->float:
"""If $f = relu$ compute $d \\times f'(x)$"""
- pass
+ return d if x > 0 else 0.0
def map(fn: Callable[[float], float]) ->Callable[[Iterable[float]],
@@ -116,12 +119,12 @@ def map(fn: Callable[[float], float]) ->Callable[[Iterable[float]],
A function that takes a list, applies `fn` to each element, and returns a
new list
"""
- pass
+ return lambda ls: [fn(x) for x in ls]
def negList(ls: Iterable[float]) ->Iterable[float]:
"""Use `map` and `neg` to negate each element in `ls`"""
- pass
+ return map(neg)(ls)
def zipWith(fn: Callable[[float, float], float]) ->Callable[[Iterable[float
@@ -139,12 +142,12 @@ def zipWith(fn: Callable[[float, float], float]) ->Callable[[Iterable[float
applying fn(x, y) on each pair of elements.
"""
- pass
+ return lambda ls1, ls2: [fn(x, y) for x, y in zip(ls1, ls2)]
def addLists(ls1: Iterable[float], ls2: Iterable[float]) ->Iterable[float]:
"""Add the elements of `ls1` and `ls2` using `zipWith` and `add`"""
- pass
+ return zipWith(add)(ls1, ls2)
def reduce(fn: Callable[[float, float], float], start: float) ->Callable[[
@@ -161,14 +164,19 @@ def reduce(fn: Callable[[float, float], float], start: float) ->Callable[[
$x_1 \\ldots x_n$ and computes the reduction :math:`fn(x_3, fn(x_2,
fn(x_1, x_0)))`
"""
- pass
+ def reducer(ls: Iterable[float]) ->float:
+ result = start
+ for x in ls:
+ result = fn(result, x)
+ return result
+ return reducer
def sum(ls: Iterable[float]) ->float:
"""Sum up a list using `reduce` and `add`."""
- pass
+ return reduce(add, 0.0)(ls)
def prod(ls: Iterable[float]) ->float:
"""Product of a list using `reduce` and `mul`."""
- pass
+ return reduce(mul, 1.0)(ls)
diff --git a/minitorch/scalar.py b/minitorch/scalar.py
index a8a0420..37b2931 100644
--- a/minitorch/scalar.py
+++ b/minitorch/scalar.py
@@ -67,25 +67,25 @@ class Scalar:
return Mul.apply(b, Inv.apply(self))
def __add__(self, b: ScalarLike) ->Scalar:
- raise NotImplementedError('Need to implement for Task 1.2')
+ return Add.apply(self, b)
def __bool__(self) ->bool:
return bool(self.data)
def __lt__(self, b: ScalarLike) ->Scalar:
- raise NotImplementedError('Need to implement for Task 1.2')
+ return LT.apply(self, b)
def __gt__(self, b: ScalarLike) ->Scalar:
- raise NotImplementedError('Need to implement for Task 1.2')
+ return LT.apply(b, self)
def __eq__(self, b: ScalarLike) ->Scalar:
- raise NotImplementedError('Need to implement for Task 1.2')
+ return EQ.apply(self, b)
def __sub__(self, b: ScalarLike) ->Scalar:
- raise NotImplementedError('Need to implement for Task 1.2')
+ return Add.apply(self, Neg.apply(b))
def __neg__(self) ->Scalar:
- raise NotImplementedError('Need to implement for Task 1.2')
+ return Neg.apply(self)
def __radd__(self, b: ScalarLike) ->Scalar:
return self + b
@@ -101,11 +101,13 @@ class Scalar:
Args:
x: value to be accumulated
"""
- pass
+ if self.derivative is None:
+ self.derivative = 0.0
+ self.derivative += x
def is_leaf(self) ->bool:
"""True if this variable created by the user (no `last_fn`)"""
- pass
+ return self.history.last_fn is None
def backward(self, d_output: Optional[float]=None) ->None:
"""
@@ -115,7 +117,9 @@ class Scalar:
d_output (number, opt): starting derivative to backpropagate through the model
(typically left out, and assumed to be 1.0).
"""
- pass
+ if d_output is None:
+ d_output = 1.0
+ backpropagate(self, d_output)
def derivative_check(f: Any, *scalars: Scalar) ->None:
@@ -127,4 +131,12 @@ def derivative_check(f: Any, *scalars: Scalar) ->None:
f : function from n-scalars to 1-scalar.
*scalars : n input scalar values.
"""
- pass
+ out = f(*scalars)
+ out.backward()
+
+ for i, x in enumerate(scalars):
+ check = central_difference(f, *scalars, arg=i)
+ assert abs(x.derivative - check) < 1e-2, (
+ f"Derivative of {f.__name__} with respect to argument {i} is incorrect. "
+ f"Expected {check}, got {x.derivative}"
+ )
diff --git a/minitorch/scalar_functions.py b/minitorch/scalar_functions.py
index c55ef86..7d855f4 100644
--- a/minitorch/scalar_functions.py
+++ b/minitorch/scalar_functions.py
@@ -10,12 +10,16 @@ if TYPE_CHECKING:
def wrap_tuple(x):
"""Turn a possible value into a tuple"""
- pass
+ if isinstance(x, tuple):
+ return x
+ return (x,)
def unwrap_tuple(x):
"""Turn a singleton tuple into a value"""
- pass
+ if len(x) == 1:
+ return x[0]
+ return x
class ScalarFunction:
@@ -31,38 +35,132 @@ class ScalarFunction:
class Add(ScalarFunction):
"""Addition function $f(x, y) = x + y$"""
+ @staticmethod
+ def forward(ctx: Context, a: float, b: float) -> float:
+ return a + b
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> Tuple[float, float]:
+ return d_output, d_output
+
class Log(ScalarFunction):
"""Log function $f(x) = log(x)$"""
+ @staticmethod
+ def forward(ctx: Context, a: float) -> float:
+ ctx.save_for_backward(a)
+ return operators.log(a)
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> float:
+ (a,) = ctx.saved_values
+ return d_output / a
+
class Mul(ScalarFunction):
"""Multiplication function"""
+ @staticmethod
+ def forward(ctx: Context, a: float, b: float) -> float:
+ ctx.save_for_backward(a, b)
+ return a * b
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> Tuple[float, float]:
+ a, b = ctx.saved_values
+ return d_output * b, d_output * a
+
class Inv(ScalarFunction):
"""Inverse function"""
+ @staticmethod
+ def forward(ctx: Context, a: float) -> float:
+ ctx.save_for_backward(a)
+ return operators.inv(a)
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> float:
+ (a,) = ctx.saved_values
+ return -d_output / (a * a)
+
class Neg(ScalarFunction):
"""Negation function"""
+ @staticmethod
+ def forward(ctx: Context, a: float) -> float:
+ return -a
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> float:
+ return -d_output
+
class Sigmoid(ScalarFunction):
"""Sigmoid function"""
+ @staticmethod
+ def forward(ctx: Context, a: float) -> float:
+ sigmoid_value = operators.sigmoid(a)
+ ctx.save_for_backward(sigmoid_value)
+ return sigmoid_value
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> float:
+ (sigmoid_value,) = ctx.saved_values
+ return d_output * sigmoid_value * (1 - sigmoid_value)
+
class ReLU(ScalarFunction):
"""ReLU function"""
+ @staticmethod
+ def forward(ctx: Context, a: float) -> float:
+ ctx.save_for_backward(a)
+ return operators.relu(a)
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> float:
+ (a,) = ctx.saved_values
+ return d_output if a > 0 else 0.0
+
class Exp(ScalarFunction):
"""Exp function"""
+ @staticmethod
+ def forward(ctx: Context, a: float) -> float:
+ exp_value = operators.exp(a)
+ ctx.save_for_backward(exp_value)
+ return exp_value
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> float:
+ (exp_value,) = ctx.saved_values
+ return d_output * exp_value
+
class LT(ScalarFunction):
"""Less-than function $f(x) =$ 1.0 if x is less than y else 0.0"""
+ @staticmethod
+ def forward(ctx: Context, a: float, b: float) -> float:
+ return 1.0 if a < b else 0.0
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> Tuple[float, float]:
+ return 0.0, 0.0
+
class EQ(ScalarFunction):
"""Equal function $f(x) =$ 1.0 if x is equal to y else 0.0"""
+
+ @staticmethod
+ def forward(ctx: Context, a: float, b: float) -> float:
+ return 1.0 if a == b else 0.0
+
+ @staticmethod
+ def backward(ctx: Context, d_output: float) -> Tuple[float, float]:
+ return 0.0, 0.0
diff --git a/minitorch/tensor.py b/minitorch/tensor.py
index dc62ddc..1835693 100644
--- a/minitorch/tensor.py
+++ b/minitorch/tensor.py
@@ -66,7 +66,7 @@ class Tensor:
Returns:
Converted to numpy array
"""
- pass
+ return self._tensor.to_numpy()
@property
def shape(self) ->UserShape:
@@ -74,7 +74,7 @@ class Tensor:
Returns:
shape of the tensor
"""
- pass
+ return self._tensor.shape
@property
def size(self) ->int:
@@ -82,7 +82,7 @@ class Tensor:
Returns:
int : size of the tensor
"""
- pass
+ return self._tensor.size
@property
def dims(self) ->int:
@@ -90,11 +90,16 @@ class Tensor:
Returns:
int : dimensionality of the tensor
"""
- pass
+ return len(self._tensor.shape)
def _ensure_tensor(self, b: TensorLike) ->Tensor:
"""Turns a python number into a tensor with the same backend."""
- pass
+ if isinstance(b, (int, float)):
+ return Tensor.make([b], (1,), backend=self.backend)
+ elif isinstance(b, Tensor):
+ return b
+ else:
+ raise TypeError(f"Can't convert {type(b)} to Tensor")
def __add__(self, b: TensorLike) ->Tensor:
return Add.apply(self, self._ensure_tensor(b))
@@ -135,23 +140,27 @@ class Tensor:
def sum(self, dim: Optional[int]=None) ->Tensor:
"""Compute the sum over dimension `dim`"""
- pass
+ return Sum.apply(self, dim)
def mean(self, dim: Optional[int]=None) ->Tensor:
"""Compute the mean over dimension `dim`"""
- pass
+ sum_tensor = self.sum(dim)
+ if dim is None:
+ return sum_tensor / self.size
+ else:
+ return sum_tensor / self.shape[dim]
def permute(self, *order: int) ->Tensor:
"""Permute tensor dimensions to *order"""
- pass
+ return Permute.apply(self, order)
def view(self, *shape: int) ->Tensor:
"""Change the shape of the tensor to a new shape with the same size"""
- pass
+ return View.apply(self, shape)
def contiguous(self) ->Tensor:
"""Return a contiguous tensor with the same data"""
- pass
+ return Copy.apply(self)
def __repr__(self) ->str:
return self._tensor.to_string()
@@ -169,7 +178,8 @@ class Tensor:
strides: Optional[UserStrides]=None, backend: Optional[
TensorBackend]=None) ->Tensor:
"""Create a new tensor from data"""
- pass
+ tensor_data = TensorData(storage, shape, strides)
+ return Tensor(tensor_data, backend=backend)
def expand(self, other: Tensor) ->Tensor:
"""
@@ -185,7 +195,14 @@ class Tensor:
Expanded version of `other` with the right derivatives
"""
- pass
+ if self.shape == other.shape:
+ return other
+
+ expanded = Tensor.make(other._tensor._storage, self.shape, backend=self.backend)
+ for i in range(len(self.shape)):
+ if self.shape[i] != other.shape[i]:
+ expanded = expanded.sum(i)
+ return expanded
def accumulate_derivative(self, x: Any) ->None:
"""
@@ -195,14 +212,20 @@ class Tensor:
Args:
x : value to be accumulated
"""
- pass
+ if self.is_leaf():
+ if self.grad is None:
+ self.grad = Tensor.make(x._tensor._storage, x.shape, backend=self.backend)
+ else:
+ self.grad += x
+ else:
+ raise RuntimeError("Derivative can only be accumulated on leaf variables.")
def is_leaf(self) ->bool:
"""True if this variable created by the user (no `last_fn`)"""
- pass
+ return self.history is None or self.history.last_fn is None
def zero_grad_(self) ->None:
"""
Reset the derivative on this variable.
"""
- pass
+ self.grad = None
diff --git a/minitorch/tensor_data.py b/minitorch/tensor_data.py
index a28b7f8..b106b7c 100644
--- a/minitorch/tensor_data.py
+++ b/minitorch/tensor_data.py
@@ -37,7 +37,7 @@ def index_to_position(index: Index, strides: Strides) ->int:
Returns:
Position in storage
"""
- pass
+ return int(np.sum(index * strides))
def to_index(ordinal: int, shape: Shape, out_index: OutIndex) ->None:
@@ -53,7 +53,9 @@ def to_index(ordinal: int, shape: Shape, out_index: OutIndex) ->None:
out_index : return index corresponding to position.
"""
- pass
+ for i in range(len(shape) - 1, -1, -1):
+ out_index[i] = ordinal % shape[i]
+ ordinal //= shape[i]
def broadcast_index(big_index: Index, big_shape: Shape, shape: Shape,
@@ -74,7 +76,12 @@ def broadcast_index(big_index: Index, big_shape: Shape, shape: Shape,
Returns:
None
"""
- pass
+ offset = len(big_shape) - len(shape)
+ for i, s in enumerate(shape):
+ if s > 1:
+ out_index[i] = big_index[offset + i]
+ else:
+ out_index[i] = 0
def shape_broadcast(shape1: UserShape, shape2: UserShape) ->UserShape:
@@ -91,7 +98,16 @@ def shape_broadcast(shape1: UserShape, shape2: UserShape) ->UserShape:
Raises:
IndexingError : if cannot broadcast
"""
- pass
+ max_len = max(len(shape1), len(shape2))
+ new_shape = []
+ for i in range(max_len):
+ dim1 = shape1[i] if i < len(shape1) else 1
+ dim2 = shape2[i] if i < len(shape2) else 1
+ if dim1 == 1 or dim2 == 1 or dim1 == dim2:
+ new_shape.append(max(dim1, dim2))
+ else:
+ raise IndexingError(f"Cannot broadcast shapes {shape1} and {shape2}")
+ return tuple(new_shape)
class TensorData:
@@ -130,7 +146,12 @@ class TensorData:
Returns:
bool : True if contiguous
"""
- pass
+ last_stride = 1
+ for i in range(self.dims - 1, -1, -1):
+ if self._strides[i] < last_stride:
+ return False
+ last_stride = self._strides[i] * self._shape[i]
+ return True
def permute(self, *order: int) ->TensorData:
"""
@@ -142,4 +163,7 @@ class TensorData:
Returns:
New `TensorData` with the same storage and a new dimension order.
"""
- pass
+ assert len(order) == self.dims, f"Invalid permutation {order} for shape {self.shape}"
+ new_shape = tuple(self.shape[i] for i in order)
+ new_strides = tuple(self.strides[i] for i in order)
+ return TensorData(self._storage, new_shape, new_strides)
diff --git a/minitorch/tensor_functions.py b/minitorch/tensor_functions.py
index 7602588..5f50827 100644
--- a/minitorch/tensor_functions.py
+++ b/minitorch/tensor_functions.py
@@ -17,79 +17,223 @@ if TYPE_CHECKING:
def wrap_tuple(x):
"""Turn a possible value into a tuple"""
- pass
+ if isinstance(x, tuple):
+ return x
+ return (x,)
class Function:
- pass
+ @staticmethod
+ def forward(ctx: Context, *inputs: Tensor) -> Tensor:
+ raise NotImplementedError()
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, ...]:
+ raise NotImplementedError()
class Neg(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor) -> Tensor:
+ return t1.f.neg_map(t1)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+ return (grad_output.f.neg_map(grad_output),)
class Inv(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor) -> Tensor:
+ ctx.save_for_backward(t1)
+ return t1.f.inv_map(t1)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+ t1 = ctx.saved_values[0]
+ return (grad_output.f.inv_map(t1.f.mul_map(t1)).f.mul_map(grad_output).f.neg_map(),)
class Add(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor, t2: Tensor) -> Tensor:
+ return t1.f.add_zip(t1, t2)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+ return (grad_output, grad_output)
class Mul(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor, t2: Tensor) -> Tensor:
+ ctx.save_for_backward(t1, t2)
+ return t1.f.mul_zip(t1, t2)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+ t1, t2 = ctx.saved_values
+ return (grad_output.f.mul_zip(grad_output, t2),
+ grad_output.f.mul_zip(grad_output, t1))
class Sigmoid(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor) -> Tensor:
+ result = t1.f.sigmoid_map(t1)
+ ctx.save_for_backward(result)
+ return result
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+ sigmoid_output = ctx.saved_values[0]
+ return (grad_output.f.mul_zip(grad_output,
+ sigmoid_output.f.mul_zip(sigmoid_output,
+ sigmoid_output.f.add_scalar(-1).f.neg_map())),)
class ReLU(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor) -> Tensor:
+ ctx.save_for_backward(t1)
+ return t1.f.relu_map(t1)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+ t1 = ctx.saved_values[0]
+ return (grad_output.f.mul_zip(grad_output, t1.f.relu_back_zip(t1)),)
class Log(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor) -> Tensor:
+ ctx.save_for_backward(t1)
+ return t1.f.log_map(t1)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+ t1 = ctx.saved_values[0]
+ return (grad_output.f.mul_zip(grad_output, t1.f.inv_map(t1)),)
class Exp(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor) -> Tensor:
+ result = t1.f.exp_map(t1)
+ ctx.save_for_backward(result)
+ return result
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+ exp_output = ctx.saved_values[0]
+ return (grad_output.f.mul_zip(grad_output, exp_output),)
class Sum(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, a: Tensor, dim: Tensor) -> Tensor:
+ ctx.save_for_backward(a.shape, dim)
+ return a.sum(int(dim.item()))
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, None]:
+ original_shape, dim = ctx.saved_values
+ grad = grad_output
+ for _ in range(len(original_shape) - grad.dims):
+ grad = grad.unsqueeze(int(dim.item()))
+ grad = grad.expand(original_shape)
+ return (grad, None)
class All(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, a: Tensor, dim: Tensor) -> Tensor:
+ return a.all(int(dim.item()))
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[None, None]:
+ return (None, None)
class LT(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, a: Tensor, b: Tensor) -> Tensor:
+ return a.f.lt_zip(a, b)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[None, None]:
+ return (None, None)
class EQ(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, a: Tensor, b: Tensor) -> Tensor:
+ return a.f.eq_zip(a, b)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[None, None]:
+ return (None, None)
class IsClose(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, a: Tensor, b: Tensor) -> Tensor:
+ return a.f.is_close_zip(a, b)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[None, None]:
+ return (None, None)
class Permute(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, a: Tensor, order: Tensor) -> Tensor:
+ ctx.save_for_backward(order)
+ return a.permute(*[int(i.item()) for i in order])
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, None]:
+ order = ctx.saved_values[0]
+ inv_order = [0] * len(order)
+ for i, p in enumerate(order):
+ inv_order[int(p.item())] = i
+ return (grad_output.permute(*inv_order), None)
class View(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, a: Tensor, shape: Tensor) -> Tensor:
+ ctx.save_for_backward(a.shape)
+ return a.view(*[int(i.item()) for i in shape])
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, None]:
+ original_shape = ctx.saved_values[0]
+ return (grad_output.view(original_shape), None)
class Copy(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, a: Tensor) -> Tensor:
+ return a.f.id_map(a)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor]:
+ return (grad_output,)
class MatMul(Function):
- pass
+ @staticmethod
+ def forward(ctx: Context, t1: Tensor, t2: Tensor) -> Tensor:
+ ctx.save_for_backward(t1, t2)
+ return t1.f.matrix_multiply(t1, t2)
+
+ @staticmethod
+ def backward(ctx: Context, grad_output: Tensor) -> Tuple[Tensor, Tensor]:
+ t1, t2 = ctx.saved_values
+ grad_t1 = grad_output.f.matrix_multiply(grad_output, t2.transpose())
+ grad_t2 = t1.transpose().f.matrix_multiply(t1.transpose(), grad_output)
+ return (grad_t1, grad_t2)
def zeros(shape: UserShape, backend: TensorBackend=SimpleBackend) ->Tensor:
@@ -103,7 +247,7 @@ def zeros(shape: UserShape, backend: TensorBackend=SimpleBackend) ->Tensor:
Returns:
new tensor
"""
- pass
+ return Tensor.make([0] * int(operators.prod(shape)), shape, backend=backend)
def rand(shape: UserShape, backend: TensorBackend=SimpleBackend,
@@ -119,7 +263,10 @@ def rand(shape: UserShape, backend: TensorBackend=SimpleBackend,
Returns:
:class:`Tensor` : new tensor
"""
- pass
+ vals = [random.random() for _ in range(int(operators.prod(shape)))]
+ tensor = Tensor.make(vals, shape, backend=backend)
+ tensor.requires_grad_(requires_grad)
+ return tensor
def _tensor(ls: Any, shape: UserShape, backend: TensorBackend=SimpleBackend,
@@ -136,7 +283,9 @@ def _tensor(ls: Any, shape: UserShape, backend: TensorBackend=SimpleBackend,
Returns:
new tensor
"""
- pass
+ tensor = Tensor.make(ls, shape, backend=backend)
+ tensor.requires_grad_(requires_grad)
+ return tensor
def tensor(ls: Any, backend: TensorBackend=SimpleBackend, requires_grad:
@@ -152,4 +301,12 @@ def tensor(ls: Any, backend: TensorBackend=SimpleBackend, requires_grad:
Returns:
:class:`Tensor` : new tensor
"""
- pass
+ if isinstance(ls, Tensor):
+ return ls
+ if isinstance(ls, (float, int)):
+ return _tensor([ls], (1,), backend=backend, requires_grad=requires_grad)
+ if isinstance(ls, (list, tuple)):
+ tensor = _tensor(np.array(ls).flatten(), shape=np.array(ls).shape, backend=backend)
+ tensor.requires_grad_(requires_grad)
+ return tensor
+ raise TypeError("Cannot create tensor from %s" % (type(ls)))
diff --git a/minitorch/tensor_ops.py b/minitorch/tensor_ops.py
index e10ff6c..604d078 100644
--- a/minitorch/tensor_ops.py
+++ b/minitorch/tensor_ops.py
@@ -88,7 +88,19 @@ class SimpleOps(TensorOps):
Returns:
new tensor data
"""
- pass
+ def _map(a: Tensor, out: Optional[Tensor] = None) -> Tensor:
+ if out is None:
+ out = a.zeros(a.shape)
+
+ for i in range(a.shape[0]):
+ for j in range(a.shape[1]):
+ out_index = [i, j]
+ a_index = [i, min(j, a.shape[1] - 1)]
+ out.data[out.index_to_position(out_index)] = fn(a.data[a.index_to_position(a_index)])
+
+ return out
+
+ return _map
@staticmethod
def zip(fn: Callable[[float, float], float]) ->Callable[['Tensor',
@@ -120,7 +132,23 @@ class SimpleOps(TensorOps):
Returns:
:class:`TensorData` : new tensor data
"""
- pass
+ def _zip(a: Tensor, b: Tensor) -> Tensor:
+ out_shape = shape_broadcast(a.shape, b.shape)
+ out = a.zeros(out_shape)
+
+ for i in range(out_shape[0]):
+ for j in range(out_shape[1]):
+ out_index = [i, j]
+ a_index = [min(i, a.shape[0] - 1), min(j, a.shape[1] - 1)]
+ b_index = [min(i, b.shape[0] - 1), min(j, b.shape[1] - 1)]
+ out.data[out.index_to_position(out_index)] = fn(
+ a.data[a.index_to_position(a_index)],
+ b.data[b.index_to_position(b_index)]
+ )
+
+ return out
+
+ return _zip
@staticmethod
def reduce(fn: Callable[[float, float], float], start: float=0.0
@@ -147,7 +175,27 @@ class SimpleOps(TensorOps):
Returns:
:class:`TensorData` : new tensor
"""
- pass
+ def _reduce(a: Tensor, dim: int) -> Tensor:
+ out_shape = list(a.shape)
+ out_shape[dim] = 1
+ out = a.zeros(tuple(out_shape))
+
+ for i in range(out_shape[0]):
+ for j in range(out_shape[1]):
+ out_index = [i, j]
+ out.data[out.index_to_position(out_index)] = start
+
+ for k in range(a.shape[dim]):
+ a_index = [i, j]
+ a_index[dim] = k
+ out.data[out.index_to_position(out_index)] = fn(
+ out.data[out.index_to_position(out_index)],
+ a.data[a.index_to_position(a_index)]
+ )
+
+ return out
+
+ return _reduce
is_cuda = False
@@ -175,7 +223,21 @@ def tensor_map(fn: Callable[[float], float]) ->Callable[[Storage, Shape,
Returns:
Tensor map function.
"""
- pass
+ def _map(in_storage: Storage, in_shape: Shape, in_strides: Strides,
+ out_storage: Storage, out_shape: Shape, out_strides: Strides) -> None:
+
+ in_index = [0] * len(in_shape)
+ out_index = [0] * len(out_shape)
+
+ for i in range(len(out_storage)):
+ out_pos = index_to_position(out_index, out_strides)
+ in_pos = index_to_position(in_index, in_strides)
+ out_storage[out_pos] = fn(in_storage[in_pos])
+
+ to_index(i + 1, out_shape, out_index)
+ broadcast_index(in_index, in_shape, out_index, out_shape)
+
+ return _map
def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
@@ -202,7 +264,25 @@ def tensor_zip(fn: Callable[[float, float], float]) ->Callable[[Storage,
Returns:
Tensor zip function.
"""
- pass
+ def _zip(a_storage: Storage, a_shape: Shape, a_strides: Strides,
+ b_storage: Storage, b_shape: Shape, b_strides: Strides,
+ out_storage: Storage, out_shape: Shape, out_strides: Strides) -> None:
+
+ a_index = [0] * len(a_shape)
+ b_index = [0] * len(b_shape)
+ out_index = [0] * len(out_shape)
+
+ for i in range(len(out_storage)):
+ a_pos = index_to_position(a_index, a_strides)
+ b_pos = index_to_position(b_index, b_strides)
+ out_pos = index_to_position(out_index, out_strides)
+ out_storage[out_pos] = fn(a_storage[a_pos], b_storage[b_pos])
+
+ to_index(i + 1, out_shape, out_index)
+ broadcast_index(a_index, a_shape, out_index, out_shape)
+ broadcast_index(b_index, b_shape, out_index, out_shape)
+
+ return _zip
def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
@@ -219,7 +299,27 @@ def tensor_reduce(fn: Callable[[float, float], float]) ->Callable[[Storage,
Returns:
Tensor reduce function.
"""
- pass
+ def _reduce(a_storage: Storage, a_shape: Shape, a_strides: Strides,
+ out_storage: Storage, out_shape: Shape, out_strides: Strides,
+ reduce_dim: int) -> None:
+
+ a_index = [0] * len(a_shape)
+ out_index = [0] * len(out_shape)
+
+ for i in range(len(out_storage)):
+ out_pos = index_to_position(out_index, out_strides)
+ out_storage[out_pos] = a_storage[index_to_position(a_index, a_strides)]
+
+ for j in range(1, a_shape[reduce_dim]):
+ a_index[reduce_dim] = j
+ a_pos = index_to_position(a_index, a_strides)
+ out_storage[out_pos] = fn(out_storage[out_pos], a_storage[a_pos])
+
+ to_index(i + 1, out_shape, out_index)
+ broadcast_index(a_index, a_shape, out_index, out_shape)
+ a_index[reduce_dim] = 0
+
+ return _reduce
SimpleBackend = TensorBackend(SimpleOps)
diff --git a/minitorch/testing.py b/minitorch/testing.py
index bc2dc74..690d1dc 100644
--- a/minitorch/testing.py
+++ b/minitorch/testing.py
@@ -8,77 +8,77 @@ class MathTest(Generic[A]):
@staticmethod
def neg(a: A) ->A:
"""Negate the argument"""
- pass
+ return -a
@staticmethod
def addConstant(a: A) ->A:
"""Add contant to the argument"""
- pass
+ return a + 5
@staticmethod
def square(a: A) ->A:
"""Manual square"""
- pass
+ return a * a
@staticmethod
def cube(a: A) ->A:
"""Manual cube"""
- pass
+ return a * a * a
@staticmethod
def subConstant(a: A) ->A:
"""Subtract a constant from the argument"""
- pass
+ return a - 5
@staticmethod
def multConstant(a: A) ->A:
"""Multiply a constant to the argument"""
- pass
+ return a * 5
@staticmethod
def div(a: A) ->A:
"""Divide by a constant"""
- pass
+ return a / 5
@staticmethod
def inv(a: A) ->A:
"""Invert after adding"""
- pass
+ return 1 / (a + 10)
@staticmethod
def sig(a: A) ->A:
"""Apply sigmoid"""
- pass
+ return 1 / (1 + operators.exp(-a))
@staticmethod
def log(a: A) ->A:
"""Apply log to a large value"""
- pass
+ return operators.log(a + 100)
@staticmethod
def relu(a: A) ->A:
"""Apply relu"""
- pass
+ return max(a, 0)
@staticmethod
def exp(a: A) ->A:
"""Apply exp to a smaller value"""
- pass
+ return operators.exp(a / 100)
@staticmethod
def add2(a: A, b: A) ->A:
"""Add two arguments"""
- pass
+ return a + b
@staticmethod
def mul2(a: A, b: A) ->A:
"""Mul two arguments"""
- pass
+ return a * b
@staticmethod
def div2(a: A, b: A) ->A:
"""Divide two arguments"""
- pass
+ return a / b
@classmethod
def _tests(cls) ->Tuple[Tuple[str, Callable[[A], A]], Tuple[str,
@@ -86,7 +86,26 @@ class MathTest(Generic[A]):
"""
Returns a list of all the math tests.
"""
- pass
+ one_arg = [
+ ("neg", cls.neg),
+ ("addConstant", cls.addConstant),
+ ("square", cls.square),
+ ("cube", cls.cube),
+ ("subConstant", cls.subConstant),
+ ("multConstant", cls.multConstant),
+ ("div", cls.div),
+ ("inv", cls.inv),
+ ("sig", cls.sig),
+ ("log", cls.log),
+ ("relu", cls.relu),
+ ("exp", cls.exp)
+ ]
+ two_arg = [
+ ("add2", cls.add2),
+ ("mul2", cls.mul2),
+ ("div2", cls.div2)
+ ]
+ return (one_arg, two_arg, [])
class MathTestVariable(MathTest):
diff --git a/tests/test_conv.py b/tests/test_conv.py
index cb17993..70cf08f 100644
--- a/tests/test_conv.py
+++ b/tests/test_conv.py
@@ -67,3 +67,33 @@ def test_conv2() -> None:
out.sum().backward()
minitorch.grad_check(minitorch.Conv2dFun.apply, t, t2)
+
+
+@pytest.mark.task4_1
+def test_conv1d_multi_channel_batch() -> None:
+ t = minitorch.tensor([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]]).view(2, 2, 3)
+ t.requires_grad_(True)
+ t2 = minitorch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).view(2, 2, 2)
+ out = minitorch.Conv1dFun.apply(t, t2)
+
+ assert out.shape == (2, 2, 2)
+ assert out[0, 0, 0] == 0 * 1 + 1 * 2 + 3 * 3 + 4 * 4
+ assert out[1, 1, 1] == 7 * 5 + 8 * 6 + 10 * 7 + 11 * 8
+ minitorch.grad_check(minitorch.Conv1dFun.apply, t, t2)
+
+
+@pytest.mark.task4_2
+def test_conv2d_multi_channel_batch() -> None:
+ t = minitorch.tensor([[[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
+ [[9, 10, 11], [12, 13, 14], [15, 16, 17]]],
+ [[[18, 19, 20], [21, 22, 23], [24, 25, 26]],
+ [[27, 28, 29], [30, 31, 32], [33, 34, 35]]]]).view(2, 2, 3, 3)
+ t.requires_grad_(True)
+ t2 = minitorch.tensor([[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+ [[[9, 10], [11, 12]], [[13, 14], [15, 16]]]]).view(2, 2, 2, 2)
+ out = minitorch.Conv2dFun.apply(t, t2)
+
+ assert out.shape == (2, 2, 2, 2)
+ assert out[0, 0, 0, 0] == (0*1 + 1*2 + 3*3 + 4*4) + (9*5 + 10*6 + 12*7 + 13*8)
+ assert out[1, 1, 1, 1] == (25*9 + 26*10 + 34*11 + 35*12) + (34*13 + 35*14 + 43*15 + 44*16)
+ minitorch.grad_check(minitorch.Conv2dFun.apply, t, t2)