diff --git a/01-saxpy.cu b/01-saxpy.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3d39766df0c563b1d8f66b555c71fc8ab6c47170
--- /dev/null
+++ b/01-saxpy.cu
@@ -0,0 +1,55 @@
+#include <stdio.h>
+
+#define N 2048 * 2048 // Number of elements in each vector
+
+/*
+ * Optimize this already-accelerated codebase. Work iteratively,
+ * and use nvprof to support your work.
+ *
+ * Aim to profile `saxpy` (without modifying `N`) running under
+ * 20us.
+ *
+ * Some bugs have been placed in this codebase for your edification.
+ */
+
+__global__ void saxpy(int * a, int * b, int * c)
+{
+    int tid = blockIdx.x * blockDim.x * threadIdx.x;
+
+    if ( tid < N )
+        c[tid] = 2 * a[tid] + b[tid];
+}
+
+int main()
+{
+    float *a, *b, *c;
+
+    int size = N * sizeof (int); // The total number of bytes per vector
+
+    cudaMallocManaged(&a, size);
+    cudaMallocManaged(&b, size);
+    cudaMallocManaged(&c, size);
+
+    // Initialize memory
+    for( int i = 0; i < N; ++i )
+    {
+        a[i] = 2;
+        b[i] = 1;
+        c[i] = 0;
+    }
+
+    int threads_per_block = 128;
+    int number_of_blocks = (N / threads_per_block) + 1;
+
+    saxpy <<< number_of_blocks, threads_per_block >>> ( a, b, c );
+
+    // Print out the first and last 5 values of c for a quality check
+    for( int i = 0; i < 5; ++i )
+        printf("c[%d] = %d, ", i, c[i]);
+    printf ("\n");
+    for( int i = N-5; i < N; ++i )
+        printf("c[%d] = %d, ", i, c[i]);
+    printf ("\n");
+
+    cudaFree( a ); cudaFree( b ); cudaFree( c );
+}