diff --git a/.DS_Store b/.DS_Store
index 707c39b8ba9621345fc9797d7071c7eea90abfc7..22ae81ec5f55ce735351902b11b164d295b0e603 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/Makefile b/Makefile
index 2d663dc0d3f81bb59d4d4e4cc7b6f9d4a39ab6c3..7de4f8a509fde54113d2f73210d01171b91747ce 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 all:
-	nvcc src/radix_sort_cuda.cu -o radix_sort_cuda
+	nvcc src/radix_sort_par.cu -o radix_sort_cuda
 
 clean:
 	rm -rf radix_sort_cuda
diff --git a/src/.DS_Store b/src/.DS_Store
index 4912157825bb3ffe363aeada5f93b27d3b9d50db..09bf7c2bc24e9930b0b02dbae5fbbe3a9d080167 100644
Binary files a/src/.DS_Store and b/src/.DS_Store differ
diff --git a/src/rad2 b/src/rad2
deleted file mode 100755
index db2a378010b194cd994b309489505efa5fa44e96..0000000000000000000000000000000000000000
Binary files a/src/rad2 and /dev/null differ
diff --git a/src/radix_par b/src/radix_par
deleted file mode 100755
index 7a027978875c8c9a404b81fda39d4867a1287257..0000000000000000000000000000000000000000
Binary files a/src/radix_par and /dev/null differ
diff --git a/src/radix_sort_cuda b/src/radix_sort_cuda
deleted file mode 100755
index b7f9a8e77e7da0869d7203a4bdc89c10cddf3768..0000000000000000000000000000000000000000
Binary files a/src/radix_sort_cuda and /dev/null differ
diff --git a/src/radix_sort_par.cu b/src/radix_sort_par.cu
index 1718b956ea1a3e16802afbbd5874b95540d76ec3..7eb0da76dfa202b87fea945bbd537454c691ba2f 100755
--- a/src/radix_sort_par.cu
+++ b/src/radix_sort_par.cu
@@ -5,24 +5,30 @@
 using namespace std; 
   
 // A utility function to get maximum value in arr[] 
-__device__ int getMax(int arr[], int n) 
+int getMax(int arr[], int n) 
 { 
     int mx = arr[0]; 
     for (int i = 1; i < n; i++) 
         if (arr[i] > mx) 
             mx = arr[i]; 
     return mx; 
+}
+
+__global__ void copyArray(int *arr, int *output, int n)
+{
+    int id = blockIdx.x*blockDim.x+threadIdx.x;
+
+    if (id<n)
+        arr[id]=output[id];
 } 
   
 // A function to do counting sort of arr[] according to 
 // the digit represented by exp. 
-__device__ void countSort(int arr[], int n, int exp) 
+void countSort(int arr[], int n, int exp) 
 { 
-//	int index = threadIdx.x;
-//    int stride = blockDim.x;
-
-    int *output= (int*)malloc(sizeof(int)*n); // output array 
-    int i, count[10] = {0}; 
+    int output[n]; // output array 
+    long i;
+    int count[10] = {0}; 
   
     // Store count of occurrences in count[] 
     for (i = 0; i < n; i++) 
@@ -42,42 +48,59 @@ __device__ void countSort(int arr[], int n, int exp)
   
     // Copy the output array to arr[], so that arr[] now 
     // contains sorted numbers according to current digit 
-    for (i = 0; i < n; i++) 
-        arr[i] = output[i]; 
+    //for (i = 0; i < n; i++) 
+        //arr[i] = output[i];
+
+    int *h_arr;
+    int *h_output;
+    int *d_arr;
+    int *d_output;
+
+    size_t bytes= n*sizeof(int);
+
+    cudaMalloc(&d_arr, bytes);
+    cudaMalloc(&d_output, bytes);
+    
+    if (d_output==0){
+        printf ("hi");
+    }
+
+    cudaMemcpy(d_output,output, bytes, cudaMemcpyHostToDevice);
+
+    int blockSize, gridSize;
+  
+    // Number of threads in each thread block
+    blockSize = 256;
+      
+    // Number of thread blocks in grid
+    gridSize = ceil((float)n/blockSize);
+
+    copyArray<<<gridSize,blockSize>>>(d_arr,d_output,n); 
+   
+    cudaMemcpy(arr,d_arr,bytes,cudaMemcpyDeviceToHost);
+    
+    cudaFree(d_arr);
+    cudaFree(d_output);
+
 } 
   
 // The main function to that sorts arr[] of size n using  
-// Radix Sort
-
-__global__ void radixsort(int *arr, int n) 
+// Radix Sort 
+void radixsort(int arr[], int n) 
 { 
-    //int *d_arr;
-
     // Find the maximum number to know number of digits 
     int m = getMax(arr, n); 
   
     // Do counting sort for every digit. Note that instead 
     // of passing digit number, exp is passed. exp is 10^i 
     // where i is current digit number 
-
-    // allocate device memory
-    //cudaMalloc((void**)&d_arr,sizeof(int)*n);
-
-    //cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
     for (int exp = 1; m/exp > 0; exp *= 10) 
-        //countSort<<<1,1024>>>(d_arr, n, exp); 
-        countSort(arr,n,exp);
-        //transfer data back to host memory
-        //cudaMemcpy(arr, d_arr, sizeof(int)*n, cudaMemcpyDeviceToHost);
-        
-    //deallocate device memory
-    //cudaFree(d_arr);
-    __syncthreads();
-
+        countSort(arr, n, exp); 
 } 
   
 // A utility function to print an array 
-void print(int arr[], int n){ 
+void print(int arr[], int n) 
+{ 
     for (int i = 0; i < n; i++) 
         cout << arr[i] << "\n"; 
 } 
@@ -109,18 +132,12 @@ int main(int argc, char *argv[])
 { 
     timespec start, stop;
     
-    int *d_arr;
     int n;
     n= atoi(argv[1]);
     int arr[n];
     rng(arr,n);
-
-    cudaMalloc((void**)&d_arr,sizeof(int)*n);
-    cudaMemcpy(d_arr, arr, sizeof(int)*n,cudaMemcpyHostToDevice);
-
     clock_gettime(CLOCK_REALTIME, &start);
-    radixsort<<<1,32>>>(d_arr,n);
-    cudaMemcpy(arr, d_arr, sizeof(int)*n,cudaMemcpyDeviceToHost);
+    radixsort(arr,n);
     clock_gettime(CLOCK_REALTIME, &stop);
     print(arr,n);
     
@@ -128,9 +145,5 @@ int main(int argc, char *argv[])
     long time = duration.tv_sec * 1000000 + duration.tv_nsec/1000;
     printf("\n%d.%09d s\n", duration.tv_sec, duration.tv_nsec);
     
-    //deallocate host memory
-    cudaFree(d_arr);
-    
     return 0; 
 } 
-
diff --git a/src/radix_sort_seq b/src/radix_sort_seq
deleted file mode 100755
index 9ad6573e7dcc4676e1900d6c7c830bca4fc5d4eb..0000000000000000000000000000000000000000
Binary files a/src/radix_sort_seq and /dev/null differ