diff --git a/cuda/.vscode/c_cpp_properties.json b/cuda/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000000000000000000000000000000000000..862aed8796dcd7cc17c0264d58b81e6348556d50
--- /dev/null
+++ b/cuda/.vscode/c_cpp_properties.json
@@ -0,0 +1,16 @@
+{
+    "configurations": [
+        {
+            "name": "Linux",
+            "includePath": [
+                "${workspaceFolder}/**"
+            ],
+            "defines": [],
+            "compilerPath": "/usr/bin/gcc",
+            "cStandard": "gnu17",
+            "cppStandard": "gnu++14",
+            "intelliSenseMode": "linux-gcc-x64"
+        }
+    ],
+    "version": 4
+}
\ No newline at end of file
diff --git a/cuda/.vscode/settings.json b/cuda/.vscode/settings.json
new file mode 100644
index 0000000000000000000000000000000000000000..43f52f3f933b2c66b0fdbbf4b6c93bc6d1af6d7e
--- /dev/null
+++ b/cuda/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "files.associations": {
+        "iostream": "cpp",
+        "chrono": "cpp",
+        "ostream": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/cuda/bin/ciphertext.o b/cuda/bin/ciphertext.o
new file mode 100644
index 0000000000000000000000000000000000000000..5189234927a1d5bb60dcd78fdb25d7e6166b3cfd
Binary files /dev/null and b/cuda/bin/ciphertext.o differ
diff --git a/cuda/bin/ckks.o b/cuda/bin/ckks.o
new file mode 100644
index 0000000000000000000000000000000000000000..d9489959a078a0d0bb0cb622434126ffc14b2fe6
Binary files /dev/null and b/cuda/bin/ckks.o differ
diff --git a/cuda/bin/evalkey.o b/cuda/bin/evalkey.o
new file mode 100644
index 0000000000000000000000000000000000000000..a04a147bcd6515ab49919a770228ea700f2ecc86
Binary files /dev/null and b/cuda/bin/evalkey.o differ
diff --git a/cuda/bin/parallel.o b/cuda/bin/parallel.o
new file mode 100644
index 0000000000000000000000000000000000000000..0901e941e6dfc2ee2d3e62e149e0a842c6be92a6
Binary files /dev/null and b/cuda/bin/parallel.o differ
diff --git a/cuda/bin/polynomial.o b/cuda/bin/polynomial.o
new file mode 100644
index 0000000000000000000000000000000000000000..e4cf05596eee04751b84815c8cce5ef7ec5427eb
Binary files /dev/null and b/cuda/bin/polynomial.o differ
diff --git a/cuda/bin/pubkey.o b/cuda/bin/pubkey.o
new file mode 100644
index 0000000000000000000000000000000000000000..ec6254a2069a800328fa0d7453c6c0a4d97c9bdf
Binary files /dev/null and b/cuda/bin/pubkey.o differ
diff --git a/cuda/bin/seckey.o b/cuda/bin/seckey.o
new file mode 100644
index 0000000000000000000000000000000000000000..a1c6ec9d4839487bf2929a22e8413a9ef22409e9
Binary files /dev/null and b/cuda/bin/seckey.o differ
diff --git a/cuda/ciphertext.cu b/cuda/ciphertext.cu
new file mode 100644
index 0000000000000000000000000000000000000000..80460e0d1fb6cfcf3c3c27c59460070ac9374a2e
--- /dev/null
+++ b/cuda/ciphertext.cu
@@ -0,0 +1,21 @@
+#include <complex>
+#include <cmath>
+#include <random>
+#include "ciphertext.h"
+#include <iostream>
+using namespace std;
+
+
+Ciphertext::Ciphertext(Polynomial _c0, Polynomial _c1){
+    c0 = Polynomial(_c0.degree);
+    c0.setCoeffs(_c0.coeffs);
+    c1 = Polynomial(_c1.degree);
+    c1.setCoeffs(_c1.coeffs);
+}
+
+Ciphertext Ciphertext:: operator + (Ciphertext const &obj){
+    Polynomial out1 = c0 + obj.c0;
+    Polynomial out2 = c1 + obj.c1;
+    Ciphertext out(out1, out2);
+    return out;
+}
\ No newline at end of file
diff --git a/cuda/ciphertext.h b/cuda/ciphertext.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe752a9ce9de7d360d273a33ccda6773cb73c80c
--- /dev/null
+++ b/cuda/ciphertext.h
@@ -0,0 +1,15 @@
+#include "polynomial.h"
+
+#ifndef CIPHERTEXT_H
+#define CIPHERTEXT_H
+
+class Ciphertext {
+  public:
+    Polynomial c0;
+    Polynomial c1;
+
+    Ciphertext(Polynomial _c0, Polynomial _c1);
+    Ciphertext operator + (Ciphertext const &obj);
+};
+
+#endif
\ No newline at end of file
diff --git a/cuda/ckks.cu b/cuda/ckks.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2570db100f31ada490311c8e285abe020f4469d6
--- /dev/null
+++ b/cuda/ckks.cu
@@ -0,0 +1,58 @@
+#include "ckks.h"
+#include <random>
+
+using namespace std;
+
+CKKS::CKKS(int N, PubKey _pk, EvalKey _evk, SecKey _sk){
+  pk = _pk;
+  sk = _sk;
+  evk = _evk;
+  deg = N;
+  ql = q0;
+  for (int i=0; i<6; i++){
+    ql *= pl[i];
+  }
+}
+
+Ciphertext CKKS::encrypt(Polynomial pt){
+  //Polynomial e = genE(pt.degree, 1.0);
+  Polynomial c0 = (pt) + pk.b;
+  Polynomial c1 = pk.a;
+
+  Ciphertext ct(c0.modCoeff(ql), c1.modCoeff(ql));
+  return ct;
+}
+
+Polynomial CKKS::decrypt(Ciphertext ct){
+  Polynomial pt = ct.c0 + (ct.c1 * sk.s);
+  return pt.modCoeff(ql);
+}
+
+Ciphertext CKKS::add(Ciphertext ct1, Ciphertext ct2){
+  Ciphertext out = ct1 + ct2;
+  return out;
+}
+
+Ciphertext CKKS::mult(Ciphertext ct1, Ciphertext ct2){
+  // Multiplication
+  Polynomial d1 = (ct1.c0 * ct2.c0).modCoeff(ql);
+  Polynomial d2 = ((ct1.c0 * ct2.c1) + (ct2.c0 * ct1.c1)).modCoeff(ql); 
+  Polynomial d3 = (ct1.c1 * ct2.c1).modCoeff(ql);
+
+  // Relin 
+  Polynomial d3_0 = (d3 * evk.b).scaleRoundCoeff(1.0/1000.0);
+  Polynomial d3_1 = (d3 * evk.a).scaleRoundCoeff(1.0/1000.0);
+
+  Polynomial outC0 = d1.modCoeff(ql) + d3_0.modCoeff(ql);
+  Polynomial outC1 = d2.modCoeff(ql) + d3_1.modCoeff(ql);
+
+  // Rescale
+  ql = (double)ql / (double)pl[level-1];
+  Polynomial c0 = outC0.scaleRoundCoeff(1.0/(double)pl[level-1]);
+  Polynomial c1 = outC1.scaleRoundCoeff(1.0/(double)pl[level-1]);
+  
+  level -= 1;
+  
+  Ciphertext out(c0.modCoeff(ql), c1.modCoeff(ql));
+  return out;
+}
\ No newline at end of file
diff --git a/cuda/ckks.h b/cuda/ckks.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4ae662e15d8acff8b45cbf98d2f7cd24126286c
--- /dev/null
+++ b/cuda/ckks.h
@@ -0,0 +1,29 @@
+#include "polynomial.h"
+#include "pubkey.h"
+#include "seckey.h"
+#include "evalkey.h"
+#include "ciphertext.h"
+
+#ifndef CKKS_H
+#define CKKS_H
+
+class CKKS {
+  public:
+    int deg;
+    int level = 6;
+    int q0 = 67;
+    int pl[6] = {61, 67, 71, 73, 79, 59};
+    int64_t ql;
+    PubKey pk;
+    SecKey sk;
+    EvalKey evk;
+
+    CKKS(int N, PubKey _pk, EvalKey _evk, SecKey _sk);
+    Polynomial fastMult(Polynomial p1, Polynomial p2);
+    Ciphertext add(Ciphertext ct1, Ciphertext ct2);
+    Ciphertext mult(Ciphertext ct1, Ciphertext ct2);
+    Ciphertext encrypt(Polynomial pt);
+    Polynomial decrypt(Ciphertext ct);
+};
+
+#endif
\ No newline at end of file
diff --git a/cuda/compile.sh b/cuda/compile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5ab0cdbcef0bc4c0fc02366bb512b7fc347a5383
--- /dev/null
+++ b/cuda/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+nvcc -c -o ./bin/ciphertext.o ciphertext.cu
+nvcc -c -o ./bin/ckks.o ckks.cu
+nvcc -c -o ./bin/parallel.o parallel.cu
+nvcc -c -o ./bin/polynomial.o polynomial.cu
+nvcc -c -o ./bin/pubkey.o pubkey.cu
+nvcc -c -o ./bin/seckey.o seckey.cu
+nvcc -c -o ./bin/evalkey.o evalkey.cu
\ No newline at end of file
diff --git a/cuda/evalkey.cu b/cuda/evalkey.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6c886a601b73311daf883854f9d3cc8d5b01d2a4
--- /dev/null
+++ b/cuda/evalkey.cu
@@ -0,0 +1,54 @@
+#include<iostream>
+#include <complex>
+#include <cmath>
+#include <random>
+#include "evalkey.h"
+
+using namespace std;
+
+EvalKey::EvalKey(Polynomial _s, int degree, int64_t q){
+    a = Polynomial(degree);
+    b = Polynomial(degree);
+    p = 1000;
+
+    generateA(4, 1000);
+    computeB(q, _s);
+}
+
+Polynomial EvalKey::genE(int degree, double var){
+  double errCoeffs[degree];
+  
+  random_device rd;
+  mt19937 gen(rd());
+  normal_distribution<double> dis(0, var);
+  
+  for(int i=0; i<degree; i++){
+    errCoeffs[i] = (double) round(dis(gen));
+  }
+
+  Polynomial err(degree, errCoeffs);
+  return err;
+}
+
+void EvalKey::generateA(int degree, int64_t q){
+    int64_t half_q = (p*q)/2;
+    random_device rd;
+    mt19937 gen(rd());
+    uniform_int_distribution<int64_t> dis(-half_q, 0);
+    double a_coeffs[degree];
+    
+    for (int i=0; i<degree; i++){
+        a_coeffs[i] = (double) dis(gen);
+    }
+    a.setCoeffs(a_coeffs);
+}
+
+void EvalKey::computeB(int q, Polynomial s){
+    Polynomial err = genE(a.degree, 1.0);
+    Polynomial ev = (s * s).scaleCoeff(p);
+    Polynomial temp = ((a.scaleCoeff(-1.0)) * s) + ev + err;
+    Polynomial res = temp.modCoeff(q * p);
+
+    b.setDegree(res.degree);
+    b.setCoeffs(res.coeffs);
+}
\ No newline at end of file
diff --git a/cuda/evalkey.h b/cuda/evalkey.h
new file mode 100644
index 0000000000000000000000000000000000000000..d05eaa25eb22cf63e528b3065d805fceb5a5e96c
--- /dev/null
+++ b/cuda/evalkey.h
@@ -0,0 +1,19 @@
+#include "polynomial.h"
+
+#ifndef EVALKEY_H
+#define EVALKEY_H
+
+class EvalKey{
+  public:
+    Polynomial a;
+    Polynomial b;
+    int64_t p;
+
+    EvalKey() = default;
+    EvalKey(Polynomial _s, int degree, int64_t q);
+    Polynomial genE(int degree, double var);
+    void generateA(int degree, int64_t q);
+    void computeB(int q, Polynomial s);
+};
+
+#endif
\ No newline at end of file
diff --git a/cuda/ntt.cu b/cuda/ntt.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1d2ee8159869cedcc9ad7fe19a2df1a4a5c030a2
--- /dev/null
+++ b/cuda/ntt.cu
@@ -0,0 +1,154 @@
+#include "ntt.h"
+
+int64_t NTT::modExp(int64_t base, int64_t power, int64_t mod){
+    int64_t res = 1;
+    int64_t p = power;
+    int64_t b = base % mod;
+    while (p > 0){
+        if (p & 1){
+            res = (res * b) % mod;
+        }
+        b = (b*b) % mod;
+        p = p >> 1;
+    }
+
+    return res;
+}
+
+int64_t NTT::modInv(int64_t x, int64_t mod){
+    int64_t t = 0;
+    int64_t t1 = 1;
+    int64_t r = mod;
+    int64_t r1 = x;
+
+    while (r1 != 0){
+        int64_t quot = (int64_t) (r/r1);
+        int64_t temp_t = t;
+        int64_t temp_r = r;
+        t = t1; 
+        t1 = (temp_t - quot * t1);
+        r = r1;
+        r1 = (temp_r % r1);
+    }
+
+    if (t < 0){
+        t = t + mod;
+    }
+    
+    return t;
+}
+
+int NTT::bitLength(int x){
+    int a = x;
+    int len = 0;
+    while (a!=0){
+        a >>= 1;
+        len += 1;
+    }
+    return len;
+}
+
+bool NTT::existSmallerN(int r, int mod, int n){
+    for(int k=2; k<n; k++){
+        if(modExp(r, k, mod) == 1){
+            return true;
+        }
+    }
+    return false;
+}
+
+int64_t NTT::genNthRoot(int mod, int n){
+    int p = mod - 1;
+    int range = mod - 1 + 1;
+    while (true){
+        int64_t a = rand() % range + 1;
+        int64_t b = modExp(a, p/n, mod);
+        if (!existSmallerN(b, mod, n)){
+            return b;
+        }
+    }
+}
+
+void NTT::reverse(vector<int64_t> &in, int bitLen){
+    for (int i=0; i<size(in); i++){
+        int64_t revN = 0;
+        for(int j=0; j<bitLen ; j++){
+            if ((i >> j) & 1){
+                revN |= 1 << (bitLen-1-j);
+            }
+        }
+        int64_t coeff = in[i];
+
+        if (revN > i){
+            coeff ^= in[revN];
+            in[revN] ^= coeff;
+            coeff ^= in[revN];
+            in[i] = coeff;
+        }
+    }
+}
+
+void NTT::_ntt(vector<int64_t> &in, int64_t w){
+    int N = size(in);
+    int nBit = bitLength(N) - 1;
+    reverse(in, nBit);
+
+    vector<int> points(N, 0);
+    for(int i=0; i<nBit; i++){
+        vector<int64_t> p1;
+        vector<int64_t> p2;
+        for(int j=0; j<N/2; j++){
+            int shift = nBit - i - 1;
+            int64_t P = (j >> shift) << shift;
+            int64_t wP = modExp(w, P, M);
+            int64_t odd = in[2*j+1] * wP;
+            int64_t even = in[2*j];
+            p1.push_back((even + odd) % M);
+            p2.push_back((even - odd) % M);
+        }
+
+        for(int k=0; k<N/2; k++){
+            points[k] = p1[k];
+            points[k+N/2] = p2[k];
+        }
+
+        if(1!=nBit){
+            for(int k=0; k<N; k++){
+                in[k] = points[k];
+            }
+        }
+    }
+
+    for(int k=0; k<N; k++){
+        in[k] = points[k];
+        if(in[k] < 0){
+            in[k] += M;
+        }
+    }
+}
+
+vector<int64_t> NTT::ntt(Polynomial in, int degree, int64_t w){
+    vector<int64_t> out(degree,0);
+    for(int i=0; i<in.degree; i++){
+        out[i] = (int64_t) in.coeffs[i];
+    }
+    _ntt(out, w);
+    return out;
+}
+
+Polynomial NTT::intt(vector<int64_t> &in, int w){
+    int N = size(in);
+    Polynomial pOut(N);
+    double* coeff = (double*) malloc(N*sizeof(double));
+
+    int64_t wInv = modInv(w, M);
+    int64_t nInv = modInv(N, M);
+
+    _ntt(in, wInv);
+    for(int i=0; i<size(in); i++){
+        coeff[i] = (in[i] * nInv) % M;
+    }
+    pOut.setCoeffs(coeff);
+    
+    return pOut;
+}
\ No newline at end of file
diff --git a/cuda/ntt.h b/cuda/ntt.h
new file mode 100644
index 0000000000000000000000000000000000000000..5078f1f82c2edb248ad84898dd70fc22fb95d45c
--- /dev/null
+++ b/cuda/ntt.h
@@ -0,0 +1,27 @@
+#include "polynomial.h"
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+#define size(c) ((int)(c).size())
+
+#ifndef NTT_H
+#define NTT_H
+
+class NTT{
+    public:
+        int M = 2013265921;
+        int64_t modExp(int64_t base, int64_t power, int64_t mod);
+        int64_t modInv(int64_t x, int64_t mod);
+        int bitLength(int x);
+        void reverse(vector<int64_t> &in, int bitLen);
+        bool existSmallerN(int r, int mod, int n);
+        int64_t genNthRoot(int mod, int n);
+        void _ntt(vector<int64_t> &in, int64_t w);
+        vector<int64_t> ntt(Polynomial in, int degree, int64_t w);
+        Polynomial intt(vector<int64_t> &in, int w);
+};
+
+
+#endif
\ No newline at end of file
diff --git a/cuda/parallel b/cuda/parallel
new file mode 100755
index 0000000000000000000000000000000000000000..05253c6f0005e9a40ed5142e70b69c188422d14f
Binary files /dev/null and b/cuda/parallel differ
diff --git a/cuda/parallel.cu b/cuda/parallel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..29d587b9e64c28b0b32d26a4729b96d8ecb586f0
--- /dev/null
+++ b/cuda/parallel.cu
@@ -0,0 +1,179 @@
+#include <iostream>
+#include <complex>
+#include <cuComplex.h>
+#include <cmath>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <chrono>
+#include <curand_kernel.h>
+#include "cublas_v2.h"
+#include "parallel.h"
+
+using namespace std;
+
+using namespace std::chrono;
+
+
+__device__ cuDoubleComplex power(cuDoubleComplex base, int power){
+  cuDoubleComplex res = make_cuDoubleComplex(1.0, 0.0);
+  while(power > 0){
+    if (power & 1){
+      res = cuCmul(res, base);
+    }
+
+    power = power >> 1;
+    base = cuCmul(base, base);
+  }
+  return res;
+}
+
+__global__ void initVandermonde(cuDoubleComplex *vand, int M, cuDoubleComplex xi){
+  int ROW = blockIdx.y*blockDim.y+threadIdx.y;
+  int COL = blockIdx.x*blockDim.x+threadIdx.x;
+  int exp = (2*ROW+1)*COL;
+  vand[ROW * M + COL] = power(xi, exp);
+  if(ROW == 0 && COL == 0) printf("test = %f\n", cuCreal(power(xi, exp)));
+}
+
+__global__ void transpose(cuDoubleComplex *dest, cuDoubleComplex *source, int N){
+  int ROW = blockIdx.y*blockDim.y+threadIdx.y;
+  int COL = blockIdx.x*blockDim.x+threadIdx.x;
+
+  if (ROW < N && COL < N){
+    int idxIn = ROW * N + COL;
+    int idxOut = COL * N + ROW;
+    dest[idxOut] = source[idxIn];
+  }
+}
+
+__global__ void expScale(cuDoubleComplex *target, int N, double scale){
+  int idx = threadIdx.x + blockIdx.x*blockDim.x;
+  target[idx].x = target[idx].x * scale;
+  target[idx].y = target[idx].y * scale;
+  target[N-idx-1] = cuConj(target[idx]);
+}
+
+__device__ cuDoubleComplex vdotc(cuDoubleComplex *a, cuDoubleComplex *b, int inca, int incb, int N){
+  cuDoubleComplex res = make_cuDoubleComplex(0.0, 0.0);
+  for(int i=0; i<N; i++){
+    res = cuCadd(res, cuCmul(a[i+inca], cuConj(b[i+incb])));
+  }
+  return res;
+}
+
+__global__ void computeCoord(double *res, cuDoubleComplex *target, cuDoubleComplex *vand, int M){
+  int idx = threadIdx.x + blockIdx.x*blockDim.x;
+  double temp = cuCreal(vdotc(vand, target, idx*M, 0, M))/cuCreal(vdotc(vand, vand, idx*M, idx*M, M));
+  res[idx] = temp;
+}
+
+__global__ void setup_kernel(curandState *state){
+  int idx = threadIdx.x+blockDim.x*blockIdx.x;
+  curand_init(1234, idx, 0, &state[idx]);
+}
+
+__global__ void coordWRR(double *target){
+  int idx = threadIdx.x + blockIdx.x*blockDim.x;
+  target[idx] = (double) round(target[idx]);
+}
+
+__global__ void unscale(double *target, double scale){
+  int idx = threadIdx.x + blockIdx.x*blockDim.x;
+  target[idx] = target[idx]/scale;
+}
+
+
+__global__ void sigma(double *pol, cuDoubleComplex *vand, int M, cuDoubleComplex *target){
+  int idx = threadIdx.x + blockIdx.x*blockDim.x;
+  target[idx] = make_cuDoubleComplex(0.0, 0.0);
+  for(int i=0; i<M; i++){
+    target[idx].x += vand[idx*M+i].x * pol[i];
+    target[idx].y += vand[idx*M+i].y * pol[i];
+  }
+}
+
+Encoder::Encoder(int Min, double s){
+  M = Min;
+  scale = s;
+  root = exp((2*M_PI/M) * J);
+  cuDoubleComplex xi = make_cuDoubleComplex(real(root), imag(root));
+
+  int N = M/2;
+  cudaMalloc((void**)&vand, N*N*sizeof(cuDoubleComplex));
+  dim3 threadsPerBlock(N, N);
+  dim3 blocksPerGrid(1, 1);
+  
+  if (N*N > 512){
+    threadsPerBlock.x = 16;
+    threadsPerBlock.y = 16;
+    blocksPerGrid.x = ceil(double(N)/double(threadsPerBlock.x));
+    blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y));
+  }
+
+  initVandermonde<<<blocksPerGrid, threadsPerBlock>>>(vand, M/2, xi);
+  cudaDeviceSynchronize();
+
+  dcomplex *test = (dcomplex *) malloc(64*64*sizeof(dcomplex));
+  cudaMemcpy(test, vand, 64*64*sizeof(dcomplex), cudaMemcpyDeviceToHost);
+
+  cudaMalloc((void**)&sigmaRBasis, N*N*sizeof(cuDoubleComplex));
+  transpose<<<blocksPerGrid, threadsPerBlock>>>(sigmaRBasis, vand, M/2);
+}
+
+Polynomial Encoder::encode(dcomparr input){
+  cuDoubleComplex *target;
+  size_t size = M/2;
+  Polynomial out(M/2);
+
+  cudaMalloc((void**) &target, size * sizeof(cuDoubleComplex));
+  cudaMemcpy(target, input.arr, M/4 * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
+
+  int block = 1;
+  int threadPerBlock = M/2;
+  if(threadPerBlock > 1024){
+    threadPerBlock = 1024;
+    block = ceil((double)(M/2)/1024.0);
+  }
+
+  auto start = high_resolution_clock::now();
+  expScale<<<block,threadPerBlock/2>>>(target, M/2, scale);
+
+  double *res;
+  cudaMalloc((void**)&res, M/2*sizeof(double));
+  computeCoord<<<block,threadPerBlock>>>(res, target, sigmaRBasis, M/2);
+
+  coordWRR<<<block,threadPerBlock>>>(res);
+
+  out.setCoeffs(res);
+  auto stop = high_resolution_clock::now();
+  auto duration = duration_cast<microseconds>(stop - start);
+  cout << "Encoding: " << duration.count() << endl;
+  return out;
+}
+
+dcomparr Encoder::decode(Polynomial pt){
+  auto start = high_resolution_clock::now();
+  
+  int block = 1;
+  int threadPerBlock = M/2;
+  if(threadPerBlock > 1024){
+    threadPerBlock = 1024;
+    block = ceil((double)(M/2)/1024.0);
+  }
+  
+  unscale<<<block,threadPerBlock>>>(pt.coeffs, scale);
+
+  cuDoubleComplex *target;
+  cudaMalloc((void**)&target, M/4 * sizeof(cuDoubleComplex));
+  sigma<<<block,threadPerBlock/2>>>(pt.coeffs, vand, M/2, target);
+
+  dcomparr output(M/4);
+  size_t size_out = (M/4)*sizeof(dcomplex);
+  cudaMemcpy(output.arr, target, size_out, cudaMemcpyDeviceToHost);
+
+  auto stop = high_resolution_clock::now();
+  auto duration = duration_cast<microseconds>(stop - start);
+  cout << "Decoding: " << duration.count() << endl;
+  return output;
+}
+
diff --git a/cuda/parallel.h b/cuda/parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ef3eb634df2a134e3bca97eb2ca17a16373fd1b
--- /dev/null
+++ b/cuda/parallel.h
@@ -0,0 +1,36 @@
+#include <complex>
+#include <cuComplex.h>
+#include "polynomial.h"
+
+using namespace std;
+
+#define J dcomplex(0.0,1.0)
+#define I make_cuDoubleComplex(0.0, 1.0)
+#define SIZE 10000
+
+typedef complex<double> dcomplex;
+
+struct dcomparr {
+  dcomplex* arr;
+  dcomparr(int size){
+    arr = (dcomplex*) malloc(size*sizeof(dcomplex));
+  }
+};
+
+#ifndef PARALLEL_H
+#define PARALLEL_H
+
+class Encoder{
+  public:
+    int M;
+    double scale;
+    dcomplex root;
+    cuDoubleComplex *vand;
+    cuDoubleComplex *sigmaRBasis;
+
+    Encoder(int M, double scale);
+    Polynomial encode(dcomparr input);
+    dcomparr decode(Polynomial pt);
+};
+
+#endif
\ No newline at end of file
diff --git a/cuda/polynomial.cu b/cuda/polynomial.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8f5944cddeb19130950fb94caabe2bb79b0bd82b
--- /dev/null
+++ b/cuda/polynomial.cu
@@ -0,0 +1,127 @@
+#include <complex>
+#include <cmath>
+#include <random>
+#include <algorithm>
+#include <string.h>
+#include <iostream>
+#include "polynomial.h"
+
+using namespace std;
+
+
+__global__ void polyAdd(double* a, int aDeg, double* b, int bDeg, double *res){
+    int idx = threadIdx.x;
+    res[idx] = 0.0;
+    if(idx < aDeg) res[idx] += a[idx];
+    if(idx < bDeg) res[idx] += b[idx];
+}
+
+__global__ void polyMod(double *res, double *in, int mod){
+    int idx = threadIdx.x;
+    res[idx] = (double) ((int) in[idx] % mod);
+}
+
+__global__ void polyScale(double *res, double *in, double scale){
+    int idx = threadIdx.x;
+    res[idx] = in[idx] * scale;
+}
+
+__global__ void polyRoundScale(double *res, double *in, double scale){
+    int idx = threadIdx.x;
+    res[idx] = (double) round(in[idx] * scale);
+}
+
+__global__ void polyMult(double *res, double *a, double *b, int deg){
+    int idx = threadIdx.x;
+    res[idx] = 0;
+    for(int i=0; i<deg; i++){
+        for(int j=0; j<deg; j++){
+            if(j+i == idx){
+                res[idx] += a[i] * b[j];
+            }
+        }
+    }
+}
+
+Polynomial::Polynomial(int deg){
+    degree = deg;
+    cudaMalloc((void**)&coeffs, degree*sizeof(double));
+}
+
+Polynomial::Polynomial(int deg, double* coeff){
+    degree = deg;
+    cudaMalloc((void**)&coeffs, degree*sizeof(double));
+    cudaMemcpy(coeffs, coeff, degree*sizeof(double), cudaMemcpyHostToDevice);
+}
+
+void Polynomial::setCoeffs(double *coeff){
+    cudaMemcpy(coeffs, coeff, degree*sizeof(double), cudaMemcpyDeviceToDevice);
+}
+
+Polynomial Polynomial::operator + (Polynomial const &obj){
+    int deg = max(obj.degree, degree);
+    double* resCoeff;
+    cudaMalloc((void**)&resCoeff, deg*sizeof(double));
+    polyAdd<<<1,deg>>>(coeffs, degree, obj.coeffs, obj.degree, resCoeff);
+    
+    Polynomial out(deg);
+    out.setCoeffs(resCoeff);
+    return out;
+}
+
+Polynomial Polynomial::operator * (Polynomial const &obj){
+    int deg = obj.degree + degree;
+    double *resCoeff;
+    cudaMalloc((void**)&resCoeff, deg*sizeof(double));
+    polyMult<<<1, deg>>>(resCoeff, coeffs, obj.coeffs, deg);
+    Polynomial out(deg);
+    out.setCoeffs(resCoeff);
+    return out;
+}
+
+void Polynomial::setDegree(int deg){
+    degree = deg;
+}
+
+Polynomial Polynomial::modCoeff(int q){
+    Polynomial out(degree);
+    double *resCoeff;
+    cudaMalloc((void**)&resCoeff, degree*sizeof(double));
+    polyMod<<<1, degree>>>(resCoeff, coeffs, q);
+    out.setCoeffs(resCoeff);
+    return out;
+}
+
+Polynomial Polynomial::scaleCoeff(double scale){
+    Polynomial out(degree);
+    double *resCoeff;
+    cudaMalloc((void**)&resCoeff, degree*sizeof(double));
+    polyScale<<<1, degree>>>(resCoeff, coeffs, scale);
+    out.setCoeffs(resCoeff);
+    return out;
+}
+
+Polynomial Polynomial::scaleRoundCoeff(double scale){
+    Polynomial out(degree);
+    double *resCoeff;
+    cudaMalloc((void**)&resCoeff, degree*sizeof(double));
+    polyRoundScale<<<1, degree>>>(resCoeff, coeffs, scale);
+    out.setCoeffs(resCoeff);
+    return out;
+}
+
+void Polynomial::printPol(){
+    double *temp = (double*) malloc(degree * sizeof(double));
+    cudaMemcpy(temp, coeffs, degree * sizeof(double), cudaMemcpyDeviceToHost);
+    for(int i=0; i<degree; i++){
+        cout << temp[i];
+        if(i != 0){
+            cout << " x^" << i;
+        }
+
+        if (i != degree-1){
+            cout << " + ";
+        }
+    }
+    cout << endl;
+}
diff --git a/cuda/polynomial.h b/cuda/polynomial.h
new file mode 100644
index 0000000000000000000000000000000000000000..98f3882683aad99b1c404d547665ea34257bfdb3
--- /dev/null
+++ b/cuda/polynomial.h
@@ -0,0 +1,26 @@
+#include<iostream>
+
+#ifndef POLYNOMIAL_H
+#define POLYNOMIAL_H
+
+class Polynomial {
+  public:
+    int degree;
+    double* coeffs;
+
+    Polynomial() = default;
+    Polynomial(int deg);
+    Polynomial(int deg, double* coeff);
+
+    void setCoeffs(double* coeff);
+    void setDegree(int deg);
+    Polynomial modCoeff(int q);
+    Polynomial scaleCoeff(double scale);
+    Polynomial scaleRoundCoeff(double scale);
+    Polynomial operator + (Polynomial const &obj);
+    Polynomial operator * (Polynomial const &obj);
+    void printPol();
+};
+
+
+#endif
\ No newline at end of file
diff --git a/cuda/pubkey.cu b/cuda/pubkey.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4c1fb7bd951208c6c7abd0a534aac93ac4c298fe
--- /dev/null
+++ b/cuda/pubkey.cu
@@ -0,0 +1,52 @@
+#include<iostream>
+#include <complex>
+#include <cmath>
+#include <random>
+#include "pubkey.h"
+
+using namespace std;
+
+PubKey::PubKey(Polynomial _s, int degree, int64_t q){
+    a = Polynomial(degree);
+    b = Polynomial(degree);
+
+    generateA(4, 100);
+    computeB(q, _s);
+}
+
+Polynomial PubKey::genE(int degree, double var){
+  double errCoeffs[degree];
+  
+  random_device rd;
+  mt19937 gen(rd());
+  normal_distribution<double> dis(0, var);
+  
+  for(int i=0; i<degree; i++){
+    errCoeffs[i] = (double) round(dis(gen));
+  }
+
+  Polynomial err(degree, errCoeffs);
+  return err;
+}
+
+void PubKey::generateA(int degree, int64_t q){
+    int64_t half_q = q/2;
+    random_device rd;
+    mt19937 gen(rd());
+    uniform_int_distribution<int64_t> dis(-half_q, 0);
+    double a_coeffs[degree];
+    
+    for (int i=0; i<degree; i++){
+        a_coeffs[i] = (double) dis(gen);
+    }
+    a.setCoeffs(a_coeffs);
+}
+
+void PubKey::computeB(int q, Polynomial s){
+    Polynomial err = genE(a.degree, 1.0);
+    Polynomial temp = (a.scaleCoeff(-1.0)) * s + err;
+    Polynomial res = temp.modCoeff(q);
+
+    b.setDegree(res.degree);
+    b.setCoeffs(res.coeffs);
+}
diff --git a/cuda/pubkey.h b/cuda/pubkey.h
new file mode 100644
index 0000000000000000000000000000000000000000..0958da4d274f216e977f729d26403a71a1c31020
--- /dev/null
+++ b/cuda/pubkey.h
@@ -0,0 +1,18 @@
+#include "polynomial.h"
+
+#ifndef PUBKEY_H
+#define PUBKEY_H
+
+class PubKey{
+  public:
+    Polynomial a;
+    Polynomial b;
+
+    PubKey() = default;
+    PubKey(Polynomial s, int degree, int64_t q);
+    Polynomial genE(int degree, double var);
+    void generateA(int degree, int64_t q);
+    void computeB(int q, Polynomial s);
+};
+
+#endif
\ No newline at end of file
diff --git a/cuda/run b/cuda/run
new file mode 100755
index 0000000000000000000000000000000000000000..20804cb980d4bb16791febdebd25492dfe56c61e
Binary files /dev/null and b/cuda/run differ
diff --git a/cuda/run.sh b/cuda/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..06ce13796cf74e4ebff1061bb82a4ca50de95c3c
--- /dev/null
+++ b/cuda/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+./compile.sh
+nvcc -c -o test.o test.cu 
+nvcc -o run test.o ./bin/ciphertext.o ./bin/ckks.o ./bin/parallel.o ./bin/polynomial.o ./bin/pubkey.o ./bin/seckey.o ./bin/evalkey.o 
+./run
\ No newline at end of file
diff --git a/cuda/seckey.cu b/cuda/seckey.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0efd3915d2130c7dc3c2ea2a5065ce2c084e7b3
--- /dev/null
+++ b/cuda/seckey.cu
@@ -0,0 +1,29 @@
+#include <random>
+#include <string>
+#include <algorithm>
+#include <random>
+#include "seckey.h"
+
+using namespace std;
+
+SecKey::SecKey(int deg){
+    double coeff[deg];
+
+    std::random_device rd; 
+    std::mt19937 gen(rd()); 
+    std::uniform_int_distribution<> distr(deg/2, deg);
+
+    int h = distr(gen);
+    string ones(h, '1');
+    string zeros((deg-h), '0');
+    
+    string key = ones + zeros;
+    random_shuffle(key.begin(), key.end());
+
+    for(int i=0; i<deg; i++){
+        char temp = key.at(i);
+        coeff[i] = (double) (atoi(&temp));
+    }
+
+    s = Polynomial(deg, coeff);
+}
\ No newline at end of file
diff --git a/cuda/seckey.h b/cuda/seckey.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fa51cc835de9e1d2986b668fdd8ab3b279136de
--- /dev/null
+++ b/cuda/seckey.h
@@ -0,0 +1,15 @@
+#include "polynomial.h"
+#include <string>
+
+#ifndef SECKEY_H
+#define SECKEY_H
+
+
+class SecKey{
+  public:
+    Polynomial s;
+    SecKey() = default;
+    SecKey(int deg);
+};
+
+#endif
\ No newline at end of file
diff --git a/cuda/test.cu b/cuda/test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c9d4cc1e7e4131a3a989663395bf79409c788b92
--- /dev/null
+++ b/cuda/test.cu
@@ -0,0 +1,170 @@
+#include <iostream>
+#include <chrono>
+
+#include "polynomial.h"
+#include "parallel.h"
+#include "ckks.h"
+#include "pubkey.h"
+#include "seckey.h"
+#include "evalkey.h"
+
+
+using namespace std;
+
+using namespace std::chrono;
+
+dcomparr randomFill(int n){
+    dcomparr out(n);
+    for(int i=0; i<n; i++){
+        out.arr[i] = 1.0 + 1.0*J;
+    }
+    return out;
+}
+
+int main(){
+    // Encoder enc(128, 128.0);
+    // dcomplex a = 1.0 + 1.0 *J; 
+    // dcomparr input;
+    // for(int i=0; i<32; i++){
+    //     input.arr[i] = a;
+    // }
+
+    // Encoder enc(8, 64.0);
+    // dcomplex a1 = 3.0 + 4.0 *J; 
+    // dcomplex a2 = 2.0 - 1.0*J;
+    // dcomparr input(2);
+    // input.arr[0] = a1;
+    // input.arr[1] = a2;
+
+    // Polynomial pt = enc.encode(input);
+    // pt.printPol();
+
+    int q0 = 67;
+    int pl[6] = {61, 67, 71, 73, 79, 59};
+    int64_t ql = q0;
+
+    for (int i=0; i<3; i++){
+        ql *= pl[i];
+    }
+
+    // SecKey sk = SecKey(4);
+    // PubKey pk(sk.s, 4, ql);
+    // EvalKey evk(sk.s, 4, ql);
+
+    // cout << "decode" << endl;
+    // dcomparr out = enc.decode(pt);
+    // for(int i=0; i<2; i++){
+    //     cout << out.arr[i] << " ";
+    // }
+    // cout << endl;
+
+    bool error = false;
+    // for(int i=1; i<5; i++){
+    //     cout << i << endl;
+    //     int N = pow(2, i);
+    //     Encoder enc(N*4, 64.0);
+    //     dcomparr in = randomFill(N);
+    //     Polynomial test = enc.encode(in);
+    //     dcomparr out = enc.decode(test);
+    //     for(int j=0; j<N; j++){
+    //         if (abs(in.arr[i]-out.arr[i]) > 2.0){
+    //             error = true;
+    //             break;
+    //         }
+    //     }
+    //     if (error) break;
+    // }
+
+    // for(int i=1; i<5; i++){
+    //     int N = pow(2, i);
+    //     Encoder enc(N*4, 64.0);
+
+    //     SecKey sk = SecKey(N);
+    //     PubKey pk(sk.s, (N)+1, ql);
+    //     EvalKey evk(sk.s, (N)+1, ql);
+    //     CKKS ckks(N, pk, evk, sk);
+
+    //     dcomparr in = randomFill(N);
+        
+    //     Polynomial test = enc.encode(in);
+    //     Ciphertext ct = ckks.encrypt(test);
+    //     Polynomial testOut = ckks.decrypt(ct);
+
+    //     dcomparr out = enc.decode(testOut);
+    //     for(int j=0; j<N; j++){
+    //         if (abs(in.arr[j]-out.arr[j]) > 2.0){
+    //             error = true;
+    //             break;
+    //         }
+    //     }
+    //     if (error) break;
+    // }
+
+    // for(int i=1; i<5; i++){
+    //     int N = pow(2, i);
+    //     Encoder enc(N*4, 64.0);
+
+    //     SecKey sk = SecKey(N);
+    //     PubKey pk(sk.s, (N)+1, ql);
+    //     EvalKey evk(sk.s, (N)+1, ql);
+    //     CKKS ckks(N, pk, evk, sk);
+
+    //     dcomparr in = randomFill(N);
+        
+    //     Polynomial test = enc.encode(in);
+    //     Ciphertext ct = ckks.encrypt(test);
+    //     Ciphertext ctadd = ckks.add(ct,ct);
+    //     Polynomial testOut = ckks.decrypt(ctadd);
+
+    //     dcomparr out = enc.decode(testOut);
+    //     dcomplex scale(2.0, 0.0);
+    //     for(int j=0; j<N; j++){
+    //         if (abs((scale*in.arr[j])-out.arr[j]) > 2.0){
+    //             error = true;
+    //             break;
+    //         }
+    //     }
+    //     if (error) break;
+    // }
+
+    // for(int i=1; i<5; i++){
+    //     int N = pow(2, i);
+    //     Encoder enc(N*4, 64.0);
+
+    //     SecKey sk = SecKey(N);
+    //     PubKey pk(sk.s, (N)+1, ql);
+    //     EvalKey evk(sk.s, (N)+1, ql);
+    //     CKKS ckks(N, pk, evk, sk);
+
+    //     dcomparr in = randomFill(N);
+        
+    //     Polynomial test = enc.encode(in);
+    //     Ciphertext ct = ckks.encrypt(test);
+    //     Ciphertext ctadd = ckks.mult(ct,ct);
+    //     Polynomial testOut = ckks.decrypt(ctadd);
+
+    //     Polynomial control = (test * test).scaleRoundCoeff(1.0/64.0);
+    //     dcomparr out = enc.decode(testOut);
+    //     dcomparr check = enc.decode(control);
+
+    //     for(int j=0; j<N*2; j++){
+    //         if (abs(check.arr[j]-out.arr[j]) > 2.0){
+    //             error = true;
+    //             break;
+    //         }
+    //     }
+    //     if (error) break;
+    // }
+
+
+    if(error) {
+        cout << "Error" << endl;
+    } else {
+        cout << "Finish" << endl;
+    }
+
+    return 0;
+
+
+
+}
\ No newline at end of file
diff --git a/cuda/test.o b/cuda/test.o
new file mode 100644
index 0000000000000000000000000000000000000000..296bb9efce91360217237fc9e23e287a11053363
Binary files /dev/null and b/cuda/test.o differ
diff --git a/cuda/tester b/cuda/tester
new file mode 100755
index 0000000000000000000000000000000000000000..f7055a3ef821cb040ae63d7f2b5d2311584f23c4
Binary files /dev/null and b/cuda/tester differ