Daniel Mario Reynaldi · Daniel Mario Reynaldi · ce7b48ec · ce7b48ec · ce7b48ec · ce7b48ec
--- a/bin/encoder.o
+++ b/bin/encoder.o
--- a/bin/ntt.o
+++ b/bin/ntt.o
--- a/cuda/bin/ciphertext.o
+++ b/cuda/bin/ciphertext.o
--- a/cuda/bin/ckks.o
+++ b/cuda/bin/ckks.o
--- a/cuda/bin/evalkey.o
+++ b/cuda/bin/evalkey.o
--- a/cuda/bin/ntt.o
+++ b/cuda/bin/ntt.o
--- a/cuda/bin/parallel.o
+++ b/cuda/bin/parallel.o
--- a/cuda/bin/polynomial.o
+++ b/cuda/bin/polynomial.o
--- a/cuda/bin/pubkey.o
+++ b/cuda/bin/pubkey.o
--- a/cuda/bin/seckey.o
+++ b/cuda/bin/seckey.o
--- a/cuda/ckks.cu
+++ b/cuda/ckks.cu
@@ -3,18 +3,16 @@

 using namespace std;

-CKKS::CKKS(int N, PubKey _pk, EvalKey _evk, SecKey _sk){
-  pk = _pk;
-  sk = _sk;
+CKKS::CKKS(EvalKey _evk){
  evk = _evk;
-  deg = N;
  ql = q0;
  for (int i=0; i<6; i++){
    ql *= pl[i];
  }
 }

-Ciphertext CKKS::encrypt(Polynomial pt){
+
+Ciphertext CKKS::encrypt(Polynomial pt, PubKey pk){
  //Polynomial e = genE(pt.degree, 1.0);
  Polynomial c0 = (pt) + pk.b;
  Polynomial c1 = pk.a;
@@ -23,7 +21,7 @@ Ciphertext CKKS::encrypt(Polynomial pt){
  return ct;
 }

-Polynomial CKKS::decrypt(Ciphertext ct){
+Polynomial CKKS::decrypt(Ciphertext ct, SecKey sk){
  Polynomial pt = ct.c0 + (ct.c1 * sk.s);
  return pt.modCoeff(ql);
 }
@@ -33,6 +31,20 @@ Ciphertext CKKS::add(Ciphertext ct1, Ciphertext ct2){
  return out;
 }

+Polynomial CKKS::fastMult(Polynomial p1, Polynomial p2, int deg, int64_t w){
+  vector<int64_t> pol1 = ntt.ntt(p1, deg, w);
+  vector<int64_t> pol2 = ntt.ntt(p2, deg, w);
+  
+  int64_t p = 4179340454199820289;
+  vector<int64_t> mult(deg, 0);
+  for(int i=0; i<deg; i++){
+    mult.push_back(NTT::modMul(pol1[i], pol2[i], p));
+  }
+
+  Polynomial out = ntt.intt(mult, w);
+  return out;
+}
+
 Ciphertext CKKS::mult(Ciphertext ct1, Ciphertext ct2){
  // Multiplication
  Polynomial d1 = (ct1.c0 * ct2.c0).modCoeff(ql);
@@ -55,4 +67,18 @@ Ciphertext CKKS::mult(Ciphertext ct1, Ciphertext ct2){
  
  Ciphertext out(c0.modCoeff(ql), c1.modCoeff(ql));
  return out;
+}
+
+int getNextPow(int deg){
+  int out = 2;
+  while(out < deg){
+    out <<= 1;
+  }
+  return out;
+}
+
+Ciphertext CKKS::multNTT(Ciphertext ct1, Ciphertext ct2){
+  level += 1;
+  Ciphertext out = mult(ct1, ct2);
+  return out;
 }
\ No newline at end of file
--- a/cuda/ckks.h
+++ b/cuda/ckks.h
 #include "polynomial.h"
+#include "ntt.h"
 #include "pubkey.h"
 #include "seckey.h"
 #include "evalkey.h"
@@ -8,22 +9,22 @@
 #define CKKS_H

 class CKKS {
-  public:
-    int deg;
+  private:
    int level = 6;
    int q0 = 67;
-    int pl[6] = {61, 67, 71, 73, 79, 59};
+    int pl[6] = {61, 67, 71, 73, 79, 67};
    int64_t ql;
-    PubKey pk;
-    SecKey sk;
    EvalKey evk;
+    NTT ntt;

-    CKKS(int N, PubKey _pk, EvalKey _evk, SecKey _sk);
-    Polynomial fastMult(Polynomial p1, Polynomial p2);
+  public:
+    CKKS(EvalKey _evk);
+    Polynomial fastMult(Polynomial p1, Polynomial p2, int deg, int64_t w);
    Ciphertext add(Ciphertext ct1, Ciphertext ct2);
    Ciphertext mult(Ciphertext ct1, Ciphertext ct2);
-    Ciphertext encrypt(Polynomial pt);
-    Polynomial decrypt(Ciphertext ct);
+    Ciphertext multNTT(Ciphertext ct1, Ciphertext ct2);
+    Ciphertext encrypt(Polynomial pt, PubKey pk);
+    Polynomial decrypt(Ciphertext ct, SecKey sk);
 };

 #endif
\ No newline at end of file
--- a/cuda/evalkey.cu
+++ b/cuda/evalkey.cu
@@ -29,10 +29,10 @@ Polynomial EvalKey::genE(int degree, double var){
 }

 void EvalKey::generateA(int degree, int64_t q){
-    int64_t half_q = (p*q)/2;
+    int64_t half_q = (q)/2;
    random_device rd;
    mt19937 gen(rd());
-    uniform_int_distribution<int64_t> dis(-half_q, 0);
+    uniform_int_distribution<int64_t> dis(0, half_q);
    double a_coeffs[degree];
    
    for (int i=0; i<degree; i++){
@@ -45,9 +45,10 @@ void EvalKey::generateA(int degree, int64_t q){
 void EvalKey::computeB(int q, Polynomial s){
    Polynomial err = genE(a.degree, 1.0);
    Polynomial ev = (s * s).scaleCoeff(p);
-    Polynomial temp = ((a.scaleCoeff(-1.0)) * s) + ev + err;
+    Polynomial temp = ((a.scaleCoeff(-1.0)) * s) + ev;
    Polynomial res = temp.modCoeff(q * p);

    b = Polynomial(res.degree);
    b.setCoeffs(res.coeffs);
+    
 }
\ No newline at end of file
--- a/cuda/ntt.cu
+++ b/cuda/ntt.cu
 #include "ntt.h"
+#include <chrono>
+
+using namespace std::chrono;

 __device__ int64_t devModExp(int64_t base, int64_t power, int64_t mod){
    int64_t res = 1;
@@ -108,7 +111,7 @@ int64_t NTT::genNthRoot(int mod, int n){
 }

 void NTT::reverse(vector<int64_t> &in, int bitLen){
-    for (int i=0; i<size(in); i++){
+    for (int i=0; i<in.size(); i++){
        int64_t revN = 0;
        for(int j=0; j<bitLen ; j++){
            if ((i >> j) & 1){
@@ -145,8 +148,22 @@ __global__ void correctNeg(int64_t* points, int M){
    if(points[idx] < 0) points[idx] += M;
 }

+int64_t NTT::modMul(int64_t a, int64_t b, int64_t mod){
+    int64_t res = 0;
+    a = a % mod;
+    while(b){
+        if (b & 1){
+            res = (res + a) % mod;
+        }
+        a = (a*2) % mod;
+        b >>= 1;
+    }
+    
+    return res;
+}
+
 void NTT::_ntt(vector<int64_t> &in, int64_t w){
-    int N = size(in);
+    int N = in.size();
    int nBit = bitLength(N) - 1;
    reverse(in, nBit);

@@ -159,12 +176,19 @@ void NTT::_ntt(vector<int64_t> &in, int64_t w){

    int64_t* points;
    cudaMalloc((void**)&points, N*sizeof(int64_t));
+
+    int block = 1;
+    int threadPerBlock = N/2;
+    if(N/2 > 1024){
+        threadPerBlock = 1024;
+        block = ceil((double)(N/2)/1024.0);
+    }
    
    for(int i=0; i<nBit; i++){
        int64_t* p1;
        cudaMalloc((void**)&p1, N*sizeof(int64_t));

-        computeNTT<<<1, N/2>>>(p1, N, i, w, nBit, input);
+        computeNTT<<<block, threadPerBlock>>>(p1, N, i, w, nBit, input);
        cudaDeviceSynchronize();

        cudaMemcpy(points, p1, N*sizeof(int64_t), cudaMemcpyDeviceToDevice);
@@ -176,10 +200,13 @@ void NTT::_ntt(vector<int64_t> &in, int64_t w){
        cudaFree(p1);
    }

-    correctNeg<<<1, N>>>(points, 2013265921);
+    correctNeg<<<block*2, threadPerBlock>>>(points, 2013265921);
    cudaMemcpy(temp, points, N*sizeof(int64_t), cudaMemcpyDeviceToHost);

-    for(int i=0; i<N; i++) in[i] = temp[i];
+    for(int i=0; i<N; i++) {
+        in[i] = temp[i];
+    }
+
 }

 vector<int64_t> NTT::ntt(Polynomial in, int degree, int64_t w){
@@ -191,21 +218,22 @@ vector<int64_t> NTT::ntt(Polynomial in, int degree, int64_t w){
    for(int i=0; i<in.degree; i++){
        out[i] = (int64_t) input[i];
    }
-    
+
    _ntt(out, w);
    return out;
 }

 Polynomial NTT::intt(vector<int64_t> &in, int w){
-    int N = size(in);
+    int N = in.size();
    double* coeff = (double*) malloc(N*sizeof(double));

    int64_t wInv = modInv(w, M);
    int64_t nInv = modInv(N, M);

    _ntt(in, wInv);
-    for(int i=0; i<size(in); i++){
-        coeff[i] = (in[i] * nInv) % M;
+
+    for(int i=0; i<in.size(); i++){
+        coeff[i] = modMul(in[i], nInv, M);
    }

    Polynomial pOut(N, coeff);

--- a/cuda/ntt.h
+++ b/cuda/ntt.h
@@ -4,7 +4,7 @@

 using namespace std;

-#define size(c) ((int)(c).size())
+

 #ifndef NTT_H
 #define NTT_H
@@ -12,15 +12,19 @@ using namespace std;
 class NTT{
    public:
        int M = 2013265921;
+        
+        vector<int64_t> ntt(Polynomial in, int degree, int64_t w);
+        Polynomial intt(vector<int64_t> &in, int w);
+        static int64_t modMul(int64_t a, int64_t b, int64_t mod);
+        int64_t genNthRoot(int mod, int n);
+    
+    private:
        int64_t modExp(int64_t base, int64_t power, int64_t mod);
        int64_t modInv(int64_t x, int64_t mod);
        int bitLength(int x);
        void reverse(vector<int64_t> &in, int bitLen);
        bool existSmallerN(int r, int mod, int n);
-        int64_t genNthRoot(int mod, int n);
        void _ntt(vector<int64_t> &in, int64_t w);
-        vector<int64_t> ntt(Polynomial in, int degree, int64_t w);
-        Polynomial intt(vector<int64_t> &in, int w);
 };



--- a/cuda/parallel.cu
+++ b/cuda/parallel.cu
@@ -30,9 +30,9 @@ __device__ cuDoubleComplex power(cuDoubleComplex base, int power){
 __global__ void initVandermonde(cuDoubleComplex *vand, int M, cuDoubleComplex xi){
  int ROW = blockIdx.y*blockDim.y+threadIdx.y;
  int COL = blockIdx.x*blockDim.x+threadIdx.x;
+
  int exp = (2*ROW+1)*COL;
  vand[ROW * M + COL] = power(xi, exp);
-  if(ROW == 0 && COL == 0) printf("test = %f\n", cuCreal(power(xi, exp)));
 }

 __global__ void transpose(cuDoubleComplex *dest, cuDoubleComplex *source, int N){
@@ -67,6 +67,23 @@ __global__ void computeCoord(double *res, cuDoubleComplex *target, cuDoubleCompl
  res[idx] = temp;
 }

+__global__ void computeLargeCoord(double *res, cuDoubleComplex *target, cuDoubleComplex xi, int M){
+  int idx = threadIdx.x + blockIdx.x*blockDim.x;
+  double temp;
+
+  cuDoubleComplex a = make_cuDoubleComplex(0.0,0.0);
+  cuDoubleComplex b = make_cuDoubleComplex(0.0,0.0);
+  for(int i=0; i<M; i++){
+    int exp = (2*i+1) * idx;
+    cuDoubleComplex vandBase = power(xi, exp);
+    a = cuCadd(a, cuCmul(vandBase, cuConj(target[i])));
+    b = cuCadd(b, cuCmul(vandBase, cuConj(vandBase)));
+  }
+
+  temp = cuCreal(a)/cuCreal(b);
+  res[idx] = temp;
+}
+
 __global__ void setup_kernel(curandState *state){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  curand_init(1234, idx, 0, &state[idx]);
@@ -83,12 +100,23 @@ __global__ void unscale(double *target, double scale){
 }


-__global__ void sigma(double *pol, cuDoubleComplex *vand, int M, cuDoubleComplex *target){
+__global__ void sigma(double *pol, cuDoubleComplex *vand, int M, int offset, cuDoubleComplex *target){
  int idx = threadIdx.x + blockIdx.x*blockDim.x;
  target[idx] = make_cuDoubleComplex(0.0, 0.0);
+
  for(int i=0; i<M; i++){
-    target[idx].x += vand[idx*M+i].x * pol[i];
-    target[idx].y += vand[idx*M+i].y * pol[i];
+    target[idx].x += vand[idx*offset+i].x * pol[i];
+    target[idx].y += vand[idx*offset+i].y * pol[i];
+  }
+}
+
+__global__ void largeSigma(double *pol, cuDoubleComplex xi, int M, cuDoubleComplex *target){
+  int idx = threadIdx.x + blockIdx.x*blockDim.x;
+  target[idx] = make_cuDoubleComplex(0.0, 0.0);
+  for(int i=0; i<M; i++){
+    cuDoubleComplex x = power(xi, (2*idx+1)*i);
+    target[idx].x += x.x * pol[i];
+    target[idx].y += x.y * pol[i];
  }
 }

@@ -110,17 +138,20 @@ Encoder::Encoder(int Min, double s){
    blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y));
  }

-  initVandermonde<<<blocksPerGrid, threadsPerBlock>>>(vand, M/2, xi);
-  cudaDeviceSynchronize();
+  if(M/4 <= pow(2,12)){
+    initVandermonde<<<blocksPerGrid, threadsPerBlock>>>(vand, M/2, xi);
+    cudaDeviceSynchronize();

-  dcomplex *test = (dcomplex *) malloc(64*64*sizeof(dcomplex));
-  cudaMemcpy(test, vand, 64*64*sizeof(dcomplex), cudaMemcpyDeviceToHost);
+    dcomplex *test = (dcomplex *) malloc(64*64*sizeof(dcomplex));
+    cudaMemcpy(test, vand, 64*64*sizeof(dcomplex), cudaMemcpyDeviceToHost);

-  cudaMalloc((void**)&sigmaRBasis, N*N*sizeof(cuDoubleComplex));
-  transpose<<<blocksPerGrid, threadsPerBlock>>>(sigmaRBasis, vand, M/2);
+    cudaMalloc((void**)&sigmaRBasis, N*N*sizeof(cuDoubleComplex));
+    transpose<<<blocksPerGrid, threadsPerBlock>>>(sigmaRBasis, vand, M/2);
+  } 
 }

 Polynomial Encoder::encode(dcomparr input){
+  
  cuDoubleComplex *target;
  size_t size = M/2;
  Polynomial out(M/2);
@@ -128,26 +159,37 @@ Polynomial Encoder::encode(dcomparr input){
  cudaMalloc((void**) &target, size * sizeof(cuDoubleComplex));
  cudaMemcpy(target, input.arr, M/4 * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);

+  auto start = high_resolution_clock::now();
+
  int block = 1;
  int threadPerBlock = M/2;
  if(threadPerBlock > 1024){
    threadPerBlock = 1024;
-    block = ceil((double)(M/2)/1024.0);
+    block = ceil((double)(M/2)/1024.0);  
  }

-  auto start = high_resolution_clock::now();
  expScale<<<block,threadPerBlock/2>>>(target, M/2, scale);

  double *res;
  cudaMalloc((void**)&res, M/2*sizeof(double));
-  computeCoord<<<block,threadPerBlock>>>(res, target, sigmaRBasis, M/2);

-  coordWRR<<<block,threadPerBlock>>>(res);
+  if(M/4 <= pow(2,12)){
+    computeCoord<<<block,threadPerBlock>>>(res, target, sigmaRBasis, M/2);
+  }else {
+    root = exp((2*M_PI/M) * J);
+    cuDoubleComplex xi = make_cuDoubleComplex(real(root), imag(root));
+    computeLargeCoord<<<block,threadPerBlock>>>(res, target, xi, M/2);
+  }

+  coordWRR<<<block,threadPerBlock>>>(res);
  out.setCoeffs(res);
+
  auto stop = high_resolution_clock::now();
  auto duration = duration_cast<microseconds>(stop - start);
-  cout << "Encoding: " << duration.count() << endl;
+  // cout << "Encoding: " << duration.count() << endl;
+
+  cudaFree(res);
+  cudaFree(target);
  return out;
 }

@@ -155,7 +197,7 @@ dcomparr Encoder::decode(Polynomial pt){
  auto start = high_resolution_clock::now();
  
  int block = 1;
-  int threadPerBlock = M/2;
+  int threadPerBlock = pt.degree;
  if(threadPerBlock > 1024){
    threadPerBlock = 1024;
    block = ceil((double)(M/2)/1024.0);
@@ -165,15 +207,56 @@ dcomparr Encoder::decode(Polynomial pt){

  cuDoubleComplex *target;
  cudaMalloc((void**)&target, M/4 * sizeof(cuDoubleComplex));
-  sigma<<<block,threadPerBlock/2>>>(pt.coeffs, vand, M/2, target);
+  
+  if(M/4 <= pow(2,12)){
+    if(pt.degree == M/2){
+      sigma<<<block,threadPerBlock/2>>>(pt.coeffs, vand, M/2, M/2, target);
+    } else {
+      int R = 2;
+      while (R < pt.degree){
+        R <<=1;
+      }
+
+      cuDoubleComplex *tempVand;
+      cudaMalloc((void**)&tempVand, R*R*sizeof(cuDoubleComplex));
+
+      cuDoubleComplex xi = make_cuDoubleComplex(real(root), imag(root));
+      dim3 threads(R, R);
+      dim3 blocksPerGrid(1, 1);
+  
+      if (R*R > 512){
+        threads.x = 16;
+        threads.y = 16;
+        blocksPerGrid.x = ceil(double(R)/double(threads.x));
+        blocksPerGrid.y = ceil(double(R)/double(threads.y));
+      }
+
+      initVandermonde<<<blocksPerGrid, threads>>>(tempVand, R, xi);
+
+      threadPerBlock = M/2;
+      if(threadPerBlock > 1024){
+        threadPerBlock = 1024;
+        block = ceil((double)(M/2)/1024.0);
+      }
+
+      sigma<<<block,threadPerBlock/2>>>(pt.coeffs, tempVand, pt.degree, R, target);
+      cudaFree(tempVand);
+    }
+    
+  }else {
+    cuDoubleComplex xi = make_cuDoubleComplex(real(root), imag(root));
+    largeSigma<<<block,threadPerBlock/2>>>(pt.coeffs, xi, M/2, target);
+  }
+
+  auto stop = high_resolution_clock::now();
+  auto duration = duration_cast<microseconds>(stop - start);
+  // cout << "Decoding: " << duration.count() << endl;

  dcomparr output(M/4);
  size_t size_out = (M/4)*sizeof(dcomplex);
  cudaMemcpy(output.arr, target, size_out, cudaMemcpyDeviceToHost);

-  auto stop = high_resolution_clock::now();
-  auto duration = duration_cast<microseconds>(stop - start);
-  cout << "Decoding: " << duration.count() << endl;
+  cudaFree(target);
+
  return output;
 }
-
--- a/cuda/polynomial.cu
+++ b/cuda/polynomial.cu
@@ -31,11 +31,11 @@ __global__ void polyRoundScale(double *res, double *in, double scale){
    res[idx] = (double) round(in[idx] * scale);
 }

-__global__ void polyMult(double *res, double *a, double *b, int deg){
+__global__ void polyMult(double *res, double *a, double *b, int degA, int degB){
    int idx = threadIdx.x;
-    res[idx] = 0;
-    for(int i=0; i<deg; i++){
-        for(int j=0; j<deg; j++){
+    res[idx] = 0.0;
+    for(int i=0; i<degA; i++){
+        for(int j=0; j<degB; j++){
            if(j+i == idx){
                res[idx] += a[i] * b[j];
            }
@@ -62,7 +62,15 @@ Polynomial Polynomial::operator + (Polynomial const &obj){
    int deg = max(obj.degree, degree);
    double* resCoeff;
    cudaMalloc((void**)&resCoeff, deg*sizeof(double));
-    polyAdd<<<1,deg>>>(coeffs, degree, obj.coeffs, obj.degree, resCoeff);
+    
+    int block = 1;
+    int threadPerBlock = deg;
+    if(threadPerBlock > 1024){
+        threadPerBlock = 1024;
+        block = ceil((double)(deg)/1024.0);
+    }
+
+    polyAdd<<<block,threadPerBlock>>>(coeffs, degree, obj.coeffs, obj.degree, resCoeff);
    
    Polynomial out(deg);
    out.setCoeffs(resCoeff);
@@ -70,12 +78,21 @@ Polynomial Polynomial::operator + (Polynomial const &obj){
 }

 Polynomial Polynomial::operator * (Polynomial const &obj){
-    int deg = obj.degree + degree;
+    int deg = obj.degree + degree-1;
    double *resCoeff;
    cudaMalloc((void**)&resCoeff, deg*sizeof(double));
-    polyMult<<<1, deg>>>(resCoeff, coeffs, obj.coeffs, deg);
+    
+    int block = 1;
+    int threadPerBlock = deg;
+    if(threadPerBlock > 1024){
+        threadPerBlock = 1024;
+        block = ceil((double)(deg)/1024.0);
+    }
+
+    polyMult<<<block,threadPerBlock>>>(resCoeff, coeffs, obj.coeffs, degree, obj.degree);
    Polynomial out(deg);
    out.setCoeffs(resCoeff);
+
    return out;
 }

@@ -87,7 +104,15 @@ Polynomial Polynomial::modCoeff(int q){
    Polynomial out(degree);
    double *resCoeff;
    cudaMalloc((void**)&resCoeff, degree*sizeof(double));
-    polyMod<<<1, degree>>>(resCoeff, coeffs, q);
+
+    int block = 1;
+    int threadPerBlock = degree;
+    if(threadPerBlock > 1024){
+        threadPerBlock = 1024;
+        block = ceil((double)(degree)/1024.0);
+    }
+
+    polyMod<<<block,threadPerBlock>>>(resCoeff, coeffs, q);
    out.setCoeffs(resCoeff);
    return out;
 }
@@ -96,7 +121,15 @@ Polynomial Polynomial::scaleCoeff(double scale){
    Polynomial out(degree);
    double *resCoeff;
    cudaMalloc((void**)&resCoeff, degree*sizeof(double));
-    polyScale<<<1, degree>>>(resCoeff, coeffs, scale);
+
+    int block = 1;
+    int threadPerBlock = degree;
+    if(threadPerBlock > 1024){
+        threadPerBlock = 1024;
+        block = ceil((double)(degree)/1024.0);
+    }
+
+    polyScale<<<block,threadPerBlock>>>(resCoeff, coeffs, scale);
    out.setCoeffs(resCoeff);
    return out;
 }
@@ -105,7 +138,15 @@ Polynomial Polynomial::scaleRoundCoeff(double scale){
    Polynomial out(degree);
    double *resCoeff;
    cudaMalloc((void**)&resCoeff, degree*sizeof(double));
-    polyRoundScale<<<1, degree>>>(resCoeff, coeffs, scale);
+
+    int block = 1;
+    int threadPerBlock = degree;
+    if(threadPerBlock > 1024){
+        threadPerBlock = 1024;
+        block = ceil((double)(degree)/1024.0);
+    }
+
+    polyRoundScale<<<block,threadPerBlock>>>(resCoeff, coeffs, scale);
    out.setCoeffs(resCoeff);
    return out;
 }

--- a/cuda/pubkey.cu
+++ b/cuda/pubkey.cu
@@ -30,7 +30,7 @@ void PubKey::generateA(int degree, int64_t q){
    int64_t half_q = q/2;
    random_device rd;
    mt19937 gen(rd());
-    uniform_int_distribution<int64_t> dis(-half_q, 0);
+    uniform_int_distribution<int64_t> dis(0, half_q);
    double a_coeffs[degree];
    
    for (int i=0; i<degree; i++){
@@ -41,9 +41,10 @@ void PubKey::generateA(int degree, int64_t q){

 void PubKey::computeB(int q, Polynomial s){
    Polynomial err = genE(a.degree, 1.0);
-    Polynomial temp = (a.scaleCoeff(-1.0)) * s + err;
+    Polynomial temp = (a.scaleCoeff(-1.0)) * s;
    Polynomial res = temp.modCoeff(q);

+
    b = Polynomial(res.degree);
    b.setCoeffs(res.coeffs);
 }
--- a/cuda/run
+++ b/cuda/run
--- a/cuda/seckey.cu
+++ b/cuda/seckey.cu
@@ -4,15 +4,22 @@
 #include <random>
 #include "seckey.h"

+#include <chrono>
+using namespace std::chrono;
+
+
 using namespace std;

 SecKey::SecKey(int deg){
    double coeff[deg];
+    auto start1 = high_resolution_clock::now();

    std::random_device rd; 
    std::mt19937 gen(rd()); 
    std::uniform_int_distribution<> distr(deg/2, deg);

+    
+
    int h = distr(gen);
    string ones(h, '1');
    string zeros((deg-h), '0');
@@ -21,9 +28,20 @@ SecKey::SecKey(int deg){
    random_shuffle(key.begin(), key.end());

    for(int i=0; i<deg; i++){
-        char temp = key.at(i);
-        coeff[i] = (double) (atoi(&temp));
+        char temp = key[i];
+        int coef = (int(temp)-48);
+
+        if(coef != 0){
+            coeff[i] = (double) -1 * coef;
+        } else {
+            coeff[i] = 0.0;
+        }
+        
    }

+    auto stop1 = high_resolution_clock::now();
+    auto duration1 = duration_cast<microseconds>(stop1 - start1);
+    cout << "secret key: " << duration1.count() << endl;
+
    s = Polynomial(deg, coeff);
 }
\ No newline at end of file
No results found