Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Commits on Source (2)
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
......@@ -3,18 +3,16 @@
using namespace std;
CKKS::CKKS(int N, PubKey _pk, EvalKey _evk, SecKey _sk){
pk = _pk;
sk = _sk;
CKKS::CKKS(EvalKey _evk){
evk = _evk;
deg = N;
ql = q0;
for (int i=0; i<6; i++){
ql *= pl[i];
}
}
Ciphertext CKKS::encrypt(Polynomial pt){
Ciphertext CKKS::encrypt(Polynomial pt, PubKey pk){
//Polynomial e = genE(pt.degree, 1.0);
Polynomial c0 = (pt) + pk.b;
Polynomial c1 = pk.a;
......@@ -23,7 +21,7 @@ Ciphertext CKKS::encrypt(Polynomial pt){
return ct;
}
Polynomial CKKS::decrypt(Ciphertext ct){
Polynomial CKKS::decrypt(Ciphertext ct, SecKey sk){
Polynomial pt = ct.c0 + (ct.c1 * sk.s);
return pt.modCoeff(ql);
}
......@@ -33,6 +31,20 @@ Ciphertext CKKS::add(Ciphertext ct1, Ciphertext ct2){
return out;
}
Polynomial CKKS::fastMult(Polynomial p1, Polynomial p2, int deg, int64_t w){
vector<int64_t> pol1 = ntt.ntt(p1, deg, w);
vector<int64_t> pol2 = ntt.ntt(p2, deg, w);
int64_t p = 4179340454199820289;
vector<int64_t> mult(deg, 0);
for(int i=0; i<deg; i++){
mult.push_back(NTT::modMul(pol1[i], pol2[i], p));
}
Polynomial out = ntt.intt(mult, w);
return out;
}
Ciphertext CKKS::mult(Ciphertext ct1, Ciphertext ct2){
// Multiplication
Polynomial d1 = (ct1.c0 * ct2.c0).modCoeff(ql);
......@@ -55,4 +67,18 @@ Ciphertext CKKS::mult(Ciphertext ct1, Ciphertext ct2){
Ciphertext out(c0.modCoeff(ql), c1.modCoeff(ql));
return out;
}
int getNextPow(int deg){
int out = 2;
while(out < deg){
out <<= 1;
}
return out;
}
Ciphertext CKKS::multNTT(Ciphertext ct1, Ciphertext ct2){
level += 1;
Ciphertext out = mult(ct1, ct2);
return out;
}
\ No newline at end of file
#include "polynomial.h"
#include "ntt.h"
#include "pubkey.h"
#include "seckey.h"
#include "evalkey.h"
......@@ -8,22 +9,22 @@
#define CKKS_H
class CKKS {
public:
int deg;
private:
int level = 6;
int q0 = 67;
int pl[6] = {61, 67, 71, 73, 79, 59};
int pl[6] = {61, 67, 71, 73, 79, 67};
int64_t ql;
PubKey pk;
SecKey sk;
EvalKey evk;
NTT ntt;
CKKS(int N, PubKey _pk, EvalKey _evk, SecKey _sk);
Polynomial fastMult(Polynomial p1, Polynomial p2);
public:
CKKS(EvalKey _evk);
Polynomial fastMult(Polynomial p1, Polynomial p2, int deg, int64_t w);
Ciphertext add(Ciphertext ct1, Ciphertext ct2);
Ciphertext mult(Ciphertext ct1, Ciphertext ct2);
Ciphertext encrypt(Polynomial pt);
Polynomial decrypt(Ciphertext ct);
Ciphertext multNTT(Ciphertext ct1, Ciphertext ct2);
Ciphertext encrypt(Polynomial pt, PubKey pk);
Polynomial decrypt(Ciphertext ct, SecKey sk);
};
#endif
\ No newline at end of file
......@@ -29,10 +29,10 @@ Polynomial EvalKey::genE(int degree, double var){
}
void EvalKey::generateA(int degree, int64_t q){
int64_t half_q = (p*q)/2;
int64_t half_q = (q)/2;
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<int64_t> dis(-half_q, 0);
uniform_int_distribution<int64_t> dis(0, half_q);
double a_coeffs[degree];
for (int i=0; i<degree; i++){
......@@ -45,9 +45,10 @@ void EvalKey::generateA(int degree, int64_t q){
void EvalKey::computeB(int q, Polynomial s){
Polynomial err = genE(a.degree, 1.0);
Polynomial ev = (s * s).scaleCoeff(p);
Polynomial temp = ((a.scaleCoeff(-1.0)) * s) + ev + err;
Polynomial temp = ((a.scaleCoeff(-1.0)) * s) + ev;
Polynomial res = temp.modCoeff(q * p);
b = Polynomial(res.degree);
b.setCoeffs(res.coeffs);
}
\ No newline at end of file
#include "ntt.h"
#include <chrono>
using namespace std::chrono;
__device__ int64_t devModExp(int64_t base, int64_t power, int64_t mod){
int64_t res = 1;
......@@ -108,7 +111,7 @@ int64_t NTT::genNthRoot(int mod, int n){
}
void NTT::reverse(vector<int64_t> &in, int bitLen){
for (int i=0; i<size(in); i++){
for (int i=0; i<in.size(); i++){
int64_t revN = 0;
for(int j=0; j<bitLen ; j++){
if ((i >> j) & 1){
......@@ -145,8 +148,22 @@ __global__ void correctNeg(int64_t* points, int M){
if(points[idx] < 0) points[idx] += M;
}
int64_t NTT::modMul(int64_t a, int64_t b, int64_t mod){
int64_t res = 0;
a = a % mod;
while(b){
if (b & 1){
res = (res + a) % mod;
}
a = (a*2) % mod;
b >>= 1;
}
return res;
}
void NTT::_ntt(vector<int64_t> &in, int64_t w){
int N = size(in);
int N = in.size();
int nBit = bitLength(N) - 1;
reverse(in, nBit);
......@@ -159,12 +176,19 @@ void NTT::_ntt(vector<int64_t> &in, int64_t w){
int64_t* points;
cudaMalloc((void**)&points, N*sizeof(int64_t));
int block = 1;
int threadPerBlock = N/2;
if(N/2 > 1024){
threadPerBlock = 1024;
block = ceil((double)(N/2)/1024.0);
}
for(int i=0; i<nBit; i++){
int64_t* p1;
cudaMalloc((void**)&p1, N*sizeof(int64_t));
computeNTT<<<1, N/2>>>(p1, N, i, w, nBit, input);
computeNTT<<<block, threadPerBlock>>>(p1, N, i, w, nBit, input);
cudaDeviceSynchronize();
cudaMemcpy(points, p1, N*sizeof(int64_t), cudaMemcpyDeviceToDevice);
......@@ -176,10 +200,13 @@ void NTT::_ntt(vector<int64_t> &in, int64_t w){
cudaFree(p1);
}
correctNeg<<<1, N>>>(points, 2013265921);
correctNeg<<<block*2, threadPerBlock>>>(points, 2013265921);
cudaMemcpy(temp, points, N*sizeof(int64_t), cudaMemcpyDeviceToHost);
for(int i=0; i<N; i++) in[i] = temp[i];
for(int i=0; i<N; i++) {
in[i] = temp[i];
}
}
vector<int64_t> NTT::ntt(Polynomial in, int degree, int64_t w){
......@@ -191,21 +218,22 @@ vector<int64_t> NTT::ntt(Polynomial in, int degree, int64_t w){
for(int i=0; i<in.degree; i++){
out[i] = (int64_t) input[i];
}
_ntt(out, w);
return out;
}
Polynomial NTT::intt(vector<int64_t> &in, int w){
int N = size(in);
int N = in.size();
double* coeff = (double*) malloc(N*sizeof(double));
int64_t wInv = modInv(w, M);
int64_t nInv = modInv(N, M);
_ntt(in, wInv);
for(int i=0; i<size(in); i++){
coeff[i] = (in[i] * nInv) % M;
for(int i=0; i<in.size(); i++){
coeff[i] = modMul(in[i], nInv, M);
}
Polynomial pOut(N, coeff);
......
......@@ -4,7 +4,7 @@
using namespace std;
#define size(c) ((int)(c).size())
#ifndef NTT_H
#define NTT_H
......@@ -12,15 +12,19 @@ using namespace std;
class NTT{
public:
int M = 2013265921;
vector<int64_t> ntt(Polynomial in, int degree, int64_t w);
Polynomial intt(vector<int64_t> &in, int w);
static int64_t modMul(int64_t a, int64_t b, int64_t mod);
int64_t genNthRoot(int mod, int n);
private:
int64_t modExp(int64_t base, int64_t power, int64_t mod);
int64_t modInv(int64_t x, int64_t mod);
int bitLength(int x);
void reverse(vector<int64_t> &in, int bitLen);
bool existSmallerN(int r, int mod, int n);
int64_t genNthRoot(int mod, int n);
void _ntt(vector<int64_t> &in, int64_t w);
vector<int64_t> ntt(Polynomial in, int degree, int64_t w);
Polynomial intt(vector<int64_t> &in, int w);
};
......
......@@ -30,9 +30,9 @@ __device__ cuDoubleComplex power(cuDoubleComplex base, int power){
__global__ void initVandermonde(cuDoubleComplex *vand, int M, cuDoubleComplex xi){
int ROW = blockIdx.y*blockDim.y+threadIdx.y;
int COL = blockIdx.x*blockDim.x+threadIdx.x;
int exp = (2*ROW+1)*COL;
vand[ROW * M + COL] = power(xi, exp);
if(ROW == 0 && COL == 0) printf("test = %f\n", cuCreal(power(xi, exp)));
}
__global__ void transpose(cuDoubleComplex *dest, cuDoubleComplex *source, int N){
......@@ -67,6 +67,23 @@ __global__ void computeCoord(double *res, cuDoubleComplex *target, cuDoubleCompl
res[idx] = temp;
}
__global__ void computeLargeCoord(double *res, cuDoubleComplex *target, cuDoubleComplex xi, int M){
int idx = threadIdx.x + blockIdx.x*blockDim.x;
double temp;
cuDoubleComplex a = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex b = make_cuDoubleComplex(0.0,0.0);
for(int i=0; i<M; i++){
int exp = (2*i+1) * idx;
cuDoubleComplex vandBase = power(xi, exp);
a = cuCadd(a, cuCmul(vandBase, cuConj(target[i])));
b = cuCadd(b, cuCmul(vandBase, cuConj(vandBase)));
}
temp = cuCreal(a)/cuCreal(b);
res[idx] = temp;
}
__global__ void setup_kernel(curandState *state){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
curand_init(1234, idx, 0, &state[idx]);
......@@ -83,12 +100,23 @@ __global__ void unscale(double *target, double scale){
}
__global__ void sigma(double *pol, cuDoubleComplex *vand, int M, cuDoubleComplex *target){
__global__ void sigma(double *pol, cuDoubleComplex *vand, int M, int offset, cuDoubleComplex *target){
int idx = threadIdx.x + blockIdx.x*blockDim.x;
target[idx] = make_cuDoubleComplex(0.0, 0.0);
for(int i=0; i<M; i++){
target[idx].x += vand[idx*M+i].x * pol[i];
target[idx].y += vand[idx*M+i].y * pol[i];
target[idx].x += vand[idx*offset+i].x * pol[i];
target[idx].y += vand[idx*offset+i].y * pol[i];
}
}
__global__ void largeSigma(double *pol, cuDoubleComplex xi, int M, cuDoubleComplex *target){
int idx = threadIdx.x + blockIdx.x*blockDim.x;
target[idx] = make_cuDoubleComplex(0.0, 0.0);
for(int i=0; i<M; i++){
cuDoubleComplex x = power(xi, (2*idx+1)*i);
target[idx].x += x.x * pol[i];
target[idx].y += x.y * pol[i];
}
}
......@@ -110,17 +138,20 @@ Encoder::Encoder(int Min, double s){
blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y));
}
initVandermonde<<<blocksPerGrid, threadsPerBlock>>>(vand, M/2, xi);
cudaDeviceSynchronize();
if(M/4 <= pow(2,12)){
initVandermonde<<<blocksPerGrid, threadsPerBlock>>>(vand, M/2, xi);
cudaDeviceSynchronize();
dcomplex *test = (dcomplex *) malloc(64*64*sizeof(dcomplex));
cudaMemcpy(test, vand, 64*64*sizeof(dcomplex), cudaMemcpyDeviceToHost);
dcomplex *test = (dcomplex *) malloc(64*64*sizeof(dcomplex));
cudaMemcpy(test, vand, 64*64*sizeof(dcomplex), cudaMemcpyDeviceToHost);
cudaMalloc((void**)&sigmaRBasis, N*N*sizeof(cuDoubleComplex));
transpose<<<blocksPerGrid, threadsPerBlock>>>(sigmaRBasis, vand, M/2);
cudaMalloc((void**)&sigmaRBasis, N*N*sizeof(cuDoubleComplex));
transpose<<<blocksPerGrid, threadsPerBlock>>>(sigmaRBasis, vand, M/2);
}
}
Polynomial Encoder::encode(dcomparr input){
cuDoubleComplex *target;
size_t size = M/2;
Polynomial out(M/2);
......@@ -128,26 +159,37 @@ Polynomial Encoder::encode(dcomparr input){
cudaMalloc((void**) &target, size * sizeof(cuDoubleComplex));
cudaMemcpy(target, input.arr, M/4 * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
auto start = high_resolution_clock::now();
int block = 1;
int threadPerBlock = M/2;
if(threadPerBlock > 1024){
threadPerBlock = 1024;
block = ceil((double)(M/2)/1024.0);
block = ceil((double)(M/2)/1024.0);
}
auto start = high_resolution_clock::now();
expScale<<<block,threadPerBlock/2>>>(target, M/2, scale);
double *res;
cudaMalloc((void**)&res, M/2*sizeof(double));
computeCoord<<<block,threadPerBlock>>>(res, target, sigmaRBasis, M/2);
coordWRR<<<block,threadPerBlock>>>(res);
if(M/4 <= pow(2,12)){
computeCoord<<<block,threadPerBlock>>>(res, target, sigmaRBasis, M/2);
}else {
root = exp((2*M_PI/M) * J);
cuDoubleComplex xi = make_cuDoubleComplex(real(root), imag(root));
computeLargeCoord<<<block,threadPerBlock>>>(res, target, xi, M/2);
}
coordWRR<<<block,threadPerBlock>>>(res);
out.setCoeffs(res);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cout << "Encoding: " << duration.count() << endl;
// cout << "Encoding: " << duration.count() << endl;
cudaFree(res);
cudaFree(target);
return out;
}
......@@ -155,7 +197,7 @@ dcomparr Encoder::decode(Polynomial pt){
auto start = high_resolution_clock::now();
int block = 1;
int threadPerBlock = M/2;
int threadPerBlock = pt.degree;
if(threadPerBlock > 1024){
threadPerBlock = 1024;
block = ceil((double)(M/2)/1024.0);
......@@ -165,15 +207,56 @@ dcomparr Encoder::decode(Polynomial pt){
cuDoubleComplex *target;
cudaMalloc((void**)&target, M/4 * sizeof(cuDoubleComplex));
sigma<<<block,threadPerBlock/2>>>(pt.coeffs, vand, M/2, target);
if(M/4 <= pow(2,12)){
if(pt.degree == M/2){
sigma<<<block,threadPerBlock/2>>>(pt.coeffs, vand, M/2, M/2, target);
} else {
int R = 2;
while (R < pt.degree){
R <<=1;
}
cuDoubleComplex *tempVand;
cudaMalloc((void**)&tempVand, R*R*sizeof(cuDoubleComplex));
cuDoubleComplex xi = make_cuDoubleComplex(real(root), imag(root));
dim3 threads(R, R);
dim3 blocksPerGrid(1, 1);
if (R*R > 512){
threads.x = 16;
threads.y = 16;
blocksPerGrid.x = ceil(double(R)/double(threads.x));
blocksPerGrid.y = ceil(double(R)/double(threads.y));
}
initVandermonde<<<blocksPerGrid, threads>>>(tempVand, R, xi);
threadPerBlock = M/2;
if(threadPerBlock > 1024){
threadPerBlock = 1024;
block = ceil((double)(M/2)/1024.0);
}
sigma<<<block,threadPerBlock/2>>>(pt.coeffs, tempVand, pt.degree, R, target);
cudaFree(tempVand);
}
}else {
cuDoubleComplex xi = make_cuDoubleComplex(real(root), imag(root));
largeSigma<<<block,threadPerBlock/2>>>(pt.coeffs, xi, M/2, target);
}
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
// cout << "Decoding: " << duration.count() << endl;
dcomparr output(M/4);
size_t size_out = (M/4)*sizeof(dcomplex);
cudaMemcpy(output.arr, target, size_out, cudaMemcpyDeviceToHost);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cout << "Decoding: " << duration.count() << endl;
cudaFree(target);
return output;
}
......@@ -31,11 +31,11 @@ __global__ void polyRoundScale(double *res, double *in, double scale){
res[idx] = (double) round(in[idx] * scale);
}
__global__ void polyMult(double *res, double *a, double *b, int deg){
__global__ void polyMult(double *res, double *a, double *b, int degA, int degB){
int idx = threadIdx.x;
res[idx] = 0;
for(int i=0; i<deg; i++){
for(int j=0; j<deg; j++){
res[idx] = 0.0;
for(int i=0; i<degA; i++){
for(int j=0; j<degB; j++){
if(j+i == idx){
res[idx] += a[i] * b[j];
}
......@@ -62,7 +62,15 @@ Polynomial Polynomial::operator + (Polynomial const &obj){
int deg = max(obj.degree, degree);
double* resCoeff;
cudaMalloc((void**)&resCoeff, deg*sizeof(double));
polyAdd<<<1,deg>>>(coeffs, degree, obj.coeffs, obj.degree, resCoeff);
int block = 1;
int threadPerBlock = deg;
if(threadPerBlock > 1024){
threadPerBlock = 1024;
block = ceil((double)(deg)/1024.0);
}
polyAdd<<<block,threadPerBlock>>>(coeffs, degree, obj.coeffs, obj.degree, resCoeff);
Polynomial out(deg);
out.setCoeffs(resCoeff);
......@@ -70,12 +78,21 @@ Polynomial Polynomial::operator + (Polynomial const &obj){
}
Polynomial Polynomial::operator * (Polynomial const &obj){
int deg = obj.degree + degree;
int deg = obj.degree + degree-1;
double *resCoeff;
cudaMalloc((void**)&resCoeff, deg*sizeof(double));
polyMult<<<1, deg>>>(resCoeff, coeffs, obj.coeffs, deg);
int block = 1;
int threadPerBlock = deg;
if(threadPerBlock > 1024){
threadPerBlock = 1024;
block = ceil((double)(deg)/1024.0);
}
polyMult<<<block,threadPerBlock>>>(resCoeff, coeffs, obj.coeffs, degree, obj.degree);
Polynomial out(deg);
out.setCoeffs(resCoeff);
return out;
}
......@@ -87,7 +104,15 @@ Polynomial Polynomial::modCoeff(int q){
Polynomial out(degree);
double *resCoeff;
cudaMalloc((void**)&resCoeff, degree*sizeof(double));
polyMod<<<1, degree>>>(resCoeff, coeffs, q);
int block = 1;
int threadPerBlock = degree;
if(threadPerBlock > 1024){
threadPerBlock = 1024;
block = ceil((double)(degree)/1024.0);
}
polyMod<<<block,threadPerBlock>>>(resCoeff, coeffs, q);
out.setCoeffs(resCoeff);
return out;
}
......@@ -96,7 +121,15 @@ Polynomial Polynomial::scaleCoeff(double scale){
Polynomial out(degree);
double *resCoeff;
cudaMalloc((void**)&resCoeff, degree*sizeof(double));
polyScale<<<1, degree>>>(resCoeff, coeffs, scale);
int block = 1;
int threadPerBlock = degree;
if(threadPerBlock > 1024){
threadPerBlock = 1024;
block = ceil((double)(degree)/1024.0);
}
polyScale<<<block,threadPerBlock>>>(resCoeff, coeffs, scale);
out.setCoeffs(resCoeff);
return out;
}
......@@ -105,7 +138,15 @@ Polynomial Polynomial::scaleRoundCoeff(double scale){
Polynomial out(degree);
double *resCoeff;
cudaMalloc((void**)&resCoeff, degree*sizeof(double));
polyRoundScale<<<1, degree>>>(resCoeff, coeffs, scale);
int block = 1;
int threadPerBlock = degree;
if(threadPerBlock > 1024){
threadPerBlock = 1024;
block = ceil((double)(degree)/1024.0);
}
polyRoundScale<<<block,threadPerBlock>>>(resCoeff, coeffs, scale);
out.setCoeffs(resCoeff);
return out;
}
......
......@@ -30,7 +30,7 @@ void PubKey::generateA(int degree, int64_t q){
int64_t half_q = q/2;
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<int64_t> dis(-half_q, 0);
uniform_int_distribution<int64_t> dis(0, half_q);
double a_coeffs[degree];
for (int i=0; i<degree; i++){
......@@ -41,9 +41,10 @@ void PubKey::generateA(int degree, int64_t q){
void PubKey::computeB(int q, Polynomial s){
Polynomial err = genE(a.degree, 1.0);
Polynomial temp = (a.scaleCoeff(-1.0)) * s + err;
Polynomial temp = (a.scaleCoeff(-1.0)) * s;
Polynomial res = temp.modCoeff(q);
b = Polynomial(res.degree);
b.setCoeffs(res.coeffs);
}
No preview for this file type
......@@ -4,15 +4,22 @@
#include <random>
#include "seckey.h"
#include <chrono>
using namespace std::chrono;
using namespace std;
SecKey::SecKey(int deg){
double coeff[deg];
auto start1 = high_resolution_clock::now();
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> distr(deg/2, deg);
int h = distr(gen);
string ones(h, '1');
string zeros((deg-h), '0');
......@@ -21,9 +28,20 @@ SecKey::SecKey(int deg){
random_shuffle(key.begin(), key.end());
for(int i=0; i<deg; i++){
char temp = key.at(i);
coeff[i] = (double) (atoi(&temp));
char temp = key[i];
int coef = (int(temp)-48);
if(coef != 0){
coeff[i] = (double) -1 * coef;
} else {
coeff[i] = 0.0;
}
}
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
cout << "secret key: " << duration1.count() << endl;
s = Polynomial(deg, coeff);
}
\ No newline at end of file