|
|
@ -1,13 +1,14 @@ |
|
|
|
// Parallelisierung: |
|
|
|
// Parallelization: |
|
|
|
// |
|
|
|
// |
|
|
|
// FFT_8 wird 2 mal 8-fach parallel ausgeführt (in FFT_64) |
|
|
|
// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64) |
|
|
|
// und 1 mal 16-fach parallel (in FFT_128_full) |
|
|
|
// and 1 time 16-fach parallel (in FFT_128_full) |
|
|
|
// |
|
|
|
// |
|
|
|
// STEP8_IF und STEP8_MAJ beinhalten je zwei 8-fach parallele Operationen |
|
|
|
// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations |
|
|
|
|
|
|
|
|
|
|
|
#define TPB 64 |
|
|
|
#define TPB 64 |
|
|
|
|
|
|
|
|
|
|
|
#include "cuda_helper.h" |
|
|
|
#include "cuda_helper.h" |
|
|
|
|
|
|
|
#include <stdio.h> |
|
|
|
|
|
|
|
|
|
|
|
// aus heavy.cu |
|
|
|
// aus heavy.cu |
|
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
|
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
|
|
@ -20,14 +21,14 @@ texture<uint4, 1, cudaReadModeElementType> texRef1D_128; |
|
|
|
|
|
|
|
|
|
|
|
__constant__ uint32_t c_perm[8][8]; |
|
|
|
__constant__ uint32_t c_perm[8][8]; |
|
|
|
const uint32_t h_perm[8][8] = { |
|
|
|
const uint32_t h_perm[8][8] = { |
|
|
|
{ 2,3,6,7,0,1,4,5 }, |
|
|
|
{ 2, 3, 6, 7, 0, 1, 4, 5 }, |
|
|
|
{ 6,7,2,3,4,5,0,1 }, |
|
|
|
{ 6, 7, 2, 3, 4, 5, 0, 1 }, |
|
|
|
{ 7,6,5,4,3,2,1,0 }, |
|
|
|
{ 7, 6, 5, 4, 3, 2, 1, 0 }, |
|
|
|
{ 1,0,3,2,5,4,7,6 }, |
|
|
|
{ 1, 0, 3, 2, 5, 4, 7, 6 }, |
|
|
|
{ 0,1,4,5,6,7,2,3 }, |
|
|
|
{ 0, 1, 4, 5, 6, 7, 2, 3 }, |
|
|
|
{ 6,7,2,3,0,1,4,5 }, |
|
|
|
{ 6, 7, 2, 3, 0, 1, 4, 5 }, |
|
|
|
{ 6,7,0,1,4,5,2,3 }, |
|
|
|
{ 6, 7, 0, 1, 4, 5, 2, 3 }, |
|
|
|
{ 4,5,2,3,6,7,0,1 } |
|
|
|
{ 4, 5, 2, 3, 6, 7, 0, 1 } |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
__constant__ uint32_t c_IV_512[32]; |
|
|
|
__constant__ uint32_t c_IV_512[32]; |
|
|
@ -43,30 +44,30 @@ static const int h_FFT128_8_16_Twiddle[128] = { |
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
|
|
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, |
|
|
|
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, |
|
|
|
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, |
|
|
|
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, |
|
|
|
1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111, |
|
|
|
1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2,-123, 17,-111, |
|
|
|
1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122, |
|
|
|
1,-118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123,-122, |
|
|
|
1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62, |
|
|
|
1, 116, 92,-122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62, |
|
|
|
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, |
|
|
|
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, |
|
|
|
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79 |
|
|
|
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79 |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
__constant__ int c_FFT256_2_128_Twiddle[128]; |
|
|
|
__constant__ int c_FFT256_2_128_Twiddle[128]; |
|
|
|
static const int h_FFT256_2_128_Twiddle[128] = { |
|
|
|
static const int h_FFT256_2_128_Twiddle[128] = { |
|
|
|
1, 41, -118, 45, 46, 87, -31, 14, |
|
|
|
1, 41,-118, 45, 46, 87, -31, 14, |
|
|
|
60, -110, 116, -127, -67, 80, -61, 69, |
|
|
|
60,-110, 116,-127, -67, 80, -61, 69, |
|
|
|
2, 82, 21, 90, 92, -83, -62, 28, |
|
|
|
2, 82, 21, 90, 92, -83, -62, 28, |
|
|
|
120, 37, -25, 3, 123, -97, -122, -119, |
|
|
|
120, 37, -25, 3, 123, -97,-122,-119, |
|
|
|
4, -93, 42, -77, -73, 91, -124, 56, |
|
|
|
4, -93, 42, -77, -73, 91,-124, 56, |
|
|
|
-17, 74, -50, 6, -11, 63, 13, 19, |
|
|
|
-17, 74, -50, 6, -11, 63, 13, 19, |
|
|
|
8, 71, 84, 103, 111, -75, 9, 112, |
|
|
|
8, 71, 84, 103, 111, -75, 9, 112, |
|
|
|
-34, -109, -100, 12, -22, 126, 26, 38, |
|
|
|
-34,-109,-100, 12, -22, 126, 26, 38, |
|
|
|
16, -115, -89, -51, -35, 107, 18, -33, |
|
|
|
16,-115, -89, -51, -35, 107, 18, -33, |
|
|
|
-68, 39, 57, 24, -44, -5, 52, 76, |
|
|
|
-68, 39, 57, 24, -44, -5, 52, 76, |
|
|
|
32, 27, 79, -102, -70, -43, 36, -66, |
|
|
|
32, 27, 79,-102, -70, -43, 36, -66, |
|
|
|
121, 78, 114, 48, -88, -10, 104, -105, |
|
|
|
121, 78, 114, 48, -88, -10, 104,-105, |
|
|
|
64, 54, -99, 53, 117, -86, 72, 125, |
|
|
|
64, 54, -99, 53, 117, -86, 72, 125, |
|
|
|
-15, -101, -29, 96, 81, -20, -49, 47, |
|
|
|
-15,-101, -29, 96, 81, -20, -49, 47, |
|
|
|
128, 108, 59, 106, -23, 85, -113, -7, |
|
|
|
128, 108, 59, 106, -23, 85,-113, -7, |
|
|
|
-30, 55, -58, -65, -95, -40, -98, 94 |
|
|
|
-30, 55, -58, -65, -95, -40, -98, 94 |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
@ -76,6 +77,7 @@ static const int h_FFT256_2_128_Twiddle[128] = { |
|
|
|
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) |
|
|
|
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) |
|
|
|
|
|
|
|
|
|
|
|
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) |
|
|
|
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) |
|
|
|
|
|
|
|
|
|
|
|
#include "x11/simd_functions.cu" |
|
|
|
#include "x11/simd_functions.cu" |
|
|
|
|
|
|
|
|
|
|
|
/********************* Message expansion ************************/ |
|
|
|
/********************* Message expansion ************************/ |
|
|
@ -84,24 +86,26 @@ static const int h_FFT256_2_128_Twiddle[128] = { |
|
|
|
* Reduce modulo 257; result is in [-127; 383] |
|
|
|
* Reduce modulo 257; result is in [-127; 383] |
|
|
|
* REDUCE(x) := (x&255) - (x>>8) |
|
|
|
* REDUCE(x) := (x&255) - (x>>8) |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
#define REDUCE(x) (((x)&255) - ((x)>>8)) |
|
|
|
#define REDUCE(x) \ |
|
|
|
|
|
|
|
(((x)&255) - ((x)>>8)) |
|
|
|
|
|
|
|
|
|
|
|
/* |
|
|
|
/* |
|
|
|
* Reduce from [-127; 383] to [-128; 128] |
|
|
|
* Reduce from [-127; 383] to [-128; 128] |
|
|
|
* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257 |
|
|
|
* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257 |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
#define EXTRA_REDUCE_S(x) \ |
|
|
|
#define EXTRA_REDUCE_S(x) \ |
|
|
|
((x)<=128 ? (x) : (x)-257) |
|
|
|
((x)<=128 ? (x) : (x)-257) |
|
|
|
|
|
|
|
|
|
|
|
/* |
|
|
|
/* |
|
|
|
* Reduce modulo 257; result is in [-128; 128] |
|
|
|
* Reduce modulo 257; result is in [-128; 128] |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
#define REDUCE_FULL_S(x) \ |
|
|
|
#define REDUCE_FULL_S(x) \ |
|
|
|
EXTRA_REDUCE_S(REDUCE(x)) |
|
|
|
EXTRA_REDUCE_S(REDUCE(x)) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void FFT_8(int *y, int stripe) { |
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void FFT_8(int *y, int stripe) { |
|
|
|
|
|
|
|
|
|
|
|
/* |
|
|
|
/* |
|
|
|
* FFT_8 using w=4 as 8th root of unity |
|
|
|
* FFT_8 using w=4 as 8th root of unity |
|
|
|
* Unrolled decimation in frequency (DIF) radix-2 NTT. |
|
|
|
* Unrolled decimation in frequency (DIF) radix-2 NTT. |
|
|
|
* Output data is in revbin_permuted order. |
|
|
|
* Output data is in revbin_permuted order. |
|
|
@ -110,20 +114,20 @@ __device__ __forceinline__ void FFT_8(int *y, int stripe) { |
|
|
|
#define X(i) y[stripe*i] |
|
|
|
#define X(i) y[stripe*i] |
|
|
|
|
|
|
|
|
|
|
|
#define DO_REDUCE(i) \ |
|
|
|
#define DO_REDUCE(i) \ |
|
|
|
X(i) = REDUCE(X(i)) |
|
|
|
X(i) = REDUCE(X(i)) |
|
|
|
|
|
|
|
|
|
|
|
#define DO_REDUCE_FULL_S(i) \ |
|
|
|
#define DO_REDUCE_FULL_S(i) \ |
|
|
|
do { \ |
|
|
|
do { \ |
|
|
|
X(i) = REDUCE(X(i)); \ |
|
|
|
X(i) = REDUCE(X(i)); \ |
|
|
|
X(i) = EXTRA_REDUCE_S(X(i)); \ |
|
|
|
X(i) = EXTRA_REDUCE_S(X(i)); \ |
|
|
|
} while(0) |
|
|
|
} while(0) |
|
|
|
|
|
|
|
|
|
|
|
#define BUTTERFLY(i,j,n) \ |
|
|
|
#define BUTTERFLY(i,j,n) \ |
|
|
|
do { \ |
|
|
|
do { \ |
|
|
|
int u= X(i); \ |
|
|
|
int u= X(i); \ |
|
|
|
int v= X(j); \ |
|
|
|
int v= X(j); \ |
|
|
|
X(i) = u+v; \ |
|
|
|
X(i) = u+v; \ |
|
|
|
X(j) = (u-v) << (2*n); \ |
|
|
|
X(j) = (u-v) << (2*n); \ |
|
|
|
} while(0) |
|
|
|
} while(0) |
|
|
|
|
|
|
|
|
|
|
|
BUTTERFLY(0, 4, 0); |
|
|
|
BUTTERFLY(0, 4, 0); |
|
|
@ -163,17 +167,16 @@ X(j) = (u-v) << (2*n); \ |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void FFT_16(int *y) { |
|
|
|
__device__ __forceinline__ void FFT_16(int *y) { |
|
|
|
|
|
|
|
|
|
|
|
/* |
|
|
|
/** |
|
|
|
* FFT_16 using w=2 as 16th root of unity |
|
|
|
* FFT_16 using w=2 as 16th root of unity |
|
|
|
* Unrolled decimation in frequency (DIF) radix-2 NTT. |
|
|
|
* Unrolled decimation in frequency (DIF) radix-2 NTT. |
|
|
|
* Output data is in revbin_permuted order. |
|
|
|
* Output data is in revbin_permuted order. |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
#define DO_REDUCE_FULL_S(i) \ |
|
|
|
#define DO_REDUCE_FULL_S(i) \ |
|
|
|
do { \ |
|
|
|
do { \ |
|
|
|
y[i] = REDUCE(y[i]); \ |
|
|
|
y[i] = REDUCE(y[i]); \ |
|
|
|
y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
} while(0) |
|
|
|
} while(0) |
|
|
|
|
|
|
|
|
|
|
|
int u,v; |
|
|
|
int u,v; |
|
|
|
|
|
|
|
|
|
|
@ -186,8 +189,8 @@ y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
// BUTTERFLY(6, 14, 6); |
|
|
|
// BUTTERFLY(6, 14, 6); |
|
|
|
// BUTTERFLY(7, 15, 7); |
|
|
|
// BUTTERFLY(7, 15, 7); |
|
|
|
{ |
|
|
|
{ |
|
|
|
u= y[0]; // 0..7 |
|
|
|
u = y[0]; // 0..7 |
|
|
|
v= y[1]; // 8..15 |
|
|
|
v = y[1]; // 8..15 |
|
|
|
y[0] = u+v; |
|
|
|
y[0] = u+v; |
|
|
|
y[1] = (u-v) << (threadIdx.x&7); |
|
|
|
y[1] = (u-v) << (threadIdx.x&7); |
|
|
|
} |
|
|
|
} |
|
|
@ -204,8 +207,8 @@ y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
// BUTTERFLY( 2, 6, 4); |
|
|
|
// BUTTERFLY( 2, 6, 4); |
|
|
|
// BUTTERFLY( 3, 7, 6); |
|
|
|
// BUTTERFLY( 3, 7, 6); |
|
|
|
{ |
|
|
|
{ |
|
|
|
u= __shfl((int)y[0], (threadIdx.x&3),8); // 0,1,2,3 0,1,2,3 |
|
|
|
u = __shfl((int)y[0], (threadIdx.x&3),8); // 0,1,2,3 0,1,2,3 |
|
|
|
v= __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7 4,5,6,7 |
|
|
|
v = __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7 4,5,6,7 |
|
|
|
y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); |
|
|
|
y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -214,8 +217,8 @@ y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
// BUTTERFLY(10, 14, 4); |
|
|
|
// BUTTERFLY(10, 14, 4); |
|
|
|
// BUTTERFLY(11, 15, 6); |
|
|
|
// BUTTERFLY(11, 15, 6); |
|
|
|
{ |
|
|
|
{ |
|
|
|
u= __shfl((int)y[1], (threadIdx.x&3),8); // 8,9,10,11 8,9,10,11 |
|
|
|
u = __shfl((int)y[1], (threadIdx.x&3),8); // 8,9,10,11 8,9,10,11 |
|
|
|
v= __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15 12,13,14,15 |
|
|
|
v = __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15 12,13,14,15 |
|
|
|
y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); |
|
|
|
y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -233,8 +236,8 @@ y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
// BUTTERFLY( 4, 6, 0); |
|
|
|
// BUTTERFLY( 4, 6, 0); |
|
|
|
// BUTTERFLY( 5, 7, 4); |
|
|
|
// BUTTERFLY( 5, 7, 4); |
|
|
|
{ |
|
|
|
{ |
|
|
|
u= __shfl((int)y[0], (threadIdx.x&5),8); // 0,1,0,1 4,5,4,5 |
|
|
|
u = __shfl((int)y[0], (threadIdx.x&5),8); // 0,1,0,1 4,5,4,5 |
|
|
|
v= __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3 6,7,6,7 |
|
|
|
v = __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3 6,7,6,7 |
|
|
|
y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); |
|
|
|
y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -243,8 +246,8 @@ y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
// BUTTERFLY(12, 14, 0); |
|
|
|
// BUTTERFLY(12, 14, 0); |
|
|
|
// BUTTERFLY(13, 15, 4); |
|
|
|
// BUTTERFLY(13, 15, 4); |
|
|
|
{ |
|
|
|
{ |
|
|
|
u= __shfl((int)y[1], (threadIdx.x&5),8); // 8,9,8,9 12,13,12,13 |
|
|
|
u = __shfl((int)y[1], (threadIdx.x&5),8); // 8,9,8,9 12,13,12,13 |
|
|
|
v= __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11 14,15,14,15 |
|
|
|
v = __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11 14,15,14,15 |
|
|
|
y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); |
|
|
|
y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -253,8 +256,8 @@ y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
// BUTTERFLY( 4, 5, 0); |
|
|
|
// BUTTERFLY( 4, 5, 0); |
|
|
|
// BUTTERFLY( 6, 7, 0); |
|
|
|
// BUTTERFLY( 6, 7, 0); |
|
|
|
{ |
|
|
|
{ |
|
|
|
u= __shfl((int)y[0], (threadIdx.x&6),8); // 0,0,2,2 4,4,6,6 |
|
|
|
u = __shfl((int)y[0], (threadIdx.x&6),8); // 0,0,2,2 4,4,6,6 |
|
|
|
v= __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3 5,5,7,7 |
|
|
|
v = __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3 5,5,7,7 |
|
|
|
y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); |
|
|
|
y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -263,8 +266,8 @@ y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
// BUTTERFLY(12, 13, 0); |
|
|
|
// BUTTERFLY(12, 13, 0); |
|
|
|
// BUTTERFLY(14, 15, 0); |
|
|
|
// BUTTERFLY(14, 15, 0); |
|
|
|
{ |
|
|
|
{ |
|
|
|
u= __shfl((int)y[1], (threadIdx.x&6),8); // 8,8,10,10 12,12,14,14 |
|
|
|
u = __shfl((int)y[1], (threadIdx.x&6),8); // 8,8,10,10 12,12,14,14 |
|
|
|
v= __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11 13,13,15,15 |
|
|
|
v = __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11 13,13,15,15 |
|
|
|
y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); |
|
|
|
y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -274,7 +277,9 @@ y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
#undef DO_REDUCE_FULL_S |
|
|
|
#undef DO_REDUCE_FULL_S |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void FFT_128_full(int y[128]) { |
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void FFT_128_full(int y[128]) |
|
|
|
|
|
|
|
{ |
|
|
|
int i; |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
FFT_8(y+0,2); // eight parallel FFT8's |
|
|
|
FFT_8(y+0,2); // eight parallel FFT8's |
|
|
@ -289,11 +294,9 @@ __device__ __forceinline__ void FFT_128_full(int y[128]) { |
|
|
|
FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads |
|
|
|
FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ void FFT_256_halfzero(int y[256]) { |
|
|
|
void FFT_256_halfzero(int y[256]) |
|
|
|
|
|
|
|
{ |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* |
|
|
|
/* |
|
|
|
* FFT_256 using w=41 as 256th root of unity. |
|
|
|
* FFT_256 using w=41 as 256th root of unity. |
|
|
|
* Decimation in frequency (DIF) NTT. |
|
|
|
* Decimation in frequency (DIF) NTT. |
|
|
@ -303,10 +306,10 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) { |
|
|
|
const int tmp = y[15]; |
|
|
|
const int tmp = y[15]; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 8 |
|
|
|
for (i=0; i<8; i++) |
|
|
|
for (int i=0; i<8; i++) |
|
|
|
y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]); |
|
|
|
y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]); |
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 8 |
|
|
|
for (i=8; i<16; i++) |
|
|
|
for (int i=8; i<16; i++) |
|
|
|
y[16+i] = 0; |
|
|
|
y[16+i] = 0; |
|
|
|
|
|
|
|
|
|
|
|
/* handle X^255 with an additional butterfly */ |
|
|
|
/* handle X^255 with an additional butterfly */ |
|
|
@ -323,19 +326,18 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) { |
|
|
|
|
|
|
|
|
|
|
|
/***************************************************/ |
|
|
|
/***************************************************/ |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) |
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void Expansion(const uint32_t *data, uint4 *g_temp4) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Message Expansion using Number Theoretical Transform similar to FFT */ |
|
|
|
/* Message Expansion using Number Theoretical Transform similar to FFT */ |
|
|
|
int expanded[32]; |
|
|
|
int expanded[32]; |
|
|
|
#pragma unroll 4 |
|
|
|
#pragma unroll 4 |
|
|
|
for (i=0; i < 4; i++) { |
|
|
|
for (int i=0; i < 4; i++) { |
|
|
|
expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff; |
|
|
|
expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff; |
|
|
|
expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff; |
|
|
|
expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff; |
|
|
|
} |
|
|
|
} |
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 8 |
|
|
|
for (i=8; i < 16; i++) |
|
|
|
for (int i=8; i < 16; i++) |
|
|
|
expanded[i] = 0; |
|
|
|
expanded[i] = 0; |
|
|
|
|
|
|
|
|
|
|
|
FFT_256_halfzero(expanded); |
|
|
|
FFT_256_halfzero(expanded); |
|
|
@ -447,7 +449,6 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) |
|
|
|
//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 }, |
|
|
|
//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 }, |
|
|
|
//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 }, |
|
|
|
//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 }, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5 |
|
|
|
bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5 |
|
|
|
|
|
|
|
|
|
|
|
P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
@ -474,7 +475,6 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) |
|
|
|
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 |
|
|
|
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 |
|
|
|
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 |
|
|
|
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
@ -552,7 +552,7 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/***************************************************/ |
|
|
|
/***************************************************/ |
|
|
|
// Die Hash-Funktion |
|
|
|
|
|
|
|
__global__ void __launch_bounds__(TPB,4) |
|
|
|
__global__ void __launch_bounds__(TPB,4) |
|
|
|
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4) |
|
|
|
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4) |
|
|
|
{ |
|
|
|
{ |
|
|
@ -567,9 +567,10 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u |
|
|
|
|
|
|
|
|
|
|
|
// Hash einlesen und auf 8 Threads und 2 Register verteilen |
|
|
|
// Hash einlesen und auf 8 Threads und 2 Register verteilen |
|
|
|
uint32_t Hash[2]; |
|
|
|
uint32_t Hash[2]; |
|
|
|
#pragma unroll 2 |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 2 |
|
|
|
for (int i=0; i<2; i++) |
|
|
|
for (int i=0; i<2; i++) |
|
|
|
Hash[i] = inpHash[8*i+(threadIdx.x&7)]; |
|
|
|
Hash[i] = inpHash[8*i + (threadIdx.x & 7)]; |
|
|
|
|
|
|
|
|
|
|
|
// Puffer für expandierte Nachricht |
|
|
|
// Puffer für expandierte Nachricht |
|
|
|
uint4 *temp4 = &g_temp4[64 * hashPosition]; |
|
|
|
uint4 *temp4 = &g_temp4[64 * hashPosition]; |
|
|
@ -622,20 +623,13 @@ x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, ui |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Setup-Funktionen |
|
|
|
__host__ |
|
|
|
__host__ void x11_simd512_cpu_init(int thr_id, int threads) |
|
|
|
void x11_simd512_cpu_init(int thr_id, int threads) |
|
|
|
{ |
|
|
|
{ |
|
|
|
cudaMalloc( &d_state[thr_id], 32*sizeof(int)*threads ); |
|
|
|
CUDA_SAFE_CALL(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads)); |
|
|
|
cudaMalloc( &d_temp4[thr_id], 64*sizeof(uint4)*threads ); |
|
|
|
CUDA_SAFE_CALL(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads)); |
|
|
|
|
|
|
|
|
|
|
|
// Textur für 128 Bit Zugriffe |
|
|
|
|
|
|
|
cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>(); |
|
|
|
|
|
|
|
texRef1D_128.normalized = 0; |
|
|
|
|
|
|
|
texRef1D_128.filterMode = cudaFilterModePoint; |
|
|
|
|
|
|
|
texRef1D_128.addressMode[0] = cudaAddressModeClamp; |
|
|
|
|
|
|
|
cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cudaMemcpyToSymbol( c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); |
|
|
@ -644,28 +638,31 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads) |
|
|
|
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Texture for 128-Bit Zugriffe |
|
|
|
|
|
|
|
cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>(); |
|
|
|
|
|
|
|
texRef1D_128.normalized = 0; |
|
|
|
|
|
|
|
texRef1D_128.filterMode = cudaFilterModePoint; |
|
|
|
|
|
|
|
texRef1D_128.addressMode[0] = cudaAddressModeClamp; |
|
|
|
|
|
|
|
CUDA_SAFE_CALL(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) |
|
|
|
__host__ |
|
|
|
|
|
|
|
void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int threadsperblock = TPB; |
|
|
|
const int threadsperblock = TPB; |
|
|
|
|
|
|
|
|
|
|
|
// Größe des dynamischen Shared Memory Bereichs |
|
|
|
|
|
|
|
size_t shared_size = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// berechne wie viele Thread Blocks wir brauchen |
|
|
|
|
|
|
|
dim3 block(threadsperblock); |
|
|
|
dim3 block(threadsperblock); |
|
|
|
|
|
|
|
|
|
|
|
dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8); |
|
|
|
dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8); |
|
|
|
x11_simd512_gpu_expand_64<<<grid8, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
x11_simd512_gpu_expand_64 <<<grid8, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
|
|
|
|
|
|
|
|
// künstlich die Occupancy limitieren, um das totale Erschöpfen des Texture Cache zu vermeiden |
|
|
|
x11_simd512_gpu_compress1_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
x11_simd512_gpu_compress1_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
x11_simd512_gpu_compress2_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
x11_simd512_gpu_compress2_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
x11_simd512_gpu_final_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
MyStreamSynchronize(NULL, order, thr_id); |
|
|
|
MyStreamSynchronize(NULL, order, thr_id); |
|
|
|
} |
|
|
|
} |
|
|
|