|
|
@ -1,9 +1,24 @@ |
|
|
|
|
|
|
|
// Parallelisierung: |
|
|
|
|
|
|
|
// |
|
|
|
|
|
|
|
// FFT_8 wird 2 mal 8-fach parallel ausgeführt (in FFT_64) |
|
|
|
|
|
|
|
// und 1 mal 16-fach parallel (in FFT_128_full) |
|
|
|
|
|
|
|
// |
|
|
|
|
|
|
|
// STEP8_IF und STEP8_MAJ beinhalten je zwei 8-fach parallele Operationen |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define TPB 256 |
|
|
|
|
|
|
|
|
|
|
|
// aus heavy.cu |
|
|
|
// aus heavy.cu |
|
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
|
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
|
|
|
|
|
|
|
|
|
|
|
typedef unsigned int uint32_t; |
|
|
|
typedef unsigned int uint32_t; |
|
|
|
typedef unsigned long long uint64_t; |
|
|
|
typedef unsigned long long uint64_t; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int *d_state[8]; |
|
|
|
|
|
|
|
uint4 *d_temp4[8]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// texture bound to d_temp4[thr_id], for read access in Compaction kernel |
|
|
|
|
|
|
|
texture<uint4, 1, cudaReadModeElementType> texRef1D_128; |
|
|
|
|
|
|
|
|
|
|
|
#define C32(x) ((uint32_t)(x ## U)) |
|
|
|
#define C32(x) ((uint32_t)(x ## U)) |
|
|
|
#define T32(x) ((x) & C32(0xFFFFFFFF)) |
|
|
|
#define T32(x) ((x) & C32(0xFFFFFFFF)) |
|
|
|
|
|
|
|
|
|
|
@ -23,99 +38,6 @@ const uint32_t h_IV_512[32] = { |
|
|
|
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 |
|
|
|
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
__constant__ int c_FFT[256]; |
|
|
|
|
|
|
|
const int h_FFT[256] = |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// this is the FFT result in revbin permuted order |
|
|
|
|
|
|
|
4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92, |
|
|
|
|
|
|
|
-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63, |
|
|
|
|
|
|
|
-63, 107, -107, -76, 76, -119, 119, -83, 83, 126, -126, 94, -94, -23, 23, -76, |
|
|
|
|
|
|
|
76, -47, 47, 92, -92, -117, 117, 73, -73, -53, 53, 88, -88, -80, 80, -47, 47, |
|
|
|
|
|
|
|
5, -5, 67, -67, 34, -34, 4, -4, 87, -87, -28, 28, -70, 70, -110, 110, -18, 18, 93, |
|
|
|
|
|
|
|
-93, 51, -51, 36, -36, 118, -118, -106, 106, 45, -45, -108, 108, -44, 44, 117, |
|
|
|
|
|
|
|
-117, -121, 121, -37, 37, 65, -65, 37, -37, 40, -40, -42, 42, 91, -91, -128, 128, |
|
|
|
|
|
|
|
-21, 21, 94, -94, -98, 98, -47, 47, 28, -28, 115, -115, 16, -16, -20, 20, 122, |
|
|
|
|
|
|
|
-122, 115, -115, 46, -46, 84, -84, -127, 127, 57, -57, 127, -127, -80, 80, 24, |
|
|
|
|
|
|
|
-24, 15, -15, 29, -29, -78, 78, -126, 126, 16, -16, 52, -52, 55, -55, 110, -110, |
|
|
|
|
|
|
|
-51, 51, -120, 120, -124, 124, -24, 24, -76, 76, 26, -26, -21, 21, -64, 64, -99, |
|
|
|
|
|
|
|
99, 85, -85, -15, 15, -120, 120, -116, 116, 85, -85, 12, -12, -24, 24, 4, -4, |
|
|
|
|
|
|
|
79, -79, 76, -76, 23, -23, 4, -4, -108, 108, -20, 20, 73, -73, -42, 42, -7, 7, |
|
|
|
|
|
|
|
-29, 29, -123, 123, 49, -49, -96, 96, -68, 68, -112, 112, 116, -116, -24, 24, 93, |
|
|
|
|
|
|
|
-93, -125, 125, -86, 86, 117, -117, -91, 91, 42, -42, 87, -87, -117, 117, 102, -102 |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__constant__ int c_P8[32][8]; |
|
|
|
|
|
|
|
static const int h_P8[32][8] = { |
|
|
|
|
|
|
|
{ 2, 66, 34, 98, 18, 82, 50, 114 }, |
|
|
|
|
|
|
|
{ 6, 70, 38, 102, 22, 86, 54, 118 }, |
|
|
|
|
|
|
|
{ 0, 64, 32, 96, 16, 80, 48, 112 }, |
|
|
|
|
|
|
|
{ 4, 68, 36, 100, 20, 84, 52, 116 }, |
|
|
|
|
|
|
|
{ 14, 78, 46, 110, 30, 94, 62, 126 }, |
|
|
|
|
|
|
|
{ 10, 74, 42, 106, 26, 90, 58, 122 }, |
|
|
|
|
|
|
|
{ 12, 76, 44, 108, 28, 92, 60, 124 }, |
|
|
|
|
|
|
|
{ 8, 72, 40, 104, 24, 88, 56, 120 }, |
|
|
|
|
|
|
|
{ 15, 79, 47, 111, 31, 95, 63, 127 }, |
|
|
|
|
|
|
|
{ 13, 77, 45, 109, 29, 93, 61, 125 }, |
|
|
|
|
|
|
|
{ 3, 67, 35, 99, 19, 83, 51, 115 }, |
|
|
|
|
|
|
|
{ 1, 65, 33, 97, 17, 81, 49, 113 }, |
|
|
|
|
|
|
|
{ 9, 73, 41, 105, 25, 89, 57, 121 }, |
|
|
|
|
|
|
|
{ 11, 75, 43, 107, 27, 91, 59, 123 }, |
|
|
|
|
|
|
|
{ 5, 69, 37, 101, 21, 85, 53, 117 }, |
|
|
|
|
|
|
|
{ 7, 71, 39, 103, 23, 87, 55, 119 }, |
|
|
|
|
|
|
|
{ 8, 72, 40, 104, 24, 88, 56, 120 }, |
|
|
|
|
|
|
|
{ 4, 68, 36, 100, 20, 84, 52, 116 }, |
|
|
|
|
|
|
|
{ 14, 78, 46, 110, 30, 94, 62, 126 }, |
|
|
|
|
|
|
|
{ 2, 66, 34, 98, 18, 82, 50, 114 }, |
|
|
|
|
|
|
|
{ 6, 70, 38, 102, 22, 86, 54, 118 }, |
|
|
|
|
|
|
|
{ 10, 74, 42, 106, 26, 90, 58, 122 }, |
|
|
|
|
|
|
|
{ 0, 64, 32, 96, 16, 80, 48, 112 }, |
|
|
|
|
|
|
|
{ 12, 76, 44, 108, 28, 92, 60, 124 }, |
|
|
|
|
|
|
|
{ 134, 198, 166, 230, 150, 214, 182, 246 }, |
|
|
|
|
|
|
|
{ 128, 192, 160, 224, 144, 208, 176, 240 }, |
|
|
|
|
|
|
|
{ 136, 200, 168, 232, 152, 216, 184, 248 }, |
|
|
|
|
|
|
|
{ 142, 206, 174, 238, 158, 222, 190, 254 }, |
|
|
|
|
|
|
|
{ 140, 204, 172, 236, 156, 220, 188, 252 }, |
|
|
|
|
|
|
|
{ 138, 202, 170, 234, 154, 218, 186, 250 }, |
|
|
|
|
|
|
|
{ 130, 194, 162, 226, 146, 210, 178, 242 }, |
|
|
|
|
|
|
|
{ 132, 196, 164, 228, 148, 212, 180, 244 }, |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__constant__ int c_Q8[32][8]; |
|
|
|
|
|
|
|
static const int h_Q8[32][8] = { |
|
|
|
|
|
|
|
{ 130, 194, 162, 226, 146, 210, 178, 242 }, |
|
|
|
|
|
|
|
{ 134, 198, 166, 230, 150, 214, 182, 246 }, |
|
|
|
|
|
|
|
{ 128, 192, 160, 224, 144, 208, 176, 240 }, |
|
|
|
|
|
|
|
{ 132, 196, 164, 228, 148, 212, 180, 244 }, |
|
|
|
|
|
|
|
{ 142, 206, 174, 238, 158, 222, 190, 254 }, |
|
|
|
|
|
|
|
{ 138, 202, 170, 234, 154, 218, 186, 250 }, |
|
|
|
|
|
|
|
{ 140, 204, 172, 236, 156, 220, 188, 252 }, |
|
|
|
|
|
|
|
{ 136, 200, 168, 232, 152, 216, 184, 248 }, |
|
|
|
|
|
|
|
{ 143, 207, 175, 239, 159, 223, 191, 255 }, |
|
|
|
|
|
|
|
{ 141, 205, 173, 237, 157, 221, 189, 253 }, |
|
|
|
|
|
|
|
{ 131, 195, 163, 227, 147, 211, 179, 243 }, |
|
|
|
|
|
|
|
{ 129, 193, 161, 225, 145, 209, 177, 241 }, |
|
|
|
|
|
|
|
{ 137, 201, 169, 233, 153, 217, 185, 249 }, |
|
|
|
|
|
|
|
{ 139, 203, 171, 235, 155, 219, 187, 251 }, |
|
|
|
|
|
|
|
{ 133, 197, 165, 229, 149, 213, 181, 245 }, |
|
|
|
|
|
|
|
{ 135, 199, 167, 231, 151, 215, 183, 247 }, |
|
|
|
|
|
|
|
{ 9, 73, 41, 105, 25, 89, 57, 121 }, |
|
|
|
|
|
|
|
{ 5, 69, 37, 101, 21, 85, 53, 117 }, |
|
|
|
|
|
|
|
{ 15, 79, 47, 111, 31, 95, 63, 127 }, |
|
|
|
|
|
|
|
{ 3, 67, 35, 99, 19, 83, 51, 115 }, |
|
|
|
|
|
|
|
{ 7, 71, 39, 103, 23, 87, 55, 119 }, |
|
|
|
|
|
|
|
{ 11, 75, 43, 107, 27, 91, 59, 123 }, |
|
|
|
|
|
|
|
{ 1, 65, 33, 97, 17, 81, 49, 113 }, |
|
|
|
|
|
|
|
{ 13, 77, 45, 109, 29, 93, 61, 125 }, |
|
|
|
|
|
|
|
{ 135, 199, 167, 231, 151, 215, 183, 247 }, |
|
|
|
|
|
|
|
{ 129, 193, 161, 225, 145, 209, 177, 241 }, |
|
|
|
|
|
|
|
{ 137, 201, 169, 233, 153, 217, 185, 249 }, |
|
|
|
|
|
|
|
{ 143, 207, 175, 239, 159, 223, 191, 255 }, |
|
|
|
|
|
|
|
{ 141, 205, 173, 237, 157, 221, 189, 253 }, |
|
|
|
|
|
|
|
{ 139, 203, 171, 235, 155, 219, 187, 251 }, |
|
|
|
|
|
|
|
{ 131, 195, 163, 227, 147, 211, 179, 243 }, |
|
|
|
|
|
|
|
{ 133, 197, 165, 229, 149, 213, 181, 245 }, |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__constant__ int c_FFT128_8_16_Twiddle[128]; |
|
|
|
__constant__ int c_FFT128_8_16_Twiddle[128]; |
|
|
|
static const int h_FFT128_8_16_Twiddle[128] = { |
|
|
|
static const int h_FFT128_8_16_Twiddle[128] = { |
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
|
@ -160,67 +82,7 @@ static const int h_FFT256_2_128_Twiddle[128] = { |
|
|
|
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) |
|
|
|
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) |
|
|
|
|
|
|
|
|
|
|
|
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) |
|
|
|
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) |
|
|
|
|
|
|
|
#include "x11/simd_functions.cu" |
|
|
|
__device__ __forceinline__ void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int j; |
|
|
|
|
|
|
|
uint32_t R[8]; |
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
|
|
|
for(j=0; j<8; j++) { |
|
|
|
|
|
|
|
R[j] = ROTL32(A[j], r); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
|
|
|
for(j=0; j<8; j++) { |
|
|
|
|
|
|
|
D[j] = D[j] + w[j] + IF(A[j], B[j], C[j]); |
|
|
|
|
|
|
|
D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]); |
|
|
|
|
|
|
|
A[j] = R[j]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void STEP8_MAJ(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int j; |
|
|
|
|
|
|
|
uint32_t R[8]; |
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
|
|
|
for(j=0; j<8; j++) { |
|
|
|
|
|
|
|
R[j] = ROTL32(A[j], r); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
|
|
|
for(j=0; j<8; j++) { |
|
|
|
|
|
|
|
D[j] = D[j] + w[j] + MAJ(A[j], B[j], C[j]); |
|
|
|
|
|
|
|
D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]); |
|
|
|
|
|
|
|
A[j] = R[j]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void Round8(uint32_t A[32], const int y[256], int i, |
|
|
|
|
|
|
|
int r, int s, int t, int u) { |
|
|
|
|
|
|
|
int code = i<2? 185: 233; |
|
|
|
|
|
|
|
uint32_t w[8][8]; |
|
|
|
|
|
|
|
int a, b; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* |
|
|
|
|
|
|
|
* The FFT output y is in revbin permuted order, |
|
|
|
|
|
|
|
* but this is included in the tables P and Q |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
|
|
|
for(a=0; a<8; a++) |
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
|
|
|
for(b=0; b<8; b++) |
|
|
|
|
|
|
|
w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STEP8_IF(w[0], 8*i+0, r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
|
|
|
|
STEP8_IF(w[1], 8*i+1, s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
|
|
|
|
STEP8_IF(w[2], 8*i+2, t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
|
|
|
|
STEP8_IF(w[3], 8*i+3, u, r, &A[8], &A[16], &A[24], A); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STEP8_MAJ(w[4], 8*i+4, r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
|
|
|
|
STEP8_MAJ(w[5], 8*i+5, s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
|
|
|
|
STEP8_MAJ(w[6], 8*i+6, t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
|
|
|
|
STEP8_MAJ(w[7], 8*i+7, u, r, &A[8], &A[16], &A[24], A); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/********************* Message expansion ************************/ |
|
|
|
/********************* Message expansion ************************/ |
|
|
|
|
|
|
|
|
|
|
@ -305,7 +167,7 @@ X(j) = (u-v) << (2*n); \ |
|
|
|
#undef BUTTERFLY |
|
|
|
#undef BUTTERFLY |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void FFT_16(int *y, int stripe) { |
|
|
|
__device__ __forceinline__ void FFT_16(int *y) { |
|
|
|
|
|
|
|
|
|
|
|
/* |
|
|
|
/* |
|
|
|
* FFT_16 using w=2 as 16th root of unity |
|
|
|
* FFT_16 using w=2 as 16th root of unity |
|
|
@ -313,115 +175,124 @@ __device__ __forceinline__ void FFT_16(int *y, int stripe) { |
|
|
|
* Output data is in revbin_permuted order. |
|
|
|
* Output data is in revbin_permuted order. |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
#define X(i) y[stripe*i] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define DO_REDUCE(i) \ |
|
|
|
|
|
|
|
X(i) = REDUCE(X(i)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define DO_REDUCE_FULL_S(i) \ |
|
|
|
#define DO_REDUCE_FULL_S(i) \ |
|
|
|
do { \ |
|
|
|
do { \ |
|
|
|
X(i) = REDUCE(X(i)); \ |
|
|
|
y[i] = REDUCE(y[i]); \ |
|
|
|
X(i) = EXTRA_REDUCE_S(X(i)); \ |
|
|
|
y[i] = EXTRA_REDUCE_S(y[i]); \ |
|
|
|
} while(0) |
|
|
|
} while(0) |
|
|
|
|
|
|
|
|
|
|
|
#define BUTTERFLY(i,j,n) \ |
|
|
|
int u,v; |
|
|
|
do { \ |
|
|
|
|
|
|
|
int u= X(i); \ |
|
|
|
|
|
|
|
int v= X(j); \ |
|
|
|
|
|
|
|
X(i) = u+v; \ |
|
|
|
|
|
|
|
X(j) = (u-v) << n; \ |
|
|
|
|
|
|
|
} while(0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
BUTTERFLY(0, 8, 0); |
|
|
|
// BUTTERFLY(0, 8, 0); |
|
|
|
BUTTERFLY(1, 9, 1); |
|
|
|
// BUTTERFLY(1, 9, 1); |
|
|
|
BUTTERFLY(2, 10, 2); |
|
|
|
// BUTTERFLY(2, 10, 2); |
|
|
|
BUTTERFLY(3, 11, 3); |
|
|
|
// BUTTERFLY(3, 11, 3); |
|
|
|
BUTTERFLY(4, 12, 4); |
|
|
|
// BUTTERFLY(4, 12, 4); |
|
|
|
BUTTERFLY(5, 13, 5); |
|
|
|
// BUTTERFLY(5, 13, 5); |
|
|
|
BUTTERFLY(6, 14, 6); |
|
|
|
// BUTTERFLY(6, 14, 6); |
|
|
|
BUTTERFLY(7, 15, 7); |
|
|
|
// BUTTERFLY(7, 15, 7); |
|
|
|
|
|
|
|
{ |
|
|
|
DO_REDUCE(11); |
|
|
|
u= y[0]; // 0..7 |
|
|
|
DO_REDUCE(12); |
|
|
|
v= y[1]; // 8..15 |
|
|
|
DO_REDUCE(13); |
|
|
|
y[0] = u+v; |
|
|
|
DO_REDUCE(14); |
|
|
|
y[1] = (u-v) << (threadIdx.x&7); |
|
|
|
DO_REDUCE(15); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
BUTTERFLY( 0, 4, 0); |
|
|
|
// DO_REDUCE(11); |
|
|
|
BUTTERFLY( 1, 5, 2); |
|
|
|
// DO_REDUCE(12); |
|
|
|
BUTTERFLY( 2, 6, 4); |
|
|
|
// DO_REDUCE(13); |
|
|
|
BUTTERFLY( 3, 7, 6); |
|
|
|
// DO_REDUCE(14); |
|
|
|
|
|
|
|
// DO_REDUCE(15); |
|
|
|
BUTTERFLY( 8, 12, 0); |
|
|
|
if ((threadIdx.x&7) >=3) y[1] = REDUCE(y[1]); // 11...15 |
|
|
|
BUTTERFLY( 9, 13, 2); |
|
|
|
|
|
|
|
BUTTERFLY(10, 14, 4); |
|
|
|
// BUTTERFLY( 0, 4, 0); |
|
|
|
BUTTERFLY(11, 15, 6); |
|
|
|
// BUTTERFLY( 1, 5, 2); |
|
|
|
|
|
|
|
// BUTTERFLY( 2, 6, 4); |
|
|
|
DO_REDUCE(5); |
|
|
|
// BUTTERFLY( 3, 7, 6); |
|
|
|
DO_REDUCE(7); |
|
|
|
{ |
|
|
|
DO_REDUCE(13); |
|
|
|
u= __shfl((int)y[0], (threadIdx.x&3),8); // 0,1,2,3 0,1,2,3 |
|
|
|
DO_REDUCE(15); |
|
|
|
v= __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7 4,5,6,7 |
|
|
|
|
|
|
|
y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); |
|
|
|
BUTTERFLY( 0, 2, 0); |
|
|
|
} |
|
|
|
BUTTERFLY( 1, 3, 4); |
|
|
|
|
|
|
|
BUTTERFLY( 4, 6, 0); |
|
|
|
// BUTTERFLY( 8, 12, 0); |
|
|
|
BUTTERFLY( 5, 7, 4); |
|
|
|
// BUTTERFLY( 9, 13, 2); |
|
|
|
|
|
|
|
// BUTTERFLY(10, 14, 4); |
|
|
|
BUTTERFLY( 8, 10, 0); |
|
|
|
// BUTTERFLY(11, 15, 6); |
|
|
|
BUTTERFLY(12, 14, 0); |
|
|
|
{ |
|
|
|
BUTTERFLY( 9, 11, 4); |
|
|
|
u= __shfl((int)y[1], (threadIdx.x&3),8); // 8,9,10,11 8,9,10,11 |
|
|
|
BUTTERFLY(13, 15, 4); |
|
|
|
v= __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15 12,13,14,15 |
|
|
|
|
|
|
|
y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); |
|
|
|
|
|
|
|
} |
|
|
|
BUTTERFLY( 0, 1, 0); |
|
|
|
|
|
|
|
BUTTERFLY( 2, 3, 0); |
|
|
|
// DO_REDUCE(5); |
|
|
|
BUTTERFLY( 4, 5, 0); |
|
|
|
// DO_REDUCE(7); |
|
|
|
BUTTERFLY( 6, 7, 0); |
|
|
|
// DO_REDUCE(13); |
|
|
|
|
|
|
|
// DO_REDUCE(15); |
|
|
|
BUTTERFLY( 8, 9, 0); |
|
|
|
if ((threadIdx.x&1) && (threadIdx.x&7) >= 4) { |
|
|
|
BUTTERFLY(10, 11, 0); |
|
|
|
y[0] = REDUCE(y[0]); // 5, 7 |
|
|
|
BUTTERFLY(12, 13, 0); |
|
|
|
y[1] = REDUCE(y[1]); // 13, 15 |
|
|
|
BUTTERFLY(14, 15, 0); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
DO_REDUCE_FULL_S( 0); |
|
|
|
// BUTTERFLY( 0, 2, 0); |
|
|
|
DO_REDUCE_FULL_S( 1); |
|
|
|
// BUTTERFLY( 1, 3, 4); |
|
|
|
DO_REDUCE_FULL_S( 2); |
|
|
|
// BUTTERFLY( 4, 6, 0); |
|
|
|
DO_REDUCE_FULL_S( 3); |
|
|
|
// BUTTERFLY( 5, 7, 4); |
|
|
|
DO_REDUCE_FULL_S( 4); |
|
|
|
{ |
|
|
|
DO_REDUCE_FULL_S( 5); |
|
|
|
u= __shfl((int)y[0], (threadIdx.x&5),8); // 0,1,0,1 4,5,4,5 |
|
|
|
DO_REDUCE_FULL_S( 6); |
|
|
|
v= __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3 6,7,6,7 |
|
|
|
DO_REDUCE_FULL_S( 7); |
|
|
|
y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); |
|
|
|
DO_REDUCE_FULL_S( 8); |
|
|
|
} |
|
|
|
DO_REDUCE_FULL_S( 9); |
|
|
|
|
|
|
|
DO_REDUCE_FULL_S(10); |
|
|
|
// BUTTERFLY( 8, 10, 0); |
|
|
|
DO_REDUCE_FULL_S(11); |
|
|
|
// BUTTERFLY( 9, 11, 4); |
|
|
|
DO_REDUCE_FULL_S(12); |
|
|
|
// BUTTERFLY(12, 14, 0); |
|
|
|
DO_REDUCE_FULL_S(13); |
|
|
|
// BUTTERFLY(13, 15, 4); |
|
|
|
DO_REDUCE_FULL_S(14); |
|
|
|
{ |
|
|
|
DO_REDUCE_FULL_S(15); |
|
|
|
u= __shfl((int)y[1], (threadIdx.x&5),8); // 8,9,8,9 12,13,12,13 |
|
|
|
|
|
|
|
v= __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11 14,15,14,15 |
|
|
|
|
|
|
|
y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// BUTTERFLY( 0, 1, 0); |
|
|
|
|
|
|
|
// BUTTERFLY( 2, 3, 0); |
|
|
|
|
|
|
|
// BUTTERFLY( 4, 5, 0); |
|
|
|
|
|
|
|
// BUTTERFLY( 6, 7, 0); |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
u= __shfl((int)y[0], (threadIdx.x&6),8); // 0,0,2,2 4,4,6,6 |
|
|
|
|
|
|
|
v= __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3 5,5,7,7 |
|
|
|
|
|
|
|
y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// BUTTERFLY( 8, 9, 0); |
|
|
|
|
|
|
|
// BUTTERFLY(10, 11, 0); |
|
|
|
|
|
|
|
// BUTTERFLY(12, 13, 0); |
|
|
|
|
|
|
|
// BUTTERFLY(14, 15, 0); |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
u= __shfl((int)y[1], (threadIdx.x&6),8); // 8,8,10,10 12,12,14,14 |
|
|
|
|
|
|
|
v= __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11 13,13,15,15 |
|
|
|
|
|
|
|
y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DO_REDUCE_FULL_S( 0); // 0...7 |
|
|
|
|
|
|
|
DO_REDUCE_FULL_S( 1); // 8...15 |
|
|
|
|
|
|
|
|
|
|
|
#undef X |
|
|
|
|
|
|
|
#undef DO_REDUCE |
|
|
|
|
|
|
|
#undef DO_REDUCE_FULL_S |
|
|
|
#undef DO_REDUCE_FULL_S |
|
|
|
#undef BUTTERFLY |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void FFT_128_full(int *y) { |
|
|
|
__device__ __forceinline__ void FFT_128_full(int y[128]) { |
|
|
|
int i; |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 16 |
|
|
|
FFT_8(y+0,2); // eight parallel FFT8's |
|
|
|
for (i=0; i<16; i++) { |
|
|
|
FFT_8(y+1,2); // eight parallel FFT8's |
|
|
|
FFT_8(y+i,16); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 128 |
|
|
|
#pragma unroll 16 |
|
|
|
for (i=0; i<128; i++) |
|
|
|
for (i=0; i<16; i++) |
|
|
|
/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i]); |
|
|
|
/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]); |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 8 |
|
|
|
for (i=0; i<8; i++) { |
|
|
|
for (i=0; i<8; i++) |
|
|
|
FFT_16(y+16*i,1); |
|
|
|
FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -435,116 +306,323 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) { |
|
|
|
* Output data is in revbin_permuted order. |
|
|
|
* Output data is in revbin_permuted order. |
|
|
|
* In place. |
|
|
|
* In place. |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
const int tmp = y[127]; |
|
|
|
const int tmp = y[15]; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 127 |
|
|
|
#pragma unroll 8 |
|
|
|
for (i=0; i<127; i++) |
|
|
|
for (i=0; i<8; i++) |
|
|
|
y[128+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[i]); |
|
|
|
y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]); |
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
|
|
|
for (i=8; i<16; i++) |
|
|
|
|
|
|
|
y[16+i] = 0; |
|
|
|
|
|
|
|
|
|
|
|
/* handle X^255 with an additionnal butterfly */ |
|
|
|
/* handle X^255 with an additional butterfly */ |
|
|
|
y[127] = REDUCE(tmp + 1); |
|
|
|
if ((threadIdx.x&7) == 7) |
|
|
|
y[255] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]); |
|
|
|
{ |
|
|
|
|
|
|
|
y[15] = REDUCE(tmp + 1); |
|
|
|
|
|
|
|
y[31] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
FFT_128_full(y); |
|
|
|
FFT_128_full(y); |
|
|
|
FFT_128_full(y+128); |
|
|
|
FFT_128_full(y+16); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void SIMD_Compress(uint32_t A[32], const int *expanded, const uint32_t *M) { |
|
|
|
/***************************************************/ |
|
|
|
|
|
|
|
|
|
|
|
uint32_t IV[4][8]; |
|
|
|
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) |
|
|
|
|
|
|
|
{ |
|
|
|
int i; |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
/* Save the chaining value for the feed-forward */ |
|
|
|
/* Message Expansion using Number Theoretical Transform similar to FFT */ |
|
|
|
|
|
|
|
int expanded[32]; |
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 4 |
|
|
|
for(i=0; i<8; i++) { |
|
|
|
for (i=0; i < 4; i++) { |
|
|
|
IV[0][i] = A[i]; |
|
|
|
expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff; |
|
|
|
IV[1][i] = (&A[8])[i]; |
|
|
|
expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff; |
|
|
|
IV[2][i] = (&A[16])[i]; |
|
|
|
|
|
|
|
IV[3][i] = (&A[24])[i]; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/* XOR the message to the chaining value */ |
|
|
|
|
|
|
|
/* we can XOR word-by-word */ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
#pragma unroll 8 |
|
|
|
for(i=0; i<8; i++) { |
|
|
|
for (i=8; i < 16; i++) |
|
|
|
A[i] ^= M[i]; |
|
|
|
expanded[i] = 0; |
|
|
|
(&A[8])[i] ^= M[8+i]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Run the feistel ladders with the expanded message */ |
|
|
|
FFT_256_halfzero(expanded); |
|
|
|
{ |
|
|
|
|
|
|
|
Round8(A, expanded, 0, 3, 23, 17, 27); |
|
|
|
|
|
|
|
Round8(A, expanded, 1, 28, 19, 22, 7); |
|
|
|
|
|
|
|
Round8(A, expanded, 2, 29, 9, 15, 5); |
|
|
|
|
|
|
|
Round8(A, expanded, 3, 4, 13, 10, 25); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STEP8_IF(IV[0], 32, 4, 13, A, &A[8], &A[16], &A[24]); |
|
|
|
|
|
|
|
STEP8_IF(IV[1], 33, 13, 10, &A[24], A, &A[8], &A[16]); |
|
|
|
|
|
|
|
STEP8_IF(IV[2], 34, 10, 25, &A[16], &A[24], A, &A[8]); |
|
|
|
|
|
|
|
STEP8_IF(IV[3], 35, 25, 4, &A[8], &A[16], &A[24], A); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// store w matrices in global memory |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define mul_185(x) ( (x)*185 ) |
|
|
|
|
|
|
|
#define mul_233(x) ( (x)*233 ) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint4 vec0; |
|
|
|
|
|
|
|
int P, Q, P1, Q1, P2, Q2; |
|
|
|
|
|
|
|
bool even = (threadIdx.x & 1) == 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 2 6 0 4 |
|
|
|
|
|
|
|
const int perm0[8] = { 2,3,6,7,0,1,4,5 }; // TODO: das landet im lmem. doof. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
g_temp4[threadIdx.x&7] = vec0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 |
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 |
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 |
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 6 2 4 0 |
|
|
|
|
|
|
|
const int perm1[8] = { 6,7,2,3,4,5,0,1 }; // TODO: das landet im lmem. doof. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
g_temp4[8+(threadIdx.x&7)] = vec0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 |
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 7 5 3 1 |
|
|
|
|
|
|
|
const int perm2[8] = { 7,6,5,4,3,2,1,0 }; // TODO: das landet im lmem. doof. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool hi = (threadIdx.x&7)>=4; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
g_temp4[16+(threadIdx.x&7)] = vec0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 |
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 1 3 5 7 |
|
|
|
|
|
|
|
const int perm3[8] = { 1,0,3,2,5,4,7,6 }; // TODO: das landet im lmem. doof. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool lo = (threadIdx.x&7)<4; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; |
|
|
|
|
|
|
|
Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
g_temp4[24+(threadIdx.x&7)] = vec0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 |
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//{ 8, 72, 40, 104, 24, 88, 56, 120 }, { 9, 73, 41, 105, 25, 89, 57, 121 }, |
|
|
|
|
|
|
|
//{ 4, 68, 36, 100, 20, 84, 52, 116 }, { 5, 69, 37, 101, 21, 85, 53, 117 }, |
|
|
|
|
|
|
|
//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 }, |
|
|
|
|
|
|
|
//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 }, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int perm4[8] = { 0,1,4,5,6,7,2,3 }; // TODO: das landet im lmem. doof. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
g_temp4[32+(threadIdx.x&7)] = vec0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 |
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 |
|
|
|
|
|
|
|
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 |
|
|
|
|
|
|
|
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int perm5[8] = { 6,7,2,3,0,1,4,5 }; // TODO: das landet im lmem. doof. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
g_temp4[40+(threadIdx.x&7)] = vec0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 |
|
|
|
|
|
|
|
// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 |
|
|
|
|
|
|
|
// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 |
|
|
|
|
|
|
|
// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int perm6[8] = { 6,7,0,1,4,5,2,3 }; // TODO: das landet im lmem. doof. |
|
|
|
|
|
|
|
// sel markiert threads 2,3,4,5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int t; |
|
|
|
|
|
|
|
t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
g_temp4[48+(threadIdx.x&7)] = vec0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 |
|
|
|
|
|
|
|
// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 |
|
|
|
|
|
|
|
// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 |
|
|
|
|
|
|
|
// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int perm7[8] = { 4,5,2,3,6,7,0,1 }; // TODO: das landet im lmem. doof. |
|
|
|
|
|
|
|
// sel markiert threads 2,3,4,5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); |
|
|
|
|
|
|
|
P = even? P1 : P2; Q = even? Q1 : Q2; |
|
|
|
|
|
|
|
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
g_temp4[56+(threadIdx.x&7)] = vec0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#undef mul_185 |
|
|
|
|
|
|
|
#undef mul_233 |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/***************************************************/ |
|
|
|
/***************************************************/ |
|
|
|
|
|
|
|
// Die Hash-Funktion |
|
|
|
|
|
|
|
__global__ void __launch_bounds__(TPB,4) |
|
|
|
|
|
|
|
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x)/8; |
|
|
|
|
|
|
|
if (thread < threads) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void SIMDHash(const uint32_t *data, uint32_t *hashval) { |
|
|
|
int hashPosition = nounce - startNounce; |
|
|
|
|
|
|
|
|
|
|
|
uint32_t A[32]; |
|
|
|
uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; |
|
|
|
int i; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t buffer[16]; |
|
|
|
// Hash einlesen und auf 8 Threads und 2 Register verteilen |
|
|
|
|
|
|
|
uint32_t Hash[2]; |
|
|
|
|
|
|
|
#pragma unroll 2 |
|
|
|
|
|
|
|
for (int i=0; i<2; i++) |
|
|
|
|
|
|
|
Hash[i] = inpHash[8*i+(threadIdx.x&7)]; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 32 |
|
|
|
// Puffer für expandierte Nachricht |
|
|
|
for (i=0; i < 32; i++) A[i] = c_IV_512[i]; |
|
|
|
uint4 *temp4 = &g_temp4[64 * hashPosition]; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 16 |
|
|
|
Expansion(Hash, temp4); |
|
|
|
for (i=0; i < 16; i++) buffer[i] = data[i]; |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/* Message Expansion using Number Theoretical Transform similar to FFT */ |
|
|
|
__global__ void __launch_bounds__(TPB,4) |
|
|
|
int expanded[256]; |
|
|
|
x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
|
|
|
|
if (thread < threads) |
|
|
|
{ |
|
|
|
{ |
|
|
|
#pragma unroll 16 |
|
|
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); |
|
|
|
for(i=0; i<64; i+=4) { |
|
|
|
|
|
|
|
expanded[i+0] = __byte_perm(buffer[i/4],0,0x4440); |
|
|
|
|
|
|
|
expanded[i+1] = __byte_perm(buffer[i/4],0,0x4441); |
|
|
|
|
|
|
|
expanded[i+2] = __byte_perm(buffer[i/4],0,0x4442); |
|
|
|
|
|
|
|
expanded[i+3] = __byte_perm(buffer[i/4],0,0x4443); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
#pragma unroll 16 |
|
|
|
|
|
|
|
for(i=64; i<128; i+=4) { |
|
|
|
|
|
|
|
expanded[i+0] = 0; |
|
|
|
|
|
|
|
expanded[i+1] = 0; |
|
|
|
|
|
|
|
expanded[i+2] = 0; |
|
|
|
|
|
|
|
expanded[i+3] = 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FFT_256_halfzero(expanded); |
|
|
|
int hashPosition = nounce - startNounce; |
|
|
|
} |
|
|
|
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; |
|
|
|
|
|
|
|
|
|
|
|
/* Compression Function */ |
|
|
|
Compression1(Hash, hashPosition, g_fft4, g_state); |
|
|
|
SIMD_Compress(A, expanded, buffer); |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/* Padding Round with known input (hence the FFT can be precomputed) */ |
|
|
|
__global__ void __launch_bounds__(TPB,4) |
|
|
|
buffer[0] = 512; |
|
|
|
x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state) |
|
|
|
#pragma unroll 15 |
|
|
|
{ |
|
|
|
for (i=1; i < 16; i++) buffer[i] = 0; |
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
|
|
|
|
if (thread < threads) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); |
|
|
|
|
|
|
|
|
|
|
|
SIMD_Compress(A, c_FFT, buffer); |
|
|
|
int hashPosition = nounce - startNounce; |
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll 16 |
|
|
|
Compression2(hashPosition, g_fft4, g_state); |
|
|
|
for (i=0; i < 16; i++) |
|
|
|
} |
|
|
|
hashval[i] = A[i]; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/***************************************************/ |
|
|
|
__global__ void __launch_bounds__(TPB,4) |
|
|
|
// Die Hash-Funktion |
|
|
|
x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, int *g_state) |
|
|
|
__global__ void x11_simd512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
if (thread < threads) |
|
|
|
if (thread < threads) |
|
|
@ -554,36 +632,60 @@ __global__ void x11_simd512_gpu_hash_64(int threads, uint32_t startNounce, uint6 |
|
|
|
int hashPosition = nounce - startNounce; |
|
|
|
int hashPosition = nounce - startNounce; |
|
|
|
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; |
|
|
|
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; |
|
|
|
|
|
|
|
|
|
|
|
SIMDHash(Hash, Hash); |
|
|
|
Final(Hash, hashPosition, g_fft4, g_state); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Setup-Funktionen |
|
|
|
// Setup-Funktionen |
|
|
|
__host__ void x11_simd512_cpu_init(int thr_id, int threads) |
|
|
|
__host__ void x11_simd512_cpu_init(int thr_id, int threads) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
cudaMalloc( &d_state[thr_id], 32*sizeof(int)*threads ); |
|
|
|
|
|
|
|
cudaMalloc( &d_temp4[thr_id], 64*sizeof(uint4)*threads ); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if 1 |
|
|
|
|
|
|
|
// Textur für 128 Bit Zugriffe |
|
|
|
|
|
|
|
cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>(); |
|
|
|
|
|
|
|
texRef1D_128.normalized = 0; |
|
|
|
|
|
|
|
texRef1D_128.filterMode = cudaFilterModePoint; |
|
|
|
|
|
|
|
texRef1D_128.addressMode[0] = cudaAddressModeClamp; |
|
|
|
|
|
|
|
cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads); |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); |
|
|
|
cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// CH |
|
|
|
|
|
|
|
cudaMemcpyToSymbol( d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
cudaMemcpyToSymbol( d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
cudaMemcpyToSymbol( d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
cudaMemcpyToSymbol( d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// cudaFuncSetCacheConfig(x11_simd512_gpu_compress1_64, cudaFuncCachePreferL1); |
|
|
|
|
|
|
|
// cudaFuncSetCacheConfig(x11_simd512_gpu_compress2_64, cudaFuncCachePreferL1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) |
|
|
|
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) |
|
|
|
{ |
|
|
|
{ |
|
|
|
const int threadsperblock = 256; |
|
|
|
const int threadsperblock = TPB; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Größe des dynamischen Shared Memory Bereichs |
|
|
|
|
|
|
|
size_t shared_size = 0; |
|
|
|
|
|
|
|
|
|
|
|
// berechne wie viele Thread Blocks wir brauchen |
|
|
|
// berechne wie viele Thread Blocks wir brauchen |
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
|
|
|
|
dim3 block(threadsperblock); |
|
|
|
dim3 block(threadsperblock); |
|
|
|
|
|
|
|
|
|
|
|
// Größe des dynamischen Shared Memory Bereichs |
|
|
|
dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8); |
|
|
|
size_t shared_size = 0; |
|
|
|
x11_simd512_gpu_expand_64<<<grid8, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); |
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// künstlich die Occupancy limitieren, um das totale Erschöpfen des Texture Cache zu vermeiden |
|
|
|
|
|
|
|
x11_simd512_gpu_compress1_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
|
|
|
|
x11_simd512_gpu_compress2_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
x11_simd512_gpu_final_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
x11_simd512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); |
|
|
|
|
|
|
|
MyStreamSynchronize(NULL, order, thr_id); |
|
|
|
MyStreamSynchronize(NULL, order, thr_id); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|