optimize x11 simd512 (+100KH/s)

change picked from tsiv repo
This commit is contained in:
Tanguy Pruvot 2014-10-25 22:12:22 +02:00
parent 1b241df5c0
commit b465fe6825

View File

@ -5,7 +5,7 @@
// //
// STEP8_IF und STEP8_MAJ beinhalten je zwei 8-fach parallele Operationen // STEP8_IF und STEP8_MAJ beinhalten je zwei 8-fach parallele Operationen
#define TPB 256 #define TPB 64
#include "cuda_helper.h" #include "cuda_helper.h"
@ -18,52 +18,58 @@ uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel // texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128; texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
__constant__ uint32_t c_IV_512[32]; __constant__ uint32_t c_perm[8][8];
const uint32_t h_IV_512[32] = { const uint32_t h_perm[8][8] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, { 2,3,6,7,0,1,4,5 },
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, { 6,7,2,3,4,5,0,1 },
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, { 7,6,5,4,3,2,1,0 },
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 { 1,0,3,2,5,4,7,6 },
{ 0,1,4,5,6,7,2,3 },
{ 6,7,2,3,0,1,4,5 },
{ 6,7,0,1,4,5,2,3 },
{ 4,5,2,3,6,7,0,1 }
}; };
__constant__ int c_FFT128_8_16_Twiddle[128]; __constant__ uint32_t c_IV_512[32];
static const int h_FFT128_8_16_Twiddle[128] = { const uint32_t h_IV_512[32] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111, 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122, };
1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62,
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79};
__constant__ int c_FFT128_8_16_Twiddle[128];
static const int h_FFT128_8_16_Twiddle[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111,
1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122,
1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62,
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79
};
__constant__ int c_FFT256_2_128_Twiddle[128]; __constant__ int c_FFT256_2_128_Twiddle[128];
static const int h_FFT256_2_128_Twiddle[128] = { static const int h_FFT256_2_128_Twiddle[128] = {
1, 41, -118, 45, 46, 87, -31, 14, 1, 41, -118, 45, 46, 87, -31, 14,
60, -110, 116, -127, -67, 80, -61, 69, 60, -110, 116, -127, -67, 80, -61, 69,
2, 82, 21, 90, 92, -83, -62, 28, 2, 82, 21, 90, 92, -83, -62, 28,
120, 37, -25, 3, 123, -97, -122, -119, 120, 37, -25, 3, 123, -97, -122, -119,
4, -93, 42, -77, -73, 91, -124, 56, 4, -93, 42, -77, -73, 91, -124, 56,
-17, 74, -50, 6, -11, 63, 13, 19, -17, 74, -50, 6, -11, 63, 13, 19,
8, 71, 84, 103, 111, -75, 9, 112, 8, 71, 84, 103, 111, -75, 9, 112,
-34, -109, -100, 12, -22, 126, 26, 38, -34, -109, -100, 12, -22, 126, 26, 38,
16, -115, -89, -51, -35, 107, 18, -33, 16, -115, -89, -51, -35, 107, 18, -33,
-68, 39, 57, 24, -44, -5, 52, 76, -68, 39, 57, 24, -44, -5, 52, 76,
32, 27, 79, -102, -70, -43, 36, -66, 32, 27, 79, -102, -70, -43, 36, -66,
121, 78, 114, 48, -88, -10, 104, -105, 121, 78, 114, 48, -88, -10, 104, -105,
64, 54, -99, 53, 117, -86, 72, 125, 64, 54, -99, 53, 117, -86, 72, 125,
-15, -101, -29, 96, 81, -20, -49, 47, -15, -101, -29, 96, 81, -20, -49, 47,
128, 108, 59, 106, -23, 85, -113, -7, 128, 108, 59, 106, -23, 85, -113, -7,
-30, 55, -58, -65, -95, -40, -98, 94}; -30, 55, -58, -65, -95, -40, -98, 94
};
#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \
((x)%7) == 1 ? 6 : \
((x)%7) == 2 ? 2 : \
((x)%7) == 3 ? 3 : \
((x)%7) == 4 ? 5 : \
((x)%7) == 5 ? 7 : \
4 )
/************* the round function ****************/ /************* the round function ****************/
@ -349,20 +355,19 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 // 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
// 2 6 0 4 // 2 6 0 4
const int perm0[8] = { 2,3,6,7,0,1,4,5 }; // TODO: das landet im lmem. doof.
P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm0[threadIdx.x&7], 8); vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8);
g_temp4[threadIdx.x&7] = vec0; g_temp4[threadIdx.x&7] = vec0;
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 // 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
@ -371,20 +376,19 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 // 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// 6 2 4 0 // 6 2 4 0
const int perm1[8] = { 6,7,2,3,4,5,0,1 }; // TODO: das landet im lmem. doof.
P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2;
Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm1[threadIdx.x&7], 8); vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8);
g_temp4[8+(threadIdx.x&7)] = vec0; g_temp4[8+(threadIdx.x&7)] = vec0;
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 // 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
@ -393,22 +397,21 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 // 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
// 7 5 3 1 // 7 5 3 1
const int perm2[8] = { 7,6,5,4,3,2,1,0 }; // TODO: das landet im lmem. doof.
bool hi = (threadIdx.x&7)>=4; bool hi = (threadIdx.x&7)>=4;
P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm2[threadIdx.x&7], 8); vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8);
g_temp4[16+(threadIdx.x&7)] = vec0; g_temp4[16+(threadIdx.x&7)] = vec0;
// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 // 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
@ -417,22 +420,21 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 // 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
// 1 3 5 7 // 1 3 5 7
const int perm3[8] = { 1,0,3,2,5,4,7,6 }; // TODO: das landet im lmem. doof.
bool lo = (threadIdx.x&7)<4; bool lo = (threadIdx.x&7)<4;
P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2;
Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), perm3[threadIdx.x&7], 8); vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8);
g_temp4[24+(threadIdx.x&7)] = vec0; g_temp4[24+(threadIdx.x&7)] = vec0;
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
@ -445,26 +447,25 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 }, //{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 },
//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 }, //{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 },
const int perm4[8] = { 0,1,4,5,6,7,2,3 }; // TODO: das landet im lmem. doof.
bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5 bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5
P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8); P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8); Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8); P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8); Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8); P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8); Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8); P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8); Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm4[threadIdx.x&7], 8); vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8);
g_temp4[32+(threadIdx.x&7)] = vec0; g_temp4[32+(threadIdx.x&7)] = vec0;
@ -473,24 +474,23 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 // 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5
const int perm5[8] = { 6,7,2,3,0,1,4,5 }; // TODO: das landet im lmem. doof.
P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8); P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8); Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8); P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8); Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8); P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8); Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8); P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8);
Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8); Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm5[threadIdx.x&7], 8); vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8);
g_temp4[40+(threadIdx.x&7)] = vec0; g_temp4[40+(threadIdx.x&7)] = vec0;
@ -499,26 +499,25 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 // 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 // 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7
const int perm6[8] = { 6,7,0,1,4,5,2,3 }; // TODO: das landet im lmem. doof.
// sel markiert threads 2,3,4,5 // sel markiert threads 2,3,4,5
int t; int t;
t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8); t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8); t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8); t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8); t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm6[threadIdx.x&7], 8); vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8);
g_temp4[48+(threadIdx.x&7)] = vec0; g_temp4[48+(threadIdx.x&7)] = vec0;
@ -527,25 +526,24 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 // 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 // 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5
const int perm7[8] = { 4,5,2,3,6,7,0,1 }; // TODO: das landet im lmem. doof.
// sel markiert threads 2,3,4,5 // sel markiert threads 2,3,4,5
t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8);
t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8);
P = even? P1 : P2; Q = even? Q1 : Q2; P = even? P1 : P2; Q = even? Q1 : Q2;
vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), perm7[threadIdx.x&7], 8); vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8);
g_temp4[56+(threadIdx.x&7)] = vec0; g_temp4[56+(threadIdx.x&7)] = vec0;
@ -637,6 +635,7 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads)
texRef1D_128.addressMode[0] = cudaAddressModeClamp; texRef1D_128.addressMode[0] = cudaAddressModeClamp;
cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads); cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads);
cudaMemcpyToSymbol( c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);