__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u)
@ -1070,16 +1074,20 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t,
@@ -1070,16 +1074,20 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u)
@ -1094,16 +1102,20 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t,
@@ -1094,16 +1102,20 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u)
@ -1118,16 +1130,20 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t,
@@ -1118,16 +1130,20 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
@ -193,7 +195,6 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
@@ -193,7 +195,6 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
{
uint32_t w[8][8];
int code = i<2? 185: 233;
int a, b;
/*
* The FFT output y is in revbin permuted order,
@ -201,9 +202,9 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
@@ -201,9 +202,9 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
@ -244,27 +245,27 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
@@ -244,27 +245,27 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
__device__ __forceinline__
void FFT_8(int *y, int stripe)
{
/*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
#define X(i) y[stripe*i]
/*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
#define X(i) y[stripe*i]
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) do { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
#define DO_REDUCE_FULL_S(i) { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
}
#define BUTTERFLY(i,j,n) do { \
int u= X(i); \
int v= X(j); \
X(i) = u+v; \
X(j) = (u-v) << (2*n); \
} while(0)
#define BUTTERFLY(i,j,n) { \
int u= X(i); \
int v= X(j); \
X(i) = u+v; \
X(j) = (u-v) << (2*n); \
}
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
@ -295,10 +296,10 @@ void FFT_8(int *y, int stripe)
@@ -295,10 +296,10 @@ void FFT_8(int *y, int stripe)
DO_REDUCE_FULL_S(6);
DO_REDUCE_FULL_S(7);
#undef X
#undef DO_REDUCE
#undef DO_REDUCE_FULL_S
#undef BUTTERFLY
#undef X
#undef DO_REDUCE
#undef DO_REDUCE_FULL_S
#undef BUTTERFLY
}
__device__ __forceinline__
@ -315,19 +316,17 @@ void FFT_16(int *y, int stripe)
@@ -315,19 +316,17 @@ void FFT_16(int *y, int stripe)
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) \
do { \
#define DO_REDUCE_FULL_S(i) { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
}
#define BUTTERFLY(i,j,n) \
do { \
#define BUTTERFLY(i,j,n) { \
int u= X(i); \
int v= X(j); \
X(i) = u+v; \
X(j) = (u-v) << n; \
} while(0)
}
BUTTERFLY(0, 8, 0);
BUTTERFLY(1, 9, 1);
@ -396,10 +395,10 @@ void FFT_16(int *y, int stripe)
@@ -396,10 +395,10 @@ void FFT_16(int *y, int stripe)