|
|
@ -898,7 +898,9 @@ __device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, con |
|
|
|
A[j] = R[j]; |
|
|
|
A[j] = R[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int j; |
|
|
|
int j; |
|
|
|
uint32_t temp; |
|
|
|
uint32_t temp; |
|
|
@ -928,7 +930,9 @@ __device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, con |
|
|
|
A[j] = R[j]; |
|
|
|
A[j] = R[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int j; |
|
|
|
int j; |
|
|
|
uint32_t temp; |
|
|
|
uint32_t temp; |
|
|
@ -958,7 +962,9 @@ __device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, con |
|
|
|
A[j] = R[j]; |
|
|
|
A[j] = R[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int j; |
|
|
|
int j; |
|
|
|
uint32_t temp; |
|
|
|
uint32_t temp; |
|
|
@ -988,7 +994,9 @@ __device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, cons |
|
|
|
A[j] = R[j]; |
|
|
|
A[j] = R[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int j; |
|
|
|
int j; |
|
|
|
uint32_t temp; |
|
|
|
uint32_t temp; |
|
|
@ -1018,7 +1026,9 @@ __device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, cons |
|
|
|
A[j] = R[j]; |
|
|
|
A[j] = R[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int j; |
|
|
|
int j; |
|
|
|
uint32_t temp; |
|
|
|
uint32_t temp; |
|
|
@ -1048,7 +1058,9 @@ __device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, cons |
|
|
|
A[j] = R[j]; |
|
|
|
A[j] = R[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int j; |
|
|
|
int j; |
|
|
|
uint32_t temp; |
|
|
|
uint32_t temp; |
|
|
@ -1078,8 +1090,9 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons |
|
|
|
A[j] = R[j]; |
|
|
|
A[j] = R[j]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
static __constant__ uint32_t d_cw0[8][8]; |
|
|
|
|
|
|
|
static const uint32_t h_cw0[8][8] = { |
|
|
|
__device__ __constant__ |
|
|
|
|
|
|
|
static const uint32_t d_cw0[8][8] = { |
|
|
|
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, |
|
|
|
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, |
|
|
|
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, |
|
|
|
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, |
|
|
|
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, |
|
|
|
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, |
|
|
@ -1089,10 +1102,8 @@ static const uint32_t h_cw0[8][8] = { |
|
|
|
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, |
|
|
|
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, |
|
|
|
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 |
|
|
|
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 |
|
|
|
}; |
|
|
|
}; |
|
|
|
__device__ __forceinline__ void Round8_0_final(uint32_t *A, |
|
|
|
__device__ __forceinline__ |
|
|
|
int r, int s, int t, int u) { |
|
|
|
void Round8_0_final(uint32_t *A, int r, int s, int t, int u) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]); |
|
|
@ -1102,8 +1113,9 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, |
|
|
|
STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); |
|
|
|
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); |
|
|
|
} |
|
|
|
} |
|
|
|
static __constant__ uint32_t d_cw1[8][8]; |
|
|
|
|
|
|
|
static const uint32_t h_cw1[8][8] = { |
|
|
|
__device__ __constant__ |
|
|
|
|
|
|
|
static const uint32_t d_cw1[8][8] = { |
|
|
|
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, |
|
|
|
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, |
|
|
|
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, |
|
|
|
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, |
|
|
|
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, |
|
|
|
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, |
|
|
@ -1113,10 +1125,8 @@ static const uint32_t h_cw1[8][8] = { |
|
|
|
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, |
|
|
|
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, |
|
|
|
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 |
|
|
|
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 |
|
|
|
}; |
|
|
|
}; |
|
|
|
__device__ __forceinline__ void Round8_1_final(uint32_t *A, |
|
|
|
__device__ __forceinline__ |
|
|
|
int r, int s, int t, int u) { |
|
|
|
void Round8_1_final(uint32_t *A, int r, int s, int t, int u) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]); |
|
|
@ -1126,8 +1136,9 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, |
|
|
|
STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); |
|
|
|
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); |
|
|
|
} |
|
|
|
} |
|
|
|
static __constant__ uint32_t d_cw2[8][8]; |
|
|
|
|
|
|
|
static const uint32_t h_cw2[8][8] = { |
|
|
|
__device__ __constant__ |
|
|
|
|
|
|
|
static const uint32_t d_cw2[8][8] = { |
|
|
|
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, |
|
|
|
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, |
|
|
|
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, |
|
|
|
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, |
|
|
|
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, |
|
|
|
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, |
|
|
@ -1137,10 +1148,8 @@ static const uint32_t h_cw2[8][8] = { |
|
|
|
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, |
|
|
|
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, |
|
|
|
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE |
|
|
|
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE |
|
|
|
}; |
|
|
|
}; |
|
|
|
__device__ __forceinline__ void Round8_2_final(uint32_t *A, |
|
|
|
__device__ __forceinline__ |
|
|
|
int r, int s, int t, int u) { |
|
|
|
void Round8_2_final(uint32_t *A, int r, int s, int t, int u) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]); |
|
|
@ -1150,8 +1159,9 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, |
|
|
|
STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); |
|
|
|
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); |
|
|
|
} |
|
|
|
} |
|
|
|
static __constant__ uint32_t d_cw3[8][8]; |
|
|
|
|
|
|
|
static const uint32_t h_cw3[8][8] = { |
|
|
|
__device__ __constant__ |
|
|
|
|
|
|
|
static const uint32_t d_cw3[8][8] = { |
|
|
|
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, |
|
|
|
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, |
|
|
|
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, |
|
|
|
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, |
|
|
|
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, |
|
|
|
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, |
|
|
@ -1161,10 +1171,8 @@ static const uint32_t h_cw3[8][8] = { |
|
|
|
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, |
|
|
|
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, |
|
|
|
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D |
|
|
|
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D |
|
|
|
}; |
|
|
|
}; |
|
|
|
__device__ __forceinline__ void Round8_3_final(uint32_t *A, |
|
|
|
__device__ __forceinline__ |
|
|
|
int r, int s, int t, int u) { |
|
|
|
void Round8_3_final(uint32_t *A, int r, int s, int t, int u) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); |
|
|
|
STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); |
|
|
|
STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); |
|
|
|
STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); |
|
|
@ -1182,8 +1190,8 @@ __device__ __forceinline__ void Round8_3_final(uint32_t *A, |
|
|
|
#define expanded_vector(x) __ldg(&g_fft4[x]) |
|
|
|
#define expanded_vector(x) __ldg(&g_fft4[x]) |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset, |
|
|
|
__device__ __forceinline__ |
|
|
|
int r, int s, int t, int u, uint4 *g_fft4) { |
|
|
|
void Round8_0(uint32_t *A, const int thr_offset, int r, int s, int t, int u, uint4 *g_fft4) { |
|
|
|
uint32_t w[8]; |
|
|
|
uint32_t w[8]; |
|
|
|
uint4 hv1, hv2; |
|
|
|
uint4 hv1, hv2; |
|
|
|
|
|
|
|
|
|
|
|