You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
111 lines
3.0 KiB
111 lines
3.0 KiB
/** |
|
* Simplified groestl512 big perm code |
|
* tpruvot - 2017 |
|
*/ |
|
|
|
#ifdef __INTELLISENSE__ |
|
#define __CUDA_ARCH__ 210 |
|
#define __CUDACC__ |
|
#include <cuda_helper.h> |
|
#include <cuda_texture_types.h> |
|
#define __byte_perm(a,b,c) (a) |
|
#define tex1Dfetch(t, n) (n) |
|
#endif |
|
|
|
// todo: merge with cuda_quark_groestl512_sm20.cu (used for groestl512-80) |
|
|
|
#if __CUDA_ARCH__ < 300 || defined(_DEBUG) |
|
|
|
#ifndef SPH_C32 |
|
#define SPH_C32(x) ((uint32_t)(x ## U)) |
|
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) |
|
#endif |
|
|
|
#ifndef PC32up |
|
#define PC32up(j, r) ((uint32_t)((j) + (r))) |
|
#define PC32dn(j, r) 0 |
|
#define QC32up(j, r) 0xFFFFFFFF |
|
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) |
|
#endif |
|
|
|
#define tT0up(x) tex1Dfetch(t0up1, x) |
|
#define tT0dn(x) tex1Dfetch(t0dn1, x) |
|
#define tT1up(x) tex1Dfetch(t1up1, x) |
|
#define tT1dn(x) tex1Dfetch(t1dn1, x) |
|
#define tT2up(x) tex1Dfetch(t2up1, x) |
|
#define tT2dn(x) tex1Dfetch(t2dn1, x) |
|
#define tT3up(x) tex1Dfetch(t3up1, x) |
|
#define tT3dn(x) tex1Dfetch(t3dn1, x) |
|
|
|
#undef B32_0 |
|
#define B32_0(x) ((x) & 0xFFu) |
|
|
|
__device__ __forceinline__ |
|
static void tex_groestl512_perm_P(uint32_t *a) |
|
{ |
|
#pragma unroll 1 |
|
for(int r=0; r<14; r++) |
|
{ |
|
uint32_t t[32]; |
|
|
|
#pragma unroll 16 |
|
for (int k=0; k<16; k++) |
|
a[(k*2)+0] ^= PC32up(k<< 4, r); |
|
|
|
#pragma unroll 16 |
|
for(int k=0; k<32; k+=2) |
|
{ |
|
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); |
|
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); |
|
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); |
|
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); |
|
|
|
t[k + 0] = tT0up( t0_0 ) ^ tT1up( t2_1 ) ^ tT2up( t4_2 ) ^ tT3up( t6_3 ) ^ |
|
tT0dn( t9_0 ) ^ tT1dn( t11_1 ) ^ tT2dn( t13_2 ) ^ tT3dn( t23_3 ); |
|
|
|
t[k + 1] = tT0dn( t0_0 ) ^ tT1dn( t2_1 ) ^ tT2dn( t4_2 ) ^ tT3dn( t6_3 ) ^ |
|
tT0up( t9_0 ) ^ tT1up( t11_1 ) ^ tT2up( t13_2 ) ^ tT3up( t23_3 ); |
|
} |
|
|
|
#pragma unroll 32 |
|
for(int k=0; k<32; k++) |
|
a[k] = t[k]; |
|
} |
|
} |
|
|
|
__device__ __forceinline__ |
|
static void tex_groestl512_perm_Q(uint32_t *a) |
|
{ |
|
#pragma unroll 1 |
|
for(int r=0; r<14; r++) |
|
{ |
|
uint32_t t[32]; |
|
|
|
#pragma unroll 16 |
|
for (int k=0; k<16; k++) { |
|
a[(k*2)+0] ^= QC32up(k<< 4, r); |
|
a[(k*2)+1] ^= QC32dn(k<< 4, r); |
|
} |
|
|
|
#pragma unroll 16 |
|
for(int k=0; k<32; k+=2) |
|
{ |
|
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); |
|
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); |
|
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); |
|
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]); |
|
|
|
t[k + 0] = tT0up( t2_0 ) ^ tT1up( t6_1 ) ^ tT2up( t10_2 ) ^ tT3up( t22_3 ) ^ |
|
tT0dn( t1_0 ) ^ tT1dn( t5_1 ) ^ tT2dn( t9_2 ) ^ tT3dn( t13_3 ); |
|
|
|
t[k + 1] = tT0dn( t2_0 ) ^ tT1dn( t6_1 ) ^ tT2dn( t10_2 ) ^ tT3dn( t22_3 ) ^ |
|
tT0up( t1_0 ) ^ tT1up( t5_1 ) ^ tT2up( t9_2 ) ^ tT3up( t13_3 ); |
|
} |
|
|
|
#pragma unroll 32 |
|
for(int k=0; k<32; k++) |
|
a[k] = t[k]; |
|
} |
|
} |
|
|
|
#endif
|
|
|