You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
112 lines
3.0 KiB
112 lines
3.0 KiB
8 years ago
|
/**
|
||
|
* Simplified groestl512 big perm code
|
||
|
* tpruvot - 2017
|
||
|
*/
|
||
|
|
||
|
#ifdef __INTELLISENSE__
|
||
|
#define __CUDA_ARCH__ 210
|
||
|
#define __CUDACC__
|
||
|
#include <cuda_helper.h>
|
||
|
#include <cuda_texture_types.h>
|
||
|
#define __byte_perm(a,b,c) (a)
|
||
|
#define tex1Dfetch(t, n) (n)
|
||
|
#endif
|
||
|
|
||
|
// todo: merge with cuda_quark_groestl512_sm20.cu (used for groestl512-80)
|
||
|
|
||
|
#if __CUDA_ARCH__ < 300 || defined(_DEBUG)
|
||
|
|
||
|
#ifndef SPH_C32
|
||
|
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||
|
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||
|
#endif
|
||
|
|
||
|
#ifndef PC32up
|
||
|
#define PC32up(j, r) ((uint32_t)((j) + (r)))
|
||
|
#define PC32dn(j, r) 0
|
||
|
#define QC32up(j, r) 0xFFFFFFFF
|
||
|
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
|
||
|
#endif
|
||
|
|
||
|
#define tT0up(x) tex1Dfetch(t0up1, x)
|
||
|
#define tT0dn(x) tex1Dfetch(t0dn1, x)
|
||
|
#define tT1up(x) tex1Dfetch(t1up1, x)
|
||
|
#define tT1dn(x) tex1Dfetch(t1dn1, x)
|
||
|
#define tT2up(x) tex1Dfetch(t2up1, x)
|
||
|
#define tT2dn(x) tex1Dfetch(t2dn1, x)
|
||
|
#define tT3up(x) tex1Dfetch(t3up1, x)
|
||
|
#define tT3dn(x) tex1Dfetch(t3dn1, x)
|
||
|
|
||
|
#undef B32_0
|
||
|
#define B32_0(x) ((x) & 0xFFu)
|
||
|
|
||
|
__device__ __forceinline__
|
||
|
static void tex_groestl512_perm_P(uint32_t *a)
|
||
|
{
|
||
|
#pragma unroll 1
|
||
|
for(int r=0; r<14; r++)
|
||
|
{
|
||
|
uint32_t t[32];
|
||
|
|
||
|
#pragma unroll 16
|
||
|
for (int k=0; k<16; k++)
|
||
|
a[(k*2)+0] ^= PC32up(k<< 4, r);
|
||
|
|
||
|
#pragma unroll 16
|
||
|
for(int k=0; k<32; k+=2)
|
||
|
{
|
||
|
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
|
||
|
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
|
||
|
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
|
||
|
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
|
||
|
|
||
|
t[k + 0] = tT0up( t0_0 ) ^ tT1up( t2_1 ) ^ tT2up( t4_2 ) ^ tT3up( t6_3 ) ^
|
||
|
tT0dn( t9_0 ) ^ tT1dn( t11_1 ) ^ tT2dn( t13_2 ) ^ tT3dn( t23_3 );
|
||
|
|
||
|
t[k + 1] = tT0dn( t0_0 ) ^ tT1dn( t2_1 ) ^ tT2dn( t4_2 ) ^ tT3dn( t6_3 ) ^
|
||
|
tT0up( t9_0 ) ^ tT1up( t11_1 ) ^ tT2up( t13_2 ) ^ tT3up( t23_3 );
|
||
|
}
|
||
|
|
||
|
#pragma unroll 32
|
||
|
for(int k=0; k<32; k++)
|
||
|
a[k] = t[k];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__device__ __forceinline__
|
||
|
static void tex_groestl512_perm_Q(uint32_t *a)
|
||
|
{
|
||
|
#pragma unroll 1
|
||
|
for(int r=0; r<14; r++)
|
||
|
{
|
||
|
uint32_t t[32];
|
||
|
|
||
|
#pragma unroll 16
|
||
|
for (int k=0; k<16; k++) {
|
||
|
a[(k*2)+0] ^= QC32up(k<< 4, r);
|
||
|
a[(k*2)+1] ^= QC32dn(k<< 4, r);
|
||
|
}
|
||
|
|
||
|
#pragma unroll 16
|
||
|
for(int k=0; k<32; k+=2)
|
||
|
{
|
||
|
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
|
||
|
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
|
||
|
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
|
||
|
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
|
||
|
|
||
|
t[k + 0] = tT0up( t2_0 ) ^ tT1up( t6_1 ) ^ tT2up( t10_2 ) ^ tT3up( t22_3 ) ^
|
||
|
tT0dn( t1_0 ) ^ tT1dn( t5_1 ) ^ tT2dn( t9_2 ) ^ tT3dn( t13_3 );
|
||
|
|
||
|
t[k + 1] = tT0dn( t2_0 ) ^ tT1dn( t6_1 ) ^ tT2dn( t10_2 ) ^ tT3dn( t22_3 ) ^
|
||
|
tT0up( t1_0 ) ^ tT1up( t5_1 ) ^ tT2up( t9_2 ) ^ tT3up( t13_3 );
|
||
|
}
|
||
|
|
||
|
#pragma unroll 32
|
||
|
for(int k=0; k<32; k++)
|
||
|
a[k] = t[k];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif
|