|
|
@ -113,9 +113,135 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) |
|
|
|
(((uint64_t)(x) & 0x00000000000000ffULL) << 56))) |
|
|
|
(((uint64_t)(x) & 0x00000000000000ffULL) << 56))) |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
|
|
|
/*********************************************************************/ |
|
|
|
#if __CUDA_ARCH__ >= 350 |
|
|
|
// Macro to catch CUDA errors in CUDA runtime calls
|
|
|
|
__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) { |
|
|
|
#define CUDA_SAFE_CALL(call) \ |
|
|
|
|
|
|
|
do { \ |
|
|
|
|
|
|
|
cudaError_t err = call; \ |
|
|
|
|
|
|
|
if (cudaSuccess != err) { \ |
|
|
|
|
|
|
|
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ |
|
|
|
|
|
|
|
__FILE__, __LINE__, cudaGetErrorString(err) ); \ |
|
|
|
|
|
|
|
exit(EXIT_FAILURE); \ |
|
|
|
|
|
|
|
} \ |
|
|
|
|
|
|
|
} while (0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*********************************************************************/ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// device asm for whirpool
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t xor1(uint64_t a, uint64_t b) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a) ,"l"(b)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// device asm for whirpool
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("{\n\t" |
|
|
|
|
|
|
|
" .reg .u64 t1;\n\t" |
|
|
|
|
|
|
|
"xor.b64 t1, %2, %3;\n\t" |
|
|
|
|
|
|
|
"xor.b64 %0, %1, t1;\n\t" |
|
|
|
|
|
|
|
"}" |
|
|
|
|
|
|
|
: "=l"(result) : "l"(a) ,"l"(b),"l"(c)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// device asm for whirpool
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h)); |
|
|
|
|
|
|
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f)); |
|
|
|
|
|
|
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e)); |
|
|
|
|
|
|
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d)); |
|
|
|
|
|
|
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c)); |
|
|
|
|
|
|
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b)); |
|
|
|
|
|
|
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// device asm for whirpool
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t xandx(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("{\n\t" |
|
|
|
|
|
|
|
".reg .u64 m,n;\n\t" |
|
|
|
|
|
|
|
"xor.b64 m, %2,%3;\n\t" |
|
|
|
|
|
|
|
"and.b64 n, m,%1;\n\t" |
|
|
|
|
|
|
|
"xor.b64 %0, n,%3;\n\t" |
|
|
|
|
|
|
|
"}\n\t" |
|
|
|
|
|
|
|
: "=l"(result) : "l"(a), "l"(b), "l"(c)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// device asm for whirpool
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t sph_t64(uint64_t x) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("{\n\t" |
|
|
|
|
|
|
|
"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t" |
|
|
|
|
|
|
|
"}\n\t" |
|
|
|
|
|
|
|
: "=l"(result) : "l"(x)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// device asm for whirpool
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t andor(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("{\n\t" |
|
|
|
|
|
|
|
".reg .u64 m,n,o;\n\t" |
|
|
|
|
|
|
|
"and.b64 m, %1, %2;\n\t" |
|
|
|
|
|
|
|
" or.b64 n, %1, %2;\n\t" |
|
|
|
|
|
|
|
"and.b64 o, n, %3;\n\t" |
|
|
|
|
|
|
|
" or.b64 %0, m, o ;\n\t" |
|
|
|
|
|
|
|
"}\n\t" |
|
|
|
|
|
|
|
: "=l"(result) : "l"(a), "l"(b), "l"(c)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// device asm for whirpool
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t shr_t64(uint64_t x, uint32_t n) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("{\n\t" |
|
|
|
|
|
|
|
".reg .u64 m;\n\t" |
|
|
|
|
|
|
|
"shr.b64 m,%1,%2;\n\t" |
|
|
|
|
|
|
|
"and.b64 %0,m,0xFFFFFFFFFFFFFFFF;\n\t" |
|
|
|
|
|
|
|
"}\n\t" |
|
|
|
|
|
|
|
: "=l"(result) : "l"(x), "r"(n)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// device asm for whirpool
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t shl_t64(uint64_t x, uint32_t n) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("{\n\t" |
|
|
|
|
|
|
|
".reg .u64 m;\n\t" |
|
|
|
|
|
|
|
"shl.b64 m,%1,%2;\n\t" |
|
|
|
|
|
|
|
"and.b64 %0,m,0xFFFFFFFFFFFFFFFF;\n\t" |
|
|
|
|
|
|
|
"}\n\t" |
|
|
|
|
|
|
|
: "=l"(result) : "l"(x), "r"(n)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// 64-bit ROTATE RIGHT
|
|
|
|
|
|
|
|
#ifdef DJM_SM35_ROT64 |
|
|
|
|
|
|
|
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ |
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t ROTR64(const uint64_t value, const int offset) { |
|
|
|
uint2 result; |
|
|
|
uint2 result; |
|
|
|
if(offset < 32) { |
|
|
|
if(offset < 32) { |
|
|
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); |
|
|
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); |
|
|
@ -126,13 +252,32 @@ __device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offse |
|
|
|
} |
|
|
|
} |
|
|
|
return __double_as_longlong(__hiloint2double(result.y, result.x)); |
|
|
|
return __double_as_longlong(__hiloint2double(result.y, result.x)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#elif __CUDA_ARCH__ >= 120 |
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t ROTR64(const uint64_t x, const int offset) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("{\n\t" |
|
|
|
|
|
|
|
".reg .b64 lhs, rhs;\n\t" |
|
|
|
|
|
|
|
".reg .u32 amt2;\n\t" |
|
|
|
|
|
|
|
"shr.b64 lhs, %1, %2;\n\t" |
|
|
|
|
|
|
|
"sub.u32 amt2, 64, %2;\n\t" |
|
|
|
|
|
|
|
"shl.b64 rhs, %1, amt2;\n\t" |
|
|
|
|
|
|
|
"add.u64 %0, lhs, rhs;\n\t" |
|
|
|
|
|
|
|
"}\n\t" |
|
|
|
|
|
|
|
: "=l"(result) : "l"(x), "r"(offset)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
#else |
|
|
|
#else |
|
|
|
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) |
|
|
|
/* host */ |
|
|
|
|
|
|
|
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
|
|
|
// 64-bit ROTATE LEFT
|
|
|
|
#if __CUDA_ARCH__ >= 350 |
|
|
|
#ifdef DJM_SM35_ROT64 |
|
|
|
__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) { |
|
|
|
/* complicated sm >= 3.5 one, to bench */ |
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t ROTL64(const uint64_t value, const int offset) { |
|
|
|
uint2 result; |
|
|
|
uint2 result; |
|
|
|
if(offset >= 32) { |
|
|
|
if(offset >= 32) { |
|
|
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); |
|
|
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); |
|
|
@ -143,19 +288,25 @@ __device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offse |
|
|
|
} |
|
|
|
} |
|
|
|
return __double_as_longlong(__hiloint2double(result.y, result.x)); |
|
|
|
return __double_as_longlong(__hiloint2double(result.y, result.x)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#elif __CUDA_ARCH__ >= 120 |
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
|
|
|
uint64_t ROTL64(const uint64_t x, const int offset) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint64_t result; |
|
|
|
|
|
|
|
asm("{\n\t" |
|
|
|
|
|
|
|
".reg .b64 lhs, rhs;\n\t" |
|
|
|
|
|
|
|
".reg .u32 amt2;\n\t" |
|
|
|
|
|
|
|
"shl.b64 lhs, %1, %2;\n\t" |
|
|
|
|
|
|
|
"sub.u32 amt2, 64, %2;\n\t" |
|
|
|
|
|
|
|
"shr.b64 rhs, %1, amt2;\n\t" |
|
|
|
|
|
|
|
"add.u64 %0, lhs, rhs;\n\t" |
|
|
|
|
|
|
|
"}\n\t" |
|
|
|
|
|
|
|
: "=l"(result) : "l"(x), "r"(offset)); |
|
|
|
|
|
|
|
return result; |
|
|
|
|
|
|
|
} |
|
|
|
#else |
|
|
|
#else |
|
|
|
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) |
|
|
|
/* host */ |
|
|
|
|
|
|
|
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) |
|
|
|
#endif |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
// Macro to catch CUDA errors in CUDA runtime calls
|
|
|
|
|
|
|
|
#define CUDA_SAFE_CALL(call) \ |
|
|
|
|
|
|
|
do { \ |
|
|
|
|
|
|
|
cudaError_t err = call; \ |
|
|
|
|
|
|
|
if (cudaSuccess != err) { \ |
|
|
|
|
|
|
|
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ |
|
|
|
|
|
|
|
__FILE__, __LINE__, cudaGetErrorString(err) ); \ |
|
|
|
|
|
|
|
exit(EXIT_FAILURE); \ |
|
|
|
|
|
|
|
} \ |
|
|
|
|
|
|
|
} while (0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#endif // #ifndef CUDA_HELPER_H
|
|
|
|
#endif // #ifndef CUDA_HELPER_H
|
|
|
|