From 3f6ebc10cc15521b84ef3e6adb419015ba9ca3f2 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 22 Aug 2014 00:12:13 +0200 Subject: [PATCH] whirlpool: x64 asm is very slow (30ms win32 vs 90) --- cuda_helper.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/cuda_helper.h b/cuda_helper.h index 8563737..db49b51 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -123,16 +123,26 @@ do { \ } while (0) /*********************************************************************/ +#ifdef _WIN64 +#define USE_XOR_ASM_OPTS 0 +#else +#define USE_XOR_ASM_OPTS 1 +#endif +#if USE_XOR_ASM_OPTS // device asm for whirpool __device__ __forceinline__ uint64_t xor1(uint64_t a, uint64_t b) { uint64_t result; - asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a),"l"(b)); + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b)); return result; } +#else +#define xor1(a,b) (a ^ b) +#endif +#if USE_XOR_ASM_OPTS // device asm for whirpool __device__ __forceinline__ uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) @@ -146,7 +156,11 @@ uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) : "=l"(result) : "l"(a) ,"l"(b),"l"(c)); return result; } +#else +#define xor3(a,b,c) (a ^ b ^ c) +#endif +#if USE_XOR_ASM_OPTS // device asm for whirpool __device__ __forceinline__ uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h) @@ -161,6 +175,9 @@ uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); return result; } +#else +#define xor8(a,b,c,d,e,f,g,h) (a^b^c^d^e^f^g^h) +#endif // device asm for whirpool __device__ __forceinline__ @@ -189,7 +206,7 @@ uint64_t sph_t64(uint64_t x) return result; } -// device asm for whirpool +// device asm for ? __device__ __forceinline__ uint64_t andor(uint64_t a, uint64_t b, uint64_t c) { @@ -205,7 +222,7 @@ uint64_t andor(uint64_t a, uint64_t b, uint64_t c) return result; } -// device asm for whirpool +// device asm for ? __device__ __forceinline__ uint64_t shr_t64(uint64_t x, uint32_t n) { @@ -219,7 +236,7 @@ uint64_t shr_t64(uint64_t x, uint32_t n) return result; } -// device asm for whirpool +// device asm for ? __device__ __forceinline__ uint64_t shl_t64(uint64_t x, uint32_t n) {