Browse Source

whirlpool: x64 asm is very slow (30ms win32 vs 90)

master
Tanguy Pruvot 10 years ago
parent
commit
3f6ebc10cc
  1. 25
      cuda_helper.h

25
cuda_helper.h

@ -123,16 +123,26 @@ do { \
} while (0) } while (0)
/*********************************************************************/ /*********************************************************************/
#ifdef _WIN64
#define USE_XOR_ASM_OPTS 0
#else
#define USE_XOR_ASM_OPTS 1
#endif
#if USE_XOR_ASM_OPTS
// device asm for whirpool // device asm for whirpool
__device__ __forceinline__ __device__ __forceinline__
uint64_t xor1(uint64_t a, uint64_t b) uint64_t xor1(uint64_t a, uint64_t b)
{ {
uint64_t result; uint64_t result;
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a),"l"(b)); asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b));
return result; return result;
} }
#else
#define xor1(a,b) (a ^ b)
#endif
#if USE_XOR_ASM_OPTS
// device asm for whirpool // device asm for whirpool
__device__ __forceinline__ __device__ __forceinline__
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
@ -146,7 +156,11 @@ uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
: "=l"(result) : "l"(a) ,"l"(b),"l"(c)); : "=l"(result) : "l"(a) ,"l"(b),"l"(c));
return result; return result;
} }
#else
#define xor3(a,b,c) (a ^ b ^ c)
#endif
#if USE_XOR_ASM_OPTS
// device asm for whirpool // device asm for whirpool
__device__ __forceinline__ __device__ __forceinline__
uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h) uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
@ -161,6 +175,9 @@ uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
return result; return result;
} }
#else
#define xor8(a,b,c,d,e,f,g,h) (a^b^c^d^e^f^g^h)
#endif
// device asm for whirpool // device asm for whirpool
__device__ __forceinline__ __device__ __forceinline__
@ -189,7 +206,7 @@ uint64_t sph_t64(uint64_t x)
return result; return result;
} }
// device asm for whirpool // device asm for ?
__device__ __forceinline__ __device__ __forceinline__
uint64_t andor(uint64_t a, uint64_t b, uint64_t c) uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
{ {
@ -205,7 +222,7 @@ uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
return result; return result;
} }
// device asm for whirpool // device asm for ?
__device__ __forceinline__ __device__ __forceinline__
uint64_t shr_t64(uint64_t x, uint32_t n) uint64_t shr_t64(uint64_t x, uint32_t n)
{ {
@ -219,7 +236,7 @@ uint64_t shr_t64(uint64_t x, uint32_t n)
return result; return result;
} }
// device asm for whirpool // device asm for ?
__device__ __forceinline__ __device__ __forceinline__
uint64_t shl_t64(uint64_t x, uint32_t n) uint64_t shl_t64(uint64_t x, uint32_t n)
{ {

Loading…
Cancel
Save