|
|
@ -231,6 +231,7 @@ uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t |
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint64_t xandx(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
uint64_t xandx(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint64_t result; |
|
|
|
uint64_t result; |
|
|
|
asm("{\n\t" |
|
|
|
asm("{\n\t" |
|
|
|
".reg .u64 n;\n\t" |
|
|
|
".reg .u64 n;\n\t" |
|
|
@ -240,24 +241,32 @@ uint64_t xandx(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
"}\n" |
|
|
|
"}\n" |
|
|
|
: "=l"(result) : "l"(a), "l"(b), "l"(c)); |
|
|
|
: "=l"(result) : "l"(a), "l"(b), "l"(c)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return ((b^c) & a) ^ c; |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// device asm for x17
|
|
|
|
// device asm for x17
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint64_t sph_t64(uint64_t x) |
|
|
|
uint64_t sph_t64(uint64_t x) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint64_t result; |
|
|
|
uint64_t result; |
|
|
|
asm("{\n\t" |
|
|
|
asm("{\n\t" |
|
|
|
"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t" |
|
|
|
"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t" |
|
|
|
"}\n" |
|
|
|
"}\n" |
|
|
|
: "=l"(result) : "l"(x)); |
|
|
|
: "=l"(result) : "l"(x)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return x; |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// device asm for x17
|
|
|
|
// device asm for x17
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint64_t andor(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
uint64_t andor(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint64_t result; |
|
|
|
uint64_t result; |
|
|
|
asm("{\n\t" |
|
|
|
asm("{\n\t" |
|
|
|
".reg .u64 m,n;\n\t" |
|
|
|
".reg .u64 m,n;\n\t" |
|
|
@ -268,33 +277,45 @@ uint64_t andor(uint64_t a, uint64_t b, uint64_t c) |
|
|
|
"}\n" |
|
|
|
"}\n" |
|
|
|
: "=l"(result) : "l"(a), "l"(b), "l"(c)); |
|
|
|
: "=l"(result) : "l"(a), "l"(b), "l"(c)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return ((a | b) & c) | (a & b); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// device asm for x17
|
|
|
|
// device asm for x17
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint64_t shr_t64(uint64_t x, uint32_t n) |
|
|
|
uint64_t shr_t64(uint64_t x, uint32_t n) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint64_t result; |
|
|
|
uint64_t result; |
|
|
|
asm("shr.b64 %0,%1,%2;\n\t" |
|
|
|
asm("shr.b64 %0,%1,%2;\n\t" |
|
|
|
"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */ |
|
|
|
"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */ |
|
|
|
: "=l"(result) : "l"(x), "r"(n)); |
|
|
|
: "=l"(result) : "l"(x), "r"(n)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return x >> n; |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// device asm for ?
|
|
|
|
// device asm for ?
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint64_t shl_t64(uint64_t x, uint32_t n) |
|
|
|
uint64_t shl_t64(uint64_t x, uint32_t n) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint64_t result; |
|
|
|
uint64_t result; |
|
|
|
asm("shl.b64 %0,%1,%2;\n\t" |
|
|
|
asm("shl.b64 %0,%1,%2;\n\t" |
|
|
|
"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */ |
|
|
|
"and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */ |
|
|
|
: "=l"(result) : "l"(x), "r"(n)); |
|
|
|
: "=l"(result) : "l"(x), "r"(n)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return x << n; |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// device asm 32 for pluck
|
|
|
|
// device asm 32 for pluck
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) { |
|
|
|
uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) { |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint32_t result; |
|
|
|
uint32_t result; |
|
|
|
asm("{ .reg .u32 m,n,o;\n\t" |
|
|
|
asm("{ .reg .u32 m,n,o;\n\t" |
|
|
|
"and.b32 m, %1, %2;\n\t" |
|
|
|
"and.b32 m, %1, %2;\n\t" |
|
|
@ -304,10 +325,15 @@ uint32_t andor32(uint32_t a, uint32_t b, uint32_t c) { |
|
|
|
"}\n\t" |
|
|
|
"}\n\t" |
|
|
|
: "=r"(result) : "r"(a), "r"(b), "r"(c)); |
|
|
|
: "=r"(result) : "r"(a), "r"(b), "r"(c)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
// unused on host...
|
|
|
|
|
|
|
|
return 0; |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) { |
|
|
|
uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) { |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint32_t result; |
|
|
|
uint32_t result; |
|
|
|
asm("{ .reg .u32 t1;\n\t" |
|
|
|
asm("{ .reg .u32 t1;\n\t" |
|
|
|
"xor.b32 t1, %2, %3;\n\t" |
|
|
|
"xor.b32 t1, %2, %3;\n\t" |
|
|
@ -315,20 +341,31 @@ uint32_t xor3b(uint32_t a, uint32_t b, uint32_t c) { |
|
|
|
"}" |
|
|
|
"}" |
|
|
|
: "=r"(result) : "r"(a) ,"r"(b),"r"(c)); |
|
|
|
: "=r"(result) : "r"(a) ,"r"(b),"r"(c)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return a^b^c; |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint32_t shr_t32(uint32_t x,uint32_t n) { |
|
|
|
uint32_t shr_t32(uint32_t x,uint32_t n) { |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint32_t result; |
|
|
|
uint32_t result; |
|
|
|
asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); |
|
|
|
asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return x >> n; |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
uint32_t shl_t32(uint32_t x,uint32_t n) { |
|
|
|
uint32_t shl_t32(uint32_t x,uint32_t n) { |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint32_t result; |
|
|
|
uint32_t result; |
|
|
|
asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); |
|
|
|
asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
return x << n; |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#ifndef USE_ROT_ASM_OPT |
|
|
|
#ifndef USE_ROT_ASM_OPT |
|
|
@ -442,24 +479,32 @@ uint64_t SWAPDWORDS(uint64_t value) |
|
|
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ |
|
|
|
__device__ __forceinline__ |
|
|
|
void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) { |
|
|
|
void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) { |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
asm("mov.b64 {%0,%1},%2; \n\t" |
|
|
|
asm("mov.b64 {%0,%1},%2; \n\t" |
|
|
|
: "=r"(lo), "=r"(hi) : "l"(x)); |
|
|
|
: "=r"(lo), "=r"(hi) : "l"(x)); |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
lo = _LOWORD(x); |
|
|
|
|
|
|
|
hi = _HIWORD(x); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ uint64_t devectorize(uint2 v) { return MAKE_ULONGLONG(v.x, v.y); } |
|
|
|
|
|
|
|
static __device__ __forceinline__ uint2 vectorize(uint64_t v) { |
|
|
|
static __device__ __forceinline__ uint2 vectorize(uint64_t v) { |
|
|
|
uint2 result; |
|
|
|
uint2 result; |
|
|
|
LOHI(result.x, result.y, v); |
|
|
|
LOHI(result.x, result.y, v); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ uint64_t devectorize(uint2 v) { return MAKE_ULONGLONG(v.x, v.y); } |
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } |
|
|
|
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } |
|
|
|
static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } |
|
|
|
static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } |
|
|
|
static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } |
|
|
|
static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } |
|
|
|
static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } |
|
|
|
static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } |
|
|
|
static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } |
|
|
|
static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } |
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) |
|
|
|
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint2 result; |
|
|
|
uint2 result; |
|
|
|
asm("{\n\t" |
|
|
|
asm("{\n\t" |
|
|
|
"add.cc.u32 %0,%2,%4; \n\t" |
|
|
|
"add.cc.u32 %0,%2,%4; \n\t" |
|
|
@ -467,11 +512,16 @@ static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) |
|
|
|
"}\n\t" |
|
|
|
"}\n\t" |
|
|
|
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); |
|
|
|
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
// incorrect but unused host equiv
|
|
|
|
|
|
|
|
return make_uint2(a.x + b.x, a.y + b.y); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; } |
|
|
|
static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; } |
|
|
|
|
|
|
|
|
|
|
|
static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) |
|
|
|
static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint2 result; |
|
|
|
uint2 result; |
|
|
|
asm("{\n\t" |
|
|
|
asm("{\n\t" |
|
|
|
"sub.cc.u32 %0,%2,%4; \n\t" |
|
|
|
"sub.cc.u32 %0,%2,%4; \n\t" |
|
|
@ -479,6 +529,10 @@ static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) |
|
|
|
"}\n\t" |
|
|
|
"}\n\t" |
|
|
|
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); |
|
|
|
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
// incorrect but unused host equiv
|
|
|
|
|
|
|
|
return make_uint2(a.x - b.x, a.y - b.y); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
@ -487,6 +541,7 @@ static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) |
|
|
|
static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
#ifdef __CUDA_ARCH__ |
|
|
|
uint2 result; |
|
|
|
uint2 result; |
|
|
|
asm("{\n\t" |
|
|
|
asm("{\n\t" |
|
|
|
"mul.lo.u32 %0,%2,%4; \n\t" |
|
|
|
"mul.lo.u32 %0,%2,%4; \n\t" |
|
|
@ -496,6 +551,10 @@ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) |
|
|
|
"}\n\t" |
|
|
|
"}\n\t" |
|
|
|
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); |
|
|
|
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); |
|
|
|
return result; |
|
|
|
return result; |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
// incorrect but unused host equiv
|
|
|
|
|
|
|
|
return make_uint2(a.x * b.x, a.y * b.y); |
|
|
|
|
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// uint2 ROR/ROL methods
|
|
|
|
// uint2 ROR/ROL methods
|
|
|
|