forgot this file in previous commit
This commit is contained in:
parent
6c8eff98c0
commit
f22ae4ebde
86
cuda_helper.h
Normal file
86
cuda_helper.h
Normal file
@ -0,0 +1,86 @@
|
||||
#ifndef CUDA_HELPER_H
|
||||
#define CUDA_HELPER_H
|
||||
|
||||
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return __double_as_longlong(__hiloint2double(HI, LO));
|
||||
#else
|
||||
return (unsigned long long)LO | (((unsigned long long)HI) << 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Hi Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t HIWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x >> 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Hi Word in einem 64 Bit Typen ersetzen
|
||||
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
|
||||
}
|
||||
|
||||
// das Lo Word aus einem 64 Bit Typen extrahieren
|
||||
static __device__ uint32_t LOWORD(const uint64_t &x) {
|
||||
#if __CUDA_ARCH__ >= 130
|
||||
return (uint32_t)__double2loint(__longlong_as_double(x));
|
||||
#else
|
||||
return (uint32_t)(x & 0xFFFFFFFFULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
// das Lo Word in einem 64 Bit Typen ersetzen
|
||||
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
|
||||
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
|
||||
}
|
||||
|
||||
// Endian Drehung für 32 Bit Typen
|
||||
static __device__ uint32_t cuda_swab32(uint32_t x)
|
||||
{
|
||||
return __byte_perm(x, x, 0x0123);
|
||||
}
|
||||
|
||||
// Endian Drehung für 64 Bit Typen
|
||||
static __device__ uint64_t cuda_swab64(uint64_t x) {
|
||||
return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
|
||||
}
|
||||
|
||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset < 32) {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#else
|
||||
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
|
||||
#endif
|
||||
|
||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||
uint2 result;
|
||||
if(offset >= 32) {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
} else {
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
|
||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||
}
|
||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||
}
|
||||
#else
|
||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||
#endif
|
||||
|
||||
#endif // #ifndef CUDA_HELPER_H
|
Loading…
Reference in New Issue
Block a user