You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
99 lines
4.3 KiB
99 lines
4.3 KiB
#ifndef CUDA_HELPER_H |
|
#define CUDA_HELPER_H |
|
|
|
#include <cuda_runtime.h> |
|
|
|
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI) |
|
{ |
|
#if __CUDA_ARCH__ >= 130 |
|
return __double_as_longlong(__hiloint2double(HI, LO)); |
|
#else |
|
return (unsigned long long)LO | (((unsigned long long)HI) << 32); |
|
#endif |
|
} |
|
|
|
// das Hi Word aus einem 64 Bit Typen extrahieren |
|
static __device__ uint32_t HIWORD(const uint64_t &x) { |
|
#if __CUDA_ARCH__ >= 130 |
|
return (uint32_t)__double2hiint(__longlong_as_double(x)); |
|
#else |
|
return (uint32_t)(x >> 32); |
|
#endif |
|
} |
|
|
|
// das Hi Word in einem 64 Bit Typen ersetzen |
|
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { |
|
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); |
|
} |
|
|
|
// das Lo Word aus einem 64 Bit Typen extrahieren |
|
static __device__ uint32_t LOWORD(const uint64_t &x) { |
|
#if __CUDA_ARCH__ >= 130 |
|
return (uint32_t)__double2loint(__longlong_as_double(x)); |
|
#else |
|
return (uint32_t)(x & 0xFFFFFFFFULL); |
|
#endif |
|
} |
|
|
|
// das Lo Word in einem 64 Bit Typen ersetzen |
|
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { |
|
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); |
|
} |
|
|
|
// Endian Drehung für 32 Bit Typen |
|
static __device__ uint32_t cuda_swab32(uint32_t x) |
|
{ |
|
return __byte_perm(x, x, 0x0123); |
|
} |
|
|
|
// Endian Drehung für 64 Bit Typen |
|
static __device__ uint64_t cuda_swab64(uint64_t x) { |
|
return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x))); |
|
} |
|
|
|
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt |
|
#if __CUDA_ARCH__ >= 350 |
|
__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) { |
|
uint2 result; |
|
if(offset < 32) { |
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); |
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); |
|
} else { |
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); |
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); |
|
} |
|
return __double_as_longlong(__hiloint2double(result.y, result.x)); |
|
} |
|
#else |
|
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) |
|
#endif |
|
|
|
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt |
|
#if __CUDA_ARCH__ >= 350 |
|
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { |
|
uint2 result; |
|
if(offset >= 32) { |
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); |
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); |
|
} else { |
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); |
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); |
|
} |
|
return __double_as_longlong(__hiloint2double(result.y, result.x)); |
|
} |
|
#else |
|
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) |
|
#endif |
|
|
|
// Macro to catch CUDA errors in CUDA runtime calls |
|
#define CUDA_SAFE_CALL(call) \ |
|
do { \ |
|
cudaError_t err = call; \ |
|
if (cudaSuccess != err) { \ |
|
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\ |
|
__FILE__, __LINE__, cudaGetErrorString(err) ); \ |
|
exit(EXIT_FAILURE); \ |
|
} \ |
|
} while (0) |
|
|
|
#endif // #ifndef CUDA_HELPER_H
|
|
|