From da2e2528a778856300bbebd6171aa78b0feca228 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 19 Dec 2014 21:45:37 +0100 Subject: [PATCH] uint2: fix SM 3.0 ROR and ROL Not sure its the fastest way, but it works for offsets 0-63 + 64 Also note than asm SM 3.5+ doesn't support ROR with offset 64 --- cuda_helper.h | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/cuda_helper.h b/cuda_helper.h index d1bdc32..cc84dcb 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -427,32 +427,41 @@ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) return result; } -// uint2 method -#if __CUDA_ARCH__ >= 350 -__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) { +// uint2 ROR/ROL methods +__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) +{ uint2 result; +#if __CUDA_ARCH__ > 300 if (offset < 32) { asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); - } - else { + } else /* if (offset < 64) */ { + /* offset SHOULD BE < 64 ! */ asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); } - return result; -} #else -__device__ __inline__ uint2 ROR2(const uint2 v, const int n) { - uint2 result; - result.x = (((v.x) >> (n)) | ((v.x) << (64 - (n)))); - result.y = (((v.y) >> (n)) | ((v.y) << (64 - (n)))); + if (!offset) + result = a; + else if (offset < 32) { + result.y = ((a.y >> offset) | (a.x << (32 - offset))); + result.x = ((a.x >> offset) | (a.y << (32 - offset))); + } else if (offset == 32) { + result.y = a.x; + result.x = a.y; + } else { + result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); + result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); + } +#endif return result; } -#endif -#if __CUDA_ARCH__ >= 350 -__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) { + +__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) +{ uint2 result; +#if __CUDA_ARCH__ > 300 if (offset >= 32) { asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); @@ -461,16 +470,13 @@ __inline__ __device__ uint2 ROL2(const uint2 a, const int offset) { asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); } - return result; -} #else -__inline__ __device__ uint2 ROL2(const uint2 v, const int n) { - uint2 result; - result.x = (((v.x) << (n)) | ((v.x) >> (64 - (n)))); - result.y = (((v.y) << (n)) | ((v.y) >> (64 - (n)))); + if (!offset) + result = a; + else + result = ROR2(a, 64 - offset); +#endif return result; } -#endif - #endif // #ifndef CUDA_HELPER_H