1
0
mirror of https://github.com/GOSTSec/ccminer synced 2025-01-10 23:08:02 +00:00

uint2: fix SM 3.0 ROR and ROL

Not sure its the fastest way, but it works for offsets 0-63 + 64

Also note than asm SM 3.5+ doesn't support ROR with offset 64
This commit is contained in:
Tanguy Pruvot 2014-12-19 21:45:37 +01:00
parent 2dce6733a7
commit da2e2528a7

View File

@ -427,32 +427,41 @@ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
return result; return result;
} }
// uint2 method // uint2 ROR/ROL methods
#if __CUDA_ARCH__ >= 350 __device__ __inline__ uint2 ROR2(const uint2 a, const int offset)
__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) { {
uint2 result; uint2 result;
#if __CUDA_ARCH__ > 300
if (offset < 32) { if (offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
} } else /* if (offset < 64) */ {
else { /* offset SHOULD BE < 64 ! */
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
} }
return result;
}
#else #else
__device__ __inline__ uint2 ROR2(const uint2 v, const int n) { if (!offset)
uint2 result; result = a;
result.x = (((v.x) >> (n)) | ((v.x) << (64 - (n)))); else if (offset < 32) {
result.y = (((v.y) >> (n)) | ((v.y) << (64 - (n)))); result.y = ((a.y >> offset) | (a.x << (32 - offset)));
return result; result.x = ((a.x >> offset) | (a.y << (32 - offset)));
} else if (offset == 32) {
result.y = a.x;
result.x = a.y;
} else {
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
} }
#endif #endif
return result;
}
#if __CUDA_ARCH__ >= 350
__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) { __inline__ __device__ uint2 ROL2(const uint2 a, const int offset)
{
uint2 result; uint2 result;
#if __CUDA_ARCH__ > 300
if (offset >= 32) { if (offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
@ -461,16 +470,13 @@ __inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
} }
return result;
}
#else #else
__inline__ __device__ uint2 ROL2(const uint2 v, const int n) { if (!offset)
uint2 result; result = a;
result.x = (((v.x) << (n)) | ((v.x) >> (64 - (n)))); else
result.y = (((v.y) << (n)) | ((v.y) >> (64 - (n)))); result = ROR2(a, 64 - offset);
#endif
return result; return result;
} }
#endif
#endif // #ifndef CUDA_HELPER_H #endif // #ifndef CUDA_HELPER_H