mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-25 14:04:34 +00:00
uint2: fix SM 3.0 ROR and ROL
Not sure its the fastest way, but it works for offsets 0-63 + 64 Also note than asm SM 3.5+ doesn't support ROR with offset 64
This commit is contained in:
parent
2dce6733a7
commit
da2e2528a7
@ -427,32 +427,41 @@ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// uint2 method
|
// uint2 ROR/ROL methods
|
||||||
#if __CUDA_ARCH__ >= 350
|
__device__ __inline__ uint2 ROR2(const uint2 a, const int offset)
|
||||||
__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) {
|
{
|
||||||
uint2 result;
|
uint2 result;
|
||||||
|
#if __CUDA_ARCH__ > 300
|
||||||
if (offset < 32) {
|
if (offset < 32) {
|
||||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||||
}
|
} else /* if (offset < 64) */ {
|
||||||
else {
|
/* offset SHOULD BE < 64 ! */
|
||||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||||
}
|
}
|
||||||
return result;
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
__device__ __inline__ uint2 ROR2(const uint2 v, const int n) {
|
if (!offset)
|
||||||
uint2 result;
|
result = a;
|
||||||
result.x = (((v.x) >> (n)) | ((v.x) << (64 - (n))));
|
else if (offset < 32) {
|
||||||
result.y = (((v.y) >> (n)) | ((v.y) << (64 - (n))));
|
result.y = ((a.y >> offset) | (a.x << (32 - offset)));
|
||||||
|
result.x = ((a.x >> offset) | (a.y << (32 - offset)));
|
||||||
|
} else if (offset == 32) {
|
||||||
|
result.y = a.x;
|
||||||
|
result.x = a.y;
|
||||||
|
} else {
|
||||||
|
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset)));
|
||||||
|
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset)));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= 350
|
|
||||||
__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
|
__inline__ __device__ uint2 ROL2(const uint2 a, const int offset)
|
||||||
|
{
|
||||||
uint2 result;
|
uint2 result;
|
||||||
|
#if __CUDA_ARCH__ > 300
|
||||||
if (offset >= 32) {
|
if (offset >= 32) {
|
||||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||||
@ -461,16 +470,13 @@ __inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
|
|||||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
|
||||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
|
||||||
}
|
}
|
||||||
return result;
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
__inline__ __device__ uint2 ROL2(const uint2 v, const int n) {
|
if (!offset)
|
||||||
uint2 result;
|
result = a;
|
||||||
result.x = (((v.x) << (n)) | ((v.x) >> (64 - (n))));
|
else
|
||||||
result.y = (((v.y) << (n)) | ((v.y) >> (64 - (n))));
|
result = ROR2(a, 64 - offset);
|
||||||
|
#endif
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#endif // #ifndef CUDA_HELPER_H
|
#endif // #ifndef CUDA_HELPER_H
|
||||||
|
Loading…
x
Reference in New Issue
Block a user