1
0
mirror of https://github.com/GOSTSec/ccminer synced 2025-01-10 14:57:53 +00:00

blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too

based on sp commit (but SWAP32 is already used for 32bit ints)
This commit is contained in:
Tanguy Pruvot 2014-11-09 01:13:28 +01:00
parent 2d98d127f8
commit a747e4ca0f
5 changed files with 26 additions and 12 deletions

View File

@ -36,7 +36,7 @@ extern const uint3 threadIdx;
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
#if __CUDA_ARCH__ < 320
// Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
@ -253,7 +253,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
#endif
// 64-bit ROTATE RIGHT
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
__device__ __forceinline__
uint64_t ROTR64(const uint64_t value, const int offset) {
@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
#endif
// 64-bit ROTATE LEFT
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
__device__ __forceinline__
uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
@ -342,4 +342,17 @@ uint64_t ROTL64(const uint64_t x, const int offset)
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
__device__ __forceinline__
uint64_t SWAPDWORDS(const uint64_t value)
{
#if __CUDA_ARCH__ >= 320
uint2 temp;
asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
return value;
#else
return ROTL64(value, 32);
#endif
}
#endif // #ifndef CUDA_HELPER_H

View File

@ -66,7 +66,7 @@ const uint64_t host_u512[16] =
#define G(a,b,c,d,e) \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
v[d] = ROTR64( v[d] ^ v[a],32); \
v[d] = SWAPDWORDS( v[d] ^ v[a]); \
v[c] += v[d]; \
v[b] = ROTR64( v[b] ^ v[c],25); \
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \

View File

@ -112,7 +112,7 @@ const uint64_t c_u512[16] =
uint32_t idx1 = c_sigma[i][x]; \
uint32_t idx2 = c_sigma[i][x+1]; \
v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[d] = SWAPDWORDS(v[d] ^ v[a]); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 25); \
v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \

View File

@ -12,12 +12,14 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \
q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + ROTL64(q[i+7], 32) + \
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
#define CONST_EXP2 \
q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \
q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPDWORDS(q[i+7]) + \
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__device__ void Compression512(uint64_t *msg, uint64_t *hash)
__device__
void Compression512(uint64_t *msg, uint64_t *hash)
{
// Compression ref. implementation
uint64_t tmp;

View File

@ -7,7 +7,6 @@
#define USE_SHUFFLE 0
// die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
@ -51,7 +50,7 @@ const uint64_t c_u512[16] =
uint32_t idx1 = sigma[i][x]; \
uint32_t idx2 = sigma[i][x+1]; \
v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
v[d] = ROTR( v[d] ^ v[a], 32); \
v[d] = SWAPDWORDS(v[d] ^ v[a]); \
v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c], 25); \
v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \