mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-10 14:57:53 +00:00
blake512: use a new SWAPDWORDS asm func (0.05ms)
small improvement, do it on pentablake and heavy variants too based on sp commit (but SWAP32 is already used for 32bit ints)
This commit is contained in:
parent
2d98d127f8
commit
a747e4ca0f
@ -36,7 +36,7 @@ extern const uint3 threadIdx;
|
|||||||
|
|
||||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||||
|
|
||||||
#if __CUDA_ARCH__ < 350
|
#if __CUDA_ARCH__ < 320
|
||||||
// Kepler (Compute 3.0)
|
// Kepler (Compute 3.0)
|
||||||
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||||
#else
|
#else
|
||||||
@ -253,7 +253,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 64-bit ROTATE RIGHT
|
// 64-bit ROTATE RIGHT
|
||||||
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
|
#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
|
||||||
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
|
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
uint64_t ROTR64(const uint64_t value, const int offset) {
|
uint64_t ROTR64(const uint64_t value, const int offset) {
|
||||||
@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 64-bit ROTATE LEFT
|
// 64-bit ROTATE LEFT
|
||||||
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
|
#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
uint64_t ROTL64(const uint64_t value, const int offset) {
|
uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||||
uint2 result;
|
uint2 result;
|
||||||
@ -342,4 +342,17 @@ uint64_t ROTL64(const uint64_t x, const int offset)
|
|||||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t SWAPDWORDS(const uint64_t value)
|
||||||
|
{
|
||||||
|
#if __CUDA_ARCH__ >= 320
|
||||||
|
uint2 temp;
|
||||||
|
asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
|
||||||
|
asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
|
||||||
|
return value;
|
||||||
|
#else
|
||||||
|
return ROTL64(value, 32);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
#endif // #ifndef CUDA_HELPER_H
|
#endif // #ifndef CUDA_HELPER_H
|
||||||
|
@ -66,7 +66,7 @@ const uint64_t host_u512[16] =
|
|||||||
|
|
||||||
#define G(a,b,c,d,e) \
|
#define G(a,b,c,d,e) \
|
||||||
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
|
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
|
||||||
v[d] = ROTR64( v[d] ^ v[a],32); \
|
v[d] = SWAPDWORDS( v[d] ^ v[a]); \
|
||||||
v[c] += v[d]; \
|
v[c] += v[d]; \
|
||||||
v[b] = ROTR64( v[b] ^ v[c],25); \
|
v[b] = ROTR64( v[b] ^ v[c],25); \
|
||||||
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \
|
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \
|
||||||
|
@ -112,7 +112,7 @@ const uint64_t c_u512[16] =
|
|||||||
uint32_t idx1 = c_sigma[i][x]; \
|
uint32_t idx1 = c_sigma[i][x]; \
|
||||||
uint32_t idx2 = c_sigma[i][x+1]; \
|
uint32_t idx2 = c_sigma[i][x+1]; \
|
||||||
v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
|
v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
|
||||||
v[d] = ROTR64(v[d] ^ v[a], 32); \
|
v[d] = SWAPDWORDS(v[d] ^ v[a]); \
|
||||||
v[c] += v[d]; \
|
v[c] += v[d]; \
|
||||||
v[b] = ROTR64(v[b] ^ v[c], 25); \
|
v[b] = ROTR64(v[b] ^ v[c], 25); \
|
||||||
v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \
|
v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \
|
||||||
|
@ -12,12 +12,14 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi
|
|||||||
#define SHL(x, n) ((x) << (n))
|
#define SHL(x, n) ((x) << (n))
|
||||||
#define SHR(x, n) ((x) >> (n))
|
#define SHR(x, n) ((x) >> (n))
|
||||||
|
|
||||||
#define CONST_EXP2 q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \
|
#define CONST_EXP2 \
|
||||||
q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + ROTL64(q[i+7], 32) + \
|
q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \
|
||||||
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
|
q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPDWORDS(q[i+7]) + \
|
||||||
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
|
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
|
||||||
|
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
|
||||||
|
|
||||||
__device__ void Compression512(uint64_t *msg, uint64_t *hash)
|
__device__
|
||||||
|
void Compression512(uint64_t *msg, uint64_t *hash)
|
||||||
{
|
{
|
||||||
// Compression ref. implementation
|
// Compression ref. implementation
|
||||||
uint64_t tmp;
|
uint64_t tmp;
|
||||||
|
@ -7,7 +7,6 @@
|
|||||||
|
|
||||||
#define USE_SHUFFLE 0
|
#define USE_SHUFFLE 0
|
||||||
|
|
||||||
// die Message it Padding zur Berechnung auf der GPU
|
|
||||||
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
|
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
|
||||||
|
|
||||||
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
|
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
|
||||||
@ -51,7 +50,7 @@ const uint64_t c_u512[16] =
|
|||||||
uint32_t idx1 = sigma[i][x]; \
|
uint32_t idx1 = sigma[i][x]; \
|
||||||
uint32_t idx2 = sigma[i][x+1]; \
|
uint32_t idx2 = sigma[i][x+1]; \
|
||||||
v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
|
v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
|
||||||
v[d] = ROTR( v[d] ^ v[a], 32); \
|
v[d] = SWAPDWORDS(v[d] ^ v[a]); \
|
||||||
v[c] += v[d]; \
|
v[c] += v[d]; \
|
||||||
v[b] = ROTR( v[b] ^ v[c], 25); \
|
v[b] = ROTR( v[b] ^ v[c], 25); \
|
||||||
v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \
|
v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \
|
||||||
|
Loading…
Reference in New Issue
Block a user