|
|
@ -10,6 +10,8 @@ |
|
|
|
#include <map> |
|
|
|
#include <map> |
|
|
|
|
|
|
|
|
|
|
|
#include <cuda_runtime.h> |
|
|
|
#include <cuda_runtime.h> |
|
|
|
|
|
|
|
#include <cuda_helper.h> |
|
|
|
|
|
|
|
|
|
|
|
#include "miner.h" |
|
|
|
#include "miner.h" |
|
|
|
|
|
|
|
|
|
|
|
#include "salsa_kernel.h" |
|
|
|
#include "salsa_kernel.h" |
|
|
@ -18,6 +20,12 @@ |
|
|
|
#define TEXWIDTH 32768 |
|
|
|
#define TEXWIDTH 32768 |
|
|
|
#define THREADS_PER_WU 4 // four threads per hash |
|
|
|
#define THREADS_PER_WU 4 // four threads per hash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 |
|
|
|
|
|
|
|
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane) |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
#define __shfl2 __shfl |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
typedef enum |
|
|
|
typedef enum |
|
|
|
{ |
|
|
|
{ |
|
|
|
ANDERSEN, |
|
|
|
ANDERSEN, |
|
|
@ -57,12 +65,12 @@ static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) |
|
|
|
return left; |
|
|
|
return left; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static __device__ uint4 __shfl(const uint4 bx, int target_thread) { |
|
|
|
static __device__ uint4 shfl4(const uint4 bx, int target_thread) { |
|
|
|
return make_uint4( |
|
|
|
return make_uint4( |
|
|
|
__shfl((int)bx.x, target_thread), |
|
|
|
__shfl2((int)bx.x, target_thread), |
|
|
|
__shfl((int)bx.y, target_thread), |
|
|
|
__shfl2((int)bx.y, target_thread), |
|
|
|
__shfl((int)bx.z, target_thread), |
|
|
|
__shfl2((int)bx.z, target_thread), |
|
|
|
__shfl((int)bx.w, target_thread) |
|
|
|
__shfl2((int)bx.w, target_thread) |
|
|
|
); |
|
|
|
); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -97,8 +105,8 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) |
|
|
|
|
|
|
|
|
|
|
|
if (SCHEME == ANDERSEN) { |
|
|
|
if (SCHEME == ANDERSEN) { |
|
|
|
int target_thread = (threadIdx.x + 4)%32; |
|
|
|
int target_thread = (threadIdx.x + 4)%32; |
|
|
|
uint4 t=b, t2=__shfl(bx, target_thread); |
|
|
|
uint4 t = b, t2 = shfl4(bx, target_thread); |
|
|
|
int t2_start = __shfl((int)start, target_thread) + 4; |
|
|
|
int t2_start = __shfl2((int)start, target_thread) + 4; |
|
|
|
bool c = (threadIdx.x & 0x4); |
|
|
|
bool c = (threadIdx.x & 0x4); |
|
|
|
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); |
|
|
|
*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); |
|
|
|
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); |
|
|
|
*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); |
|
|
@ -115,7 +123,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) |
|
|
|
|
|
|
|
|
|
|
|
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; |
|
|
|
if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; |
|
|
|
if (SCHEME == ANDERSEN) { |
|
|
|
if (SCHEME == ANDERSEN) { |
|
|
|
int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4; |
|
|
|
int t2_start = __shfl2((int)start, (threadIdx.x + 4)%32) + 4; |
|
|
|
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; } |
|
|
|
if (TEX_DIM > 0) { start /= 4; t2_start /= 4; } |
|
|
|
bool c = (threadIdx.x & 0x4); |
|
|
|
bool c = (threadIdx.x & 0x4); |
|
|
|
if (TEX_DIM == 0) { |
|
|
|
if (TEX_DIM == 0) { |
|
|
@ -129,7 +137,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) |
|
|
|
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH)); |
|
|
|
bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH)); |
|
|
|
} |
|
|
|
} |
|
|
|
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx); |
|
|
|
uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx); |
|
|
|
bx = __shfl(bx, (threadIdx.x + 28)%32); |
|
|
|
bx = shfl4(bx, (threadIdx.x + 28)%32); |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start])); |
|
|
|
if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start])); |
|
|
|
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4); |
|
|
|
else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4); |
|
|
@ -149,14 +157,15 @@ void primary_order_shuffle(uint4 &b, uint4 &bx) |
|
|
|
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
|
|
|
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); |
|
|
|
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
|
|
|
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); |
|
|
|
|
|
|
|
|
|
|
|
b.w = __shfl((int)b.w, x1); |
|
|
|
b.w = __shfl2((int)b.w, x1); |
|
|
|
b.z = __shfl((int)b.z, x2); |
|
|
|
b.z = __shfl2((int)b.z, x2); |
|
|
|
b.y = __shfl((int)b.y, x3); |
|
|
|
b.y = __shfl2((int)b.y, x3); |
|
|
|
|
|
|
|
|
|
|
|
uint32_t tmp = b.y; b.y = b.w; b.w = tmp; |
|
|
|
uint32_t tmp = b.y; b.y = b.w; b.w = tmp; |
|
|
|
|
|
|
|
|
|
|
|
bx.w = __shfl((int)bx.w, x1); |
|
|
|
bx.w = __shfl2((int)bx.w, x1); |
|
|
|
bx.z = __shfl((int)bx.z, x2); |
|
|
|
bx.z = __shfl2((int)bx.z, x2); |
|
|
|
bx.y = __shfl((int)bx.y, x3); |
|
|
|
bx.y = __shfl2((int)bx.y, x3); |
|
|
|
tmp = bx.y; bx.y = bx.w; bx.w = tmp; |
|
|
|
tmp = bx.y; bx.y = bx.w; bx.w = tmp; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -318,9 +327,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x |
|
|
|
/* Unclear if this optimization is needed: These are ordered based |
|
|
|
/* Unclear if this optimization is needed: These are ordered based |
|
|
|
* upon the dependencies needed in the later xors. Compiler should be |
|
|
|
* upon the dependencies needed in the later xors. Compiler should be |
|
|
|
* able to figure this out, but might as well give it a hand. */ |
|
|
|
* able to figure this out, but might as well give it a hand. */ |
|
|
|
x.y = __shfl((int)x.y, x3); |
|
|
|
x.y = __shfl2((int)x.y, x3); |
|
|
|
x.w = __shfl((int)x.w, x1); |
|
|
|
x.w = __shfl2((int)x.w, x1); |
|
|
|
x.z = __shfl((int)x.z, x2); |
|
|
|
x.z = __shfl2((int)x.z, x2); |
|
|
|
|
|
|
|
|
|
|
|
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, |
|
|
|
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, |
|
|
|
* but the register targets are rewritten here to swap x[1] and x[3] so that |
|
|
|
* but the register targets are rewritten here to swap x[1] and x[3] so that |
|
|
@ -333,9 +342,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x |
|
|
|
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
|
|
|
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
|
|
|
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
|
|
|
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
|
|
|
|
|
|
|
|
|
|
|
x.w = __shfl((int)x.w, x3); |
|
|
|
x.w = __shfl2((int)x.w, x3); |
|
|
|
x.y = __shfl((int)x.y, x1); |
|
|
|
x.y = __shfl2((int)x.y, x1); |
|
|
|
x.z = __shfl((int)x.z, x2); |
|
|
|
x.z = __shfl2((int)x.z, x2); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
b += x; |
|
|
|
b += x; |
|
|
@ -352,18 +361,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x |
|
|
|
XOR_ROTATE_ADD(x.w, x.z, x.y, 13); |
|
|
|
XOR_ROTATE_ADD(x.w, x.z, x.y, 13); |
|
|
|
XOR_ROTATE_ADD(x.x, x.w, x.z, 18); |
|
|
|
XOR_ROTATE_ADD(x.x, x.w, x.z, 18); |
|
|
|
|
|
|
|
|
|
|
|
x.y = __shfl((int)x.y, x3); |
|
|
|
x.y = __shfl2((int)x.y, x3); |
|
|
|
x.w = __shfl((int)x.w, x1); |
|
|
|
x.w = __shfl2((int)x.w, x1); |
|
|
|
x.z = __shfl((int)x.z, x2); |
|
|
|
x.z = __shfl2((int)x.z, x2); |
|
|
|
|
|
|
|
|
|
|
|
XOR_ROTATE_ADD(x.w, x.x, x.y, 7); |
|
|
|
XOR_ROTATE_ADD(x.w, x.x, x.y, 7); |
|
|
|
XOR_ROTATE_ADD(x.z, x.w, x.x, 9); |
|
|
|
XOR_ROTATE_ADD(x.z, x.w, x.x, 9); |
|
|
|
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
|
|
|
XOR_ROTATE_ADD(x.y, x.z, x.w, 13); |
|
|
|
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
|
|
|
XOR_ROTATE_ADD(x.x, x.y, x.z, 18); |
|
|
|
|
|
|
|
|
|
|
|
x.w = __shfl((int)x.w, x3); |
|
|
|
x.w = __shfl2((int)x.w, x3); |
|
|
|
x.y = __shfl((int)x.y, x1); |
|
|
|
x.y = __shfl2((int)x.y, x1); |
|
|
|
x.z = __shfl((int)x.z, x2); |
|
|
|
x.z = __shfl2((int)x.z, x2); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// At the end of these iterations, the data is in primary order again. |
|
|
|
// At the end of these iterations, the data is in primary order again. |
|
|
@ -407,9 +416,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
|
|
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
|
|
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
|
|
|
|
|
|
|
|
|
|
|
x.y = __shfl((int)x.y, x1); |
|
|
|
x.y = __shfl2((int)x.y, x1); |
|
|
|
x.z = __shfl((int)x.z, x2); |
|
|
|
x.z = __shfl2((int)x.z, x2); |
|
|
|
x.w = __shfl((int)x.w, x3); |
|
|
|
x.w = __shfl2((int)x.w, x3); |
|
|
|
|
|
|
|
|
|
|
|
// Diagonal Mixing phase of chacha |
|
|
|
// Diagonal Mixing phase of chacha |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
|
|
@ -417,9 +426,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
|
|
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
|
|
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
|
|
|
|
|
|
|
|
|
|
|
x.y = __shfl((int)x.y, x3); |
|
|
|
x.y = __shfl2((int)x.y, x3); |
|
|
|
x.z = __shfl((int)x.z, x2); |
|
|
|
x.z = __shfl2((int)x.z, x2); |
|
|
|
x.w = __shfl((int)x.w, x1); |
|
|
|
x.w = __shfl2((int)x.w, x1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
b += x; |
|
|
|
b += x; |
|
|
@ -436,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
|
|
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
|
|
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
|
|
|
|
|
|
|
|
|
|
|
x.y = __shfl((int)x.y, x1); |
|
|
|
x.y = __shfl2((int)x.y, x1); |
|
|
|
x.z = __shfl((int)x.z, x2); |
|
|
|
x.z = __shfl2((int)x.z, x2); |
|
|
|
x.w = __shfl((int)x.w, x3); |
|
|
|
x.w = __shfl2((int)x.w, x3); |
|
|
|
|
|
|
|
|
|
|
|
// Diagonal Mixing phase of chacha |
|
|
|
// Diagonal Mixing phase of chacha |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) |
|
|
@ -446,9 +455,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
|
|
|
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) |
|
|
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
|
|
|
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) |
|
|
|
|
|
|
|
|
|
|
|
x.y = __shfl((int)x.y, x3); |
|
|
|
x.y = __shfl2((int)x.y, x3); |
|
|
|
x.z = __shfl((int)x.z, x2); |
|
|
|
x.z = __shfl2((int)x.z, x2); |
|
|
|
x.w = __shfl((int)x.w, x1); |
|
|
|
x.w = __shfl2((int)x.w, x1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#undef CHACHA_PRIMITIVE |
|
|
|
#undef CHACHA_PRIMITIVE |
|
|
@ -572,7 +581,7 @@ void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) |
|
|
|
} else load_key<ALGO>(d_odata, b, bx); |
|
|
|
} else load_key<ALGO>(d_odata, b, bx); |
|
|
|
|
|
|
|
|
|
|
|
for (int i = begin; i < end; i++) { |
|
|
|
for (int i = begin; i < end; i++) { |
|
|
|
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j); |
|
|
|
uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j); |
|
|
|
b ^= t; bx ^= tx; |
|
|
|
b ^= t; bx ^= tx; |
|
|
|
block_mixer<ALGO>(b, bx, x1, x2, x3); |
|
|
|
block_mixer<ALGO>(b, bx, x1, x2, x3); |
|
|
@ -604,7 +613,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign |
|
|
|
{ |
|
|
|
{ |
|
|
|
// better divergent thread handling submitted by nVidia engineers, but |
|
|
|
// better divergent thread handling submitted by nVidia engineers, but |
|
|
|
// supposedly this does not run with the ANDERSEN memory access scheme |
|
|
|
// supposedly this does not run with the ANDERSEN memory access scheme |
|
|
|
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
int pos = j/LOOKUP_GAP; |
|
|
|
int pos = j/LOOKUP_GAP; |
|
|
|
int loop = -1; |
|
|
|
int loop = -1; |
|
|
|
uint4 t, tx; |
|
|
|
uint4 t, tx; |
|
|
@ -612,7 +621,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign |
|
|
|
int i = begin; |
|
|
|
int i = begin; |
|
|
|
while(i < end) { |
|
|
|
while(i < end) { |
|
|
|
if (loop==-1) { |
|
|
|
if (loop==-1) { |
|
|
|
j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
pos = j/LOOKUP_GAP; |
|
|
|
pos = j/LOOKUP_GAP; |
|
|
|
loop = j-pos*LOOKUP_GAP; |
|
|
|
loop = j-pos*LOOKUP_GAP; |
|
|
|
read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
|
|
|
read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
|
|
@ -634,7 +643,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign |
|
|
|
// this is my original implementation, now used with the ANDERSEN |
|
|
|
// this is my original implementation, now used with the ANDERSEN |
|
|
|
// memory access scheme only. |
|
|
|
// memory access scheme only. |
|
|
|
for (int i = begin; i < end; i++) { |
|
|
|
for (int i = begin; i < end; i++) { |
|
|
|
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; |
|
|
|
int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; |
|
|
|
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
|
|
|
uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
|
|
|
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3); |
|
|
|
while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3); |
|
|
@ -644,7 +653,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
//for (int i = begin; i < end; i++) { |
|
|
|
//for (int i = begin; i < end; i++) { |
|
|
|
// int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
// int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); |
|
|
|
// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; |
|
|
|
// int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; |
|
|
|
// uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
|
|
|
// uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos); |
|
|
|
// while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3); |
|
|
|
// while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3); |
|
|
|