Browse Source

nanashi r10 with proper utf8

1.7.6-r10-nanashi
Tanguy Pruvot 8 years ago
parent
commit
f262850270
  1. 623
      Algo256/cuda_blake256.cu
  2. 226
      Algo256/cuda_bmw256.cu
  3. 472
      Algo256/cuda_cubehash256.cu
  4. 427
      Algo256/cuda_skein256.cu
  5. 576
      ccminer.cpp
  6. 33
      ccminer.vcxproj
  7. 6
      ccminer.vcxproj.filters
  8. 2
      configure.ac
  9. 30
      cuda_helper.h
  10. 599
      lyra2/cuda_lyra2.cu
  11. 7
      lyra2/cuda_lyra2_sm2.cuh
  12. 701
      lyra2/cuda_lyra2_sm5.cuh
  13. 606
      lyra2/cuda_lyra2v2.cu
  14. 338
      lyra2/cuda_lyra2v2_sm3.cuh
  15. 63
      lyra2/lyra2RE.cu
  16. 178
      lyra2/lyra2REv2.cu
  17. 4
      miner.h
  18. 1778
      neoscrypt/cuda_neoscrypt.cu
  19. 4
      neoscrypt/cuda_vectors.h
  20. 76
      neoscrypt/neoscrypt.cpp
  21. 7
      quark/cuda_quark_blake512_sp.cuh
  22. 2
      util.cpp

623
Algo256/cuda_blake256.cu

@ -8,17 +8,28 @@ extern "C" { @@ -8,17 +8,28 @@ extern "C" {
}
#include "cuda_helper.h"
#include <memory.h>
static __device__ uint64_t cuda_swab32ll(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
#define UINT2(x,y) make_uint2(x,y)
__device__ __inline__ uint2 ROR8(const uint2 a)
{
uint2 result;
result.x = __byte_perm(a.y, a.x, 0x0765);
result.y = __byte_perm(a.x, a.y, 0x0765);
return result;
}
__constant__ static uint32_t c_data[3+1];
__constant__ static uint32_t sigma[16][16];
static uint32_t c_sigma[16][16] = {
//static __device__ uint64_t cuda_swab32ll(uint64_t x) {
// return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
//}
__constant__ static uint32_t c_data[3];
//__constant__ static uint8_t sigma[16][16];
static uint8_t c_sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
@ -46,7 +57,7 @@ static const uint32_t c_IV256[8] = { @@ -46,7 +57,7 @@ static const uint32_t c_IV256[8] = {
__device__ __constant__ static uint32_t cpu_h[8];
__device__ __constant__ static uint32_t u256[16];
//__device__ __constant__ static uint32_t u256[16];
static const uint32_t c_u256[16] = {
0x243F6A88, 0x85A308D3,
0x13198A2E, 0x03707344,
@ -59,24 +70,22 @@ static const uint32_t c_u256[16] = { @@ -59,24 +70,22 @@ static const uint32_t c_u256[16] = {
};
#define GS2(a,b,c,d,x) { \
const uint32_t idx1 = sigma[r][x]; \
const uint32_t idx2 = sigma[r][x+1]; \
const uint8_t idx1 = sigma[r][x]; \
const uint8_t idx2 = sigma[r][x+1]; \
v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
v[c] += v[d]; \
v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
\
v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
v[c] += v[d]; \
v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
}
//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
//#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define hostGS(a,b,c,d,x) { \
const uint32_t idx1 = c_sigma[r][x]; \
const uint32_t idx2 = c_sigma[r][x+1]; \
const uint8_t idx1 = c_sigma[r][x]; \
const uint8_t idx2 = c_sigma[r][x+1]; \
v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
v[d] = ROTR32(v[d] ^ v[a], 16); \
v[c] += v[d]; \
@ -88,12 +97,45 @@ static const uint32_t c_u256[16] = { @@ -88,12 +97,45 @@ static const uint32_t c_u256[16] = {
v[b] = ROTR32(v[b] ^ v[c], 7); \
}
/* Second part (64-80) msg never change, store it */
__device__ __constant__ static const uint32_t c_Padding[16] = {
0, 0, 0, 0,
0x80000000, 0, 0, 0,
0, 0, 0, 0,
0, 1, 0, 640,
#define GSPREC(a,b,c,d,x,y) { \
v[a] += (m[x] ^ u256[y]) + v[b]; \
v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
v[c] += v[d]; \
v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
v[a] += (m[y] ^ u256[x]) + v[b]; \
v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
v[c] += v[d]; \
v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
}
__constant__ uint64_t keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull,
0x8000000080008081ull, 0x8000000000008009ull,
0x000000000000008aull, 0x0000000000000088ull,
0x0000000080008009ull, 0x000000008000000aull,
0x000000008000808bull, 0x800000000000008bull,
0x8000000000008089ull, 0x8000000000008003ull,
0x8000000000008002ull, 0x8000000000000080ull,
0x000000000000800aull, 0x800000008000000aull,
0x8000000080008081ull, 0x8000000000008080ull,
0x0000000080000001ull, 0x8000000080008008ull
};
__constant__ uint2 keccak_round_constants35[24] = {
{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
{ 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 },
{ 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 },
{ 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 },
{ 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 },
{ 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 },
{ 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 },
{ 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 },
{ 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 },
{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
};
__host__ __forceinline__
@ -132,27 +174,113 @@ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint3 @@ -132,27 +174,113 @@ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint3
hostGS(3, 4, 0x9, 0xE, 0xE);
}
for (int i = 0; i < 16; i++) {
int j = i & 7;
h[j] ^= v[i];
h[0] ^= v[0] ^ v[8];
h[1] ^= v[1] ^ v[9];
h[2] ^= v[2] ^ v[10];
h[3] ^= v[3] ^ v[11];
h[4] ^= v[4] ^ v[12];
h[5] ^= v[5] ^ v[13];
h[6] ^= v[6] ^ v[14];
h[7] ^= v[7] ^ v[15];
}
#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
static void __forceinline__ __device__ keccak_block(uint2 *s)
{
uint2 bc[5], tmpxor[5], tmp1, tmp2;
// uint2 s[25];
#pragma unroll 1
for (int i = 0; i < 24; i++)
{
#pragma unroll
for (uint32_t x = 0; x < 5; x++)
tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
tmp1 = s[1] ^ bc[0];
s[0] ^= bc[4];
s[1] = ROL2(s[6] ^ bc[0], 44);
s[6] = ROL2(s[9] ^ bc[3], 20);
s[9] = ROL2(s[22] ^ bc[1], 61);
s[22] = ROL2(s[14] ^ bc[3], 39);
s[14] = ROL2(s[20] ^ bc[4], 18);
s[20] = ROL2(s[2] ^ bc[1], 62);
s[2] = ROL2(s[12] ^ bc[1], 43);
s[12] = ROL2(s[13] ^ bc[2], 25);
s[13] = ROL8(s[19] ^ bc[3]);
s[19] = ROR8(s[23] ^ bc[2]);
s[23] = ROL2(s[15] ^ bc[4], 41);
s[15] = ROL2(s[4] ^ bc[3], 27);
s[4] = ROL2(s[24] ^ bc[3], 14);
s[24] = ROL2(s[21] ^ bc[0], 2);
s[21] = ROL2(s[8] ^ bc[2], 55);
s[8] = ROL2(s[16] ^ bc[0], 45);
s[16] = ROL2(s[5] ^ bc[4], 36);
s[5] = ROL2(s[3] ^ bc[2], 28);
s[3] = ROL2(s[18] ^ bc[2], 21);
s[18] = ROL2(s[17] ^ bc[1], 15);
s[17] = ROL2(s[11] ^ bc[0], 10);
s[11] = ROL2(s[7] ^ bc[1], 6);
s[7] = ROL2(s[10] ^ bc[4], 3);
s[10] = ROL2(tmp1, 1);
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
s[0] ^= keccak_round_constants35[i];
}
}
__device__ __forceinline__
static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
//__launch_bounds__(256)
__global__
void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
{
uint32_t m[16];
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t h[8];
// uint32_t input[4];
const uint32_t T0 = 640;
#pragma unroll 8
for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
uint32_t v[16];
m[0] = block[0];
m[1] = block[1];
m[2] = block[2];
m[3] = block[3];
const uint32_t c_Padding[12] = {
0x80000000, 0, 0, 0,
0, 0, 0, 0,
0, 1, 0, 640
};
#pragma unroll
for (int i = 4; i < 16; i++) {
m[i] = c_Padding[i];
}
const uint32_t u256[16] =
{
0x243F6A88, 0x85A308D3,
0x13198A2E, 0x03707344,
0xA4093822, 0x299F31D0,
0x082EFA98, 0xEC4E6C89,
0x452821E6, 0x38D01377,
0xBE5466CF, 0x34E90C6C,
0xC0AC29B7, 0xC97C50DD,
0x3F84D5B5, 0xB5470917
};
uint32_t m[16] =
{
c_data[0], c_data[1], c_data[2], nonce,
c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
};
#pragma unroll 8
for (int i = 0; i < 8; i++)
@ -162,86 +290,429 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3 @@ -162,86 +290,429 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3
v[9] = u256[1];
v[10] = u256[2];
v[11] = u256[3];
v[12] = u256[4] ^ T0;
v[13] = u256[5] ^ T0;
v[14] = u256[6];
v[15] = u256[7];
#pragma unroll 14
for (int r = 0; r < 14; r++) {
/* column step */
GS2(0, 4, 0x8, 0xC, 0x0);
GS2(1, 5, 0x9, 0xD, 0x2);
GS2(2, 6, 0xA, 0xE, 0x4);
GS2(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
GS2(0, 5, 0xA, 0xF, 0x8);
GS2(1, 6, 0xB, 0xC, 0xA);
GS2(2, 7, 0x8, 0xD, 0xC);
GS2(3, 4, 0x9, 0xE, 0xE);
// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
GSPREC(0, 4, 0x8, 0xC, 0, 1);
GSPREC(1, 5, 0x9, 0xD, 2, 3);
GSPREC(2, 6, 0xA, 0xE, 4, 5);
GSPREC(3, 7, 0xB, 0xF, 6, 7);
GSPREC(0, 5, 0xA, 0xF, 8, 9);
GSPREC(1, 6, 0xB, 0xC, 10, 11);
GSPREC(2, 7, 0x8, 0xD, 12, 13);
GSPREC(3, 4, 0x9, 0xE, 14, 15);
// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
GSPREC(0, 4, 0x8, 0xC, 14, 10);
GSPREC(1, 5, 0x9, 0xD, 4, 8);
GSPREC(2, 6, 0xA, 0xE, 9, 15);
GSPREC(3, 7, 0xB, 0xF, 13, 6);
GSPREC(0, 5, 0xA, 0xF, 1, 12);
GSPREC(1, 6, 0xB, 0xC, 0, 2);
GSPREC(2, 7, 0x8, 0xD, 11, 7);
GSPREC(3, 4, 0x9, 0xE, 5, 3);
// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
GSPREC(0, 4, 0x8, 0xC, 11, 8);
GSPREC(1, 5, 0x9, 0xD, 12, 0);
GSPREC(2, 6, 0xA, 0xE, 5, 2);
GSPREC(3, 7, 0xB, 0xF, 15, 13);
GSPREC(0, 5, 0xA, 0xF, 10, 14);
GSPREC(1, 6, 0xB, 0xC, 3, 6);
GSPREC(2, 7, 0x8, 0xD, 7, 1);
GSPREC(3, 4, 0x9, 0xE, 9, 4);
// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
GSPREC(0, 4, 0x8, 0xC, 7, 9);
GSPREC(1, 5, 0x9, 0xD, 3, 1);
GSPREC(2, 6, 0xA, 0xE, 13, 12);
GSPREC(3, 7, 0xB, 0xF, 11, 14);
GSPREC(0, 5, 0xA, 0xF, 2, 6);
GSPREC(1, 6, 0xB, 0xC, 5, 10);
GSPREC(2, 7, 0x8, 0xD, 4, 0);
GSPREC(3, 4, 0x9, 0xE, 15, 8);
// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
GSPREC(0, 4, 0x8, 0xC, 9, 0);
GSPREC(1, 5, 0x9, 0xD, 5, 7);
GSPREC(2, 6, 0xA, 0xE, 2, 4);
GSPREC(3, 7, 0xB, 0xF, 10, 15);
GSPREC(0, 5, 0xA, 0xF, 14, 1);
GSPREC(1, 6, 0xB, 0xC, 11, 12);
GSPREC(2, 7, 0x8, 0xD, 6, 8);
GSPREC(3, 4, 0x9, 0xE, 3, 13);
// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
GSPREC(0, 4, 0x8, 0xC, 2, 12);
GSPREC(1, 5, 0x9, 0xD, 6, 10);
GSPREC(2, 6, 0xA, 0xE, 0, 11);
GSPREC(3, 7, 0xB, 0xF, 8, 3);
GSPREC(0, 5, 0xA, 0xF, 4, 13);
GSPREC(1, 6, 0xB, 0xC, 7, 5);
GSPREC(2, 7, 0x8, 0xD, 15, 14);
GSPREC(3, 4, 0x9, 0xE, 1, 9);
// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
GSPREC(0, 4, 0x8, 0xC, 12, 5);
GSPREC(1, 5, 0x9, 0xD, 1, 15);
GSPREC(2, 6, 0xA, 0xE, 14, 13);
GSPREC(3, 7, 0xB, 0xF, 4, 10);
GSPREC(0, 5, 0xA, 0xF, 0, 7);
GSPREC(1, 6, 0xB, 0xC, 6, 3);
GSPREC(2, 7, 0x8, 0xD, 9, 2);
GSPREC(3, 4, 0x9, 0xE, 8, 11);
// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
GSPREC(0, 4, 0x8, 0xC, 13, 11);
GSPREC(1, 5, 0x9, 0xD, 7, 14);
GSPREC(2, 6, 0xA, 0xE, 12, 1);
GSPREC(3, 7, 0xB, 0xF, 3, 9);
GSPREC(0, 5, 0xA, 0xF, 5, 0);
GSPREC(1, 6, 0xB, 0xC, 15, 4);
GSPREC(2, 7, 0x8, 0xD, 8, 6);
GSPREC(3, 4, 0x9, 0xE, 2, 10);
// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
GSPREC(0, 4, 0x8, 0xC, 6, 15);
GSPREC(1, 5, 0x9, 0xD, 14, 9);
GSPREC(2, 6, 0xA, 0xE, 11, 3);
GSPREC(3, 7, 0xB, 0xF, 0, 8);
GSPREC(0, 5, 0xA, 0xF, 12, 2);
GSPREC(1, 6, 0xB, 0xC, 13, 7);
GSPREC(2, 7, 0x8, 0xD, 1, 4);
GSPREC(3, 4, 0x9, 0xE, 10, 5);
// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
GSPREC(0, 4, 0x8, 0xC, 10, 2);
GSPREC(1, 5, 0x9, 0xD, 8, 4);
GSPREC(2, 6, 0xA, 0xE, 7, 6);
GSPREC(3, 7, 0xB, 0xF, 1, 5);
GSPREC(0, 5, 0xA, 0xF, 15, 11);
GSPREC(1, 6, 0xB, 0xC, 9, 14);
GSPREC(2, 7, 0x8, 0xD, 3, 12);
GSPREC(3, 4, 0x9, 0xE, 13, 0);
// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
GSPREC(0, 4, 0x8, 0xC, 0, 1);
GSPREC(1, 5, 0x9, 0xD, 2, 3);
GSPREC(2, 6, 0xA, 0xE, 4, 5);
GSPREC(3, 7, 0xB, 0xF, 6, 7);
GSPREC(0, 5, 0xA, 0xF, 8, 9);
GSPREC(1, 6, 0xB, 0xC, 10, 11);
GSPREC(2, 7, 0x8, 0xD, 12, 13);
GSPREC(3, 4, 0x9, 0xE, 14, 15);
// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
GSPREC(0, 4, 0x8, 0xC, 14, 10);
GSPREC(1, 5, 0x9, 0xD, 4, 8);
GSPREC(2, 6, 0xA, 0xE, 9, 15);
GSPREC(3, 7, 0xB, 0xF, 13, 6);
GSPREC(0, 5, 0xA, 0xF, 1, 12);
GSPREC(1, 6, 0xB, 0xC, 0, 2);
GSPREC(2, 7, 0x8, 0xD, 11, 7);
GSPREC(3, 4, 0x9, 0xE, 5, 3);
// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
GSPREC(0, 4, 0x8, 0xC, 11, 8);
GSPREC(1, 5, 0x9, 0xD, 12, 0);
GSPREC(2, 6, 0xA, 0xE, 5, 2);
GSPREC(3, 7, 0xB, 0xF, 15, 13);
GSPREC(0, 5, 0xA, 0xF, 10, 14);
GSPREC(1, 6, 0xB, 0xC, 3, 6);
GSPREC(2, 7, 0x8, 0xD, 7, 1);
GSPREC(3, 4, 0x9, 0xE, 9, 4);
// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
GSPREC(0, 4, 0x8, 0xC, 7, 9);
GSPREC(1, 5, 0x9, 0xD, 3, 1);
GSPREC(2, 6, 0xA, 0xE, 13, 12);
GSPREC(3, 7, 0xB, 0xF, 11, 14);
GSPREC(0, 5, 0xA, 0xF, 2, 6);
GSPREC(1, 6, 0xB, 0xC, 5, 10);
GSPREC(2, 7, 0x8, 0xD, 4, 0);
GSPREC(3, 4, 0x9, 0xE, 15, 8);
h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
uint2 keccak_gpu_state[25] = { 0 };
keccak_gpu_state[0].x = h[0];
keccak_gpu_state[0].y = h[1];
keccak_gpu_state[1].x = h[2];
keccak_gpu_state[1].y = h[3];
keccak_gpu_state[2].x = h[4];
keccak_gpu_state[2].y = h[5];
keccak_gpu_state[3].x = h[6];
keccak_gpu_state[3].y = h[7];
keccak_gpu_state[4] = UINT2(1, 0);
keccak_gpu_state[16] = UINT2(0, 0x80000000);
keccak_block(keccak_gpu_state);
uint64_t *outputHash = (uint64_t *)Hash;
#pragma unroll 4
for (int i = 0; i<4; i++)
outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]);
}
#pragma unroll 16
for (int i = 0; i < 16; i++) {
int j = i & 7;
h[j] ^= v[i];
}
}
__global__ __launch_bounds__(256,3)
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
__global__ __launch_bounds__(256, 4)
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t h[8];
uint32_t input[4];
// uint32_t input[4];
const uint32_t T0 = 640;
#pragma unroll 8
for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
#pragma unroll
for (int i = 0; i < 8; i++) h[i] = cpu_h[i];
uint32_t v[16];
#pragma unroll
for (int i = 0; i < 3; ++i) input[i] = c_data[i];
const uint32_t c_Padding[12] = {
0x80000000, 0, 0, 0,
0, 0, 0, 0,
0, 1, 0, 640
};
const uint32_t u256[16] =
{
0x243F6A88, 0x85A308D3,
0x13198A2E, 0x03707344,
0xA4093822, 0x299F31D0,
0x082EFA98, 0xEC4E6C89,
0x452821E6, 0x38D01377,
0xBE5466CF, 0x34E90C6C,
0xC0AC29B7, 0xC97C50DD,
0x3F84D5B5, 0xB5470917
};
uint32_t m[16] =
{
c_data[0], c_data[1], c_data[2], nonce,
c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
};
input[3] = startNonce + thread;
blake256_compress2nd(h, input, 640);
#pragma unroll 8
for (int i = 0; i < 8; i++)
v[i] = h[i];
#pragma unroll
for (int i = 0; i<4; i++) {
Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
}
v[8] = u256[0];
v[9] = u256[1];
v[10] = u256[2];
v[11] = u256[3];
v[12] = u256[4] ^ T0;
v[13] = u256[5] ^ T0;
v[14] = u256[6];
v[15] = u256[7];
// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
GSPREC(0, 4, 0x8, 0xC, 0, 1);
GSPREC(1, 5, 0x9, 0xD, 2, 3);
GSPREC(2, 6, 0xA, 0xE, 4, 5);
GSPREC(3, 7, 0xB, 0xF, 6, 7);
GSPREC(0, 5, 0xA, 0xF, 8, 9);
GSPREC(1, 6, 0xB, 0xC, 10, 11);
GSPREC(2, 7, 0x8, 0xD, 12, 13);
GSPREC(3, 4, 0x9, 0xE, 14, 15);
// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
GSPREC(0, 4, 0x8, 0xC, 14, 10);
GSPREC(1, 5, 0x9, 0xD, 4, 8);
GSPREC(2, 6, 0xA, 0xE, 9, 15);
GSPREC(3, 7, 0xB, 0xF, 13, 6);
GSPREC(0, 5, 0xA, 0xF, 1, 12);
GSPREC(1, 6, 0xB, 0xC, 0, 2);
GSPREC(2, 7, 0x8, 0xD, 11, 7);
GSPREC(3, 4, 0x9, 0xE, 5, 3);
// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
GSPREC(0, 4, 0x8, 0xC, 11, 8);
GSPREC(1, 5, 0x9, 0xD, 12, 0);
GSPREC(2, 6, 0xA, 0xE, 5, 2);
GSPREC(3, 7, 0xB, 0xF, 15, 13);
GSPREC(0, 5, 0xA, 0xF, 10, 14);
GSPREC(1, 6, 0xB, 0xC, 3, 6);
GSPREC(2, 7, 0x8, 0xD, 7, 1);
GSPREC(3, 4, 0x9, 0xE, 9, 4);
// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
GSPREC(0, 4, 0x8, 0xC, 7, 9);
GSPREC(1, 5, 0x9, 0xD, 3, 1);
GSPREC(2, 6, 0xA, 0xE, 13, 12);
GSPREC(3, 7, 0xB, 0xF, 11, 14);
GSPREC(0, 5, 0xA, 0xF, 2, 6);
GSPREC(1, 6, 0xB, 0xC, 5, 10);
GSPREC(2, 7, 0x8, 0xD, 4, 0);
GSPREC(3, 4, 0x9, 0xE, 15, 8);
// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
GSPREC(0, 4, 0x8, 0xC, 9, 0);
GSPREC(1, 5, 0x9, 0xD, 5, 7);
GSPREC(2, 6, 0xA, 0xE, 2, 4);
GSPREC(3, 7, 0xB, 0xF, 10, 15);
GSPREC(0, 5, 0xA, 0xF, 14, 1);
GSPREC(1, 6, 0xB, 0xC, 11, 12);
GSPREC(2, 7, 0x8, 0xD, 6, 8);
GSPREC(3, 4, 0x9, 0xE, 3, 13);
// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
GSPREC(0, 4, 0x8, 0xC, 2, 12);
GSPREC(1, 5, 0x9, 0xD, 6, 10);
GSPREC(2, 6, 0xA, 0xE, 0, 11);
GSPREC(3, 7, 0xB, 0xF, 8, 3);
GSPREC(0, 5, 0xA, 0xF, 4, 13);
GSPREC(1, 6, 0xB, 0xC, 7, 5);
GSPREC(2, 7, 0x8, 0xD, 15, 14);
GSPREC(3, 4, 0x9, 0xE, 1, 9);
// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
GSPREC(0, 4, 0x8, 0xC, 12, 5);
GSPREC(1, 5, 0x9, 0xD, 1, 15);
GSPREC(2, 6, 0xA, 0xE, 14, 13);
GSPREC(3, 7, 0xB, 0xF, 4, 10);
GSPREC(0, 5, 0xA, 0xF, 0, 7);
GSPREC(1, 6, 0xB, 0xC, 6, 3);
GSPREC(2, 7, 0x8, 0xD, 9, 2);
GSPREC(3, 4, 0x9, 0xE, 8, 11);
// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
GSPREC(0, 4, 0x8, 0xC, 13, 11);
GSPREC(1, 5, 0x9, 0xD, 7, 14);
GSPREC(2, 6, 0xA, 0xE, 12, 1);
GSPREC(3, 7, 0xB, 0xF, 3, 9);
GSPREC(0, 5, 0xA, 0xF, 5, 0);
GSPREC(1, 6, 0xB, 0xC, 15, 4);
GSPREC(2, 7, 0x8, 0xD, 8, 6);
GSPREC(3, 4, 0x9, 0xE, 2, 10);
// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
GSPREC(0, 4, 0x8, 0xC, 6, 15);
GSPREC(1, 5, 0x9, 0xD, 14, 9);
GSPREC(2, 6, 0xA, 0xE, 11, 3);
GSPREC(3, 7, 0xB, 0xF, 0, 8);
GSPREC(0, 5, 0xA, 0xF, 12, 2);
GSPREC(1, 6, 0xB, 0xC, 13, 7);
GSPREC(2, 7, 0x8, 0xD, 1, 4);
GSPREC(3, 4, 0x9, 0xE, 10, 5);
// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
GSPREC(0, 4, 0x8, 0xC, 10, 2);
GSPREC(1, 5, 0x9, 0xD, 8, 4);
GSPREC(2, 6, 0xA, 0xE, 7, 6);
GSPREC(3, 7, 0xB, 0xF, 1, 5);
GSPREC(0, 5, 0xA, 0xF, 15, 11);
GSPREC(1, 6, 0xB, 0xC, 9, 14);
GSPREC(2, 7, 0x8, 0xD, 3, 12);
GSPREC(3, 4, 0x9, 0xE, 13, 0);
// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
GSPREC(0, 4, 0x8, 0xC, 0, 1);
GSPREC(1, 5, 0x9, 0xD, 2, 3);
GSPREC(2, 6, 0xA, 0xE, 4, 5);
GSPREC(3, 7, 0xB, 0xF, 6, 7);
GSPREC(0, 5, 0xA, 0xF, 8, 9);
GSPREC(1, 6, 0xB, 0xC, 10, 11);
GSPREC(2, 7, 0x8, 0xD, 12, 13);
GSPREC(3, 4, 0x9, 0xE, 14, 15);
// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
GSPREC(0, 4, 0x8, 0xC, 14, 10);
GSPREC(1, 5, 0x9, 0xD, 4, 8);
GSPREC(2, 6, 0xA, 0xE, 9, 15);
GSPREC(3, 7, 0xB, 0xF, 13, 6);
GSPREC(0, 5, 0xA, 0xF, 1, 12);
GSPREC(1, 6, 0xB, 0xC, 0, 2);
GSPREC(2, 7, 0x8, 0xD, 11, 7);
GSPREC(3, 4, 0x9, 0xE, 5, 3);
// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
GSPREC(0, 4, 0x8, 0xC, 11, 8);
GSPREC(1, 5, 0x9, 0xD, 12, 0);
GSPREC(2, 6, 0xA, 0xE, 5, 2);
GSPREC(3, 7, 0xB, 0xF, 15, 13);
GSPREC(0, 5, 0xA, 0xF, 10, 14);
GSPREC(1, 6, 0xB, 0xC, 3, 6);
GSPREC(2, 7, 0x8, 0xD, 7, 1);
GSPREC(3, 4, 0x9, 0xE, 9, 4);
// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
GSPREC(0, 4, 0x8, 0xC, 7, 9);
GSPREC(1, 5, 0x9, 0xD, 3, 1);
GSPREC(2, 6, 0xA, 0xE, 13, 12);
GSPREC(3, 7, 0xB, 0xF, 11, 14);
GSPREC(0, 5, 0xA, 0xF, 2, 6);
GSPREC(1, 6, 0xB, 0xC, 5, 10);
GSPREC(2, 7, 0x8, 0xD, 4, 0);
GSPREC(3, 4, 0x9, 0xE, 15, 8);
h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
Hash[((0 * threads) + thread) * 2] = (h[0]);
Hash[((0 * threads) + thread) * 2 + 1] = (h[1]);
Hash[((1 * threads) + thread) * 2] = (h[2]);
Hash[((1 * threads) + thread) * 2 + 1] = (h[3]);
Hash[((2 * threads) + thread) * 2] = (h[4]);
Hash[((2 * threads) + thread) * 2 + 1] = (h[5]);
Hash[((3 * threads) + thread) * 2] = (h[6]);
Hash[((3 * threads) + thread) * 2 + 1] = (h[7]);
}
}
__host__
void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
{
const uint32_t threadsperblock = 256;
const uint32_t threadsperblock = 64;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash);
MyStreamSynchronize(NULL, order, thr_id);
blake256_gpu_hash_80 << <grid, block >> > (threads, startNonce, (uint32_t *)Hash);
}
__host__
void blake256_cpu_setBlock_80(uint32_t *pdata)
{
uint32_t h[8], data[20];
uint32_t h[8];
uint32_t data[20];
memcpy(data, pdata, 80);
memcpy(h, c_IV256, sizeof(c_IV256));
for (int i = 0; i<8; i++) {
h[i] = c_IV256[i];
}
blake256_compress1st(h, pdata, 512);
cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_data, &data[16], sizeof(c_data), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_data, &data[16], 3 * 4, 0, cudaMemcpyHostToDevice);
}
__host__
void blake256_cpu_init(int thr_id, uint32_t threads)
void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
{
cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
blakeKeccak256_gpu_hash_80 << <grid, block >> > (threads, startNonce, (uint32_t *)Hash);
}
__host__
void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order, cudaStream_t stream)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
blakeKeccak256_gpu_hash_80 << <grid, block, 0, stream >> > (threads, startNonce, (uint32_t *)Hash);
}

226
Algo256/cuda_bmw256.cu

@ -15,58 +15,56 @@ __constant__ uint64_t pTarget[4]; @@ -15,58 +15,56 @@ __constant__ uint64_t pTarget[4];
#define shr(x, n) ((x) >> (n))
#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19))
#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23))
#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ __byte_perm(x,0,0x2103) ^ SPH_ROTL32((x), 23))
#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
#define ss4(x) (shr((x), 1) ^ (x))
#define ss5(x) (shr((x), 2) ^ (x))
#define rs1(x) SPH_ROTL32((x), 3)
#define rs2(x) SPH_ROTL32((x), 7)
#define rs3(x) SPH_ROTL32((x), 13)
#define rs4(x) SPH_ROTL32((x), 16)
#define rs4(x) __byte_perm(x,0,0x1032)
#define rs5(x) SPH_ROTL32((x), 19)
#define rs6(x) SPH_ROTL32((x), 23)
#define rs7(x) SPH_ROTL32((x), 27)
/* Message expansion function 1 */
__forceinline__ __device__
uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
__forceinline__ __device__ uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
{
return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
}
/* Message expansion function 2 */
__forceinline__ __device__
uint32_t expand32_2(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
__forceinline__ __device__ uint32_t expand32_2(const int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
{
return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
return (
rs2(Q[i - 13]) + rs3(Q[i - 11]) + rs4(Q[i - 9]) + rs1(Q[i - 15]) +
+rs5(Q[i - 7]) + rs6(Q[i - 5]) + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]));
}
__forceinline__ __device__
void Compression256(uint32_t * M32)
__forceinline__ __device__ void Compression256(uint32_t M32[16])
{
uint32_t Q[32], XL32, XH32;
const uint32_t H[16] = {
0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F,
0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F,
0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F,
0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F
(0x40414243), (0x44454647),
(0x48494A4B), (0x4C4D4E4F),
(0x50515253), (0x54555657),
(0x58595A5B), (0x5C5D5E5F),
(0x60616263), (0x64656667),
(0x68696A6B), (0x6C6D6E6F),
(0x70717273), (0x74757677),
(0x78797A7B), (0x7C7D7E7F)
};
M32[8] = 0x80;
M32[14] = 0x100;
// int i;
uint32_t XL32, XH32, Q[32];
Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
@ -109,13 +107,91 @@ void Compression256(uint32_t * M32) @@ -109,13 +107,91 @@ void Compression256(uint32_t * M32)
/* The following relation for these parameters should is satisfied: */
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */
#pragma unroll
for (int i=16; i<18; i++)
Q[i] = expand32_1(i, M32, H, Q);
#pragma nounroll
for (int i=18; i<32; i++)
Q[i] = expand32_2(i, M32, H, Q);
// #pragma unroll
// for (i = 0; i<2; i++)
// Q[i + 16] = expand32_1(i + 16, M32, H, Q);
Q[16] = ss1(Q[16 - 16]) + ss2(Q[16 - 15]) + ss3(Q[16 - 14]) + ss0(Q[16 - 13])
+ ss1(Q[16 - 12]) + ss2(Q[16 - 11]) + ss3(Q[16 - 10]) + ss0(Q[16 - 9])
+ ss1(Q[16 - 8]) + ss2(Q[16 - 7]) + ss3(Q[16 - 6]) + ss0(Q[16 - 5])
+ ss1(Q[16 - 4]) + ss2(Q[16 - 3]) + ss3(Q[16 - 2]) + ss0(Q[16 - 1])
+ ((16 * (0x05555555ul) + SPH_ROTL32(M32[0], ((16 - 16) % 16) + 1) + SPH_ROTL32(M32[3], ((16 - 13) % 16) + 1)) ^ H[(16 - 16 + 7) % 16]);
Q[17] = ss1(Q[17 - 16]) + ss2(Q[17 - 15]) + ss3(Q[17 - 14]) + ss0(Q[17 - 13])
+ ss1(Q[17 - 12]) + ss2(Q[17 - 11]) + ss3(Q[17 - 10]) + ss0(Q[17 - 9])
+ ss1(Q[17 - 8]) + ss2(Q[17 - 7]) + ss3(Q[17 - 6]) + ss0(Q[17 - 5])
+ ss1(Q[17 - 4]) + ss2(Q[17 - 3]) + ss3(Q[17 - 2]) + ss0(Q[17 - 1])
+ ((17 * (0x05555555ul) + SPH_ROTL32(M32[(17 - 16) % 16], ((17 - 16) % 16) + 1) + SPH_ROTL32(M32[(17 - 13) % 16], ((17 - 13) % 16) + 1)) ^ H[(17 - 16 + 7) % 16]);
uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4]
uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4]
// #pragma unroll
// for (i = 2 + 16; i < 16 + 16; i+=2)
// {
precalc = precalc + Q[18 - 4];
precalc2 = precalc2 + Q[18 + 1 - 4];
uint32_t p1 = ((18 * (0x05555555ul) + SPH_ROTL32(M32[2], ((18 - 16) % 16) + 1) + SPH_ROTL32(M32[5], ((18 - 13) % 16) + 1)) ^ H[(18 - 16 + 7) % 16]);
uint32_t p2 = (((18 + 1)*(0x05555555ul) + SPH_ROTL32(M32[3], (((18 + 1) - 16) % 16) + 1) + SPH_ROTL32(M32[6], (((18 + 1) - 13) % 16) + 1)) ^ H[((18 + 1) - 16 + 7) % 16]);
Q[18] = precalc + expand32_2(18, M32, H, Q) + p1;
Q[18 + 1] = precalc2 + expand32_2(18 + 1, M32, H, Q) + p2;
precalc = precalc - Q[18 - 16];
precalc2 = precalc2 - Q[18 + 1 - 16];
precalc = precalc + Q[20 - 4];
precalc2 = precalc2 + Q[20 + 1 - 4];
p1 = ((20 * (0x05555555ul) + SPH_ROTL32(M32[4], ((20 - 16) % 16) + 1) + SPH_ROTL32(M32[7], ((20 - 13) % 16) + 1) - (0x100 << 15)) ^ H[(20 - 16 + 7) % 16]);
p2 = (((20 + 1)*(0x05555555ul) + SPH_ROTL32(M32[5], (((20 + 1) - 16) % 16) + 1) + (0x80 << 9)) ^ H[((20 + 1) - 16 + 7) % 16]);
Q[20] = precalc + expand32_2(20, M32, H, Q) + p1;
Q[20 + 1] = precalc2 + expand32_2(20 + 1, M32, H, Q) + p2;
precalc = precalc - Q[20 - 16];
precalc2 = precalc2 - Q[20 + 1 - 16];
precalc = precalc + Q[22 - 4];
precalc2 = precalc2 + Q[22 + 1 - 4];
p1 = ((22 * (0x05555555ul) + SPH_ROTL32(M32[6], ((22 - 16) % 16) + 1) - SPH_ROTL32(M32[0], ((22 - 6) % 16) + 1)) ^ H[(22 - 16 + 7) % 16]);
p2 = (((22 + 1)*(0x05555555ul) + SPH_ROTL32(M32[7], (((22 + 1) - 16) % 16) + 1) - SPH_ROTL32(M32[1], (((22 + 1) - 6) % 16) + 1)) ^ H[((22 + 1) - 16 + 7) % 16]);
Q[22] = precalc + expand32_2(22, M32, H, Q) + p1;
Q[22 + 1] = precalc2 + expand32_2(22 + 1, M32, H, Q) + p2;
precalc = precalc - Q[22 - 16];
precalc2 = precalc2 - Q[22 + 1 - 16];
precalc = precalc + Q[24 - 4];
precalc2 = precalc2 + Q[24 + 1 - 4];
p1 = ((24 * (0x05555555ul) + (0x80 << 9) - SPH_ROTL32(M32[2], ((24 - 6) % 16) + 1)) ^ H[(24 - 16 + 7) % 16]);
p2 = (((24 + 1)*(0x05555555ul) - SPH_ROTL32(M32[3], (((24 + 1) - 6) % 16) + 1)) ^ H[((24 + 1) - 16 + 7) % 16]);
Q[24] = precalc + expand32_2(24, M32, H, Q) + p1;
Q[24 + 1] = precalc2 + expand32_2(24 + 1, M32, H, Q) + p2;
precalc = precalc - Q[24 - 16];
precalc2 = precalc2 - Q[24 + 1 - 16];
precalc = precalc + Q[26 - 4];
precalc2 = precalc2 + Q[26 + 1 - 4];
p1 = ((26 * (0x05555555ul) - SPH_ROTL32(M32[4], ((26 - 6) % 16) + 1)) ^ H[(26 - 16 + 7) % 16]);
p2 = (((26 + 1)*(0x05555555ul) + (0x100 << 15) - SPH_ROTL32(M32[5], (((26 + 1) - 6) % 16) + 1)) ^ H[((26 + 1) - 16 + 7) % 16]);
Q[26] = precalc + expand32_2(26, M32, H, Q) + p1;
Q[26 + 1] = precalc2 + expand32_2(26 + 1, M32, H, Q) + p2;
precalc = precalc - Q[26 - 16];
precalc2 = precalc2 - Q[26 + 1 - 16];
precalc = precalc + Q[28 - 4];
precalc2 = precalc2 + Q[28 + 1 - 4];
p1 = ((28 * (0x05555555ul) - SPH_ROTL32(M32[6], ((28 - 6) % 16) + 1)) ^ H[(28 - 16 + 7) % 16]);
p2 = (((28 + 1)*(0x05555555ul) + SPH_ROTL32(M32[0], (((28 + 1) - 13) % 16) + 1) - SPH_ROTL32(M32[7], (((28 + 1) - 6) % 16) + 1)) ^ H[((28 + 1) - 16 + 7) % 16]);
Q[28] = precalc + expand32_2(28, M32, H, Q) + p1;
Q[28 + 1] = precalc2 + expand32_2(28 + 1, M32, H, Q) + p2;
precalc = precalc - Q[28 - 16];
precalc2 = precalc2 - Q[28 + 1 - 16];
precalc = precalc + Q[30 - 4];
precalc2 = precalc2 + Q[30 + 1 - 4];
p1 = ((30 * (0x05555555ul) + (0x100 << 15) + SPH_ROTL32(M32[1], ((30 - 13) % 16) + 1) - (0x80 << 9)) ^ H[(30 - 16 + 7) % 16]);
p2 = (((30 + 1)*(0x05555555ul) + SPH_ROTL32(M32[2], (((30 + 1) - 13) % 16) + 1)) ^ H[((30 + 1) - 16 + 7) % 16]);
Q[30] = precalc + expand32_2(30, M32, H, Q) + p1;
Q[30 + 1] = precalc2 + expand32_2(30 + 1, M32, H, Q) + p2;
precalc = precalc - Q[30 - 16];
precalc2 = precalc2 - Q[30 + 1 - 16];
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
/* 16 new variables that are prooduced in the Message Expansion part. */
@ -145,17 +221,18 @@ void Compression256(uint32_t * M32) @@ -145,17 +221,18 @@ void Compression256(uint32_t * M32)
M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
}
__forceinline__ __device__
void Compression256_2(uint32_t * M32)
__forceinline__ __device__ void Compression256_2(uint32_t M32[16])
{
uint32_t XL32, XH32, Q[32];
const uint32_t H[16] = {
0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3,
0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7,
0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf
(0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2),
(0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5),
(0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8),
(0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab),
(0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae),
(0xaaaaaaaf)
};
int i;
uint32_t XL32, XH32, Q[32];
Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
@ -200,44 +277,68 @@ void Compression256_2(uint32_t * M32) @@ -200,44 +277,68 @@ void Compression256_2(uint32_t * M32)
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */
#pragma unroll
for (int i = 16; i<18; i++)
Q[i] = expand32_1(i, M32, H, Q);
for (i = 0; i<2; i++)
Q[i + 16] = expand32_1(i + 16, M32, H, Q);
/* #pragma unroll
for (i = 2; i<16; i++)
Q[i + 16] = expand32_2(i + 16, M32, H, Q);
*/
uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4]
uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4]
#pragma unroll
for (i = 2 + 16; i < 16 + 16; i += 2)
{
precalc = precalc + Q[i - 4];
precalc2 = precalc2 + Q[i + 1 - 4];
uint32_t p1 = ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]);
uint32_t p2 = (((i + 1)*(0x05555555ul) + SPH_ROTL32(M32[((i + 1) - 16) % 16], (((i + 1) - 16) % 16) + 1) + SPH_ROTL32(M32[((i + 1) - 13) % 16], (((i + 1) - 13) % 16) + 1) - SPH_ROTL32(M32[((i + 1) - 6) % 16], (((i + 1) - 6) % 16) + 1)) ^ H[((i + 1) - 16 + 7) % 16]);
Q[i] = precalc + expand32_2(i, M32, H, Q) + p1;
Q[i + 1] = precalc2 + expand32_2(i + 1, M32, H, Q) + p2;
precalc = precalc - Q[i - 16];
precalc2 = precalc2 - Q[i + 1 - 16];
}
#pragma nounroll
for (int i = 18; i<32; i++)
Q[i] = expand32_2(i, M32, H, Q);
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
/* 16 new variables that are prooduced in the Message Expansion part. */
XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
}
#define TPB 512
__global__ __launch_bounds__(TPB, 2)
void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *const __restrict__ nonceVector)
void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *const __restrict__ nonceVector, uint32_t Target)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t message[16] = { 0 };
LOHI(message[0], message[1], __ldg(&g_hash[thread]));
LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads]));
message[8]=0x80;
message[14]=0x100;
Compression256(message);
Compression256_2(message);
if (((uint64_t*)message)[7] <= pTarget[3])
uint2 message[8] = { 0 };
message[0] = __ldg(&g_hash[thread + 0 * threads]);
message[1] = __ldg(&g_hash[thread + 1 * threads]);
message[2] = __ldg(&g_hash[thread + 2 * threads]);
message[3] = __ldg(&g_hash[thread + 3 * threads]);
//LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
//LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
//LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads]));
message[4].x = 0x80;
message[7].x = 0x100;
Compression256((uint32_t*)message);
Compression256_2((uint32_t*)message);
if (message[7].y <= Target)
{
uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread);
if (tmp != 0)
@ -247,7 +348,7 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash @@ -247,7 +348,7 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
}
__host__
void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces)
void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target)
{
const uint32_t threadsperblock = TPB;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
@ -255,13 +356,12 @@ void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint @@ -255,13 +356,12 @@ void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint
cudaMemset(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t));
bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, g_hash, d_GNonce[thr_id]);
bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, (uint2*)g_hash, d_GNonce[thr_id], Target);
cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
resultnonces[0] = *(d_gnounce[thr_id]);
resultnonces[1] = *(d_gnounce[thr_id] + 1);
}
__host__
void bmw256_cpu_init(int thr_id, uint32_t threads)
{
@ -276,8 +376,10 @@ void bmw256_cpu_free(int thr_id) @@ -276,8 +376,10 @@ void bmw256_cpu_free(int thr_id)
cudaFreeHost(d_gnounce[thr_id]);
}
/*
__host__
void bmw256_setTarget(const void *pTargetIn)
{
cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
}
*/

472
Algo256/cuda_cubehash256.cu

@ -3,179 +3,247 @@ @@ -3,179 +3,247 @@
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#define __CUDA_ARCH__ 520
#endif
#if __CUDA_ARCH__ < 350
#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
#else
#define LROT(x, bits) __funnelshift_l(x, x, bits)
#endif
#if __CUDA_ARCH__ < 500
#define TPB 576
#else
#define TPB 1024
#endif
#define TPB35 576
#define TPB50 1024
#define ROTATEUPWARDS7(a) LROT(a,7)
#define ROTATEUPWARDS11(a) LROT(a,11)
//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
#define SWAP(a,b) { a ^= b; b ^= a; a ^= b; }
__device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
{
int r;
int j;
int k;
int l;
int m;
#pragma unroll 2
for (r = 0; r < CUBEHASH_ROUNDS; ++r) {
/* "add x_0jklm into x_1jklmn modulo 2^32" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (l = 0; l < 2; ++l)
#pragma unroll 2
for (m = 0; m < 2; ++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
uint32_t x0[2][2][2][2];
uint32_t x1[2][2][2][2];
for (r = 0; r < CUBEHASH_ROUNDS; r += 2) {
/* "rotate x_0jklm upwards by 7 bits" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (l = 0; l < 2; ++l)
#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
/* "swap x_00klm with x_01klm" */
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (l = 0; l < 2; ++l)
#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
/* "xor x_1jklm into x_0jklm" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (l = 0; l < 2; ++l)
#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jk0m with x_1jk1m" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]);
x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]);
x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]);
x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]);
x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]);
x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]);
x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]);
x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]);
x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]);
x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]);
x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]);
x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]);
x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]);
x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]);
x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]);
x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]);
/* "add x_0jklm into x_1jklm modulo 2^32" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (l = 0; l < 2; ++l)
#pragma unroll 2
for (m = 0; m < 2; ++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
x1[0][0][0][0] = x[1][0][0][0][0] + x[0][0][0][0][0];
x1[0][0][0][1] = x[1][0][0][0][1] + x[0][0][0][0][1];
x1[0][0][1][0] = x[1][0][0][1][0] + x[0][0][0][1][0];
x1[0][0][1][1] = x[1][0][0][1][1] + x[0][0][0][1][1];
x1[0][1][0][0] = x[1][0][1][0][0] + x[0][0][1][0][0];
x1[0][1][0][1] = x[1][0][1][0][1] + x[0][0][1][0][1];
x1[0][1][1][0] = x[1][0][1][1][0] + x[0][0][1][1][0];
x1[0][1][1][1] = x[1][0][1][1][1] + x[0][0][1][1][1];
x1[1][0][0][0] = x[1][1][0][0][0] + x[0][1][0][0][0];
x1[1][0][0][1] = x[1][1][0][0][1] + x[0][1][0][0][1];
x1[1][0][1][0] = x[1][1][0][1][0] + x[0][1][0][1][0];
x1[1][0][1][1] = x[1][1][0][1][1] + x[0][1][0][1][1];
x1[1][1][0][0] = x[1][1][1][0][0] + x[0][1][1][0][0];
x1[1][1][0][1] = x[1][1][1][0][1] + x[0][1][1][0][1];
x1[1][1][1][0] = x[1][1][1][1][0] + x[0][1][1][1][0];
x1[1][1][1][1] = x[1][1][1][1][1] + x[0][1][1][1][1];
/* "xor x_1~jklm into x_0jklm" */
x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[1][0][0][0];
x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[1][0][0][1];
x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[1][0][1][0];
x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[1][0][1][1];
x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[1][1][0][0];
x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[1][1][0][1];
x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[1][1][1][0];
x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[1][1][1][1];
x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[0][0][0][0];
x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[0][0][0][1];
x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[0][0][1][0];
x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[0][0][1][1];
x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[0][1][0][0];
x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[0][1][0][1];
x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[0][1][1][0];
x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[0][1][1][1];
/* "rotate x_0jklm upwards by 11 bits" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (l = 0; l < 2; ++l)
#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
/* "swap x_0j0lm with x_0j1lm" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (l = 0; l < 2; ++l)
#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
/* "xor x_1jklm into x_0jklm" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (l = 0; l < 2; ++l)
#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jkl0 with x_1jkl1" */
#pragma unroll 2
for (j = 0; j < 2; ++j)
#pragma unroll 2
for (k = 0; k < 2; ++k)
#pragma unroll 2
for (l = 0; l < 2; ++l)
SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
}
}
__device__ __forceinline__ void block_tox(const uint32_t *in, uint32_t x[2][2][2][2][2])
{
x[0][0][0][0][0] ^= in[0];
x[0][0][0][0][1] ^= in[1];
x[0][0][0][1][0] ^= in[2];
x[0][0][0][1][1] ^= in[3];
x[0][0][1][0][0] ^= in[4];
x[0][0][1][0][1] ^= in[5];
x[0][0][1][1][0] ^= in[6];
x[0][0][1][1][1] ^= in[7];
}
__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2])
{
out[0] = x[0][0][0][0][0];
out[1] = x[0][0][0][0][1];
out[2] = x[0][0][0][1][0];
out[3] = x[0][0][0][1][1];
out[4] = x[0][0][1][0][0];
out[5] = x[0][0][1][0][1];
out[6] = x[0][0][1][1][0];
out[7] = x[0][0][1][1][1];
x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]);
x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]);
x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]);
x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]);
x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]);
x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]);
x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]);
x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]);
x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]);
x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]);
x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]);
x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]);
x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]);
x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]);
x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]);
x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]);
/* "add x_0jklm into x_1~jk~lm modulo 2^32" */
x[1][1][0][1][0] = x1[1][0][1][0] + x[0][0][0][0][0];
x[1][1][0][1][1] = x1[1][0][1][1] + x[0][0][0][0][1];
x[1][1][0][0][0] = x1[1][0][0][0] + x[0][0][0][1][0];
x[1][1][0][0][1] = x1[1][0][0][1] + x[0][0][0][1][1];
x[1][1][1][1][0] = x1[1][1][1][0] + x[0][0][1][0][0];
x[1][1][1][1][1] = x1[1][1][1][1] + x[0][0][1][0][1];
x[1][1][1][0][0] = x1[1][1][0][0] + x[0][0][1][1][0];
x[1][1][1][0][1] = x1[1][1][0][1] + x[0][0][1][1][1];
x[1][0][0][1][0] = x1[0][0][1][0] + x[0][1][0][0][0];
x[1][0][0][1][1] = x1[0][0][1][1] + x[0][1][0][0][1];
x[1][0][0][0][0] = x1[0][0][0][0] + x[0][1][0][1][0];
x[1][0][0][0][1] = x1[0][0][0][1] + x[0][1][0][1][1];
x[1][0][1][1][0] = x1[0][1][1][0] + x[0][1][1][0][0];
x[1][0][1][1][1] = x1[0][1][1][1] + x[0][1][1][0][1];
x[1][0][1][0][0] = x1[0][1][0][0] + x[0][1][1][1][0];
x[1][0][1][0][1] = x1[0][1][0][1] + x[0][1][1][1][1];
/* "xor x_1~j~k~lm into x_0jklm" */
x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][1][1][1][0];
x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][1][1][1][1];
x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][1][1][0][0];
x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][1][1][0][1];
x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][1][0][1][0];
x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][1][0][1][1];
x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][1][0][0][0];
x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][1][0][0][1];
x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][0][1][1][0];
x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][0][1][1][1];
x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][0][1][0][0];
x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][0][1][0][1];
x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][0][0][1][0];
x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][0][0][1][1];
x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][0][0][0][0];
x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][0][0][0][1];
}
/* "rotate x_0jklm upwards by 7 bits" */
x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]);
x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]);
x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]);
x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]);
x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]);
x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]);
x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]);
x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]);
x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]);
x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]);
x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]);
x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]);
x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]);
x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]);
x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]);
x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]);
/* "add x_0jklm into x_1~j~k~l~m modulo 2^32" */
x1[1][1][1][1] = x[1][1][1][1][1] + x[0][0][0][0][0];
x1[1][1][1][0] = x[1][1][1][1][0] + x[0][0][0][0][1];
x1[1][1][0][1] = x[1][1][1][0][1] + x[0][0][0][1][0];
x1[1][1][0][0] = x[1][1][1][0][0] + x[0][0][0][1][1];
x1[1][0][1][1] = x[1][1][0][1][1] + x[0][0][1][0][0];
x1[1][0][1][0] = x[1][1][0][1][0] + x[0][0][1][0][1];
x1[1][0][0][1] = x[1][1][0][0][1] + x[0][0][1][1][0];
x1[1][0][0][0] = x[1][1][0][0][0] + x[0][0][1][1][1];
x1[0][1][1][1] = x[1][0][1][1][1] + x[0][1][0][0][0];
x1[0][1][1][0] = x[1][0][1][1][0] + x[0][1][0][0][1];
x1[0][1][0][1] = x[1][0][1][0][1] + x[0][1][0][1][0];
x1[0][1][0][0] = x[1][0][1][0][0] + x[0][1][0][1][1];
x1[0][0][1][1] = x[1][0][0][1][1] + x[0][1][1][0][0];
x1[0][0][1][0] = x[1][0][0][1][0] + x[0][1][1][0][1];
x1[0][0][0][1] = x[1][0][0][0][1] + x[0][1][1][1][0];
x1[0][0][0][0] = x[1][0][0][0][0] + x[0][1][1][1][1];
/* "xor x_1j~k~l~m into x_0jklm" */
x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[0][1][1][1];
x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[0][1][1][0];
x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[0][1][0][1];
x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[0][1][0][0];
x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[0][0][1][1];
x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[0][0][1][0];
x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[0][0][0][1];
x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[0][0][0][0];
x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[1][1][1][1];
x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[1][1][1][0];
x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[1][1][0][1];
x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[1][1][0][0];
x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[1][0][1][1];
x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[1][0][1][0];
x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[1][0][0][1];
x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[1][0][0][0];
__device__ __forceinline__
void Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
{
/* "xor the block into the first b bytes of the state" */
/* "and then transform the state invertibly through r identical rounds" */
block_tox(data, x);
rrounds(x);
/* "rotate x_0jklm upwards by 11 bits" */
x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]);
x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]);
x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]);
x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]);
x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]);
x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]);
x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]);
x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]);
x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]);
x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]);
x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]);
x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]);
x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]);
x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]);
x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]);
x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]);
/* "add x_0jklm into x_1j~kl~m modulo 2^32" */
x[1][0][1][0][1] = x1[0][1][0][1] + x[0][0][0][0][0];
x[1][0][1][0][0] = x1[0][1][0][0] + x[0][0][0][0][1];
x[1][0][1][1][1] = x1[0][1][1][1] + x[0][0][0][1][0];
x[1][0][1][1][0] = x1[0][1][1][0] + x[0][0][0][1][1];
x[1][0][0][0][1] = x1[0][0][0][1] + x[0][0][1][0][0];
x[1][0][0][0][0] = x1[0][0][0][0] + x[0][0][1][0][1];
x[1][0][0][1][1] = x1[0][0][1][1] + x[0][0][1][1][0];
x[1][0][0][1][0] = x1[0][0][1][0] + x[0][0][1][1][1];
x[1][1][1][0][1] = x1[1][1][0][1] + x[0][1][0][0][0];
x[1][1][1][0][0] = x1[1][1][0][0] + x[0][1][0][0][1];
x[1][1][1][1][1] = x1[1][1][1][1] + x[0][1][0][1][0];
x[1][1][1][1][0] = x1[1][1][1][0] + x[0][1][0][1][1];
x[1][1][0][0][1] = x1[1][0][0][1] + x[0][1][1][0][0];
x[1][1][0][0][0] = x1[1][0][0][0] + x[0][1][1][0][1];
x[1][1][0][1][1] = x1[1][0][1][1] + x[0][1][1][1][0];
x[1][1][0][1][0] = x1[1][0][1][0] + x[0][1][1][1][1];
/* "xor x_1jkl~m into x_0jklm" */
x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][0][0][0][1];
x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][0][0][0][0];
x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][0][0][1][1];
x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][0][0][1][0];
x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][0][1][0][1];
x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][0][1][0][0];
x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][0][1][1][1];
x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][0][1][1][0];
x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][1][0][0][1];
x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][1][0][0][0];
x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][1][0][1][1];
x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][1][0][1][0];
x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][1][1][0][1];
x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][1][1][0][0];
x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][1][1][1][1];
x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][1][1][1][0];
}
__device__ __forceinline__
void Update32_const(uint32_t x[2][2][2][2][2])
{
x[0][0][0][0][0] ^= 0x80;
rrounds(x);
}
__device__ __forceinline__
@ -185,27 +253,44 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) @@ -185,27 +253,44 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
x[1][1][1][1][1] ^= 1U;
/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 2
for (int i = 0; i < 10; ++i) rrounds(x);
/* "output the first h/8 bytes of the state" */
hash_fromx(hashval, x);
hashval[0] = x[0][0][0][0][0];
hashval[1] = x[0][0][0][0][1];
hashval[2] = x[0][0][0][1][0];
hashval[3] = x[0][0][0][1][1];
hashval[4] = x[0][0][1][0][0];
hashval[5] = x[0][0][1][0][1];
hashval[6] = x[0][0][1][1][0];
hashval[7] = x[0][0][1][1][1];
}
#if __CUDA_ARCH__ >= 500
__global__ __launch_bounds__(TPB, 1)
__global__ __launch_bounds__(TPB50, 1)
#else
__global__ __launch_bounds__(TPB35, 1)
#endif
void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
#if __CUDA_ARCH__ >= 500
uint2 Hash[4];
Hash[0] = __ldg(&g_hash[thread]);
Hash[1] = __ldg(&g_hash[thread + 1 * threads]);
Hash[2] = __ldg(&g_hash[thread + 2 * threads]);
Hash[3] = __ldg(&g_hash[thread + 3 * threads]);
#else
uint32_t Hash[8];
LOHI(Hash[0], Hash[1], __ldg(&((uint64_t*)g_hash)[thread]));
LOHI(Hash[2], Hash[3], __ldg(&((uint64_t*)g_hash)[thread + 1 * threads]));
LOHI(Hash[4], Hash[5], __ldg(&((uint64_t*)g_hash)[thread + 2 * threads]));
LOHI(Hash[6], Hash[7], __ldg(&((uint64_t*)g_hash)[thread + 3 * threads]));
#endif
uint32_t x[2][2][2][2][2] =
{
@ -219,6 +304,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha @@ -219,6 +304,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha
0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
};
#if __CUDA_ARCH__ >= 500
x[0][0][0][0][0] ^= Hash[0].x;
x[0][0][0][0][1] ^= Hash[0].y;
x[0][0][0][1][0] ^= Hash[1].x;
@ -227,48 +313,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha @@ -227,48 +313,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha
x[0][0][1][0][1] ^= Hash[2].y;
x[0][0][1][1][0] ^= Hash[3].x;
x[0][0][1][1][1] ^= Hash[3].y;
rrounds(x);
x[0][0][0][0][0] ^= 0x80U;
rrounds(x);
Final(x, (uint32_t*) Hash);
g_hash[thread] = Hash[0];
g_hash[1 * threads + thread] = Hash[1];
g_hash[2 * threads + thread] = Hash[2];
g_hash[3 * threads + thread] = Hash[3];
}
}
#else
__global__ __launch_bounds__(TPB, 1)
void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t Hash[8];
uint64_t* g_hash = (uint64_t*) d_hash;
LOHI(Hash[0], Hash[1], __ldg(&g_hash[thread]));
LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads]));
LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads]));
LOHI(Hash[6], Hash[7], __ldg(&g_hash[thread + 3 * threads]));
uint32_t x[2][2][2][2][2] =
{
0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, 0x35481EAE,
0x22512D5B, 0xE5D94E63, 0x7E624131, 0xF4CC12BE,
0xC2D0B696, 0x42AF2070, 0xD0720C35, 0x3361DA8C,
0x28CCECA4, 0x8EF8AD83, 0x4680AC00, 0x40E5FBAB,
0xD89041C3, 0x6107FBD5, 0x6C859D41, 0xF0B26679,
0x09392549, 0x5FA25603, 0x65C892FD, 0x93CB6285,
0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, 0x85254725,
0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
};
x[0][0][0][0][0] ^= Hash[0];
x[0][0][0][0][1] ^= Hash[1];
x[0][0][0][1][0] ^= Hash[2];
@ -277,29 +322,48 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_ha @@ -277,29 +322,48 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_ha
x[0][0][1][0][1] ^= Hash[5];
x[0][0][1][1][0] ^= Hash[6];
x[0][0][1][1][1] ^= Hash[7];
#endif
rrounds(x);
x[0][0][0][0][0] ^= 0x80U;
rrounds(x);
#if __CUDA_ARCH__ >= 500
Final(x, (uint32_t*)Hash);
g_hash[thread] = Hash[0];
g_hash[1 * threads + thread] = Hash[1];
g_hash[2 * threads + thread] = Hash[2];
g_hash[3 * threads + thread] = Hash[3];
#else
Final(x, Hash);
g_hash[thread] = ((uint64_t*)Hash)[0];
g_hash[1 * threads + thread] = ((uint64_t*)Hash)[1];
g_hash[2 * threads + thread] = ((uint64_t*)Hash)[2];
g_hash[3 * threads + thread] = ((uint64_t*)Hash)[3];
((uint64_t*)g_hash)[thread] = ((uint64_t*)Hash)[0];
((uint64_t*)g_hash)[1 * threads + thread] = ((uint64_t*)Hash)[1];
((uint64_t*)g_hash)[2 * threads + thread] = ((uint64_t*)Hash)[2];
((uint64_t*)g_hash)[3 * threads + thread] = ((uint64_t*)Hash)[3];
#endif
}
}
#endif
__host__
void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
{
uint32_t tpb = TPB;
uint32_t tpb = TPB35;
if (cuda_arch[thr_id] >= 500) tpb = TPB50;
dim3 grid((threads + tpb - 1) / tpb);
dim3 block(tpb);
cubehash256_gpu_hash_32 << <grid, block >> > (threads, startNounce, (uint2*)d_hash);
}
__host__
void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream)
{
uint32_t tpb = TPB35;
if (cuda_arch[thr_id] >= 500) tpb = TPB50;
dim3 grid((threads + tpb - 1) / tpb);
dim3 block(tpb);
cubehash256_gpu_hash_32 << <grid, block, 0, stream >> > (threads, startNounce, (uint2*)d_hash);
}

427
Algo256/cuda_skein256.cu

@ -13,40 +13,263 @@ void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p @@ -13,40 +13,263 @@ void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p
}
__forceinline__ __device__
void Round_8_512v35(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
void Round_8_512v35_1(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
p0 += ks[(R+0) % 9];
p1 += ks[(R+1) % 9];
p2 += ks[(R+2) % 9];
p3 += ks[(R+3) % 9];
p4 += ks[(R+4) % 9];
p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
p7 += ks[(R+7) % 9] + make_uint2(R, 0);
p0 += ks[1];
p1 += ks[2];
p2 += ks[3];
p3 += ks[4];
p4 += ks[5];
p5 += ks[6] + ts[1];
p6 += ks[7] + ts[2];
p7 += ks[8] + make_uint2(1, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[(R+1) % 9];
p1 += ks[(R+2) % 9];
p2 += ks[(R+3) % 9];
p3 += ks[(R+4) % 9];
p4 += ks[(R+5) % 9];
p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
p7 += ks[(R+8) % 9] + make_uint2(R+1, 0);
p0 += ks[2];
p1 += ks[3];
p2 += ks[4];
p3 += ks[5];
p4 += ks[6];
p5 += ks[7] + ts[2];
p6 += ks[8] + ts[0];
p7 += ks[0] + make_uint2(2, 0);
}
__forceinline__ __device__
void Round_8_512v35_3(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
p0 += ks[3];
p1 += ks[4];
p2 += ks[5];
p3 += ks[6];
p4 += ks[7];
p5 += ks[8] + ts[0];
p6 += ks[0] + ts[1];
p7 += ks[1] + make_uint2(3, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[4];
p1 += ks[5];
p2 += ks[6];
p3 += ks[7];
p4 += ks[8];
p5 += ks[0] + ts[1];
p6 += ks[1] + ts[2];
p7 += ks[2] + make_uint2(4, 0);
}
__forceinline__ __device__
void Round_8_512v35_5(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
p0 += ks[5];
p1 += ks[6];
p2 += ks[7];
p3 += ks[8];
p4 += ks[0];
p5 += ks[1] + ts[2];
p6 += ks[2] + ts[0];
p7 += ks[3] + make_uint2(5, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[6];
p1 += ks[7];
p2 += ks[8];
p3 += ks[0];
p4 += ks[1];
p5 += ks[2] + ts[0];
p6 += ks[3] + ts[1];
p7 += ks[4] + make_uint2(6, 0);
}
__forceinline__ __device__
void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
void Round_8_512v35_7(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
p0 += ks[7];
p1 += ks[8];
p2 += ks[0];
p3 += ks[1];
p4 += ks[2];
p5 += ks[3] + ts[1];
p6 += ks[4] + ts[2];
p7 += ks[5] + make_uint2(7, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[8];
p1 += ks[0];
p2 += ks[1];
p3 += ks[2];
p4 += ks[3];
p5 += ks[4] + ts[2];
p6 += ks[5] + ts[0];
p7 += ks[6] + make_uint2(8, 0);
}
__forceinline__ __device__
void Round_8_512v35_9(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
p0 += ks[0];
p1 += ks[1];
p2 += ks[2];
p3 += ks[3];
p4 += ks[4];
p5 += ks[5] + ts[0];
p6 += ks[6] + ts[1];
p7 += ks[7] + make_uint2(9, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[1];
p1 += ks[2];
p2 += ks[3];
p3 += ks[4];
p4 += ks[5];
p5 += ks[6] + ts[1];
p6 += ks[7] + ts[2];
p7 += ks[8] + make_uint2(10, 0);
}
__forceinline__ __device__
void Round_8_512v35_11(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
p0 += ks[2];
p1 += ks[3];
p2 += ks[4];
p3 += ks[5];
p4 += ks[6];
p5 += ks[7] + ts[2];
p6 += ks[8] + ts[0];
p7 += ks[0] + make_uint2(11, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[3];
p1 += ks[4];
p2 += ks[5];
p3 += ks[6];
p4 += ks[7];
p5 += ks[8] + ts[0];
p6 += ks[0] + ts[1];
p7 += ks[1] + make_uint2(12, 0);
}
__forceinline__ __device__
void Round_8_512v35_13(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
p0 += ks[4];
p1 += ks[5];
p2 += ks[6];
p3 += ks[7];
p4 += ks[8];
p5 += ks[0] + ts[1];
p6 += ks[1] + ts[2];
p7 += ks[2] + make_uint2(13, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[5];
p1 += ks[6];
p2 += ks[7];
p3 += ks[8];
p4 += ks[0];
p5 += ks[1] + ts[2];
p6 += ks[2] + ts[0];
p7 += ks[3] + make_uint2(14, 0);
}
__forceinline__ __device__
void Round_8_512v35_15(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
p0 += ks[6];
p1 += ks[7];
p2 += ks[8];
p3 += ks[0];
p4 += ks[1];
p5 += ks[2] + ts[0];
p6 += ks[3] + ts[1];
p7 += ks[4] + make_uint2(15, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[7];
p1 += ks[8];
p2 += ks[0];
p3 += ks[1];
p4 += ks[2];
p5 += ks[3] + ts[1];
p6 += ks[4] + ts[2];
p7 += ks[5] + make_uint2(16, 0);
}
__forceinline__ __device__
void Round_8_512v35_17(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
@ -72,26 +295,48 @@ void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const @@ -72,26 +295,48 @@ void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const
p1 += ks[1];
p2 += ks[2];
p3 += ks[3];
p4 += ks[4];
p5 += ks[5] + ts[0];
p6 += ks[6] + ts[1];
p7 += ks[7] + make_uint2(18, 0);
}
__global__ __launch_bounds__(256,3)
void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
__forceinline__ __device__
void Round_8_512v35_final(const uint2 ks[9], const uint2 ts[3],
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
if (thread < threads)
p0 += ks[8];
p1 += ks[0];
p2 += ks[1];
p3 += ks[2];
p4 += ks[3];
p5 += ks[4] + ts[2];
p6 += ks[5] + ts[0];
p7 += ks[6] + make_uint2(17, 0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
p0 += ks[0];
p1 += ks[1];
p2 += ks[2];
p3 += ks[3];
}
__global__ __launch_bounds__(256, 4)
void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
const uint2 t12[6] = {
{ 0x20, 0 },
{ 0, 0xf0000000 },
{ 0x20, 0xf0000000 },
{ 0x08, 0 },
{ 0, 0xff000000 },
{ 0x08, 0xff000000 }
};
uint2 h[9] = {
const uint2 h2[9] = {
{ 0x2FDB3E13, 0xCCD044A1 },
{ 0x1A79A9EB, 0xE8359030 },
{ 0x4F816E6F, 0x55AEA061 },
@ -102,68 +347,71 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp @@ -102,68 +347,71 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp
{ 0x33EDFC13, 0x3EEDBA18 },
{ 0xC73A4E2A, 0xB69D3CFC }
};
const uint2 t12[2][3] = {
{ { 0x20, 0 },
{ 0, 0xf0000000 },
{ 0x20, 0xf0000000 } },
{ { 0x08, 0 },
{ 0, 0xff000000 },
{ 0x08, 0xff000000 } }
};
if (thread < threads)
{
uint2 dt0,dt1,dt2,dt3;
uint2 p0, p1, p2, p3, p4, p5, p6, p7;
LOHI(dt0.x,dt0.y,outputHash[thread]);
LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
dt0 = __ldg(&outputHash[0 * threads + thread]);
dt1 = __ldg(&outputHash[1 * threads + thread]);
dt2 = __ldg(&outputHash[2 * threads + thread]);
dt3 = __ldg(&outputHash[3 * threads + thread]);
p0 = h[0] + dt0;
p1 = h[1] + dt1;
p2 = h[2] + dt2;
p3 = h[3] + dt3;
p4 = h[4];
p5 = h[5] + t12[0];
p6 = h[6] + t12[1];
p7 = h[7];
p0 = h2[0] + dt0;
p1 = h2[1] + dt1;
p2 = h2[2] + dt2;
p3 = h2[3] + dt3;
p4 = h2[4];
p5 = h2[5] + t12[0][0];
p6 = h2[6] + t12[0][1];
p7 = h2[7];
// forced unroll required
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1);
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3);
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5);
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7);
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9);
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11);
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13);
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15);
Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17);
Round_8_512v35_1(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_3(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_5(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_7(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_9(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_11(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_13(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_15(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_17(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
p0 ^= dt0;
p1 ^= dt1;
p2 ^= dt2;
p3 ^= dt3;
h[0] = p0;
h[1] = p1;
h[2] = p2;
h[3] = p3;
h[4] = p4;
h[5] = p5;
h[6] = p6;
h[7] = p7;
h[8] = skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7];
const uint2 h[9] = { p0, p1, p2, p3, p4, p5, p6, p7, skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] };
const uint2 *t = t12+3;
p5 += t12[3]; //p5 already equal h[5]
p6 += t12[4];
p5 += t12[1][0]; //p5 already equal h[5]
p6 += t12[1][1];
// forced unroll
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1);
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3);
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5);
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7);
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9);
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11);
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13);
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15);
Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7);
outputHash[thread] = devectorize(p0);
outputHash[threads+thread] = devectorize(p1);
outputHash[2*threads+thread] = devectorize(p2);
outputHash[3*threads+thread] = devectorize(p3);
Round_8_512v35_1(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_3(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_5(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_7(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_9(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_11(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_13(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_15(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
Round_8_512v35_final(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
outputHash[0 * threads + thread] = p0;
outputHash[1 * threads + thread] = p1;
outputHash[2 * threads + thread] = p2;
outputHash[3 * threads + thread] = p3;
}
}
@ -304,10 +552,27 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui @@ -304,10 +552,27 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui
// only 1kH/s perf change between kernels on a 960...
if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
skein256_gpu_hash_32 << <grid, block >> >(threads, startNounce, (uint2*)d_outputHash);
else
skein256_gpu_hash_32_v30 << <grid, block >> >(threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
//MyStreamSynchronize(NULL, order, thr_id);
}
__host__
void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order, cudaStream_t stream)
{
const uint32_t threadsperblock = 256;
int dev_id = device_map[thr_id];
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// only 1kH/s perf change between kernels on a 960...
if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
skein256_gpu_hash_32 << <grid, block, 0, stream >> >(threads, startNounce, (uint2*)d_outputHash);
else
skein256_gpu_hash_32_v30 << <grid, block,0, stream >> >(threads, startNounce, d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id);
}

576
ccminer.cpp

@ -83,6 +83,7 @@ bool opt_debug_threads = false; @@ -83,6 +83,7 @@ bool opt_debug_threads = false;
bool opt_protocol = false;
bool opt_benchmark = false;
bool opt_showdiff = false;
bool opt_eco_mode = false;
// todo: limit use of these flags,
// prefer the pools[] attributes
@ -91,6 +92,7 @@ bool have_longpoll = false; @@ -91,6 +92,7 @@ bool have_longpoll = false;
bool want_stratum = true;
bool have_stratum = false;
bool allow_gbt = true;
bool allow_getwork = true;
bool allow_mininginfo = true;
bool check_dups = false;
bool check_stratum_jobs = false;
@ -165,6 +167,8 @@ char *short_url = NULL; @@ -165,6 +167,8 @@ char *short_url = NULL;
struct stratum_ctx stratum = { 0 };
pthread_mutex_t stratum_sock_lock;
pthread_mutex_t stratum_work_lock;
static unsigned char pk_script[25] = { 0 };
static size_t pk_script_size = 0;
char *opt_cert;
char *opt_proxy;
@ -185,6 +189,7 @@ pthread_mutex_t stats_lock; @@ -185,6 +189,7 @@ pthread_mutex_t stats_lock;
double thr_hashrates[MAX_GPUS] = { 0 };
uint64_t global_hashrate = 0;
double stratum_diff = 0.0;
static char *lp_id;
double net_diff = 0;
uint64_t net_hashrate = 0;
uint64_t net_blocks = 0;
@ -226,8 +231,8 @@ Options:\n\ @@ -226,8 +231,8 @@ Options:\n\
jackpot Jackpot\n\
keccak Keccak-256 (Maxcoin)\n\
luffa Joincoin\n\
lyra2 LyraBar\n\
lyra2v2 VertCoin\n\
lyra2 Lyra2RE(Crypto)\n\
lyra2v2 Lyra2REv2(VertCoin)\n\
mjollnir Mjollnircoin\n\
myr-gr Myriad-Groestl\n\
neoscrypt FeatherCoin, Phoenix, UFO...\n\
@ -256,6 +261,8 @@ Options:\n\ @@ -256,6 +261,8 @@ Options:\n\
(matching 2nd gt640 in the PC)\n\
-i --intensity=N[,N] GPU intensity 8.0-25.0 (default: auto) \n\
Decimals are allowed for fine tuning \n\
--eco Use Eco mode\n\
Auto tuning for low energy (Lyra2REv2 only)\n\
--cuda-schedule Set device threads scheduling mode (default: auto)\n\
-f, --diff-factor Divide difficulty by this factor (default 1.0) \n\
-m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\
@ -278,6 +285,8 @@ Options:\n\ @@ -278,6 +285,8 @@ Options:\n\
long polling is unavailable, in seconds (default: 10)\n\
-n, --ndevs list cuda devices\n\
-N, --statsavg number of samples used to compute hashrate (default: 30)\n\
--coinbase-addr=ADDR payout address for solo mining\n\
--no-getwork disable getwork support\n\
--no-gbt disable getblocktemplate support (height check in solo)\n\
--no-longpoll disable X-Long-Polling support\n\
--no-stratum disable X-Stratum support\n\
@ -329,6 +338,7 @@ struct option options[] = { @@ -329,6 +338,7 @@ struct option options[] = {
{ "background", 0, NULL, 'B' },
{ "benchmark", 0, NULL, 1005 },
{ "cert", 1, NULL, 1001 },
{ "coinbase-addr", 1, NULL, 1016 },
{ "config", 1, NULL, 'c' },
{ "cputest", 0, NULL, 1006 },
{ "cpu-affinity", 1, NULL, 1020 },
@ -341,6 +351,7 @@ struct option options[] = { @@ -341,6 +351,7 @@ struct option options[] = {
{ "no-color", 0, NULL, 1002 },
{ "no-extranonce", 0, NULL, 1012 },
{ "no-gbt", 0, NULL, 1011 },
{ "no-getwork", 0, NULL, 1010 },
{ "no-longpoll", 0, NULL, 1003 },
{ "no-stratum", 0, NULL, 1007 },
{ "no-autotune", 0, NULL, 1004 }, // scrypt
@ -394,6 +405,7 @@ struct option options[] = { @@ -394,6 +405,7 @@ struct option options[] = {
{ "diff-multiplier", 1, NULL, 'm' },
{ "diff-factor", 1, NULL, 'f' },
{ "diff", 1, NULL, 'f' }, // compat
{ "eco", 0, NULL, 1080 },
{ 0, 0, 0, 0 }
};
@ -892,7 +904,65 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -892,7 +904,65 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
if (check_dups)
hashlog_remember_submit(work, nonce);
} else {
}
else if (work->txs2)
{
char data_str[2 * sizeof(work->data) + 1];
char *req;
for (int i = 0; i < ARRAY_SIZE(work->data); i++)
be32enc(work->data + i, work->data[i]);
cbin2hex(data_str, (char *)work->data, 80);
if (work->workid) {
char *params;
val = json_object();
json_object_set_new(val, "workid", json_string(work->workid));
params = json_dumps(val, 0);
json_decref(val);
req = (char*)malloc(128 + 2 * 80 + strlen(work->txs2) + strlen(params));
sprintf(req,
"{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":4}\r\n",
data_str, work->txs2, params);
free(params);
}
else {
req = (char*)malloc(128 + 2 * 80 + strlen(work->txs2));
sprintf(req,
"{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":4}\r\n",
data_str, work->txs2);
}
val = json_rpc_call_pool(curl, pool, req, false, false, NULL);
free(req);
if (unlikely(!val)) {
applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
return false;
}
res = json_object_get(val, "result");
if (json_is_object(res)) {
char *res_str;
bool sumres = false;
void *iter = json_object_iter(res);
while (iter) {
if (json_is_null(json_object_iter_value(iter))) {
sumres = true;
break;
}
iter = json_object_iter_next(res, iter);
}
res_str = json_dumps(res, 0);
share_result(sumres, work->pooln, work->sharediff, res_str);
free(res_str);
}
else
share_result(json_is_null(res), work->pooln, work->sharediff, json_string_value(res));
json_decref(val);
}
else {
int data_size = 128;
int adata_sz = data_size / sizeof(uint32_t);
@ -924,6 +994,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -924,6 +994,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
/* issue JSON-RPC request */
val = json_rpc_call_pool(curl, pool, s, false, false, NULL);
free(str);
if (unlikely(!val)) {
applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
return false;
@ -940,12 +1011,15 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -940,12 +1011,15 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
json_decref(val);
free(str);
}
return true;
}
#ifndef ORG
#define BLOCK_VERSION_CURRENT 7
#endif
/* simplified method to only get some extra infos in solo mode */
static bool gbt_work_decode(const json_t *val, struct work *work)
{
@ -985,8 +1059,311 @@ static bool gbt_work_decode(const json_t *val, struct work *work) @@ -985,8 +1059,311 @@ static bool gbt_work_decode(const json_t *val, struct work *work)
return true;
}
#ifndef ORG
int varint_encode(unsigned char *p, uint64_t n)
{
int i;
if (n < 0xfd) {
p[0] = (uchar)n;
return 1;
}
if (n <= 0xffff) {
p[0] = 0xfd;
p[1] = n & 0xff;
p[2] = (uchar)(n >> 8);
return 3;
}
if (n <= 0xffffffff) {
p[0] = 0xfe;
for (i = 1; i < 5; i++) {
p[i] = n & 0xff;
n >>= 8;
}
return 5;
}
p[0] = 0xff;
for (i = 1; i < 9; i++) {
p[i] = n & 0xff;
n >>= 8;
}
return 9;
}
static bool gbt_work_decode_full(const json_t *val, struct work *work)
{
int i, n;
uint32_t version, curtime, bits;
uint32_t prevhash[8];
uint32_t target[8];
int cbtx_size;
uchar *cbtx = NULL;
int tx_count, tx_size;
uchar txc_vi[9];
uchar(*merkle_tree)[32] = NULL;
bool coinbase_append = false;
bool submit_coinbase = false;
bool version_force = false;
bool version_reduce = false;
json_t *tmp, *txa;
bool rc = false;
tmp = json_object_get(val, "mutable");
if (tmp && json_is_array(tmp)) {
n = (int)json_array_size(tmp);
for (i = 0; i < n; i++) {
const char *s = json_string_value(json_array_get(tmp, i));
if (!s)
continue;
if (!strcmp(s, "coinbase/append"))
coinbase_append = true;
else if (!strcmp(s, "submit/coinbase"))
submit_coinbase = true;
else if (!strcmp(s, "version/force"))
version_force = true;
else if (!strcmp(s, "version/reduce"))
version_reduce = true;
}
}
tmp = json_object_get(val, "height");
if (!tmp || !json_is_integer(tmp)) {
applog(LOG_ERR, "JSON invalid height");
goto out;
}
work->height = (int)json_integer_value(tmp);
applog(LOG_BLUE, "Current block is %d", work->height);
tmp = json_object_get(val, "version");
if (!tmp || !json_is_integer(tmp)) {
applog(LOG_ERR, "JSON invalid version");
goto out;
}
version = (uint32_t)json_integer_value(tmp);
if ((version & 0xffU) > BLOCK_VERSION_CURRENT) {
if (version_reduce) {
version = (version & ~0xffU) | BLOCK_VERSION_CURRENT;
}
else if (allow_gbt && allow_getwork && !version_force) {
applog(LOG_DEBUG, "Switching to getwork, gbt version %d", version);
allow_gbt = false;
goto out;
}
else if (!version_force) {
applog(LOG_ERR, "Unrecognized block version: %u", version);
goto out;
}
}
if (unlikely(!jobj_binary(val, "previousblockhash", prevhash, sizeof(prevhash)))) {
applog(LOG_ERR, "JSON invalid previousblockhash");
goto out;
}
tmp = json_object_get(val, "curtime");
if (!tmp || !json_is_integer(tmp)) {
applog(LOG_ERR, "JSON invalid curtime");
goto out;
}
curtime = (uint32_t)json_integer_value(tmp);
if (unlikely(!jobj_binary(val, "bits", &bits, sizeof(bits)))) {
applog(LOG_ERR, "JSON invalid bits");
goto out;
}
/* find count and size of transactions */
txa = json_object_get(val, "transactions");
if (!txa || !json_is_array(txa)) {
applog(LOG_ERR, "JSON invalid transactions");
goto out;
}
tx_count = (int)json_array_size(txa);
tx_size = 0;
for (i = 0; i < tx_count; i++) {
const json_t *tx = json_array_get(txa, i);
const char *tx_hex = json_string_value(json_object_get(tx, "data"));
if (!tx_hex) {
applog(LOG_ERR, "JSON invalid transactions");
goto out;
}
tx_size += (int)(strlen(tx_hex) / 2);
}
/* build coinbase transaction */
tmp = json_object_get(val, "coinbasetxn");
if (tmp) {
const char *cbtx_hex = json_string_value(json_object_get(tmp, "data"));
cbtx_size = cbtx_hex ? (int)strlen(cbtx_hex) / 2 : 0;
cbtx = (uchar*)malloc(cbtx_size + 100);
if (cbtx_size < 60 || !hex2bin(cbtx, cbtx_hex, cbtx_size)) {
applog(LOG_ERR, "JSON invalid coinbasetxn");
goto out;
}
}
else {
int64_t cbvalue;
if (!pk_script_size) {
if (allow_getwork) {
applog(LOG_INFO, "No payout address provided, switching to getwork");
allow_gbt = false;
}
else
applog(LOG_ERR, "No payout address provided");
goto out;
}
tmp = json_object_get(val, "coinbasevalue");
if (!tmp || !json_is_number(tmp)) {
applog(LOG_ERR, "JSON invalid coinbasevalue");
goto out;
}
cbvalue = (int64_t)(json_is_integer(tmp) ? json_integer_value(tmp) : json_number_value(tmp));
cbtx = (uchar*)malloc(256);
le32enc((uint32_t *)cbtx, 1); /* version */
cbtx[4] = 1; /* in-counter */
memset(cbtx + 5, 0x00, 32); /* prev txout hash */
le32enc((uint32_t *)(cbtx + 37), 0xffffffff); /* prev txout index */
cbtx_size = 43;
/* BIP 34: height in coinbase */
for (n = work->height; n; n >>= 8)
cbtx[cbtx_size++] = n & 0xff;
cbtx[42] = cbtx_size - 43;
cbtx[41] = cbtx_size - 42; /* scriptsig length */
le32enc((uint32_t *)(cbtx + cbtx_size), 0xffffffff); /* sequence */
cbtx_size += 4;
cbtx[cbtx_size++] = 1; /* out-counter */
le32enc((uint32_t *)(cbtx + cbtx_size), (uint32_t)cbvalue); /* value */
le32enc((uint32_t *)(cbtx + cbtx_size + 4), cbvalue >> 32);
cbtx_size += 8;
cbtx[cbtx_size++] = (uint8_t)pk_script_size; /* txout-script length */
memcpy(cbtx + cbtx_size, pk_script, pk_script_size);
cbtx_size += (int)pk_script_size;
le32enc((uint32_t *)(cbtx + cbtx_size), 0); /* lock time */
cbtx_size += 4;
coinbase_append = true;
}
if (coinbase_append) {
unsigned char xsig[100];
int xsig_len = 0;
tmp = json_object_get(val, "coinbaseaux");
if (tmp && json_is_object(tmp)) {
void *iter = json_object_iter(tmp);
while (iter) {
unsigned char buf[100];
const char *s = json_string_value(json_object_iter_value(iter));
n = s ? (int)(strlen(s) / 2) : 0;
if (!s || n > 100 || !hex2bin(buf, s, n)) {
applog(LOG_ERR, "JSON invalid coinbaseaux");
break;
}
if (cbtx[41] + xsig_len + n <= 100) {
memcpy(xsig + xsig_len, buf, n);
xsig_len += n;
}
iter = json_object_iter_next(tmp, iter);
}
}
if (xsig_len) {
unsigned char *ssig_end = cbtx + 42 + cbtx[41];
int push_len = cbtx[41] + xsig_len < 76 ? 1 :
cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
n = xsig_len + push_len;
memmove(ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41]);
cbtx[41] += n;
if (push_len == 2)
*(ssig_end++) = 0x4c; /* OP_PUSHDATA1 */
if (push_len)
*(ssig_end++) = xsig_len;
memcpy(ssig_end, xsig, xsig_len);
cbtx_size += n;
}
}
n = varint_encode(txc_vi, 1 + tx_count);
work->txs2 = (char*)malloc(2 * (n + cbtx_size + tx_size) + 1);
cbin2hex(work->txs2, (char *)txc_vi, n);
cbin2hex(work->txs2 + 2 * n, (char *)cbtx, cbtx_size);
/* generate merkle root */
merkle_tree = (uchar(*)[32]) calloc(((1 + tx_count + 1) & ~1), 32);
sha256d(merkle_tree[0], cbtx, cbtx_size);
for (i = 0; i < tx_count; i++) {
tmp = json_array_get(txa, i);
const char *tx_hex = json_string_value(json_object_get(tmp, "data"));
const int tx_size = tx_hex ? (int)(strlen(tx_hex) / 2) : 0;
unsigned char *tx = (uchar*)malloc(tx_size);
if (!tx_hex || !hex2bin(tx, tx_hex, tx_size)) {
applog(LOG_ERR, "JSON invalid transactions");
free(tx);
goto out;
}
sha256d(merkle_tree[1 + i], tx, tx_size);
if (!submit_coinbase)
strcat(work->txs2, tx_hex);
}
n = 1 + tx_count;
while (n > 1) {
if (n % 2) {
memcpy(merkle_tree[n], merkle_tree[n - 1], 32);
++n;
}
n /= 2;
for (i = 0; i < n; i++)
sha256d(merkle_tree[i], merkle_tree[2 * i], 64);
}
/* assemble block header */
work->data[0] = swab32(version);
for (i = 0; i < 8; i++)
work->data[8 - i] = le32dec(prevhash + i);
for (i = 0; i < 8; i++)
work->data[9 + i] = be32dec((uint32_t *)merkle_tree[0] + i);
work->data[17] = swab32(curtime);
work->data[18] = le32dec(&bits);
memset(work->data + 19, 0x00, 52);
work->data[20] = 0x80000000;
work->data[31] = 0x00000280;
if (unlikely(!jobj_binary(val, "target", target, sizeof(target)))) {
applog(LOG_ERR, "JSON invalid target");
goto out;
}
for (i = 0; i < ARRAY_SIZE(work->target); i++)
work->target[7 - i] = be32dec(target + i);
tmp = json_object_get(val, "workid");
if (tmp) {
if (!json_is_string(tmp)) {
applog(LOG_ERR, "JSON invalid workid");
goto out;
}
work->workid = strdup(json_string_value(tmp));
}
rc = true;
out:
/* Long polling */
tmp = json_object_get(val, "longpollid");
if (want_longpoll && json_is_string(tmp)) {
free(lp_id);
lp_id = strdup(json_string_value(tmp));
if (!have_longpoll) {
char *lp_uri;
tmp = json_object_get(val, "longpolluri");
lp_uri = json_is_string(tmp) ? strdup(json_string_value(tmp)) : rpc_url;
have_longpoll = true;
tq_push(thr_info[longpoll_thr_id].q, lp_uri);
}
}
free(merkle_tree);
free(cbtx);
return rc;
}
#endif
#define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"
static const char *gbt_req =
static const char *gbt_req_ =
"{\"method\": \"getblocktemplate\", \"params\": [{"
// "\"capabilities\": " GBT_CAPABILITIES ""
"}], \"id\":9}\r\n";
@ -998,7 +1375,7 @@ static bool get_blocktemplate(CURL *curl, struct work *work) @@ -998,7 +1375,7 @@ static bool get_blocktemplate(CURL *curl, struct work *work)
return false;
int curl_err = 0;
json_t *val = json_rpc_call_pool(curl, pool, gbt_req, false, false, &curl_err);
json_t *val = json_rpc_call_pool(curl, pool, gbt_req_, false, false, &curl_err);
if (!val && curl_err == -1) {
// when getblocktemplate is not supported, disable it
@ -1068,8 +1445,19 @@ static bool get_mininginfo(CURL *curl, struct work *work) @@ -1068,8 +1445,19 @@ static bool get_mininginfo(CURL *curl, struct work *work)
return true;
}
#ifdef ORG
static const char *rpc_req =
"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
#else
static const char *getwork_req =
"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
static const char *gbt_req =
"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
GBT_CAPABILITIES "}], \"id\":0}\r\n";
#endif
static const char *gbt_lp_req =
"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
static bool get_upstream_work(CURL *curl, struct work *work)
{
@ -1082,9 +1470,18 @@ static bool get_upstream_work(CURL *curl, struct work *work) @@ -1082,9 +1470,18 @@ static bool get_upstream_work(CURL *curl, struct work *work)
applog(LOG_DEBUG, "%s: want_longpoll=%d have_longpoll=%d",
__func__, want_longpoll, have_longpoll);
#ifndef ORG
int err;
start:
#endif
gettimeofday(&tv_start, NULL);
/* want_longpoll/have_longpoll required here to init/unlock the lp thread */
#ifdef ORG
val = json_rpc_call_pool(curl, pool, rpc_req, want_longpoll, have_longpoll, NULL);
#else
val = json_rpc_call_pool(curl, pool, allow_gbt ? gbt_req : getwork_req, want_longpoll, have_longpoll, &err);
#endif
gettimeofday(&tv_end, NULL);
if (have_stratum || unlikely(work->pooln != cur_pooln)) {
@ -1093,10 +1490,39 @@ static bool get_upstream_work(CURL *curl, struct work *work) @@ -1093,10 +1490,39 @@ static bool get_upstream_work(CURL *curl, struct work *work)
return false;
}
#ifndef ORG
if (!allow_gbt && !allow_getwork) {
applog(LOG_ERR, "No usable protocol");
if (val)
json_decref(val);
return false;
}
if (allow_gbt && allow_getwork && !val && err == CURLE_OK) {
applog(LOG_NOTICE, "getblocktemplate failed, falling back to getwork");
allow_gbt = false;
goto start;
}
#endif
if (!val)
return false;
#ifndef ORG
if (allow_gbt) {
rc = gbt_work_decode_full(json_object_get(val, "result"), work);
if (!allow_gbt) {
json_decref(val);
goto start;
}
}
else {
#endif
rc = work_decode(json_object_get(val, "result"), work);
#ifndef ORG
}
#endif
if (opt_protocol && rc) {
timeval_subtract(&diff, &tv_end, &tv_start);
@ -1720,8 +2146,8 @@ static void *miner_thread(void *userdata) @@ -1720,8 +2146,8 @@ static void *miner_thread(void *userdata)
#endif
memcpy(&work, &g_work, sizeof(struct work));
nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
} else
nonceptr[0]++; //??
}
//else nonceptr[0]++; //??
if (opt_algo == ALGO_DECRED) {
// suprnova job_id check without data/target/height change...
@ -2136,10 +2562,15 @@ static void *miner_thread(void *userdata) @@ -2136,10 +2562,15 @@ static void *miner_thread(void *userdata)
}
}
if (rc > 0)
/* if (rc > 0)
work.scanned_to = work.nonces[0];
if (rc > 1)
work.scanned_to = max(work.nonces[0], work.nonces[1]);
*/
if (rc > 0)
work.scanned_to = start_nonce + hashes_done;
else {
work.scanned_to = max_nonce;
if (opt_debug && opt_benchmark) {
@ -2209,6 +2640,7 @@ static void *miner_thread(void *userdata) @@ -2209,6 +2640,7 @@ static void *miner_thread(void *userdata)
break;
}
}
nonceptr[0] = start_nonce + hashes_done;
}
out:
@ -2278,6 +2710,7 @@ longpoll_retry: @@ -2278,6 +2710,7 @@ longpoll_retry:
while (!abort_flag) {
json_t *val = NULL, *soval;
char *req = NULL;
int err = 0;
if (opt_debug_threads)
@ -2288,7 +2721,12 @@ longpoll_retry: @@ -2288,7 +2721,12 @@ longpoll_retry:
if (switchn != pool_switch_count)
goto need_reinit;
val = json_rpc_longpoll(curl, lp_url, pool, rpc_req, &err);
if (allow_gbt) {
req = (char*)malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1);
sprintf(req, gbt_lp_req, lp_id);
}
val = json_rpc_longpoll(curl, lp_url, pool, req ? req : getwork_req, &err);
if (allow_gbt) free(req);
if (have_stratum || switchn != pool_switch_count) {
if (val)
json_decref(val);
@ -2552,6 +2990,109 @@ static void show_usage_and_exit(int status) @@ -2552,6 +2990,109 @@ static void show_usage_and_exit(int status)
}
proper_exit(status);
}
static const char b58digits[] = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";
static bool b58dec(unsigned char *bin, size_t binsz, const char *b58)
{
size_t i, j;
uint64_t t;
uint32_t c;
uint32_t *outi;
size_t outisz = (binsz + 3) / 4;
int rem = binsz % 4;
uint32_t remmask = 0xffffffff << (8 * rem);
size_t b58sz = strlen(b58);
bool rc = false;
outi = (uint32_t *)calloc(outisz, sizeof(*outi));
for (i = 0; i < b58sz; ++i) {
for (c = 0; b58digits[c] != b58[i]; c++)
if (!b58digits[c])
goto out;
for (j = outisz; j--;) {
t = (uint64_t)outi[j] * 58 + c;
c = t >> 32;
outi[j] = t & 0xffffffff;
}
if (c || outi[0] & remmask)
goto out;
}
j = 0;
switch (rem) {
case 3:
*(bin++) = (outi[0] >> 16) & 0xff;
case 2:
*(bin++) = (outi[0] >> 8) & 0xff;
case 1:
*(bin++) = outi[0] & 0xff;
++j;
default:
break;
}
for (; j < outisz; ++j) {
be32enc((uint32_t *)bin, outi[j]);
bin += sizeof(uint32_t);
}
rc = true;
out:
free(outi);
return rc;
}
static int b58check(unsigned char *bin, size_t binsz, const char *b58)
{
unsigned char buf[32];
int i;
sha256d(buf, bin, (int)(binsz - 4));
if (memcmp(&bin[binsz - 4], buf, 4))
return -1;
/* Check number of zeros is correct AFTER verifying checksum
* (to avoid possibility of accessing the string beyond the end) */
for (i = 0; bin[i] == '\0' && b58[i] == '1'; ++i);
if (bin[i] == '\0' || b58[i] == '1')
return -3;
return bin[0];
}
size_t address_to_script(unsigned char *out, size_t outsz, const char *addr)
{
unsigned char addrbin[25];
int addrver;
size_t rv;
if (!b58dec(addrbin, sizeof(addrbin), addr))
return 0;
addrver = b58check(addrbin, sizeof(addrbin), addr);
if (addrver < 0)
return 0;
switch (addrver) {
case 5: /* Bitcoin script hash */
case 196: /* Testnet script hash */
if (outsz < (rv = 23))
return rv;
out[0] = 0xa9; /* OP_HASH160 */
out[1] = 0x14; /* push 20 bytes */
memcpy(&out[2], &addrbin[1], 20);
out[22] = 0x87; /* OP_EQUAL */
return rv;
default:
if (outsz < (rv = 25))
return rv;
out[0] = 0x76; /* OP_DUP */
out[1] = 0xa9; /* OP_HASH160 */
out[2] = 0x14; /* push 20 bytes */
memcpy(&out[3], &addrbin[1], 20);
out[23] = 0x88; /* OP_EQUALVERIFY */
out[24] = 0xac; /* OP_CHECKSIG */
return rv;
}
}
void parse_arg(int key, char *arg)
{
@ -2611,6 +3152,9 @@ void parse_arg(int key, char *arg) @@ -2611,6 +3152,9 @@ void parse_arg(int key, char *arg)
case 1030: /* --api-remote */
opt_api_remote = 1;
break;
case 1080:
opt_eco_mode = true;
break;
case 'B':
opt_background = true;
break;
@ -2946,9 +3490,19 @@ void parse_arg(int key, char *arg) @@ -2946,9 +3490,19 @@ void parse_arg(int key, char *arg)
case 1009:
opt_shares_limit = atoi(arg);
break;
case 1010:
allow_getwork = false;
break;
case 1011:
allow_gbt = false;
break;
case 1016: /* --coinbase-addr */
pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg);
if (!pk_script_size) {
fprintf(stderr, "invalid address -- '%s'\n", arg);
show_usage_and_exit(1);
}
break;
case 1012:
opt_extranonce = false;
break;
@ -3186,7 +3740,7 @@ static void parse_cmdline(int argc, char *argv[]) @@ -3186,7 +3740,7 @@ static void parse_cmdline(int argc, char *argv[])
show_usage_and_exit(1);
}
if (opt_algo == ALGO_DECRED && opt_vote == 9999) {
if (opt_vote == 9999) {
opt_vote = 0; // default, don't vote
}
}

33
ccminer.vcxproj

@ -41,10 +41,7 @@ @@ -41,10 +41,7 @@
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
</ImportGroup>
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
<ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
@ -83,10 +80,10 @@ @@ -83,10 +80,10 @@
<CudaCompile>
<CInterleavedPTX>false</CInterleavedPTX>
<GenerateLineInfo>true</GenerateLineInfo>
<MaxRegCount>80</MaxRegCount>
<MaxRegCount>255</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
<Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
</CudaCompile>
</ItemDefinitionGroup>
@ -115,15 +112,16 @@ @@ -115,15 +112,16 @@
<CudaCompile>
<CInterleavedPTX>false</CInterleavedPTX>
<GenerateLineInfo>true</GenerateLineInfo>
<MaxRegCount>80</MaxRegCount>
<MaxRegCount>255</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
<Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
<CudaLink>
<PerformDeviceLink>false</PerformDeviceLink>
<Optimization>O3</Optimization>
</CudaLink>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@ -158,16 +156,16 @@ @@ -158,16 +156,16 @@
</Link>
<CudaCompile>
<CInterleavedPTX>false</CInterleavedPTX>
<MaxRegCount>80</MaxRegCount>
<MaxRegCount>255</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
<CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
<Optimization>O2</Optimization>
<Optimization>O3</Optimization>
</CudaCompile>
<CudaLink>
<GPUDebugInfo>false</GPUDebugInfo>
<Optimization>O3</Optimization>
<Optimization>O2</Optimization>
</CudaLink>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@ -201,10 +199,10 @@ @@ -201,10 +199,10 @@
</Link>
<CudaCompile>
<CInterleavedPTX>false</CInterleavedPTX>
<MaxRegCount>80</MaxRegCount>
<MaxRegCount>255</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
<CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
<Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
<Optimization>O3</Optimization>
<TargetMachinePlatform>64</TargetMachinePlatform>
@ -250,6 +248,7 @@ @@ -250,6 +248,7 @@
<ClCompile Include="lyra2\Lyra2.c" />
<ClCompile Include="lyra2\Sponge.c" />
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
<ClInclude Include="neoscrypt\neoscrypt.h" />
<ClCompile Include="neoscrypt\neoscrypt.cpp" />
<ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
@ -347,7 +346,6 @@ @@ -347,7 +346,6 @@
<ClInclude Include="uint256.h" />
<ClInclude Include="lyra2\Lyra2.h" />
<ClInclude Include="lyra2\Sponge.h" />
<ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
<ClInclude Include="quark\groestl_transf_quad.h" />
<ClInclude Include="quark\groestl_functions_quad.h" />
<ClInclude Include="quark\cuda_quark.h" />
@ -527,10 +525,7 @@ @@ -527,10 +525,7 @@
<Text Include="README.txt" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
</ImportGroup>
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
</ImportGroup>
<!-- Copy the required dlls -->

6
ccminer.vcxproj.filters

@ -437,9 +437,6 @@ @@ -437,9 +437,6 @@
<ClInclude Include="bignum.hpp">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
@ -455,6 +452,9 @@ @@ -455,6 +452,9 @@
<ClInclude Include="x11\cuda_x11_simd512_sm2.cuh">
<Filter>Source Files\CUDA\x11</Filter>
</ClInclude>
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CudaCompile Include="cuda.cpp">

2
configure.ac

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
AC_INIT([ccminer], [1.7.6], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_INIT([ccminer], [1.7.6-r10], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

30
cuda_helper.h

@ -96,7 +96,6 @@ __device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uin @@ -96,7 +96,6 @@ __device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uin
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
// Endian Drehung für 32 Bit Typen
#ifdef __CUDA_ARCH__
__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
{
@ -471,6 +470,15 @@ static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) { @@ -471,6 +470,15 @@ static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) {
#endif
}
static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v)
{
uint2 result;
result.y = u.x ^ v.x;
result.x = u.y ^ v.y;
return result;
}
/**
* uint2 direct ops by c++ operator definitions
*/
@ -561,11 +569,9 @@ uint2 ROR2(const uint2 a, const int offset) @@ -561,11 +569,9 @@ uint2 ROR2(const uint2 a, const int offset)
return result;
}
__device__ __forceinline__
uint2 ROL2(const uint2 a, const int offset)
{
#if __CUDA_ARCH__ >= 350
__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
uint2 result;
#if __CUDA_ARCH__ > 300
if (offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
@ -574,14 +580,20 @@ uint2 ROL2(const uint2 a, const int offset) @@ -574,14 +580,20 @@ uint2 ROL2(const uint2 a, const int offset)
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
}
return result;
}
#else
if (!offset)
result = a;
__inline__ __device__ uint2 ROL2(const uint2 v, const int n)
{
uint2 result;
if (!n)
result = v;
else
result = ROR2(a, 64 - offset);
#endif
result = ROR2(v, 64 - n);
return result;
}
#endif
__device__ __forceinline__
uint2 SWAPUINT2(uint2 value)

599
lyra2/cuda_lyra2.cu

@ -6,36 +6,206 @@ @@ -6,36 +6,206 @@
#include <stdio.h>
#include <memory.h>
#define TPB50 16
#define TPB52 8
#define TPB52 32
#include "cuda_lyra2_sm2.cuh"
#include "cuda_lyra2_sm5.cuh"
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#define __CUDA_ARCH__ 500
#define __CUDA_ARCH__ 520
#endif
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ > 500
#include "cuda_vector_uint2x4.h"
#include "cuda_lyra2_vectors.h"
#define memshift 3
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
#endif
#define Nrow 8
#define Ncol 8
#define NcolMask 0x7
#define memshift 3
#define BUF_COUNT 0
__device__ uint2 *DMatrix;
__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads)
{
#if BUF_COUNT != 8
extern __shared__ uint2 shared_mem[];
const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
#endif
#if BUF_COUNT != 0
const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
#endif
#if BUF_COUNT == 8
#pragma unroll
for (int j = 0; j < 3; j++)
res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
#elif BUF_COUNT == 0
#pragma unroll
for (int j = 0; j < 3; j++)
res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
#else
if (row < BUF_COUNT)
{
#pragma unroll
for (int j = 0; j < 3; j++)
res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
}
else
{
#pragma unroll
for (int j = 0; j < 3; j++)
res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
}
#endif
}
__device__ uint2x4* DMatrix;
__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads)
{
#if BUF_COUNT != 8
extern __shared__ uint2 shared_mem[];
const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
#endif
#if BUF_COUNT != 0
const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
#endif
#if BUF_COUNT == 8
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
#elif BUF_COUNT == 0
#pragma unroll
for (int j = 0; j < 3; j++)
shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
#else
if (row < BUF_COUNT)
{
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
}
else
{
#pragma unroll
for (int j = 0; j < 3; j++)
shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
}
#endif
}
#if __CUDA_ARCH__ >= 300
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
return __shfl(a, b, c);
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
a1 = WarpShuffle(a1, b1, c);
a2 = WarpShuffle(a2, b2, c);
a3 = WarpShuffle(a3, b3, c);
}
#else
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
uint32_t *_ptr = (uint32_t*)shared_mem;
__threadfence_block();
uint32_t buf = _ptr[thread];
_ptr[thread] = a;
__threadfence_block();
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
_ptr[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a;
__threadfence_block();
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a1;
__threadfence_block();
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
__threadfence_block();
shared_mem[thread] = a2;
__threadfence_block();
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
__threadfence_block();
shared_mem[thread] = a3;
__threadfence_block();
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
}
#endif
static __device__ __forceinline__
void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
{
a += b; d ^= a; d = SWAPUINT2(d);
c += d; b ^= c; b = ROR2(b, 24);
a += b; d ^= a; d = ROR2(d, 16);
a += b; d = eorswap32(a, d);
c += d; b ^= c; b = ROR24(b);
a += b; d ^= a; d = ROR16(d);
c += d; b ^= c; b = ROR2(b, 63);
}
__device__ __forceinline__ void round_lyra(uint2 s[4])
{
Gfunc(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
Gfunc(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
}
static __device__ __forceinline__
void round_lyra(uint2x4* s)
{
@ -50,21 +220,24 @@ void round_lyra(uint2x4* s) @@ -50,21 +220,24 @@ void round_lyra(uint2x4* s)
}
static __device__ __forceinline__
void reduceDuplex(uint2x4 state[4], uint32_t thread)
void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
{
uint2 state1[3];
#if __CUDA_ARCH__ > 500
#pragma unroll
#endif
for (int i = 0; i < Nrow; i++)
{
uint2x4 state1[3];
ST4S(0, Ncol - i - 1, state, thread, threads);
const uint32_t ps1 = (256 * thread);
const uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread);
round_lyra(state);
}
#pragma unroll 4
for (int i = 0; i < 8; i++)
for (int i = 0; i < Nrow; i++)
{
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 - i*memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix+s1)[j]);
LD4S(state1, 0, i, thread, threads);
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
@ -72,208 +245,342 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread) @@ -72,208 +245,342 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread)
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state1[j];
ST4S(1, Ncol - i - 1, state1, thread, threads);
}
}
static __device__ __forceinline__
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], uint32_t thread)
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
{
uint2x4 state1[3], state2[3];
const uint32_t ps1 = ( memshift*8 * rowIn + 256 * thread);
const uint32_t ps2 = ( memshift*8 * rowInOut + 256 * thread);
const uint32_t ps3 = (memshift*7 + memshift*8 * rowOut + 256 * thread);
uint2 state1[3], state2[3];
#pragma unroll 1
for (int i = 0; i < 8; i++)
for (int i = 0; i < Nrow; i++)
{
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
for (int j = 0; j < 3; j++)
state1[j]= __ldg4(&(DMatrix + s1)[j]);
LD4S(state1, rowIn, i, thread, threads);
LD4S(state2, rowInOut, i, thread, threads);
for (int j = 0; j < 3; j++)
state2[j]= __ldg4(&(DMatrix + s2)[j]);
for (int j = 0; j < 3; j++) {
uint2x4 tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
state[j] ^= state1[j] + state2[j];
round_lyra(state);
for (int j = 0; j < 3; j++) {
const uint32_t s3 = ps3 - i*memshift;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];
(DMatrix + s3)[j] = state1[j];
}
((uint2*)state2)[0] ^= ((uint2*)state)[11];
ST4S(rowOut, Ncol - i - 1, state1, thread, threads);
for (int j = 0; j < 11; j++)
((uint2*)state2)[j+1] ^= ((uint2*)state)[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
ST4S(rowInOut, i, state2, thread, threads);
}
}
static __device__ __forceinline__
void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread)
void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
{
const uint32_t ps1 = (memshift * 8 * rowIn + 256 * thread);
const uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread);
const uint32_t ps3 = (memshift * 8 * rowOut + 256 * thread);
#pragma unroll 1
for (int i = 0; i < 8; i++)
for (int i = 0; i < Nrow; i++)
{
uint2x4 state1[3], state2[3];
uint2 state1[3], state2[3];
LD4S(state1, rowIn, i, thread, threads);
LD4S(state2, rowInOut, i, thread, threads);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
for (int j = 0; j < 3; j++) {
state1[j] = __ldg4(&(DMatrix + s1)[j]);
state2[j] = __ldg4(&(DMatrix + s2)[j]);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
ST4S(rowInOut, i, state2, thread, threads);
LD4S(state1, rowOut, i, thread, threads);
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] += state2[j];
state[j] ^= state1[j];
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];
ST4S(rowOut, i, state1, thread, threads);
}
}
round_lyra(state);
static __device__ __forceinline__
void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
{
uint2 state1[3], state2[3], last[3];
LD4S(state1, 2, 0, thread, threads);
LD4S(last, rowInOut, 0, thread, threads);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + last[j];
((uint2*)state2)[0] ^= ((uint2*)state)[11];
round_lyra(state);
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (rowInOut == rowOut) {
for (int j = 0; j < 3; j++) {
state2[j] ^= state[j];
(DMatrix + s2)[j]=state2[j];
if (threadIdx.x == 0)
{
last[0] ^= Data2;
last[1] ^= Data0;
last[2] ^= Data1;
}
} else {
const uint32_t s3 = ps3 + i*memshift;
for (int j = 0; j < 3; j++) {
(DMatrix + s2)[j] = state2[j];
(DMatrix + s3)[j] ^= state[j];
else
{
last[0] ^= Data0;
last[1] ^= Data1;
last[2] ^= Data2;
}
if (rowInOut == 5)
{
#pragma unroll
for (int j = 0; j < 3; j++)
last[j] ^= state[j];
}
for (int i = 1; i < Nrow; i++)
{
LD4S(state1, 2, i, thread, threads);
LD4S(state2, rowInOut, i, thread, threads);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
}
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= last[j];
}
#if __CUDA_ARCH__ == 500
__global__ __launch_bounds__(TPB50, 1)
#else
__global__ __launch_bounds__(TPB52, 2)
#endif
void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
__constant__ uint2x4 blake2b_IV[2] = {
0xf3bcc908lu, 0x6a09e667lu,
0x84caa73blu, 0xbb67ae85lu,
0xfe94f82blu, 0x3c6ef372lu,
0x5f1d36f1lu, 0xa54ff53alu,
0xade682d1lu, 0x510e527flu,
0x2b3e6c1flu, 0x9b05688clu,
0xfb41bd6blu, 0x1f83d9ablu,
0x137e2179lu, 0x5be0cd19lu
};
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint2x4 blake2b_IV[2] = {
{{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }},
{{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }}
};
if (thread < threads)
{
uint2x4 state[4];
((uint2*)state)[0] = __ldg(&g_hash[thread]);
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
state[1] = state[0];
state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]);
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<24; i++)
round_lyra(state); //because 12 is not enough
const uint32_t ps1 = (memshift * 7 + 256 * thread);
for (int i = 0; i < 8; i++)
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
}
}
#if __CUDA_ARCH__ < 300
__global__ __launch_bounds__(TPB20, 1)
#elif __CUDA_ARCH__ < 500
__global__ __launch_bounds__(TPB30, 1)
#elif __CUDA_ARCH__ == 500
__global__ __launch_bounds__(TPB50, 1)
#else
__global__ __launch_bounds__(TPB52, 1)
#endif
void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{
const uint32_t s1 = ps1 - memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix + s1)[j] = (state)[j];
round_lyra(state);
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
if (thread < threads)
{
uint2 state[4];
state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
reduceDuplex(state, thread, threads);
reduceDuplexRowSetup(1, 0, 2, state, thread, threads);
reduceDuplexRowSetup(2, 1, 3, state, thread, threads);
reduceDuplexRowSetup(3, 0, 4, state, thread, threads);
reduceDuplexRowSetup(4, 3, 5, state, thread, threads);
reduceDuplexRowSetup(5, 2, 6, state, thread, threads);
reduceDuplexRowSetup(6, 1, 7, state, thread, threads);
uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(7, rowa, 0, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(0, rowa, 3, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(3, rowa, 6, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(6, rowa, 1, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(1, rowa, 4, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(4, rowa, 7, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(7, rowa, 2, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt_8(rowa, state, thread, threads);
DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
}
}
reduceDuplex(state, thread);
reduceDuplexRowSetup(1, 0, 2, state, thread);
reduceDuplexRowSetup(2, 1, 3, state, thread);
reduceDuplexRowSetup(3, 0, 4, state, thread);
reduceDuplexRowSetup(4, 3, 5, state, thread);
reduceDuplexRowSetup(5, 2, 6, state, thread);
reduceDuplexRowSetup(6, 1, 7, state, thread);
uint32_t rowa = state[0].x.x & 7;
reduceDuplexRowt(7, rowa, 0, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(0, rowa, 3, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(3, rowa, 6, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(6, rowa, 1, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(1, rowa, 4, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(4, rowa, 7, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(7, rowa, 2, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(2, rowa, 5, state, thread);
const int32_t shift = (memshift * 8 * rowa + 256 * thread);
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
uint28 state[4];
if (thread < threads)
{
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
for (int i = 0; i < 12; i++)
round_lyra(state);
g_hash[thread] = ((uint2*)state)[0];
g_hash[thread + threads] = ((uint2*)state)[1];
g_hash[thread + threads*2] = ((uint2*)state)[2];
g_hash[thread + threads*3] = ((uint2*)state)[3];
}
g_hash[thread + threads * 0] = state[0].x;
g_hash[thread + threads * 1] = state[0].y;
g_hash[thread + threads * 2] = state[0].z;
g_hash[thread + threads * 3] = state[0].w;
} //thread
}
#else
#if __CUDA_ARCH__ < 500
/* for unsupported SM arch */
__device__ void* DMatrix;
__global__ void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
#endif
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
#endif
__host__
void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
{
cuda_get_arch(thr_id);
int dev_id = device_map[thr_id % MAX_GPUS];
// just assign the device pointer allocated in main loop
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
}
__host__
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
{
int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t tpb = TPB52;
if (device_sm[dev_id] == 500) tpb = TPB50;
if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not)
if (device_sm[dev_id] <= 300) tpb = TPB30;
dim3 grid((threads + tpb - 1) / tpb);
dim3 block(tpb);
if (cuda_arch[dev_id] >= 520) tpb = TPB52;
else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
dim3 grid1((threads * 4 + tpb - 1) / tpb);
dim3 block1(4, tpb >> 2);
dim3 grid2((threads + 64 - 1) / 64);
dim3 block2(64);
dim3 grid3((threads + tpb - 1) / tpb);
dim3 block3(tpb);
if (device_sm[dev_id] >= 500)
lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_hash);
size_t shared_mem = 0;
//if (cuda_arch[dev_id] < 500) cudaFuncSetCacheConfig(lyra2_gpu_hash_32_2, cudaFuncCachePreferShared);
if (cuda_arch[dev_id] >= 520)
{
lyra2_gpu_hash_32_1 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_2 << <grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >> > (threads, startNounce, d_hash);
lyra2_gpu_hash_32_3 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
}
else if (cuda_arch[dev_id] >= 500)
{
if (gtx750ti)
// 8Warpに調整のため、8192バイト確保する
shared_mem = 8192;
else
lyra2_gpu_hash_32_sm2 <<< grid, block >>> (threads, startNounce, d_hash);
// 10Warpに調整のため、6144バイト確保する
shared_mem = 6144;
lyra2_gpu_hash_32_1_sm5 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_2_sm5 << <grid1, block1, shared_mem >> > (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_3_sm5 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
}
else
lyra2_gpu_hash_32_sm2 << < grid3, block3 >> > (threads, startNounce, d_hash);
}

7
lyra2/cuda_lyra2_sm2.cuh

@ -3,15 +3,16 @@ @@ -3,15 +3,16 @@
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#undef __CUDA_ARCH__
#define __CUDA_ARCH__ 300
#define __CUDA_ARCH__ 500
#endif
#include "cuda_helper.h"
#define TPB30 160
#define TPB20 160
#if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__)
__constant__ static uint2 blake2b_IV[8] = {
__constant__ static uint2 blake2b_IV_sm2[8] = {
{ 0xf3bcc908, 0x6a09e667 },
{ 0x84caa73b, 0xbb67ae85 },
{ 0xfe94f82b, 0x3c6ef372 },
@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h @@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h
#pragma unroll
for (int i = 0; i<8; i++) {
state[i + 8] = blake2b_IV[i];
state[i + 8] = blake2b_IV_sm2[i];
}
// blake2blyra x2

701
lyra2/cuda_lyra2_sm5.cuh

@ -0,0 +1,701 @@ @@ -0,0 +1,701 @@
#include <memory.h>
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#undef __CUDA_ARCH__
#define __CUDA_ARCH__ 500
#endif
#include "cuda_helper.h"
#define TPB50 32
#if __CUDA_ARCH__ == 500
#include "cuda_lyra2_vectors.h"
#define Nrow 8
#define Ncol 8
#define memshift 3
__device__ uint2 *DMatrix;
__device__ __forceinline__ uint2 LD4S(const int index)
{
extern __shared__ uint2 shared_mem[];
return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
}
__device__ __forceinline__ void ST4S(const int index, const uint2 data)
{
extern __shared__ uint2 shared_mem[];
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
}
#if __CUDA_ARCH__ >= 300
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
return __shfl(a, b, c);
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
a1 = WarpShuffle(a1, b1, c);
a2 = WarpShuffle(a2, b2, c);
a3 = WarpShuffle(a3, b3, c);
}
#else
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
uint32_t *_ptr = (uint32_t*)shared_mem;
__threadfence_block();
uint32_t buf = _ptr[thread];
_ptr[thread] = a;
__threadfence_block();
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
_ptr[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a;
__threadfence_block();
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a1;
__threadfence_block();
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
__threadfence_block();
shared_mem[thread] = a2;
__threadfence_block();
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
__threadfence_block();
shared_mem[thread] = a3;
__threadfence_block();
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
}
#endif
static __device__ __forceinline__
void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
{
a += b; d ^= a; d = SWAPUINT2(d);
c += d; b ^= c; b = ROR2(b, 24);
a += b; d ^= a; d = ROR2(d, 16);
c += d; b ^= c; b = ROR2(b, 63);
}
__device__ __forceinline__ void round_lyra(uint2 s[4])
{
Gfunc(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
Gfunc(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
}
static __device__ __forceinline__
void round_lyra(uint2x4* s)
{
Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
}
static __device__ __forceinline__
void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads)
{
uint2 state1[3], state2[3];
const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x;
for (int i = 0; i < 8; i++)
{
const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift;
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s0 + j, state[j]);
round_lyra(state);
}
for (int i = 0; i < 8; i++)
{
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = LD4S(s0 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j];
}
// 1, 0, 2
for (int i = 0; i < 8; i++)
{
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = LD4S(s0 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s0 + j, state2[j]);
}
// 2, 1, 3
for (int i = 0; i < 8; i++)
{
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s2 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
}
// 3, 0, 4
for (int i = 0; i < 8; i++)
{
const uint32_t ls0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x;
const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s3 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = LD4S(ls0 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s0 + j*threads*blockDim.x) = state2[j];
}
// 4, 3, 5
for (int i = 0; i < 8; i++)
{
const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x;
const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s4 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = *(DMatrix + s3 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s3 + j*threads*blockDim.x) = state2[j];
}
// 5, 2, 6
for (int i = 0; i < 8; i++)
{
const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x;
const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s5 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
}
// 6, 1, 7
for (int i = 0; i < 8; i++)
{
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x;
const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s6 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
}
}
static __device__ __forceinline__
void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
{
const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x;
#pragma unroll 1
for (int i = 0; i < 8; i++)
{
uint2 state1[3], state2[3];
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
}
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] += state2[j];
state[j] ^= state1[j];
}
round_lyra(state);
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
{
*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j];
}
}
}
static __device__ __forceinline__
void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
{
const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
uint2 state1[3], last[3];
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
}
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] += last[j];
state[j] ^= state1[j];
}
round_lyra(state);
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
last[0] ^= Data2;
last[1] ^= Data0;
last[2] ^= Data1;
}
else
{
last[0] ^= Data0;
last[1] ^= Data1;
last[2] ^= Data2;
}
if (rowInOut == 5)
{
#pragma unroll
for (int j = 0; j < 3; j++)
last[j] ^= state[j];
}
for (int i = 1; i < 8; i++)
{
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
round_lyra(state);
}
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= last[j];
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint2x4 blake2b_IV[2] = {
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
};
if (thread < threads)
{
uint2x4 state[4];
((uint2*)state)[0] = __ldg(&g_hash[thread]);
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]);
((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]);
state[1] = state[0];
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i < 24; i++)
round_lyra(state); //because 12 is not enough
((uint2x4*)DMatrix)[0 * threads + thread] = state[0];
((uint2x4*)DMatrix)[1 * threads + thread] = state[1];
((uint2x4*)DMatrix)[2 * threads + thread] = state[2];
((uint2x4*)DMatrix)[3 * threads + thread] = state[3];
}
}
__global__ __launch_bounds__(TPB50, 1)
void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
if (thread < threads)
{
uint2 state[4];
state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]);
reduceDuplexV5(state, thread, threads);
uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(7, rowa, 0, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(0, rowa, 3, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(3, rowa, 6, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(6, rowa, 1, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(1, rowa, 4, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(4, rowa, 7, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(7, rowa, 2, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50_8(rowa, state, thread, threads);
DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0];
DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1];
DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2];
DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3];
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]);
for (int i = 0; i < 12; i++)
round_lyra(state);
g_hash[thread] = ((uint2*)state)[0];
g_hash[thread + threads] = ((uint2*)state)[1];
g_hash[thread + threads * 2] = ((uint2*)state)[2];
g_hash[thread + threads * 3] = ((uint2*)state)[3];
}
}
#else
/* if __CUDA_ARCH__ != 500 .. host */
__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
#endif

606
lyra2/cuda_lyra2v2.cu

@ -2,35 +2,152 @@ @@ -2,35 +2,152 @@
#include <stdint.h>
#include <memory.h>
#define TPB52 8
#define TPB50 16
#include "cuda_lyra2v2_sm3.cuh"
#define TPB52 32
#define TPB50 32
#define TPB30 32
#define TPB20 32
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#define __CUDA_ARCH__ 500
#define __CUDA_ARCH__ 200
#endif
#if __CUDA_ARCH__ >= 500
#include "cuda_lyra2_vectors.h"
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#if __CUDA_ARCH__ >= 300
__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
#endif
#endif
#define Nrow 4
#define Ncol 4
#define memshift 3
__device__ uint2x4 *DMatrix;
__device__ uint2x4 *DState;
__device__ __forceinline__ uint2 LD4S(const int index)
{
extern __shared__ uint2 shared_mem[];
return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
}
__device__ __forceinline__ void ST4S(const int index, const uint2 data)
{
extern __shared__ uint2 shared_mem[];
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
}
__device__ __forceinline__
void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
{
a += b; d ^= a; d = SWAPUINT2(d);
c += d; b ^= c; b = ROR2(b, 24);
a += b; d ^= a; d = ROR2(d, 16);
a += b; d = eorswap32(a, d);
c += d; b ^= c; b = ROR24(b);
a += b; d ^= a; d = ROR16(d);
c += d; b ^= c; b = ROR2(b, 63);
}
#if __CUDA_ARCH__ >= 300
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
return __shfl(a, b, c);
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
a1 = WarpShuffle(a1, b1, c);
a2 = WarpShuffle(a2, b2, c);
a3 = WarpShuffle(a3, b3, c);
}
#else
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
uint32_t *_ptr = (uint32_t*)shared_mem;
__threadfence_block();
uint32_t buf = _ptr[thread];
_ptr[thread] = a;
__threadfence_block();
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
_ptr[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a;
__threadfence_block();
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a1;
__threadfence_block();
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
__threadfence_block();
shared_mem[thread] = a2;
__threadfence_block();
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
__threadfence_block();
shared_mem[thread] = a3;
__threadfence_block();
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
}
#endif
__device__ __forceinline__ void round_lyra_v35(uint2 s[4])
{
Gfunc_v5(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
Gfunc_v5(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
}
__device__ __forceinline__
void round_lyra_v5(uint2x4* s)
{
@ -45,145 +162,142 @@ void round_lyra_v5(uint2x4* s) @@ -45,145 +162,142 @@ void round_lyra_v5(uint2x4* s)
Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
}
__device__ __forceinline__
void reduceDuplex(uint2x4 state[4], const uint32_t thread)
__device__ __forceinline__ void reduceDuplexRowSetupV2(uint2 state[4])
{
uint2x4 state1[3];
const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
const uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
int i, j;
uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
#pragma unroll 4
#if __CUDA_ARCH__ > 500
#pragma unroll
#endif
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + i*memshift;
uint32_t s2 = ps2 - i*memshift;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix+s1)[j]);
for (j = 0; j < 3; j++)
state0[Ncol - i - 1][j] = state[j];
round_lyra_v35(state);
}
//#pragma unroll 4
for (i = 0; i < Ncol; i++)
{
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
for (j = 0; j < 3; j++)
state[j] ^= state0[i][j];
round_lyra_v5(state);
round_lyra_v35(state);
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];
for (j = 0; j < 3; j++)
state1[Ncol - i - 1][j] = state0[i][j];
#pragma unroll
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state1[j];
}
for (j = 0; j < 3; j++)
state1[Ncol - i - 1][j] ^= state[j];
}
__device__ __forceinline__
void reduceDuplex50(uint2x4 state[4], const uint32_t thread)
for (i = 0; i < Ncol; i++)
{
const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
const uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= state1[i][j] + state0[i][j];
#pragma unroll 4
for (int i = 0; i < Ncol; i++)
{
const uint32_t s1 = ps1 + i*memshift;
const int32_t s2 = ps2 - i*memshift;
round_lyra_v35(state);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + s1)[j]);
round_lyra_v5(state);
for (j = 0; j < 3; j++)
state2[j] = state1[i][j];
#pragma unroll
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = __ldg4(&(DMatrix + s1)[j]) ^ state[j];
}
}
for (j = 0; j < 3; j++)
state2[j] ^= state[j];
__device__ __forceinline__
void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], const uint32_t thread)
{
uint2x4 state2[3], state1[3];
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s2 + j, state2[j]);
const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
const uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread);
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
for (int i = 0; i < Ncol; i++)
if (threadIdx.x == 0)
{
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
const uint32_t s3 = ps3 - i*memshift;
#if __CUDA_ARCH__ == 500
state0[i][0] ^= Data2;
state0[i][1] ^= Data0;
state0[i][2] ^= Data1;
}
else
{
state0[i][0] ^= Data0;
state0[i][1] ^= Data1;
state0[i][2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] = state[j] ^ (__ldg4(&(DMatrix + s1)[j]) + __ldg4(&(DMatrix + s2)[j]));
for (j = 0; j < 3; j++)
ST4S(s0 + j, state0[i][j]);
round_lyra_v5(state);
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
for (j = 0; j < 3; j++)
state0[i][j] = state2[j];
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2)[j]);
}
#pragma unroll
for (int j = 0; j < 3; j++)
for (i = 0; i < Ncol; i++)
{
state1[j] ^= state[j];
(DMatrix + s3)[j] = state1[j];
}
const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
#else /* 5.2 */
round_lyra_v35(state);
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
for (j = 0; j < 3; j++)
state0[Ncol - i - 1][j] ^= state[j];
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2)[j]);
#pragma unroll
for (int j = 0; j < 3; j++)
{
uint2x4 tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
for (j = 0; j < 3; j++)
ST4S(s3 + j, state0[Ncol - i - 1][j]);
round_lyra_v5(state);
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
#pragma unroll
for (int j = 0; j < 3; j++)
if (threadIdx.x == 0)
{
state1[j] ^= state[j];
(DMatrix + s3)[j] = state1[j];
state1[i][0] ^= Data2;
state1[i][1] ^= Data0;
state1[i][2] ^= Data1;
}
else
{
state1[i][0] ^= Data0;
state1[i][1] ^= Data1;
state1[i][2] ^= Data2;
}
#endif
((uint2*)state2)[0] ^= ((uint2*)state)[11];
#pragma unroll
for (int j = 0; j < 11; j++)
((uint2*)state2)[j+1] ^= ((uint2*)state)[j];
for (j = 0; j < 3; j++)
ST4S(s1 + j, state1[i][j]);
#pragma unroll
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
}
__device__ __forceinline__
void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread)
__device__ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
{
uint2x4 state1[3], state2[3];
const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
const uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread);
uint2 state1[3], state2[3];
const uint32_t ps1 = memshift * Ncol * rowIn;
const uint32_t ps2 = memshift * Ncol * rowInOut;
const uint32_t ps3 = memshift * Ncol * rowOut;
for (int i = 0; i < Ncol; i++)
{
@ -193,188 +307,266 @@ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, u @@ -193,188 +307,266 @@ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, u
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
state1[j] = LD4S(s1 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2)[j]);
state2[j] = LD4S(s2 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] += state2[j];
state[j] ^= state1[j] + state2[j];
round_lyra_v35(state);
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s2 + j, state2[j]);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
}
}
round_lyra_v5(state);
__device__ void reduceDuplexRowtV2_4(const int rowInOut, uint2 state[4])
{
const int rowIn = 2;
const int rowOut = 3;
((uint2*)state2)[0] ^= ((uint2*)state)[11];
int i, j;
uint2 state2[3], state1[3], last[3];
const uint32_t ps1 = memshift * Ncol * rowIn;
const uint32_t ps2 = memshift * Ncol * rowInOut;
const uint32_t ps3 = memshift * Ncol * rowOut;
#pragma unroll
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
for (int j = 0; j < 3; j++)
last[j] = LD4S(ps2 + j);
#if __CUDA_ARCH__ == 500
if (rowInOut != rowOut)
{
#pragma unroll
for (int j = 0; j < 3; j++)
(DMatrix + s3)[j] ^= state[j];
state[j] ^= LD4S(ps1 + j) + last[j];
round_lyra_v35(state);
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
last[0] ^= Data2;
last[1] ^= Data0;
last[2] ^= Data1;
}
else
{
last[0] ^= Data0;
last[1] ^= Data1;
last[2] ^= Data2;
}
if (rowInOut == rowOut)
{
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] ^= state[j];
for (j = 0; j < 3; j++)
last[j] ^= state[j];
}
#else
if (rowInOut != rowOut)
for (i = 1; i < Ncol; i++)
{
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
#pragma unroll
for (int j = 0; j < 3; j++)
(DMatrix + s3)[j] ^= state[j];
} else {
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] ^= state[j];
for (j = 0; j < 3; j++)
state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);
round_lyra_v35(state);
}
#endif
#pragma unroll
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
state[j] ^= last[j];
}
#if __CUDA_ARCH__ == 500
__global__ __launch_bounds__(TPB50, 1)
#else
__global__ __launch_bounds__(TPB52, 1)
#endif
void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash)
__constant__ uint28 blake2b_IV[2] = {
0xf3bcc908lu, 0x6a09e667lu,
0x84caa73blu, 0xbb67ae85lu,
0xfe94f82blu, 0x3c6ef372lu,
0x5f1d36f1lu, 0xa54ff53alu,
0xade682d1lu, 0x510e527flu,
0x2b3e6c1flu, 0x9b05688clu,
0xfb41bd6blu, 0x1f83d9ablu,
0x137e2179lu, 0x5be0cd19lu
};
__constant__ uint28 Mask[2] = {
0x00000020lu, 0x00000000lu,
0x00000020lu, 0x00000000lu,
0x00000020lu, 0x00000000lu,
0x00000001lu, 0x00000000lu,
0x00000004lu, 0x00000000lu,
0x00000004lu, 0x00000000lu,
0x00000080lu, 0x00000000lu,
0x00000000lu, 0x01000000lu
};
__global__ __launch_bounds__(64, 1)
void lyra2v2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint2x4 blake2b_IV[2];
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (threadIdx.x == 0) {
((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
);
}
uint28 state[4];
if (thread < threads)
{
uint2x4 state[4];
((uint2*)state)[0] = __ldg(&g_hash[thread]);
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
state[1] = state[0];
state[2] = ((blake2b_IV)[0]);
state[3] = ((blake2b_IV)[1]);
state[0].x = state[1].x = __ldg(&outputHash[thread + threads * 0]);
state[0].y = state[1].y = __ldg(&outputHash[thread + threads * 1]);
state[0].z = state[1].z = __ldg(&outputHash[thread + threads * 2]);
state[0].w = state[1].w = __ldg(&outputHash[thread + threads * 3]);
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<12; i++)
round_lyra_v5(state);
((uint2*)state)[0].x ^= 0x20;
((uint2*)state)[1].x ^= 0x20;
((uint2*)state)[2].x ^= 0x20;
((uint2*)state)[3].x ^= 0x01;
((uint2*)state)[4].x ^= 0x04;
((uint2*)state)[5].x ^= 0x04;
((uint2*)state)[6].x ^= 0x80;
((uint2*)state)[7].y ^= 0x01000000;
state[0] ^= Mask[0];
state[1] ^= Mask[1];
for (int i = 0; i<12; i++)
round_lyra_v5(state);
const uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread);
DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x] = state[0];
DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x] = state[1];
DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x] = state[2];
DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x] = state[3];
for (int i = 0; i < Ncol; i++)
{
const uint32_t s1 = ps1 - memshift * i;
DMatrix[s1] = state[0];
DMatrix[s1+1] = state[1];
DMatrix[s1+2] = state[2];
round_lyra_v5(state);
} //thread
}
reduceDuplex50(state, thread);
#if __CUDA_ARCH__ < 300
__global__ __launch_bounds__(TPB20, 1)
#elif __CUDA_ARCH__ < 500
__global__ __launch_bounds__(TPB30, 1)
#elif __CUDA_ARCH__ == 500
__global__ __launch_bounds__(TPB50, 1)
#else
__global__ __launch_bounds__(TPB52, 1)
#endif
void lyra2v2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
{
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
if (thread < threads)
{
uint2 state[4];
state[0] = ((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[1] = ((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[2] = ((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[3] = ((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
reduceDuplexRowSetupV2(1, 0, 2, state, thread);
reduceDuplexRowSetupV2(2, 1, 3, state, thread);
reduceDuplexRowSetupV2(state);
uint32_t rowa;
int prev = 3;
for (int i = 0; i < 4; i++)
for (int i = 0; i < 3; i++)
{
rowa = ((uint2*)state)[0].x & 3;
reduceDuplexRowtV2(prev, rowa, i, state, thread);
rowa = WarpShuffle(state[0].x, 0, 4) & 3;
reduceDuplexRowtV2(prev, rowa, i, state);
prev = i;
}
const uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread);
rowa = WarpShuffle(state[0].x, 0, 4) & 3;
reduceDuplexRowtV2_4(rowa, state);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
} //thread
}
__global__ __launch_bounds__(64, 1)
void lyra2v2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
uint28 state[4];
if (thread < threads)
{
state[0] = __ldg4(&DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x]);
state[1] = __ldg4(&DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x]);
state[2] = __ldg4(&DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x]);
state[3] = __ldg4(&DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x]);
for (int i = 0; i < 12; i++)
round_lyra_v5(state);
g_hash[thread] = ((uint2*)state)[0];
g_hash[thread + threads] = ((uint2*)state)[1];
g_hash[thread + threads*2] = ((uint2*)state)[2];
g_hash[thread + threads*3] = ((uint2*)state)[3];
}
outputHash[thread + threads * 0] = state[0].x;
outputHash[thread + threads * 1] = state[0].y;
outputHash[thread + threads * 2] = state[0].z;
outputHash[thread + threads * 3] = state[0].w;
} //thread
}
#else
#include "cuda_helper.h"
#if __CUDA_ARCH__ < 200
__device__ void* DMatrix;
#endif
__global__ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
#endif
__host__
void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
{
cuda_get_arch(thr_id);
int dev_id = device_map[thr_id % MAX_GPUS];
// just assign the device pointer allocated in main loop
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(DState, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
}
__host__
void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
{
int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t tpb = TPB52;
if (cuda_arch[dev_id] > 500) tpb = TPB52;
else if (cuda_arch[dev_id] == 500) tpb = TPB50;
else if (cuda_arch[dev_id] >= 350) tpb = TPB35;
else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
dim3 grid((threads + tpb - 1) / tpb);
dim3 block(tpb);
dim3 grid1((threads * 4 + tpb - 1) / tpb);
dim3 block1(4, tpb >> 2);
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
lyra2v2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, (uint2*)g_hash);
else
lyra2v2_gpu_hash_32_v3 <<<grid, block>>> (threads, startNounce, (uint2*)g_hash);
dim3 grid2((threads + 64 - 1) / 64);
dim3 block2(64);
if (cuda_arch[dev_id] < 500)
cudaFuncSetCacheConfig(lyra2v2_gpu_hash_32_2, cudaFuncCachePreferShared);
lyra2v2_gpu_hash_32_1 << <grid2, block2 >> > (threads, startNounce, (uint2*)g_hash);
lyra2v2_gpu_hash_32_2 << <grid1, block1, 48 * sizeof(uint2) * tpb >> > (threads, startNounce, g_hash);
lyra2v2_gpu_hash_32_3 << <grid2, block2 >> > (threads, startNounce, (uint2*)g_hash);
//MyStreamSynchronize(NULL, order, thr_id);
}

338
lyra2/cuda_lyra2v2_sm3.cuh

@ -1,338 +0,0 @@ @@ -1,338 +0,0 @@
/* SM 2/3/3.5 Variant for lyra2REv2 */
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#undef __CUDA_ARCH__
#define __CUDA_ARCH__ 350
#endif
#define TPB20 64
#define TPB30 64
#define TPB35 64
#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
#include "cuda_lyra2_vectors.h"
#define Nrow 4
#define Ncol 4
#define vectype ulonglong4
#define memshift 4
__device__ vectype *DMatrix;
static __device__ __forceinline__
void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
{
a += b; d ^= a; d = ROTR64(d, 32);
c += d; b ^= c; b = ROTR64(b, 24);
a += b; d ^= a; d = ROTR64(d, 16);
c += d; b ^= c; b = ROTR64(b, 63);
}
static __device__ __forceinline__
void round_lyra_v35(vectype* s)
{
Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
}
static __device__ __forceinline__
void reduceDuplexV3(vectype state[4], uint32_t thread)
{
vectype state1[3];
uint32_t ps1 = (Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
#pragma unroll 4
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow * i *memshift;
uint32_t s2 = ps2 - Nrow * i *memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra_v35(state);
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state1[j];
}
}
static __device__ __forceinline__
void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
{
vectype state2[3], state1[3];
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread);
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow*i*memshift;
uint32_t s2 = ps2 + Nrow*i*memshift;
uint32_t s3 = ps3 - Nrow*i*memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1 )[j]);
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2 )[j]);
for (int j = 0; j < 3; j++) {
vectype tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
round_lyra_v35(state);
for (int j = 0; j < 3; j++) {
state1[j] ^= state[j];
(DMatrix + s3)[j] = state1[j];
}
((uint2*)state2)[0] ^= ((uint2*)state)[11];
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
}
static __device__ __forceinline__
void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
{
vectype state1[3], state2[3];
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread);
#pragma nounroll
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow * i*memshift;
uint32_t s2 = ps2 + Nrow * i*memshift;
uint32_t s3 = ps3 + Nrow * i*memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2)[j]);
for (int j = 0; j < 3; j++)
state1[j] += state2[j];
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra_v35(state);
((uint2*)state2)[0] ^= ((uint2*)state)[11];
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
if (rowInOut != rowOut) {
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
for (int j = 0; j < 3; j++)
(DMatrix + s3)[j] ^= state[j];
} else {
for (int j = 0; j < 3; j++)
state2[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
}
}
#if __CUDA_ARCH__ >= 300
__global__ __launch_bounds__(TPB35, 1)
void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
vectype state[4];
vectype blake2b_IV[2];
vectype padding[2];
if (threadIdx.x == 0) {
((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
);
((uint16*)padding)[0] = make_uint16(
0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
);
}
if (thread < threads)
{
((uint2*)state)[0] = __ldg(&outputHash[thread]);
((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
state[1] = state[0];
state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
for (int i = 0; i<12; i++)
round_lyra_v35(state);
state[0] ^= shuffle4(((vectype*)padding)[0], 0);
state[1] ^= shuffle4(((vectype*)padding)[1], 0);
for (int i = 0; i<12; i++)
round_lyra_v35(state);
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t s1 = ps1 - 4 * memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix + s1)[j] = (state)[j];
round_lyra_v35(state);
}
reduceDuplexV3(state, thread);
reduceDuplexRowSetupV3(1, 0, 2, state, thread);
reduceDuplexRowSetupV3(2, 1, 3, state, thread);
uint32_t rowa;
int prev = 3;
for (int i = 0; i < 4; i++)
{
rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread);
prev = i;
}
uint32_t shift = (memshift * rowa + 16 * memshift * thread);
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
for (int i = 0; i < 12; i++)
round_lyra_v35(state);
outputHash[thread] = ((uint2*)state)[0];
outputHash[thread + threads] = ((uint2*)state)[1];
outputHash[thread + 2 * threads] = ((uint2*)state)[2];
outputHash[thread + 3 * threads] = ((uint2*)state)[3];
} //thread
}
#elif __CUDA_ARCH__ >= 200
__global__ __launch_bounds__(TPB20, 1)
void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
vectype state[4];
vectype blake2b_IV[2];
vectype padding[2];
((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
);
((uint16*)padding)[0] = make_uint16(
0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
);
if (thread < threads)
{
((uint2*)state)[0] = outputHash[thread];
((uint2*)state)[1] = outputHash[thread + threads];
((uint2*)state)[2] = outputHash[thread + 2 * threads];
((uint2*)state)[3] = outputHash[thread + 3 * threads];
state[1] = state[0];
state[2] = ((vectype*)blake2b_IV)[0];
state[3] = ((vectype*)blake2b_IV)[1];
for (int i = 0; i<12; i++)
round_lyra_v35(state);
state[0] ^= ((vectype*)padding)[0];
state[1] ^= ((vectype*)padding)[1];
for (int i = 0; i<12; i++)
round_lyra_v35(state);
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t s1 = ps1 - 4 * memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix + s1)[j] = (state)[j];
round_lyra_v35(state);
}
reduceDuplexV3(state, thread);
reduceDuplexRowSetupV3(1, 0, 2, state, thread);
reduceDuplexRowSetupV3(2, 1, 3, state, thread);
uint32_t rowa;
int prev = 3;
for (int i = 0; i < 4; i++)
{
rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread);
prev = i;
}
uint32_t shift = (memshift * rowa + 16 * memshift * thread);
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
for (int i = 0; i < 12; i++)
round_lyra_v35(state);
outputHash[thread] = ((uint2*)state)[0];
outputHash[thread + threads] = ((uint2*)state)[1];
outputHash[thread + 2 * threads] = ((uint2*)state)[2];
outputHash[thread + 3 * threads] = ((uint2*)state)[3];
} //thread
}
#endif
#else
/* host & sm5+ */
__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
#endif

63
lyra2/lyra2RE.cu

@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon @@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
extern void groestl256_cpu_init(int thr_id, uint32_t threads);
extern void groestl256_cpu_free(int thr_id);
@ -85,30 +85,49 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, @@ -85,30 +85,49 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 17 : 16;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
ptarget[7] = 0x000f;
ptarget[7] = 0x00ff;
static bool gtx750ti;
static uint32_t throughput[MAX_GPUS];
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
int dev_id = device_map[thr_id];
cudaSetDevice(dev_id);
CUDA_LOG_ERROR();
blake256_cpu_init(thr_id, throughput);
keccak256_cpu_init(thr_id,throughput);
skein256_cpu_init(thr_id, throughput);
groestl256_cpu_init(thr_id, throughput);
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
if (device_sm[device_map[thr_id]] == 500) intensity = 15;
int temp = intensity;
throughput[thr_id] = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
if (init[thr_id]) throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);
// DMatrix
cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput);
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, dev_id);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
if (strstr(props.name, "750 Ti")) gtx750ti = true;
else gtx750ti = false;
//blake256_cpu_init(thr_id, throughput);
keccak256_cpu_init(thr_id, throughput[thr_id]);
skein256_cpu_init(thr_id, throughput[thr_id]);
groestl256_cpu_init(thr_id, throughput[thr_id]);
if (device_sm[dev_id] >= 500)
{
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
lyra2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));
init[thr_id] = true;
if (temp != intensity){
gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
intensity, throughput[thr_id]);
}
}
uint32_t _ALIGN(128) endiandata[20];
@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, @@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
int order = 0;
uint32_t foundNonce;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
blake256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
keccak256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], gtx750ti);
skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
TRACE("S")
*hashes_done = pdata[19] - first_nonce + throughput;
*hashes_done = pdata[19] - first_nonce + throughput[thr_id];
foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
foundNonce = groestl256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
if (foundNonce != UINT32_MAX)
{
uint32_t _ALIGN(64) vhash64[8];
@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, @@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
pdata[19] += throughput[thr_id];
} while (!work_restart[thr_id].restart);

178
lyra2/lyra2REv2.cu

@ -10,6 +10,7 @@ extern "C" { @@ -10,6 +10,7 @@ extern "C" {
#include "miner.h"
#include "cuda_helper.h"
#include <math.h>
static uint64_t *d_hash[MAX_GPUS];
static uint64_t* d_matrix[MAX_GPUS];
@ -20,6 +21,9 @@ extern void blake256_cpu_setBlock_80(uint32_t *pdata); @@ -20,6 +21,9 @@ extern void blake256_cpu_setBlock_80(uint32_t *pdata);
extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_cpu_init(int thr_id, uint32_t threads);
extern void keccak256_cpu_free(int thr_id);
extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void blakeKeccakcube256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
@ -27,10 +31,11 @@ extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t start @@ -27,10 +31,11 @@ extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t start
extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);
extern void bmw256_setTarget(const void *ptarget);
//extern void bmw256_setTarget(const void *ptarget);
extern void bmw256_cpu_init(int thr_id, uint32_t threads);
extern void bmw256_cpu_free(int thr_id);
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target);
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target, uint32_t **result);
void lyra2v2_hash(void *state, const void *input)
{
@ -79,7 +84,7 @@ void lyra2v2_hash(void *state, const void *input) @@ -79,7 +84,7 @@ void lyra2v2_hash(void *state, const void *input)
uint32_t* debugbuf = NULL; \
cudaMallocHost(&debugbuf, 32); \
cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \
printf("lyra2 %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
printf("lyra2 %s %08x %08x %08x %08x...%08x... ¥n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \
cudaFreeHost(debugbuf); \
} \
@ -89,23 +94,96 @@ void lyra2v2_hash(void *state, const void *input) @@ -89,23 +94,96 @@ void lyra2v2_hash(void *state, const void *input)
#endif
static bool init[MAX_GPUS] = { 0 };
static uint32_t throughput[MAX_GPUS] = { 0 };
extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
ptarget[7] = 0x000f;
if (!init[thr_id])
{
size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
int dev_id = device_map[thr_id];
cudaDeviceProp props;
cudaGetDeviceProperties(&props, dev_id);
int intensity = 0;
// Pascal
if (strstr(props.name, "1080")) intensity = 22;
else if (strstr(props.name, "1070")) intensity = 21;
// Maxwell
else if (strstr(props.name, "TITAN X")) intensity = 21;
else if (strstr(props.name, "980")) intensity = 21;
else if (strstr(props.name, "970")) intensity = 20;
else if (strstr(props.name, "960")) intensity = 20;
else if (strstr(props.name, "950")) intensity = 19;
else if (strstr(props.name, "750 Ti")) intensity = 19;
else if (strstr(props.name, "750")) intensity = 18;
// Kepler〜Fermi
else if (strstr(props.name, "TITAN Z")) intensity = 20;
else if (strstr(props.name, "TITAN")) intensity = 19;
else if (strstr(props.name, "780")) intensity = 19;
else if (strstr(props.name, "760")) intensity = 18;
else if (strstr(props.name, "730")) intensity = 16;
else if (strstr(props.name, "720")) intensity = 15;
else if (strstr(props.name, "710")) intensity = 16;
else if (strstr(props.name, "690")) intensity = 20;
else if (strstr(props.name, "680")) intensity = 19;
else if (strstr(props.name, "660")) intensity = 18;
else if (strstr(props.name, "650 Ti")) intensity = 18;
else if (strstr(props.name, "640")) intensity = 17;
else if (strstr(props.name, "630")) intensity = 16;
else if (strstr(props.name, "620")) intensity = 15;
else if (strstr(props.name, "90")) intensity = 18; //590
else if (strstr(props.name, "80")) intensity = 18; //480 580
else if (strstr(props.name, "70")) intensity = 18; //470 570 670 770
else if (strstr(props.name, "65")) intensity = 17; //465
else if (strstr(props.name, "60")) intensity = 17; //460 560
else if (strstr(props.name, "55")) intensity = 17; //555
else if (strstr(props.name, "50")) intensity = 17; //450 550Ti 650
else if (strstr(props.name, "45")) intensity = 16; //545
else if (strstr(props.name, "40")) intensity = 15; //440
else if (strstr(props.name, "30")) intensity = 15; //430 530
else if (strstr(props.name, "20")) intensity = 14; //420 520
else if (strstr(props.name, "10")) intensity = 14; //510 610
if (intensity != 0 && opt_eco_mode) intensity -= 3.0;
if (intensity == 0)
{
intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
throughput[thr_id] = cuda_default_throughput(dev_id, 1UL << (int)intensity);
}
else
{
//uint32_t adds = 0;
// double d = floor(intensity);
/* if ((intensity - d) > 0.0) {
adds = (uint32_t)floor((intensity - d) * (1 << (int)(d - 10.0)) * 1024;
throughput = (1 << (int)d) + adds;
gpulog(LOG_INFO, thr_id, "Adding %u threads to intensity %u, %u cuda threads",
adds, (int)d, throughput);
}
else if (gpus_intensity[n] != (1 << (int)intensity)) {
throughput = (1 << (int)intensity);
applog(LOG_INFO, "Intensity set to %u, %u cuda threads",
v, gpus_intensity[n]);
}
*/
uint32_t temp = 1UL << intensity;
throughput[thr_id] = cuda_default_throughput(dev_id, temp);
if (temp == throughput[thr_id])
{
gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
intensity, throughput[thr_id]);
}
}
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
@ -113,52 +191,84 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc @@ -113,52 +191,84 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
blake256_cpu_init(thr_id, throughput);
keccak256_cpu_init(thr_id,throughput);
skein256_cpu_init(thr_id, throughput);
bmw256_cpu_init(thr_id, throughput);
//blake256_cpu_init(thr_id, throughput);
//keccak256_cpu_init(thr_id,throughput);
skein256_cpu_init(thr_id, throughput[thr_id]);
bmw256_cpu_init(thr_id, throughput[thr_id]);
// SM 3 implentation requires a bit more memory
if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
//if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
// matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
//else
size_t matrix_sz = sizeof(uint64_t) * 4 * 4;
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
lyra2v2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));
api_set_throughput(thr_id, throughput);
api_set_throughput(thr_id, throughput[thr_id]);
init[thr_id] = true;
}
else throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);
uint32_t endiandata[20];
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
blake256_cpu_setBlock_80(pdata);
bmw256_setTarget(ptarget);
//bmw256_setTarget(ptarget);
//uint32_t *vhash64[2];
do {
int order = 0;
uint32_t foundNonces[2] = { 0, 0 };
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
blakeKeccak256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
//blakeKeccakcube256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("blake :");
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
//keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("keccak :");
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
cubehash256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
TRACE("cube :");
lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2v2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
TRACE("lyra2 :");
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
TRACE("skein :");
cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);
cubehash256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
TRACE("cube :");
bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces);
bmw256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], foundNonces, ptarget[7]);
//bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces, ptarget[7], vhash64);
*hashes_done = pdata[19] - first_nonce + throughput;
*hashes_done = pdata[19] - first_nonce + throughput[thr_id];
/*if (foundNonces[1] != 0)
{
if (fulltest(vhash64[0], ptarget))
{
gpulog(LOG_WARNING, thr_id, "result two foundNonces!");
pdata[19] = foundNonces[1];
pdata[21] = foundNonces[0];
work_set_target_ratio(work, vhash64[0]);
if (bn_hash_target_ratio(vhash64[1], ptarget) > work->shareratio) {
work_set_target_ratio(work, vhash64[1]);
}
return 2;
}
}
if (foundNonces[0] != 0)
{
if (fulltest(vhash64[0], ptarget))
{
gpulog(LOG_WARNING, thr_id, "result one foundNonce!");
pdata[19] = foundNonces[0];
work_set_target_ratio(work, vhash64[0]);
return 1;
}
}*/
if (foundNonces[0] != 0)
{
@ -176,25 +286,25 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc @@ -176,25 +286,25 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
be32enc(&endiandata[19], foundNonces[1]);
lyra2v2_hash(vhash64, endiandata);
pdata[21] = foundNonces[1];
xchg(pdata[19], pdata[21]);
if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) {
work_set_target_ratio(work, vhash64);
xchg(pdata[19], pdata[21]);
}
res++;
}
return res;
}
else
else if (vhash64[7] > ptarget[7])
{
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonces[0]);
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
pdata[19] += throughput[thr_id];
} while (!work_restart[thr_id].restart && !abort_flag);
@ -214,7 +324,7 @@ extern "C" void free_lyra2v2(int thr_id) @@ -214,7 +324,7 @@ extern "C" void free_lyra2v2(int thr_id)
cudaFree(d_matrix[thr_id]);
bmw256_cpu_free(thr_id);
keccak256_cpu_free(thr_id);
//keccak256_cpu_free(thr_id);
init[thr_id] = false;

4
miner.h

@ -445,6 +445,7 @@ struct option { @@ -445,6 +445,7 @@ struct option {
#endif
extern int options_count();
extern bool opt_eco_mode;
extern bool opt_benchmark;
extern bool opt_debug;
extern bool opt_quiet;
@ -646,6 +647,9 @@ struct work { @@ -646,6 +647,9 @@ struct work {
/* pok getwork txs */
uint32_t tx_count;
struct tx txs[POK_MAX_TXS];
char *txs2;
char *workid;
};
#define POK_BOOL_MASK 0x00008000

1778
neoscrypt/cuda_neoscrypt.cu

File diff suppressed because it is too large Load Diff

4
neoscrypt/cuda_vectors.h

@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift @@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
// require a uint32_t[9] ret array
// note: djm neoscrypt implementation is near the limits of gpu capabilities
// and weird behaviors can happen when tuning device functions code...
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
__device__ __forceinline__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{
uint8_t *v = (uint8_t*) &vec4.s0;
uint8_t *r = (uint8_t*) ret;
@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) @@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
#else
// same for SM 3.5+, really faster ?
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
__device__ __forceinline__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));

76
neoscrypt/neoscrypt.cpp

@ -1,11 +1,14 @@ @@ -1,11 +1,14 @@
#include <cuda_runtime.h>
#include "miner.h"
#include "neoscrypt/neoscrypt.h"
#include <string.h>
#include <miner.h>
extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
extern void neoscrypt_cpu_init(int thr_id, uint32_t threads);
extern void neoscrypt_cpu_free(int thr_id);
extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order);
#include "neoscrypt.h"
extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);
extern void neoscrypt_init_2stream(int thr_id, uint32_t threads);
extern void neoscrypt_free_2stream(int thr_id);
extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);
static bool init[MAX_GPUS] = { 0 };
@ -18,6 +21,17 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign @@ -18,6 +21,17 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
int dev_id = device_map[thr_id];
int intensity = is_windows() ? 18 : 19;
// Pascal
if (strstr(device_name[dev_id], "GTX 10")) intensity = 22;
// Maxwell
else if (strstr(device_name[dev_id], "TITAN X")) intensity = 21;
else if (strstr(device_name[dev_id], "980")) intensity = 21;
else if (strstr(device_name[dev_id], "970")) intensity = 20;
else if (strstr(device_name[dev_id], "960")) intensity = 20;
else if (strstr(device_name[dev_id], "950")) intensity = 19;
else if (strstr(device_name[dev_id], "750 Ti")) intensity = 19;
else if (strstr(device_name[dev_id], "750")) intensity = 19;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
throughput = throughput / 32; /* set for max intensity ~= 20 */
api_set_throughput(thr_id, throughput);
@ -31,16 +45,20 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign @@ -31,16 +45,20 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
{
cudaDeviceSynchronize();
cudaSetDevice(dev_id);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
cudaGetLastError(); // reset errors if device is not "reset"
}
if (device_sm[dev_id] <= 300) {
applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices");
gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
proper_exit(EXIT_CODE_CUDA_ERROR);
}
applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput);
neoscrypt_cpu_init(thr_id, throughput);
gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);
neoscrypt_init_2stream(thr_id, throughput);
init[thr_id] = true;
}
@ -48,7 +66,8 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign @@ -48,7 +66,8 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
if (have_stratum) {
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
} else {
}
else {
for (int k = 0; k < 20; k++)
endiandata[k] = pdata[k];
}
@ -56,26 +75,30 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign @@ -56,26 +75,30 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
neoscrypt_setBlockTarget(endiandata, ptarget);
do {
uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0);
if (foundNonce != UINT32_MAX)
{
uint32_t _ALIGN(64) vhash64[8];
uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum);
*hashes_done = pdata[19] - first_nonce + 1;
*hashes_done = pdata[19] - first_nonce + throughput;
if (foundNonces[0] != UINT32_MAX)
{
uint32_t _ALIGN(64) vhash[8];
if (have_stratum) {
be32enc(&endiandata[19], foundNonce);
} else {
endiandata[19] = foundNonce;
be32enc(&endiandata[19], foundNonces[0]);
}
else {
endiandata[19] = foundNonces[0];
}
neoscrypt((uchar*)vhash64, (uchar*) endiandata, 0x80000620U);
neoscrypt((uchar*)vhash, (uchar*)endiandata, 0x80000620U);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
work_set_target_ratio(work, vhash64);
pdata[19] = foundNonce;
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work_set_target_ratio(work, vhash);
pdata[19] = foundNonces[0];
return 1;
} else {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
}
else {
gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);
}
}
@ -100,8 +123,9 @@ void free_neoscrypt(int thr_id) @@ -100,8 +123,9 @@ void free_neoscrypt(int thr_id)
cudaThreadSynchronize();
neoscrypt_cpu_free(thr_id);
neoscrypt_free_2stream(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}

7
quark/cuda_quark_blake512_sp.cuh

@ -21,12 +21,7 @@ static __device__ __forceinline__ uint2 cuda_swap(uint2 v) { @@ -21,12 +21,7 @@ static __device__ __forceinline__ uint2 cuda_swap(uint2 v) {
v.y = t;
return v;
}
static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) {
uint2 result;
result.y = u.x ^ v.x;
result.x = u.y ^ v.y;
return result;
}
__constant__ uint2 c_512_u2[16] =
{

2
util.cpp

@ -559,7 +559,7 @@ static json_t *json_rpc_call(CURL *curl, const char *url, @@ -559,7 +559,7 @@ static json_t *json_rpc_call(CURL *curl, const char *url,
res_val = json_object_get(val, "result");
err_val = json_object_get(val, "error");
if (!res_val || json_is_null(res_val) ||
if (!res_val || //json_is_null(res_val) ||
(err_val && !json_is_null(err_val))) {
char *s = NULL;

Loading…
Cancel
Save