GOSTCoin CUDA miner project, compatible with most nvidia cards, containing only gostd algo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

683 lines
20 KiB

/* sp implementation of blake */
//#include <stdio.h>
//#include <memory.h>
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500
#include "cuda_vector_uint2x4.h"
#undef G
#define vectorizelow(/* uint32_t*/ v) make_uint2(v,0)
#define vectorizehigh(/*uint32_t*/ v) make_uint2(0,v)
static __device__ __forceinline__ uint2 cuda_swap(uint2 v) {
const uint32_t t = cuda_swab32(v.x);
v.x = cuda_swab32(v.y);
v.y = t;
return v;
}
static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) {
uint2 result;
result.y = u.x ^ v.x;
result.x = u.y ^ v.y;
return result;
}
static uint2* d_PaddedMessage80[MAX_GPUS]; // padded message (80 bytes + padding)
__constant__ uint2 c_PaddedM[16];
__constant__ uint2x4 Hostprecalc[4];
__constant__ uint2 c_512_u2[16] =
{
{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
};
__constant__ uint8_t c_sigma[6][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
#define Gprecalc(a,b,c,d,idx1,idx2) { \
v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \
v[d] = eorswap32( v[d] , v[a]); \
v[c] += v[d]; \
v[b] = ROR2(v[b] ^ v[c], 25); \
v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \
v[d] = ROR16(v[d] ^ v[a]); \
v[c] += v[d]; \
v[b] = ROR2(v[b] ^ v[c], 11); \
}
#define GprecalcHost(a,b,c,d,idx1,idx2) { \
v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
v[d] = ROTR64( v[d] ^ v[a],32); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 25); \
v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
v[d] = ROTR64(v[d] ^ v[a],16); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 11); \
}
#define G(a,b,c,d,x) { \
uint32_t idx1 = c_sigma[i][x]; \
uint32_t idx2 = c_sigma[i][x+1]; \
v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \
v[d] = eorswap32(v[d] , v[a]); \
v[c] += v[d]; \
v[b] = ROR2( v[b] ^ v[c], 25); \
v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \
v[d] = ROR16( v[d] ^ v[a]); \
v[c] += v[d]; \
v[b] = ROR2( v[b] ^ v[c], 11); \
}
__global__
#if __CUDA_ARCH__ > 500
__launch_bounds__(256, 1)
#endif
void quark_blake512_gpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2* g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
const uint32_t hashPosition = nounce - startNounce;
uint2 msg[16];
uint2x4 *phash = (uint2x4*)&g_hash[hashPosition * 8U];
uint2x4 *outpt = (uint2x4*)msg;
outpt[0] = phash[0];
outpt[1] = phash[1];
uint2 block[16];
block[0].x = cuda_swab32(msg[0].y);
block[0].y = cuda_swab32(msg[0].x);
block[1].x = cuda_swab32(msg[1].y);
block[1].y = cuda_swab32(msg[1].x);
block[2].x = cuda_swab32(msg[2].y);
block[2].y = cuda_swab32(msg[2].x);
block[3].x = cuda_swab32(msg[3].y);
block[3].y = cuda_swab32(msg[3].x);
block[4].x = cuda_swab32(msg[4].y);
block[4].y = cuda_swab32(msg[4].x);
block[5].x = cuda_swab32(msg[5].y);
block[5].y = cuda_swab32(msg[5].x);
block[6].x = cuda_swab32(msg[6].y);
block[6].y = cuda_swab32(msg[6].x);
block[7].x = cuda_swab32(msg[7].y);
block[7].y = cuda_swab32(msg[7].x);
block[8] = vectorizehigh(0x80000000);
block[9] = vectorizelow(0x0);
block[10] = vectorizelow(0x0);
block[11] = vectorizelow(0x0);
block[12] = vectorizelow(0x0);
block[13] = vectorizelow(0x1);
block[14] = vectorizelow(0x0);
block[15] = vectorizelow(0x200);
const uint2 h[8] = {
{ 0xf3bcc908UL, 0x6a09e667UL },
{ 0x84caa73bUL, 0xbb67ae85UL },
{ 0xfe94f82bUL, 0x3c6ef372UL },
{ 0x5f1d36f1UL, 0xa54ff53aUL },
{ 0xade682d1UL, 0x510e527fUL },
{ 0x2b3e6c1fUL, 0x9b05688cUL },
{ 0xfb41bd6bUL, 0x1f83d9abUL },
{ 0x137e2179UL, 0x5be0cd19UL }
};
uint2 v[16] = {
h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
c_512_u2[0], c_512_u2[1], c_512_u2[2], c_512_u2[3],
c_512_u2[4], c_512_u2[5], c_512_u2[6], c_512_u2[7]
};
v[12].x ^= 512U;
v[13].x ^= 512U;
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
Gprecalc(0, 4, 8, 12, 0x5, 0xc)
Gprecalc(1, 5, 9, 13, 0xf, 0x1)
Gprecalc(2, 6, 10, 14, 0xd, 0xe)
Gprecalc(3, 7, 11, 15, 0xa, 0x4)
Gprecalc(0, 5, 10, 15, 0x7, 0x0)
Gprecalc(1, 6, 11, 12, 0x3, 0x6)
Gprecalc(2, 7, 8, 13, 0x2, 0x9)
Gprecalc(3, 4, 9, 14, 0xb, 0x8)
Gprecalc(0, 4, 8, 12, 0xb, 0xd)
Gprecalc(1, 5, 9, 13, 0xe, 0x7)
Gprecalc(2, 6, 10, 14, 0x1, 0xc)
Gprecalc(3, 7, 11, 15, 0x9, 0x3)
Gprecalc(0, 5, 10, 15, 0x0, 0x5)
Gprecalc(1, 6, 11, 12, 0x4, 0xf)
Gprecalc(2, 7, 8, 13, 0x6, 0x8)
Gprecalc(3, 4, 9, 14, 0xa, 0x2)
Gprecalc(0, 4, 8, 12, 0xf, 0x6)
Gprecalc(1, 5, 9, 13, 0x9, 0xe)
Gprecalc(2, 6, 10, 14, 0x3, 0xb)
Gprecalc(3, 7, 11, 15, 0x8, 0x0)
Gprecalc(0, 5, 10, 15, 0x2, 0xc)
Gprecalc(1, 6, 11, 12, 0x7, 0xd)
Gprecalc(2, 7, 8, 13, 0x4, 0x1)
Gprecalc(3, 4, 9, 14, 0x5, 0xa)
Gprecalc(0, 4, 8, 12, 0x2, 0xa)
Gprecalc(1, 5, 9, 13, 0x4, 0x8)
Gprecalc(2, 6, 10, 14, 0x6, 0x7)
Gprecalc(3, 7, 11, 15, 0x5, 0x1)
Gprecalc(0, 5, 10, 15, 0xb, 0xf)
Gprecalc(1, 6, 11, 12, 0xe, 0x9)
Gprecalc(2, 7, 8, 13, 0xc, 0x3)
Gprecalc(3, 4, 9, 14, 0x0, 0xd)
#if __CUDA_ARCH__ == 500
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
#else
for (int i = 0; i < 6; i++)
{
/* column step */
G(0, 4, 8, 12, 0);
G(1, 5, 9, 13, 2);
G(2, 6, 10, 14, 4);
G(3, 7, 11, 15, 6);
/* diagonal step */
G(0, 5, 10, 15, 8);
G(1, 6, 11, 12, 10);
G(2, 7, 8, 13, 12);
G(3, 4, 9, 14, 14);
}
#endif
v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
phash = (uint2x4*)v;
outpt = (uint2x4*)&g_hash[hashPosition * 8];
outpt[0] = phash[0];
outpt[1] = phash[1];
}
}
__global__
__launch_bounds__(128, 8)
void quark_blake512_gpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = startNounce + thread;
uint2 block[16];
block[0] = c_PaddedM[0];
block[1] = c_PaddedM[1];
block[2] = c_PaddedM[2];
block[3] = c_PaddedM[3];
block[4] = c_PaddedM[4];
block[5] = c_PaddedM[5];
block[6] = c_PaddedM[6];
block[7] = c_PaddedM[7];
block[8] = c_PaddedM[8];
block[9].y = c_PaddedM[9].y;
block[10] = vectorizehigh(0x80000000);
block[11] = vectorizelow(0);
block[12] = vectorizelow(0);
block[13] = vectorizelow(0x1);
block[14] = vectorizelow(0);
block[15] = vectorizelow(0x280);
block[9].x = nounce;
const uint2 h[8] = {
{ 0xf3bcc908UL, 0x6a09e667UL },
{ 0x84caa73bUL, 0xbb67ae85UL },
{ 0xfe94f82bUL, 0x3c6ef372UL },
{ 0x5f1d36f1UL, 0xa54ff53aUL },
{ 0xade682d1UL, 0x510e527fUL },
{ 0x2b3e6c1fUL, 0x9b05688cUL },
{ 0xfb41bd6bUL, 0x1f83d9abUL },
{ 0x137e2179UL, 0x5be0cd19UL }
};
uint2 v[16];
uint2x4 *outpt = (uint2x4*)v;
outpt[0] = Hostprecalc[0];
outpt[1] = Hostprecalc[1];
outpt[2] = Hostprecalc[2];
outpt[3] = Hostprecalc[3];
v[0] += (block[9] ^ c_512_u2[8]);
v[15] = ROR16(v[15] ^ v[0]);
v[10] += v[15];
v[5] = ROR2(v[5] ^ v[10], 11);
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
// Gprecalc(1, 5, 9, 13, 0x8, 0x4)
v[1] += v[5];
v[13] = eorswap32(v[13], v[1]);
v[9] += v[13];
v[5] = ROR2(v[5] ^ v[9], 25);
v[1] += (block[8] ^ c_512_u2[4]) + v[5];
v[13] = ROR16(v[13] ^ v[1]);
v[9] += v[13];
v[5] = ROR2(v[5] ^ v[9], 11);
// Gprecalc(2, 6, 10, 14, 0xf, 0x9)
v[2] += (block[9] ^ c_512_u2[0xf]);
v[14] = eorswap32(v[14], v[2]);
v[10] += v[14];
v[6] = ROR2(v[6] ^ v[10], 25);
v[2] += (block[0xf] ^ c_512_u2[9]) + v[6];
v[14] = ROR16(v[14] ^ v[2]);
v[10] += v[14];
v[6] = ROR2(v[6] ^ v[10], 11);
// Gprecalc(3, 7, 11, 15, 0x6, 0xd)
v[15] = eorswap32( v[15] , v[3]);
v[11] += v[15];
v[7] = ROR2(v[7] ^ v[11], 25);
v[3] += (block[6] ^ c_512_u2[0xd]) + v[7];
v[15] = ROR16(v[15] ^ v[3]);
v[11] += v[15];
v[7] = ROR2(v[7] ^ v[11], 11);
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
Gprecalc(0, 4, 8, 12, 0x5, 0xc)
Gprecalc(1, 5, 9, 13, 0xf, 0x1)
Gprecalc(2, 6, 10, 14, 0xd, 0xe)
Gprecalc(3, 7, 11, 15, 0xa, 0x4)
Gprecalc(0, 5, 10, 15, 0x7, 0x0)
Gprecalc(1, 6, 11, 12, 0x3, 0x6)
Gprecalc(2, 7, 8, 13, 0x2, 0x9)
Gprecalc(3, 4, 9, 14, 0xb, 0x8)
Gprecalc(0, 4, 8, 12, 0xb, 0xd)
Gprecalc(1, 5, 9, 13, 0xe, 0x7)
Gprecalc(2, 6, 10, 14, 0x1, 0xc)
Gprecalc(3, 7, 11, 15, 0x9, 0x3)
Gprecalc(0, 5, 10, 15, 0x0, 0x5)
Gprecalc(1, 6, 11, 12, 0x4, 0xf)
Gprecalc(2, 7, 8, 13, 0x6, 0x8)
Gprecalc(3, 4, 9, 14, 0xa, 0x2)
Gprecalc(0, 4, 8, 12, 0xf, 0x6)
Gprecalc(1, 5, 9, 13, 0x9, 0xe)
Gprecalc(2, 6, 10, 14, 0x3, 0xb)
Gprecalc(3, 7, 11, 15, 0x8, 0x0)
Gprecalc(0, 5, 10, 15, 0x2, 0xc)
Gprecalc(1, 6, 11, 12, 0x7, 0xd)
Gprecalc(2, 7, 8, 13, 0x4, 0x1)
Gprecalc(3, 4, 9, 14, 0x5, 0xa)
Gprecalc(0, 4, 8, 12, 0x2, 0xa)
Gprecalc(1, 5, 9, 13, 0x4, 0x8)
Gprecalc(2, 6, 10, 14, 0x6, 0x7)
Gprecalc(3, 7, 11, 15, 0x5, 0x1)
Gprecalc(0, 5, 10, 15, 0xb, 0xf)
Gprecalc(1, 6, 11, 12, 0xe, 0x9)
Gprecalc(2, 7, 8, 13, 0xc, 0x3)
Gprecalc(3, 4, 9, 14, 0x0, 0xd)
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
uint2x4 *phash = (uint2x4*)v;
outpt = (uint2x4*) &outputHash[thread * 8U];
outpt[0] = phash[0];
outpt[1] = phash[1];
}
}
// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
__host__ void quark_blake512_cpu_init_sp(int thr_id)
{
CUDA_SAFE_CALL(cudaMalloc(&d_PaddedMessage80[thr_id], 10 * sizeof(uint2)));
}
__host__ void quark_blake512_cpu_free_sp(int thr_id)
{
cudaFree(d_PaddedMessage80[thr_id]);
}
__host__ void quark_blake512_cpu_setBlock_80_sp(uint64_t *pdata)
{
uint64_t PaddedMessage[10];
for (int i = 0; i < 10; i++)
PaddedMessage[i] = cuda_swab64(pdata[i]);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedM, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
uint64_t block[16];
uint64_t *peker = (uint64_t *)&PaddedMessage[0];
block[0] = peker[0];
block[1] = peker[1];
block[2] = peker[2];
block[3] = peker[3];
block[4] = peker[4];
block[5] = peker[5];
block[6] = peker[6];
block[7] = peker[7];
block[8] = peker[8];
block[9] = peker[9];
block[10] = 0x8000000000000000;
block[11] = 0;
block[12] = 0;
block[13] = 1;
block[14] = 0;
block[15] = 0x280;
const uint64_t u512[16] = {
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};
uint64_t h[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL
};
uint64_t v[16] = {
h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640U, u512[5] ^ 640U, u512[6], u512[7]
};
GprecalcHost(0, 4, 8, 12, 0x1, 0x0)
GprecalcHost(1, 5, 9, 13, 0x3, 0x2)
GprecalcHost(2, 6, 10, 14, 0x5, 0x4)
GprecalcHost(3, 7, 11, 15, 0x7, 0x6)
GprecalcHost(1, 6, 11, 12, 0xb, 0xa)
GprecalcHost(2, 7, 8, 13, 0xd, 0xc)
v[0] += (block[8] ^ u512[9]) + v[5];
v[15] = ROTR64(v[15] ^ v[0], 32); \
v[10] += v[15];
v[5] = ROTR64(v[5] ^ v[10], 25);
v[0] += v[5];
GprecalcHost(3, 4, 9, 14, 0xf, 0xe);
v[1] += (block[0x4] ^ u512[0x8]);
v[2] += v[6];
v[3] += (block[0xd] ^ u512[6]) + v[7];
//applog_hash((unsigned char*) &v[0]);
//applog_hash((unsigned char*) &v[4]);
//applog_hash((unsigned char*) &v[8]);
//applog_hash((unsigned char*) &v[12]);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(Hostprecalc, v, 128, 0, cudaMemcpyHostToDevice));
}
#else
// __CUDA_ARCH__ < 500
__global__ void quark_blake512_gpu_hash_64_sp(uint32_t, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {}
__global__ void quark_blake512_gpu_hash_80_sp(uint32_t, uint32_t startNounce, uint2 *outputHash) {}
#endif
__host__
void quark_blake512_cpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash)
{
const uint32_t threadsperblock = 32;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
quark_blake512_gpu_hash_64_sp <<<grid, block>>>(threads, startNounce, d_nonceVector, (uint2*)d_outputHash);
}
__host__
void quark_blake512_cpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
{
const uint32_t threadsperblock = 64;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
quark_blake512_gpu_hash_80_sp <<<grid, block>>>(threads, startNounce, (uint2 *)d_outputHash);
}