use blake512 sp kernels on SM 5+ (80+64)

import and keep my code for older archs, like skein 64

reduce the gap between our versions...

+150kH x11   GTX 960 / +30kH  750Ti
+900kH quark GTX 960 / +230kH 750Ti
This commit is contained in:
Tanguy Pruvot 2015-10-24 10:35:38 +02:00
parent e12d666d36
commit d43dc9a021
14 changed files with 893 additions and 122 deletions

View File

@ -23,6 +23,7 @@ extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
@ -279,6 +280,7 @@ extern "C" void free_jackpot(int thr_id)
cudaFree(d_branch3Nonces[thr_id]);
cudaFree(d_jackpotNonces[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
jackpot_compactTest_cpu_free(thr_id);

View File

@ -314,6 +314,7 @@
<ClInclude Include="algos.h" />
<ClInclude Include="miner.h" />
<ClInclude Include="nvml.h" />
<ClInclude Include="quark\cuda_quark_blake512_sp.cuh" />
<ClInclude Include="quark\cuda_skein512_sp.cuh" />
<ClInclude Include="res\resource.h" />
<ClInclude Include="scrypt\salsa_kernel.h" />

View File

@ -428,6 +428,9 @@
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<ClInclude Include="quark\cuda_quark_blake512_sp.cuh">
<Filter>Source Files\CUDA\quark</Filter>
</ClInclude>
<ClInclude Include="quark\cuda_skein512_sp.cuh">
<Filter>Source Files\CUDA\quark</Filter>
</ClInclude>

View File

@ -14,6 +14,7 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
@ -175,6 +176,7 @@ extern "C" void free_nist5(int thr_id)
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
cuda_check_cpu_free(thr_id);
init[thr_id] = false;

View File

@ -52,7 +52,7 @@ static uint64_t __align__(32) c_data[32];
static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNounce[MAX_GPUS];
static uint32_t *h_resNounce[MAX_GPUS];
static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
static __thread uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
/* prefer uint32_t to prevent size conversions = speed +5/10 % */
__constant__
@ -375,16 +375,13 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000F;
ptarget[7] = 0x000F;
if (!init[thr_id]) {
if (active_gpus > 1) {
cudaSetDevice(device_map[thr_id]);
}
cudaSetDevice(device_map[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
init[thr_id] = true;
}

View File

@ -2,6 +2,7 @@
#include <memory.h>
#include <sys/types.h> // off_t
#include "miner.h"
#include "cuda_helper.h"
#define ROTR(x,n) ROTR64(x,n)
@ -14,23 +15,26 @@ static uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
__device__ __constant__
static const uint8_t c_sigma[16][16] = {
static const uint8_t c_sigma_big[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }};
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
__device__ __constant__
static const uint64_t c_u512[16] =
@ -59,7 +63,7 @@ static const uint64_t c_u512[16] =
}
__device__ __forceinline__
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
void quark_blake512_compress(uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
{
uint64_t v[16];
uint64_t m[16];
@ -157,7 +161,7 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
buf[i+8] = d_constHashPadding[i];
// Ending round
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
quark_blake512_compress(h, buf, c_sigma_big, c_u512, 512);
#if __CUDA_ARCH__ <= 350
uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U];
@ -201,7 +205,7 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
0x5be0cd19137e2179ULL
};
quark_blake512_compress(h, buf, c_sigma, c_u512, 640);
quark_blake512_compress(h, buf, c_sigma_big, c_u512, 640);
#if __CUDA_ARCH__ <= 350
uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U);
@ -219,52 +223,106 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
}
}
#define SP_KERNEL
#ifdef SP_KERNEL
void quark_blake512_cpu_setBlock_80_sp(uint64_t*);
void quark_blake512_cpu_init_sp(int thr_id);
void quark_blake512_cpu_free_sp(int thr_id);
#endif
#include "cuda_quark_blake512_sp.cuh"
__host__
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
{
#ifdef SP_KERNEL
int dev_id = device_map[thr_id];
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
quark_blake512_cpu_hash_64_sp(threads, startNounce, d_nonceVector, d_outputHash);
else
#endif
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
}
MyStreamSynchronize(NULL, order, thr_id);
}
__host__
void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
{
#ifdef SP_KERNEL
int dev_id = device_map[thr_id];
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
quark_blake512_cpu_hash_80_sp(threads, startNounce, d_outputHash);
else
#endif
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
quark_blake512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
}
}
// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
__host__
void quark_blake512_cpu_init(int thr_id, uint32_t threads)
{
// CUDA_SAFE_CALL(cudaGetLastError());
cuda_get_arch(thr_id);
#ifdef SP_KERNEL
int dev_id = device_map[thr_id];
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
quark_blake512_cpu_init_sp(thr_id);
#endif
}
__host__
void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata)
void quark_blake512_cpu_free(int thr_id)
{
uint64_t message[16];
#ifdef SP_KERNEL
int dev_id = device_map[thr_id];
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
quark_blake512_cpu_free_sp(thr_id);
#endif
}
memcpy(message, pdata, 80);
message[10] = 0x80;
message[11] = 0;
message[12] = 0;
message[13] = 0x0100000000000000ull;
message[14] = 0;
message[15] = 0x8002000000000000ull; // 0x280
// ----------------------------- Host midstate for 80-bytes input ------------------------------------
cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice);
CUDA_SAFE_CALL(cudaGetLastError());
#undef SPH_C32
#undef SPH_T32
#undef SPH_C64
#undef SPH_T64
extern "C" {
#include "sph/sph_blake.h"
}
__host__
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *endiandata)
{
const uint32_t threadsperblock = 256;
#ifdef SP_KERNEL
int dev_id = device_map[thr_id];
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
quark_blake512_cpu_setBlock_80_sp((uint64_t*) endiandata);
else
#endif
{
uint64_t message[16];
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
memcpy(message, endiandata, 80);
message[10] = 0x80;
message[11] = 0;
message[12] = 0;
message[13] = 0x0100000000000000ull;
message[14] = 0;
message[15] = 0x8002000000000000ull; // 0x280
quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id);
}
__host__
void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
quark_blake512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice);
}
CUDA_LOG_ERROR();
}

View File

@ -0,0 +1,682 @@
/* sp implementation of blake */
//#include <stdio.h>
//#include <memory.h>
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500
#include "cuda_vector_uint2x4.h"
#undef G
#define vectorizelow(/* uint32_t*/ v) make_uint2(v,0)
#define vectorizehigh(/*uint32_t*/ v) make_uint2(0,v)
static __device__ __forceinline__ uint2 cuda_swap(uint2 v) {
const uint32_t t = cuda_swab32(v.x);
v.x = cuda_swab32(v.y);
v.y = t;
return v;
}
static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) {
uint2 result;
result.y = u.x ^ v.x;
result.x = u.y ^ v.y;
return result;
}
static uint2* d_PaddedMessage80[MAX_GPUS]; // padded message (80 bytes + padding)
__constant__ uint2 c_PaddedM[16];
__constant__ uint2x4 Hostprecalc[4];
__constant__ uint2 c_512_u2[16] =
{
{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
};
__constant__ uint8_t c_sigma[6][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
#define Gprecalc(a,b,c,d,idx1,idx2) { \
v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \
v[d] = eorswap32( v[d] , v[a]); \
v[c] += v[d]; \
v[b] = ROR2(v[b] ^ v[c], 25); \
v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \
v[d] = ROR16(v[d] ^ v[a]); \
v[c] += v[d]; \
v[b] = ROR2(v[b] ^ v[c], 11); \
}
#define GprecalcHost(a,b,c,d,idx1,idx2) { \
v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
v[d] = ROTR64( v[d] ^ v[a],32); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 25); \
v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
v[d] = ROTR64(v[d] ^ v[a],16); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 11); \
}
#define G(a,b,c,d,x) { \
uint32_t idx1 = c_sigma[i][x]; \
uint32_t idx2 = c_sigma[i][x+1]; \
v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \
v[d] = eorswap32(v[d] , v[a]); \
v[c] += v[d]; \
v[b] = ROR2( v[b] ^ v[c], 25); \
v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \
v[d] = ROR16( v[d] ^ v[a]); \
v[c] += v[d]; \
v[b] = ROR2( v[b] ^ v[c], 11); \
}
__global__
#if __CUDA_ARCH__ > 500
__launch_bounds__(256, 1)
#endif
void quark_blake512_gpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2* g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
const uint32_t hashPosition = nounce - startNounce;
uint2 msg[16];
uint2x4 *phash = (uint2x4*)&g_hash[hashPosition * 8U];
uint2x4 *outpt = (uint2x4*)msg;
outpt[0] = phash[0];
outpt[1] = phash[1];
uint2 block[16];
block[0].x = cuda_swab32(msg[0].y);
block[0].y = cuda_swab32(msg[0].x);
block[1].x = cuda_swab32(msg[1].y);
block[1].y = cuda_swab32(msg[1].x);
block[2].x = cuda_swab32(msg[2].y);
block[2].y = cuda_swab32(msg[2].x);
block[3].x = cuda_swab32(msg[3].y);
block[3].y = cuda_swab32(msg[3].x);
block[4].x = cuda_swab32(msg[4].y);
block[4].y = cuda_swab32(msg[4].x);
block[5].x = cuda_swab32(msg[5].y);
block[5].y = cuda_swab32(msg[5].x);
block[6].x = cuda_swab32(msg[6].y);
block[6].y = cuda_swab32(msg[6].x);
block[7].x = cuda_swab32(msg[7].y);
block[7].y = cuda_swab32(msg[7].x);
block[8] = vectorizehigh(0x80000000);
block[9] = vectorizelow(0x0);
block[10] = vectorizelow(0x0);
block[11] = vectorizelow(0x0);
block[12] = vectorizelow(0x0);
block[13] = vectorizelow(0x1);
block[14] = vectorizelow(0x0);
block[15] = vectorizelow(0x200);
const uint2 h[8] = {
{ 0xf3bcc908UL, 0x6a09e667UL },
{ 0x84caa73bUL, 0xbb67ae85UL },
{ 0xfe94f82bUL, 0x3c6ef372UL },
{ 0x5f1d36f1UL, 0xa54ff53aUL },
{ 0xade682d1UL, 0x510e527fUL },
{ 0x2b3e6c1fUL, 0x9b05688cUL },
{ 0xfb41bd6bUL, 0x1f83d9abUL },
{ 0x137e2179UL, 0x5be0cd19UL }
};
uint2 v[16] = {
h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
c_512_u2[0], c_512_u2[1], c_512_u2[2], c_512_u2[3],
c_512_u2[4], c_512_u2[5], c_512_u2[6], c_512_u2[7]
};
v[12].x ^= 512U;
v[13].x ^= 512U;
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
Gprecalc(0, 4, 8, 12, 0x5, 0xc)
Gprecalc(1, 5, 9, 13, 0xf, 0x1)
Gprecalc(2, 6, 10, 14, 0xd, 0xe)
Gprecalc(3, 7, 11, 15, 0xa, 0x4)
Gprecalc(0, 5, 10, 15, 0x7, 0x0)
Gprecalc(1, 6, 11, 12, 0x3, 0x6)
Gprecalc(2, 7, 8, 13, 0x2, 0x9)
Gprecalc(3, 4, 9, 14, 0xb, 0x8)
Gprecalc(0, 4, 8, 12, 0xb, 0xd)
Gprecalc(1, 5, 9, 13, 0xe, 0x7)
Gprecalc(2, 6, 10, 14, 0x1, 0xc)
Gprecalc(3, 7, 11, 15, 0x9, 0x3)
Gprecalc(0, 5, 10, 15, 0x0, 0x5)
Gprecalc(1, 6, 11, 12, 0x4, 0xf)
Gprecalc(2, 7, 8, 13, 0x6, 0x8)
Gprecalc(3, 4, 9, 14, 0xa, 0x2)
Gprecalc(0, 4, 8, 12, 0xf, 0x6)
Gprecalc(1, 5, 9, 13, 0x9, 0xe)
Gprecalc(2, 6, 10, 14, 0x3, 0xb)
Gprecalc(3, 7, 11, 15, 0x8, 0x0)
Gprecalc(0, 5, 10, 15, 0x2, 0xc)
Gprecalc(1, 6, 11, 12, 0x7, 0xd)
Gprecalc(2, 7, 8, 13, 0x4, 0x1)
Gprecalc(3, 4, 9, 14, 0x5, 0xa)
Gprecalc(0, 4, 8, 12, 0x2, 0xa)
Gprecalc(1, 5, 9, 13, 0x4, 0x8)
Gprecalc(2, 6, 10, 14, 0x6, 0x7)
Gprecalc(3, 7, 11, 15, 0x5, 0x1)
Gprecalc(0, 5, 10, 15, 0xb, 0xf)
Gprecalc(1, 6, 11, 12, 0xe, 0x9)
Gprecalc(2, 7, 8, 13, 0xc, 0x3)
Gprecalc(3, 4, 9, 14, 0x0, 0xd)
#if __CUDA_ARCH__ == 500
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
#else
for (int i = 0; i < 6; i++)
{
/* column step */
G(0, 4, 8, 12, 0);
G(1, 5, 9, 13, 2);
G(2, 6, 10, 14, 4);
G(3, 7, 11, 15, 6);
/* diagonal step */
G(0, 5, 10, 15, 8);
G(1, 6, 11, 12, 10);
G(2, 7, 8, 13, 12);
G(3, 4, 9, 14, 14);
}
#endif
v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
phash = (uint2x4*)v;
outpt = (uint2x4*)&g_hash[hashPosition * 8];
outpt[0] = phash[0];
outpt[1] = phash[1];
}
}
__global__
__launch_bounds__(128, 8)
void quark_blake512_gpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = startNounce + thread;
uint2 block[16];
block[0] = c_PaddedM[0];
block[1] = c_PaddedM[1];
block[2] = c_PaddedM[2];
block[3] = c_PaddedM[3];
block[4] = c_PaddedM[4];
block[5] = c_PaddedM[5];
block[6] = c_PaddedM[6];
block[7] = c_PaddedM[7];
block[8] = c_PaddedM[8];
block[9].y = c_PaddedM[9].y;
block[10] = vectorizehigh(0x80000000);
block[11] = vectorizelow(0);
block[12] = vectorizelow(0);
block[13] = vectorizelow(0x1);
block[14] = vectorizelow(0);
block[15] = vectorizelow(0x280);
block[9].x = nounce;
const uint2 h[8] = {
{ 0xf3bcc908UL, 0x6a09e667UL },
{ 0x84caa73bUL, 0xbb67ae85UL },
{ 0xfe94f82bUL, 0x3c6ef372UL },
{ 0x5f1d36f1UL, 0xa54ff53aUL },
{ 0xade682d1UL, 0x510e527fUL },
{ 0x2b3e6c1fUL, 0x9b05688cUL },
{ 0xfb41bd6bUL, 0x1f83d9abUL },
{ 0x137e2179UL, 0x5be0cd19UL }
};
uint2 v[16];
uint2x4 *outpt = (uint2x4*)v;
outpt[0] = Hostprecalc[0];
outpt[1] = Hostprecalc[1];
outpt[2] = Hostprecalc[2];
outpt[3] = Hostprecalc[3];
v[0] += (block[9] ^ c_512_u2[8]);
v[15] = ROR16(v[15] ^ v[0]);
v[10] += v[15];
v[5] = ROR2(v[5] ^ v[10], 11);
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
// Gprecalc(1, 5, 9, 13, 0x8, 0x4)
v[1] += v[5];
v[13] = eorswap32(v[13], v[1]);
v[9] += v[13];
v[5] = ROR2(v[5] ^ v[9], 25);
v[1] += (block[8] ^ c_512_u2[4]) + v[5];
v[13] = ROR16(v[13] ^ v[1]);
v[9] += v[13];
v[5] = ROR2(v[5] ^ v[9], 11);
// Gprecalc(2, 6, 10, 14, 0xf, 0x9)
v[2] += (block[9] ^ c_512_u2[0xf]);
v[14] = eorswap32(v[14], v[2]);
v[10] += v[14];
v[6] = ROR2(v[6] ^ v[10], 25);
v[2] += (block[0xf] ^ c_512_u2[9]) + v[6];
v[14] = ROR16(v[14] ^ v[2]);
v[10] += v[14];
v[6] = ROR2(v[6] ^ v[10], 11);
// Gprecalc(3, 7, 11, 15, 0x6, 0xd)
v[15] = eorswap32( v[15] , v[3]);
v[11] += v[15];
v[7] = ROR2(v[7] ^ v[11], 25);
v[3] += (block[6] ^ c_512_u2[0xd]) + v[7];
v[15] = ROR16(v[15] ^ v[3]);
v[11] += v[15];
v[7] = ROR2(v[7] ^ v[11], 11);
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
Gprecalc(0, 4, 8, 12, 0x5, 0xc)
Gprecalc(1, 5, 9, 13, 0xf, 0x1)
Gprecalc(2, 6, 10, 14, 0xd, 0xe)
Gprecalc(3, 7, 11, 15, 0xa, 0x4)
Gprecalc(0, 5, 10, 15, 0x7, 0x0)
Gprecalc(1, 6, 11, 12, 0x3, 0x6)
Gprecalc(2, 7, 8, 13, 0x2, 0x9)
Gprecalc(3, 4, 9, 14, 0xb, 0x8)
Gprecalc(0, 4, 8, 12, 0xb, 0xd)
Gprecalc(1, 5, 9, 13, 0xe, 0x7)
Gprecalc(2, 6, 10, 14, 0x1, 0xc)
Gprecalc(3, 7, 11, 15, 0x9, 0x3)
Gprecalc(0, 5, 10, 15, 0x0, 0x5)
Gprecalc(1, 6, 11, 12, 0x4, 0xf)
Gprecalc(2, 7, 8, 13, 0x6, 0x8)
Gprecalc(3, 4, 9, 14, 0xa, 0x2)
Gprecalc(0, 4, 8, 12, 0xf, 0x6)
Gprecalc(1, 5, 9, 13, 0x9, 0xe)
Gprecalc(2, 6, 10, 14, 0x3, 0xb)
Gprecalc(3, 7, 11, 15, 0x8, 0x0)
Gprecalc(0, 5, 10, 15, 0x2, 0xc)
Gprecalc(1, 6, 11, 12, 0x7, 0xd)
Gprecalc(2, 7, 8, 13, 0x4, 0x1)
Gprecalc(3, 4, 9, 14, 0x5, 0xa)
Gprecalc(0, 4, 8, 12, 0x2, 0xa)
Gprecalc(1, 5, 9, 13, 0x4, 0x8)
Gprecalc(2, 6, 10, 14, 0x6, 0x7)
Gprecalc(3, 7, 11, 15, 0x5, 0x1)
Gprecalc(0, 5, 10, 15, 0xb, 0xf)
Gprecalc(1, 6, 11, 12, 0xe, 0x9)
Gprecalc(2, 7, 8, 13, 0xc, 0x3)
Gprecalc(3, 4, 9, 14, 0x0, 0xd)
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
uint2x4 *phash = (uint2x4*)v;
outpt = (uint2x4*) &outputHash[thread * 8U];
outpt[0] = phash[0];
outpt[1] = phash[1];
}
}
// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
__host__ void quark_blake512_cpu_init_sp(int thr_id)
{
CUDA_SAFE_CALL(cudaMalloc(&d_PaddedMessage80[thr_id], 10 * sizeof(uint2)));
}
__host__ void quark_blake512_cpu_free_sp(int thr_id)
{
cudaFree(d_PaddedMessage80[thr_id]);
}
__host__ void quark_blake512_cpu_setBlock_80_sp(uint64_t *pdata)
{
uint64_t PaddedMessage[10];
for (int i = 0; i < 10; i++)
PaddedMessage[i] = cuda_swab64(pdata[i]);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedM, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
uint64_t block[16];
uint64_t *peker = (uint64_t *)&PaddedMessage[0];
block[0] = peker[0];
block[1] = peker[1];
block[2] = peker[2];
block[3] = peker[3];
block[4] = peker[4];
block[5] = peker[5];
block[6] = peker[6];
block[7] = peker[7];
block[8] = peker[8];
block[9] = peker[9];
block[10] = 0x8000000000000000;
block[11] = 0;
block[12] = 0;
block[13] = 1;
block[14] = 0;
block[15] = 0x280;
const uint64_t u512[16] = {
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};
uint64_t h[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL
};
uint64_t v[16] = {
h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640U, u512[5] ^ 640U, u512[6], u512[7]
};
GprecalcHost(0, 4, 8, 12, 0x1, 0x0)
GprecalcHost(1, 5, 9, 13, 0x3, 0x2)
GprecalcHost(2, 6, 10, 14, 0x5, 0x4)
GprecalcHost(3, 7, 11, 15, 0x7, 0x6)
GprecalcHost(1, 6, 11, 12, 0xb, 0xa)
GprecalcHost(2, 7, 8, 13, 0xd, 0xc)
v[0] += (block[8] ^ u512[9]) + v[5];
v[15] = ROTR64(v[15] ^ v[0], 32); \
v[10] += v[15];
v[5] = ROTR64(v[5] ^ v[10], 25);
v[0] += v[5];
GprecalcHost(3, 4, 9, 14, 0xf, 0xe);
v[1] += (block[0x4] ^ u512[0x8]);
v[2] += v[6];
v[3] += (block[0xd] ^ u512[6]) + v[7];
//applog_hash((unsigned char*) &v[0]);
//applog_hash((unsigned char*) &v[4]);
//applog_hash((unsigned char*) &v[8]);
//applog_hash((unsigned char*) &v[12]);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(Hostprecalc, v, 128, 0, cudaMemcpyHostToDevice));
}
#else
// __CUDA_ARCH__ < 500
__global__ void quark_blake512_gpu_hash_64_sp(uint32_t, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {}
__global__ void quark_blake512_gpu_hash_80_sp(uint32_t, uint32_t startNounce, uint2 *outputHash) {}
#endif
__host__
void quark_blake512_cpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash)
{
const uint32_t threadsperblock = 32;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
quark_blake512_gpu_hash_64_sp <<<grid, block>>>(threads, startNounce, d_nonceVector, (uint2*)d_outputHash);
}
__host__
void quark_blake512_cpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
{
const uint32_t threadsperblock = 64;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
quark_blake512_gpu_hash_80_sp <<<grid, block>>>(threads, startNounce, (uint2 *)d_outputHash);
}

View File

@ -20,6 +20,7 @@ static uint32_t *d_branch2Nonces[MAX_GPUS];
static uint32_t *d_branch3Nonces[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -57,81 +58,96 @@ extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t st
// Original Quarkhash Funktion aus einem miner Quelltext
extern "C" void quarkhash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
unsigned char hash[64];
unsigned char _ALIGN(128) hash[64];
sph_blake512_init(&ctx_blake);
sph_blake512 (&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, (void*) hash);
sph_bmw512_init(&ctx_bmw);
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
if (hash[0] & 0x8)
{
sph_groestl512_init(&ctx_groestl);
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
}
else
{
sph_skein512_init(&ctx_skein);
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
}
sph_groestl512_init(&ctx_groestl);
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_blake512_init(&ctx_blake);
sph_blake512 (&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, (void*) hash);
sph_jh512_init(&ctx_jh);
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
sph_bmw512_init(&ctx_bmw);
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
if (hash[0] & 0x8)
{
sph_blake512_init(&ctx_blake);
sph_blake512 (&ctx_blake, (const void*) hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
}
else
{
sph_bmw512_init(&ctx_bmw);
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
}
if (hash[0] & 0x8)
{
sph_groestl512_init(&ctx_groestl);
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
}
else
{
sph_skein512_init(&ctx_skein);
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
}
sph_keccak512_init(&ctx_keccak);
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_groestl512_init(&ctx_groestl);
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_skein512_init(&ctx_skein);
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
sph_jh512_init(&ctx_jh);
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
if (hash[0] & 0x8)
{
sph_keccak512_init(&ctx_keccak);
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
}
else
{
sph_jh512_init(&ctx_jh);
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
}
if (hash[0] & 0x8)
{
sph_blake512_init(&ctx_blake);
sph_blake512 (&ctx_blake, (const void*) hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
}
else
{
sph_bmw512_init(&ctx_bmw);
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
}
memcpy(state, hash, 32);
sph_keccak512_init(&ctx_keccak);
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_skein512_init(&ctx_skein);
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
if (hash[0] & 0x8)
{
sph_keccak512_init(&ctx_keccak);
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
}
else
{
sph_jh512_init(&ctx_jh);
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
}
memcpy(state, hash, 32);
}
#ifdef _DEBUG
#define TRACE(algo) { \
if (max_nonce == 1 && pdata[19] <= 1) { \
uint32_t* debugbuf = NULL; \
cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
printf("quark %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
swab32(debugbuf[2]), swab32(debugbuf[3])); \
cudaFreeHost(debugbuf); \
} \
}
#else
#define TRACE(algo) {}
#endif
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
@ -141,18 +157,17 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t throughput = cuda_default_throughput(thr_id, 1 << 20); // 256*4096
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00F;
ptarget[7] = 0x00F;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
// Konstanten kopieren, Speicher belegen
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
@ -180,8 +195,8 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
int order = 0;
uint32_t nrm1=0, nrm2=0, nrm3=0;
// erstes Blake512 Hash mit CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("blake :");
// das ist der unbedingte Branch für BMW512
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
@ -189,7 +204,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
d_branch3Nonces[thr_id], &nrm3,
order++);
// nur den Skein Branch weiterverfolgen
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
@ -223,28 +238,26 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
*hashes_done = pdata[19] - first_nonce + 1;
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
if (foundNonce != UINT32_MAX)
{
uint32_t vhash64[8];
uint32_t vhash[8];
be32enc(&endiandata[19], foundNonce);
quarkhash(vhash64, endiandata);
quarkhash(vhash, endiandata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
work_set_target_ratio(work, vhash64);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work_set_target_ratio(work, vhash);
pdata[19] = foundNonce;
return 1;
} else {
applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
applog_hash((uchar*) vhash);
applog_hash((uchar*) ptarget);
}
}
@ -270,6 +283,7 @@ extern "C" void free_quark(int thr_id)
cudaFree(d_branch2Nonces[thr_id]);
cudaFree(d_branch3Nonces[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
quark_compactTest_cpu_free(thr_id);

View File

@ -23,6 +23,7 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
@ -257,6 +258,7 @@ extern "C" void free_c11(int thr_id)
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);

View File

@ -23,6 +23,7 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
@ -255,6 +256,7 @@ extern "C" void free_x11(int thr_id)
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);

View File

@ -26,6 +26,7 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
@ -263,6 +264,7 @@ extern "C" void free_x13(int thr_id)
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
x13_fugue512_cpu_free(thr_id);

View File

@ -30,6 +30,7 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS] = { 0 };
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
@ -269,6 +270,7 @@ extern "C" void free_x14(int thr_id)
cudaThreadSynchronize();
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
x13_fugue512_cpu_free(thr_id);

View File

@ -31,6 +31,7 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS] = { 0 };
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
@ -277,6 +278,7 @@ extern "C" void free_x15(int thr_id)
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
x13_fugue512_cpu_free(thr_id);

View File

@ -34,6 +34,7 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_free(int thr_id);
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
@ -301,6 +302,7 @@ extern "C" void free_x17(int thr_id)
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
x13_fugue512_cpu_free(thr_id);