use blake512 sp kernels on SM 5+ (80+64)
import and keep my code for older archs, like skein 64 reduce the gap between our versions... +150kH x11 GTX 960 / +30kH 750Ti +900kH quark GTX 960 / +230kH 750Ti
This commit is contained in:
parent
e12d666d36
commit
d43dc9a021
@ -23,6 +23,7 @@ extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
|
||||
extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
|
||||
@ -279,6 +280,7 @@ extern "C" void free_jackpot(int thr_id)
|
||||
cudaFree(d_branch3Nonces[thr_id]);
|
||||
cudaFree(d_jackpotNonces[thr_id]);
|
||||
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
jackpot_compactTest_cpu_free(thr_id);
|
||||
|
||||
|
@ -314,6 +314,7 @@
|
||||
<ClInclude Include="algos.h" />
|
||||
<ClInclude Include="miner.h" />
|
||||
<ClInclude Include="nvml.h" />
|
||||
<ClInclude Include="quark\cuda_quark_blake512_sp.cuh" />
|
||||
<ClInclude Include="quark\cuda_skein512_sp.cuh" />
|
||||
<ClInclude Include="res\resource.h" />
|
||||
<ClInclude Include="scrypt\salsa_kernel.h" />
|
||||
|
@ -428,6 +428,9 @@
|
||||
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
|
||||
<Filter>Source Files\CUDA\lyra2</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="quark\cuda_quark_blake512_sp.cuh">
|
||||
<Filter>Source Files\CUDA\quark</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="quark\cuda_skein512_sp.cuh">
|
||||
<Filter>Source Files\CUDA\quark</Filter>
|
||||
</ClInclude>
|
||||
|
@ -14,6 +14,7 @@ extern "C"
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
|
||||
@ -175,6 +176,7 @@ extern "C" void free_nist5(int thr_id)
|
||||
|
||||
cudaFree(d_hash[thr_id]);
|
||||
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
cuda_check_cpu_free(thr_id);
|
||||
init[thr_id] = false;
|
||||
|
@ -52,7 +52,7 @@ static uint64_t __align__(32) c_data[32];
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
static uint32_t *d_resNounce[MAX_GPUS];
|
||||
static uint32_t *h_resNounce[MAX_GPUS];
|
||||
static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
|
||||
static __thread uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
|
||||
|
||||
/* prefer uint32_t to prevent size conversions = speed +5/10 % */
|
||||
__constant__
|
||||
@ -375,16 +375,13 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
|
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x000F;
|
||||
ptarget[7] = 0x000F;
|
||||
|
||||
if (!init[thr_id]) {
|
||||
if (active_gpus > 1) {
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
}
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
|
||||
CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
|
||||
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <memory.h>
|
||||
#include <sys/types.h> // off_t
|
||||
|
||||
#include "miner.h"
|
||||
#include "cuda_helper.h"
|
||||
|
||||
#define ROTR(x,n) ROTR64(x,n)
|
||||
@ -14,23 +15,26 @@ static uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
|
||||
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
|
||||
|
||||
__device__ __constant__
|
||||
static const uint8_t c_sigma[16][16] = {
|
||||
static const uint8_t c_sigma_big[16][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
|
||||
|
||||
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
|
||||
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
|
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
|
||||
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
|
||||
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }};
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
__device__ __constant__
|
||||
static const uint64_t c_u512[16] =
|
||||
@ -59,7 +63,7 @@ static const uint64_t c_u512[16] =
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
|
||||
void quark_blake512_compress(uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
|
||||
{
|
||||
uint64_t v[16];
|
||||
uint64_t m[16];
|
||||
@ -157,7 +161,7 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
|
||||
buf[i+8] = d_constHashPadding[i];
|
||||
|
||||
// Ending round
|
||||
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
|
||||
quark_blake512_compress(h, buf, c_sigma_big, c_u512, 512);
|
||||
|
||||
#if __CUDA_ARCH__ <= 350
|
||||
uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U];
|
||||
@ -201,7 +205,7 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
|
||||
0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
quark_blake512_compress(h, buf, c_sigma, c_u512, 640);
|
||||
quark_blake512_compress(h, buf, c_sigma_big, c_u512, 640);
|
||||
|
||||
#if __CUDA_ARCH__ <= 350
|
||||
uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U);
|
||||
@ -219,21 +223,98 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
|
||||
}
|
||||
}
|
||||
|
||||
#define SP_KERNEL
|
||||
#ifdef SP_KERNEL
|
||||
void quark_blake512_cpu_setBlock_80_sp(uint64_t*);
|
||||
void quark_blake512_cpu_init_sp(int thr_id);
|
||||
void quark_blake512_cpu_free_sp(int thr_id);
|
||||
#endif
|
||||
|
||||
#include "cuda_quark_blake512_sp.cuh"
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
|
||||
{
|
||||
#ifdef SP_KERNEL
|
||||
int dev_id = device_map[thr_id];
|
||||
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
|
||||
quark_blake512_cpu_hash_64_sp(threads, startNounce, d_nonceVector, d_outputHash);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const uint32_t threadsperblock = 256;
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
|
||||
}
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
|
||||
{
|
||||
#ifdef SP_KERNEL
|
||||
int dev_id = device_map[thr_id];
|
||||
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
|
||||
quark_blake512_cpu_hash_80_sp(threads, startNounce, d_outputHash);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
const uint32_t threadsperblock = 256;
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
quark_blake512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_init(int thr_id, uint32_t threads)
|
||||
{
|
||||
// CUDA_SAFE_CALL(cudaGetLastError());
|
||||
cuda_get_arch(thr_id);
|
||||
#ifdef SP_KERNEL
|
||||
int dev_id = device_map[thr_id];
|
||||
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
|
||||
quark_blake512_cpu_init_sp(thr_id);
|
||||
#endif
|
||||
}
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata)
|
||||
void quark_blake512_cpu_free(int thr_id)
|
||||
{
|
||||
#ifdef SP_KERNEL
|
||||
int dev_id = device_map[thr_id];
|
||||
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
|
||||
quark_blake512_cpu_free_sp(thr_id);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ----------------------------- Host midstate for 80-bytes input ------------------------------------
|
||||
|
||||
#undef SPH_C32
|
||||
#undef SPH_T32
|
||||
#undef SPH_C64
|
||||
#undef SPH_T64
|
||||
|
||||
extern "C" {
|
||||
#include "sph/sph_blake.h"
|
||||
}
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *endiandata)
|
||||
{
|
||||
#ifdef SP_KERNEL
|
||||
int dev_id = device_map[thr_id];
|
||||
if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
|
||||
quark_blake512_cpu_setBlock_80_sp((uint64_t*) endiandata);
|
||||
else
|
||||
#endif
|
||||
{
|
||||
uint64_t message[16];
|
||||
|
||||
memcpy(message, pdata, 80);
|
||||
memcpy(message, endiandata, 80);
|
||||
message[10] = 0x80;
|
||||
message[11] = 0;
|
||||
message[12] = 0;
|
||||
@ -242,29 +323,6 @@ void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata)
|
||||
message[15] = 0x8002000000000000ull; // 0x280
|
||||
|
||||
cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice);
|
||||
CUDA_SAFE_CALL(cudaGetLastError());
|
||||
}
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
|
||||
{
|
||||
const uint32_t threadsperblock = 256;
|
||||
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
quark_blake512_gpu_hash_64<<<grid, block>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
|
||||
|
||||
//MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
|
||||
{
|
||||
const uint32_t threadsperblock = 256;
|
||||
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
quark_blake512_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash);
|
||||
}
|
||||
CUDA_LOG_ERROR();
|
||||
}
|
||||
|
682
quark/cuda_quark_blake512_sp.cuh
Normal file
682
quark/cuda_quark_blake512_sp.cuh
Normal file
@ -0,0 +1,682 @@
|
||||
/* sp implementation of blake */
|
||||
|
||||
//#include <stdio.h>
|
||||
//#include <memory.h>
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500
|
||||
|
||||
#include "cuda_vector_uint2x4.h"
|
||||
|
||||
#undef G
|
||||
#define vectorizelow(/* uint32_t*/ v) make_uint2(v,0)
|
||||
#define vectorizehigh(/*uint32_t*/ v) make_uint2(0,v)
|
||||
|
||||
static __device__ __forceinline__ uint2 cuda_swap(uint2 v) {
|
||||
const uint32_t t = cuda_swab32(v.x);
|
||||
v.x = cuda_swab32(v.y);
|
||||
v.y = t;
|
||||
return v;
|
||||
}
|
||||
static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) {
|
||||
uint2 result;
|
||||
result.y = u.x ^ v.x;
|
||||
result.x = u.y ^ v.y;
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint2* d_PaddedMessage80[MAX_GPUS]; // padded message (80 bytes + padding)
|
||||
__constant__ uint2 c_PaddedM[16];
|
||||
__constant__ uint2x4 Hostprecalc[4];
|
||||
|
||||
__constant__ uint2 c_512_u2[16] =
|
||||
{
|
||||
{ 0x85a308d3UL, 0x243f6a88 }, { 0x03707344UL, 0x13198a2e },
|
||||
{ 0x299f31d0UL, 0xa4093822 }, { 0xec4e6c89UL, 0x082efa98 },
|
||||
{ 0x38d01377UL, 0x452821e6 }, { 0x34e90c6cUL, 0xbe5466cf },
|
||||
{ 0xc97c50ddUL, 0xc0ac29b7 }, { 0xb5470917UL, 0x3f84d5b5 },
|
||||
{ 0x8979fb1bUL, 0x9216d5d9 }, { 0x98dfb5acUL, 0xd1310ba6 },
|
||||
{ 0xd01adfb7UL, 0x2ffd72db }, { 0x6a267e96UL, 0xb8e1afed },
|
||||
{ 0xf12c7f99UL, 0xba7c9045 }, { 0xb3916cf7UL, 0x24a19947 },
|
||||
{ 0x858efc16UL, 0x0801f2e2 }, { 0x71574e69UL, 0x636920d8 }
|
||||
};
|
||||
|
||||
__constant__ uint8_t c_sigma[6][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
|
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
|
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
|
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
|
||||
|
||||
#define Gprecalc(a,b,c,d,idx1,idx2) { \
|
||||
v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \
|
||||
v[d] = eorswap32( v[d] , v[a]); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROR2(v[b] ^ v[c], 25); \
|
||||
v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \
|
||||
v[d] = ROR16(v[d] ^ v[a]); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROR2(v[b] ^ v[c], 11); \
|
||||
}
|
||||
|
||||
#define GprecalcHost(a,b,c,d,idx1,idx2) { \
|
||||
v[a] += (block[idx2] ^ u512[idx1]) + v[b]; \
|
||||
v[d] = ROTR64( v[d] ^ v[a],32); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR64(v[b] ^ v[c], 25); \
|
||||
v[a] += (block[idx1] ^ u512[idx2]) + v[b]; \
|
||||
v[d] = ROTR64(v[d] ^ v[a],16); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROTR64(v[b] ^ v[c], 11); \
|
||||
}
|
||||
|
||||
#define G(a,b,c,d,x) { \
|
||||
uint32_t idx1 = c_sigma[i][x]; \
|
||||
uint32_t idx2 = c_sigma[i][x+1]; \
|
||||
v[a] += (block[idx1] ^ c_512_u2[idx2]) + v[b]; \
|
||||
v[d] = eorswap32(v[d] , v[a]); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROR2( v[b] ^ v[c], 25); \
|
||||
v[a] += (block[idx2] ^ c_512_u2[idx1]) + v[b]; \
|
||||
v[d] = ROR16( v[d] ^ v[a]); \
|
||||
v[c] += v[d]; \
|
||||
v[b] = ROR2( v[b] ^ v[c], 11); \
|
||||
}
|
||||
|
||||
__global__
|
||||
#if __CUDA_ARCH__ > 500
|
||||
__launch_bounds__(256, 1)
|
||||
#endif
|
||||
void quark_blake512_gpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2* g_hash)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
if (thread < threads)
|
||||
{
|
||||
const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
const uint32_t hashPosition = nounce - startNounce;
|
||||
|
||||
uint2 msg[16];
|
||||
|
||||
uint2x4 *phash = (uint2x4*)&g_hash[hashPosition * 8U];
|
||||
uint2x4 *outpt = (uint2x4*)msg;
|
||||
outpt[0] = phash[0];
|
||||
outpt[1] = phash[1];
|
||||
|
||||
uint2 block[16];
|
||||
block[0].x = cuda_swab32(msg[0].y);
|
||||
block[0].y = cuda_swab32(msg[0].x);
|
||||
block[1].x = cuda_swab32(msg[1].y);
|
||||
block[1].y = cuda_swab32(msg[1].x);
|
||||
block[2].x = cuda_swab32(msg[2].y);
|
||||
block[2].y = cuda_swab32(msg[2].x);
|
||||
block[3].x = cuda_swab32(msg[3].y);
|
||||
block[3].y = cuda_swab32(msg[3].x);
|
||||
block[4].x = cuda_swab32(msg[4].y);
|
||||
block[4].y = cuda_swab32(msg[4].x);
|
||||
block[5].x = cuda_swab32(msg[5].y);
|
||||
block[5].y = cuda_swab32(msg[5].x);
|
||||
block[6].x = cuda_swab32(msg[6].y);
|
||||
block[6].y = cuda_swab32(msg[6].x);
|
||||
block[7].x = cuda_swab32(msg[7].y);
|
||||
block[7].y = cuda_swab32(msg[7].x);
|
||||
|
||||
block[8] = vectorizehigh(0x80000000);
|
||||
block[9] = vectorizelow(0x0);
|
||||
block[10] = vectorizelow(0x0);
|
||||
block[11] = vectorizelow(0x0);
|
||||
block[12] = vectorizelow(0x0);
|
||||
block[13] = vectorizelow(0x1);
|
||||
block[14] = vectorizelow(0x0);
|
||||
block[15] = vectorizelow(0x200);
|
||||
|
||||
const uint2 h[8] = {
|
||||
{ 0xf3bcc908UL, 0x6a09e667UL },
|
||||
{ 0x84caa73bUL, 0xbb67ae85UL },
|
||||
{ 0xfe94f82bUL, 0x3c6ef372UL },
|
||||
{ 0x5f1d36f1UL, 0xa54ff53aUL },
|
||||
{ 0xade682d1UL, 0x510e527fUL },
|
||||
{ 0x2b3e6c1fUL, 0x9b05688cUL },
|
||||
{ 0xfb41bd6bUL, 0x1f83d9abUL },
|
||||
{ 0x137e2179UL, 0x5be0cd19UL }
|
||||
};
|
||||
|
||||
uint2 v[16] = {
|
||||
h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
|
||||
c_512_u2[0], c_512_u2[1], c_512_u2[2], c_512_u2[3],
|
||||
c_512_u2[4], c_512_u2[5], c_512_u2[6], c_512_u2[7]
|
||||
};
|
||||
v[12].x ^= 512U;
|
||||
v[13].x ^= 512U;
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
|
||||
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
|
||||
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
|
||||
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
|
||||
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
|
||||
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
|
||||
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
|
||||
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
|
||||
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
|
||||
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
|
||||
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
|
||||
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
|
||||
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
|
||||
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
|
||||
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
|
||||
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
|
||||
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
|
||||
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
|
||||
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
|
||||
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
|
||||
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
|
||||
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
|
||||
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
|
||||
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
|
||||
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
|
||||
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
|
||||
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
|
||||
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
|
||||
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
|
||||
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
|
||||
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
|
||||
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
|
||||
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
|
||||
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
|
||||
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
|
||||
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
|
||||
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
|
||||
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
|
||||
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
|
||||
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
|
||||
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
|
||||
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
|
||||
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x5, 0xc)
|
||||
Gprecalc(1, 5, 9, 13, 0xf, 0x1)
|
||||
Gprecalc(2, 6, 10, 14, 0xd, 0xe)
|
||||
Gprecalc(3, 7, 11, 15, 0xa, 0x4)
|
||||
Gprecalc(0, 5, 10, 15, 0x7, 0x0)
|
||||
Gprecalc(1, 6, 11, 12, 0x3, 0x6)
|
||||
Gprecalc(2, 7, 8, 13, 0x2, 0x9)
|
||||
Gprecalc(3, 4, 9, 14, 0xb, 0x8)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xb, 0xd)
|
||||
Gprecalc(1, 5, 9, 13, 0xe, 0x7)
|
||||
Gprecalc(2, 6, 10, 14, 0x1, 0xc)
|
||||
Gprecalc(3, 7, 11, 15, 0x9, 0x3)
|
||||
Gprecalc(0, 5, 10, 15, 0x0, 0x5)
|
||||
Gprecalc(1, 6, 11, 12, 0x4, 0xf)
|
||||
Gprecalc(2, 7, 8, 13, 0x6, 0x8)
|
||||
Gprecalc(3, 4, 9, 14, 0xa, 0x2)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xf, 0x6)
|
||||
Gprecalc(1, 5, 9, 13, 0x9, 0xe)
|
||||
Gprecalc(2, 6, 10, 14, 0x3, 0xb)
|
||||
Gprecalc(3, 7, 11, 15, 0x8, 0x0)
|
||||
Gprecalc(0, 5, 10, 15, 0x2, 0xc)
|
||||
Gprecalc(1, 6, 11, 12, 0x7, 0xd)
|
||||
Gprecalc(2, 7, 8, 13, 0x4, 0x1)
|
||||
Gprecalc(3, 4, 9, 14, 0x5, 0xa)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x2, 0xa)
|
||||
Gprecalc(1, 5, 9, 13, 0x4, 0x8)
|
||||
Gprecalc(2, 6, 10, 14, 0x6, 0x7)
|
||||
Gprecalc(3, 7, 11, 15, 0x5, 0x1)
|
||||
Gprecalc(0, 5, 10, 15, 0xb, 0xf)
|
||||
Gprecalc(1, 6, 11, 12, 0xe, 0x9)
|
||||
Gprecalc(2, 7, 8, 13, 0xc, 0x3)
|
||||
Gprecalc(3, 4, 9, 14, 0x0, 0xd)
|
||||
|
||||
#if __CUDA_ARCH__ == 500
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
|
||||
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
|
||||
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
|
||||
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
|
||||
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
|
||||
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
|
||||
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
|
||||
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
|
||||
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
|
||||
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
|
||||
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
|
||||
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
|
||||
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
|
||||
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
|
||||
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
|
||||
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
|
||||
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
|
||||
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
|
||||
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
|
||||
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
|
||||
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
|
||||
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
|
||||
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
|
||||
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
|
||||
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
|
||||
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
|
||||
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
|
||||
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
|
||||
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
|
||||
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
|
||||
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
|
||||
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
|
||||
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
|
||||
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
|
||||
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
|
||||
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
|
||||
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
|
||||
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
|
||||
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
|
||||
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
|
||||
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
|
||||
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
|
||||
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
|
||||
|
||||
#else
|
||||
|
||||
for (int i = 0; i < 6; i++)
|
||||
{
|
||||
/* column step */
|
||||
G(0, 4, 8, 12, 0);
|
||||
G(1, 5, 9, 13, 2);
|
||||
G(2, 6, 10, 14, 4);
|
||||
G(3, 7, 11, 15, 6);
|
||||
/* diagonal step */
|
||||
G(0, 5, 10, 15, 8);
|
||||
G(1, 6, 11, 12, 10);
|
||||
G(2, 7, 8, 13, 12);
|
||||
G(3, 4, 9, 14, 14);
|
||||
}
|
||||
#endif
|
||||
|
||||
v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
|
||||
v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
|
||||
v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
|
||||
v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
|
||||
v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
|
||||
v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
|
||||
v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
|
||||
v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
|
||||
|
||||
phash = (uint2x4*)v;
|
||||
outpt = (uint2x4*)&g_hash[hashPosition * 8];
|
||||
outpt[0] = phash[0];
|
||||
outpt[1] = phash[1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__
|
||||
__launch_bounds__(128, 8)
|
||||
void quark_blake512_gpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
const uint32_t nounce = startNounce + thread;
|
||||
uint2 block[16];
|
||||
|
||||
block[0] = c_PaddedM[0];
|
||||
block[1] = c_PaddedM[1];
|
||||
block[2] = c_PaddedM[2];
|
||||
block[3] = c_PaddedM[3];
|
||||
block[4] = c_PaddedM[4];
|
||||
block[5] = c_PaddedM[5];
|
||||
block[6] = c_PaddedM[6];
|
||||
block[7] = c_PaddedM[7];
|
||||
block[8] = c_PaddedM[8];
|
||||
block[9].y = c_PaddedM[9].y;
|
||||
block[10] = vectorizehigh(0x80000000);
|
||||
block[11] = vectorizelow(0);
|
||||
block[12] = vectorizelow(0);
|
||||
block[13] = vectorizelow(0x1);
|
||||
block[14] = vectorizelow(0);
|
||||
block[15] = vectorizelow(0x280);
|
||||
block[9].x = nounce;
|
||||
|
||||
const uint2 h[8] = {
|
||||
{ 0xf3bcc908UL, 0x6a09e667UL },
|
||||
{ 0x84caa73bUL, 0xbb67ae85UL },
|
||||
{ 0xfe94f82bUL, 0x3c6ef372UL },
|
||||
{ 0x5f1d36f1UL, 0xa54ff53aUL },
|
||||
{ 0xade682d1UL, 0x510e527fUL },
|
||||
{ 0x2b3e6c1fUL, 0x9b05688cUL },
|
||||
{ 0xfb41bd6bUL, 0x1f83d9abUL },
|
||||
{ 0x137e2179UL, 0x5be0cd19UL }
|
||||
};
|
||||
|
||||
uint2 v[16];
|
||||
uint2x4 *outpt = (uint2x4*)v;
|
||||
outpt[0] = Hostprecalc[0];
|
||||
outpt[1] = Hostprecalc[1];
|
||||
outpt[2] = Hostprecalc[2];
|
||||
outpt[3] = Hostprecalc[3];
|
||||
|
||||
v[0] += (block[9] ^ c_512_u2[8]);
|
||||
v[15] = ROR16(v[15] ^ v[0]);
|
||||
v[10] += v[15];
|
||||
v[5] = ROR2(v[5] ^ v[10], 11);
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
|
||||
|
||||
// Gprecalc(1, 5, 9, 13, 0x8, 0x4)
|
||||
v[1] += v[5];
|
||||
v[13] = eorswap32(v[13], v[1]);
|
||||
v[9] += v[13];
|
||||
|
||||
v[5] = ROR2(v[5] ^ v[9], 25);
|
||||
v[1] += (block[8] ^ c_512_u2[4]) + v[5];
|
||||
v[13] = ROR16(v[13] ^ v[1]);
|
||||
v[9] += v[13];
|
||||
v[5] = ROR2(v[5] ^ v[9], 11);
|
||||
|
||||
// Gprecalc(2, 6, 10, 14, 0xf, 0x9)
|
||||
v[2] += (block[9] ^ c_512_u2[0xf]);
|
||||
v[14] = eorswap32(v[14], v[2]);
|
||||
v[10] += v[14];
|
||||
v[6] = ROR2(v[6] ^ v[10], 25);
|
||||
v[2] += (block[0xf] ^ c_512_u2[9]) + v[6];
|
||||
v[14] = ROR16(v[14] ^ v[2]);
|
||||
v[10] += v[14];
|
||||
v[6] = ROR2(v[6] ^ v[10], 11);
|
||||
|
||||
// Gprecalc(3, 7, 11, 15, 0x6, 0xd)
|
||||
v[15] = eorswap32( v[15] , v[3]);
|
||||
v[11] += v[15];
|
||||
v[7] = ROR2(v[7] ^ v[11], 25);
|
||||
v[3] += (block[6] ^ c_512_u2[0xd]) + v[7];
|
||||
v[15] = ROR16(v[15] ^ v[3]);
|
||||
v[11] += v[15];
|
||||
v[7] = ROR2(v[7] ^ v[11], 11);
|
||||
|
||||
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
|
||||
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
|
||||
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
|
||||
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
|
||||
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
|
||||
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
|
||||
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
|
||||
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
|
||||
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
|
||||
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
|
||||
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
|
||||
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
|
||||
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
|
||||
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
|
||||
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
|
||||
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
|
||||
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
|
||||
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
|
||||
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
|
||||
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
|
||||
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
|
||||
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
|
||||
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
|
||||
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
|
||||
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
|
||||
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
|
||||
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
|
||||
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
|
||||
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
|
||||
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
|
||||
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
|
||||
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x5, 0xc)
|
||||
Gprecalc(1, 5, 9, 13, 0xf, 0x1)
|
||||
Gprecalc(2, 6, 10, 14, 0xd, 0xe)
|
||||
Gprecalc(3, 7, 11, 15, 0xa, 0x4)
|
||||
Gprecalc(0, 5, 10, 15, 0x7, 0x0)
|
||||
Gprecalc(1, 6, 11, 12, 0x3, 0x6)
|
||||
Gprecalc(2, 7, 8, 13, 0x2, 0x9)
|
||||
Gprecalc(3, 4, 9, 14, 0xb, 0x8)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xb, 0xd)
|
||||
Gprecalc(1, 5, 9, 13, 0xe, 0x7)
|
||||
Gprecalc(2, 6, 10, 14, 0x1, 0xc)
|
||||
Gprecalc(3, 7, 11, 15, 0x9, 0x3)
|
||||
Gprecalc(0, 5, 10, 15, 0x0, 0x5)
|
||||
Gprecalc(1, 6, 11, 12, 0x4, 0xf)
|
||||
Gprecalc(2, 7, 8, 13, 0x6, 0x8)
|
||||
Gprecalc(3, 4, 9, 14, 0xa, 0x2)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xf, 0x6)
|
||||
Gprecalc(1, 5, 9, 13, 0x9, 0xe)
|
||||
Gprecalc(2, 6, 10, 14, 0x3, 0xb)
|
||||
Gprecalc(3, 7, 11, 15, 0x8, 0x0)
|
||||
Gprecalc(0, 5, 10, 15, 0x2, 0xc)
|
||||
Gprecalc(1, 6, 11, 12, 0x7, 0xd)
|
||||
Gprecalc(2, 7, 8, 13, 0x4, 0x1)
|
||||
Gprecalc(3, 4, 9, 14, 0x5, 0xa)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x2, 0xa)
|
||||
Gprecalc(1, 5, 9, 13, 0x4, 0x8)
|
||||
Gprecalc(2, 6, 10, 14, 0x6, 0x7)
|
||||
Gprecalc(3, 7, 11, 15, 0x5, 0x1)
|
||||
Gprecalc(0, 5, 10, 15, 0xb, 0xf)
|
||||
Gprecalc(1, 6, 11, 12, 0xe, 0x9)
|
||||
Gprecalc(2, 7, 8, 13, 0xc, 0x3)
|
||||
Gprecalc(3, 4, 9, 14, 0x0, 0xd)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x1, 0x0)
|
||||
Gprecalc(1, 5, 9, 13, 0x3, 0x2)
|
||||
Gprecalc(2, 6, 10, 14, 0x5, 0x4)
|
||||
Gprecalc(3, 7, 11, 15, 0x7, 0x6)
|
||||
Gprecalc(0, 5, 10, 15, 0x9, 0x8)
|
||||
Gprecalc(1, 6, 11, 12, 0xb, 0xa)
|
||||
Gprecalc(2, 7, 8, 13, 0xd, 0xc)
|
||||
Gprecalc(3, 4, 9, 14, 0xf, 0xe)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xa, 0xe)
|
||||
Gprecalc(1, 5, 9, 13, 0x8, 0x4)
|
||||
Gprecalc(2, 6, 10, 14, 0xf, 0x9)
|
||||
Gprecalc(3, 7, 11, 15, 0x6, 0xd)
|
||||
Gprecalc(0, 5, 10, 15, 0xc, 0x1)
|
||||
Gprecalc(1, 6, 11, 12, 0x2, 0x0)
|
||||
Gprecalc(2, 7, 8, 13, 0x7, 0xb)
|
||||
Gprecalc(3, 4, 9, 14, 0x3, 0x5)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x8, 0xb)
|
||||
Gprecalc(1, 5, 9, 13, 0x0, 0xc)
|
||||
Gprecalc(2, 6, 10, 14, 0x2, 0x5)
|
||||
Gprecalc(3, 7, 11, 15, 0xd, 0xf)
|
||||
Gprecalc(0, 5, 10, 15, 0xe, 0xa)
|
||||
Gprecalc(1, 6, 11, 12, 0x6, 0x3)
|
||||
Gprecalc(2, 7, 8, 13, 0x1, 0x7)
|
||||
Gprecalc(3, 4, 9, 14, 0x4, 0x9)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x9, 0x7)
|
||||
Gprecalc(1, 5, 9, 13, 0x1, 0x3)
|
||||
Gprecalc(2, 6, 10, 14, 0xc, 0xd)
|
||||
Gprecalc(3, 7, 11, 15, 0xe, 0xb)
|
||||
Gprecalc(0, 5, 10, 15, 0x6, 0x2)
|
||||
Gprecalc(1, 6, 11, 12, 0xa, 0x5)
|
||||
Gprecalc(2, 7, 8, 13, 0x0, 0x4)
|
||||
Gprecalc(3, 4, 9, 14, 0x8, 0xf)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0x0, 0x9)
|
||||
Gprecalc(1, 5, 9, 13, 0x7, 0x5)
|
||||
Gprecalc(2, 6, 10, 14, 0x4, 0x2)
|
||||
Gprecalc(3, 7, 11, 15, 0xf, 0xa)
|
||||
Gprecalc(0, 5, 10, 15, 0x1, 0xe)
|
||||
Gprecalc(1, 6, 11, 12, 0xc, 0xb)
|
||||
Gprecalc(2, 7, 8, 13, 0x8, 0x6)
|
||||
Gprecalc(3, 4, 9, 14, 0xd, 0x3)
|
||||
|
||||
Gprecalc(0, 4, 8, 12, 0xc, 0x2)
|
||||
Gprecalc(1, 5, 9, 13, 0xa, 0x6)
|
||||
Gprecalc(2, 6, 10, 14, 0xb, 0x0)
|
||||
Gprecalc(3, 7, 11, 15, 0x3, 0x8)
|
||||
Gprecalc(0, 5, 10, 15, 0xd, 0x4)
|
||||
Gprecalc(1, 6, 11, 12, 0x5, 0x7)
|
||||
Gprecalc(2, 7, 8, 13, 0xe, 0xf)
|
||||
Gprecalc(3, 4, 9, 14, 0x9, 0x1)
|
||||
|
||||
v[0] = cuda_swap(h[0] ^ v[0] ^ v[8]);
|
||||
v[1] = cuda_swap(h[1] ^ v[1] ^ v[9]);
|
||||
v[2] = cuda_swap(h[2] ^ v[2] ^ v[10]);
|
||||
v[3] = cuda_swap(h[3] ^ v[3] ^ v[11]);
|
||||
v[4] = cuda_swap(h[4] ^ v[4] ^ v[12]);
|
||||
v[5] = cuda_swap(h[5] ^ v[5] ^ v[13]);
|
||||
v[6] = cuda_swap(h[6] ^ v[6] ^ v[14]);
|
||||
v[7] = cuda_swap(h[7] ^ v[7] ^ v[15]);
|
||||
|
||||
uint2x4 *phash = (uint2x4*)v;
|
||||
outpt = (uint2x4*) &outputHash[thread * 8U];
|
||||
outpt[0] = phash[0];
|
||||
outpt[1] = phash[1];
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
|
||||
|
||||
__host__ void quark_blake512_cpu_init_sp(int thr_id)
|
||||
{
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_PaddedMessage80[thr_id], 10 * sizeof(uint2)));
|
||||
}
|
||||
|
||||
__host__ void quark_blake512_cpu_free_sp(int thr_id)
|
||||
{
|
||||
cudaFree(d_PaddedMessage80[thr_id]);
|
||||
}
|
||||
|
||||
__host__ void quark_blake512_cpu_setBlock_80_sp(uint64_t *pdata)
|
||||
{
|
||||
uint64_t PaddedMessage[10];
|
||||
for (int i = 0; i < 10; i++)
|
||||
PaddedMessage[i] = cuda_swab64(pdata[i]);
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedM, PaddedMessage, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
|
||||
|
||||
uint64_t block[16];
|
||||
|
||||
uint64_t *peker = (uint64_t *)&PaddedMessage[0];
|
||||
|
||||
block[0] = peker[0];
|
||||
block[1] = peker[1];
|
||||
block[2] = peker[2];
|
||||
block[3] = peker[3];
|
||||
block[4] = peker[4];
|
||||
block[5] = peker[5];
|
||||
block[6] = peker[6];
|
||||
block[7] = peker[7];
|
||||
block[8] = peker[8];
|
||||
block[9] = peker[9];
|
||||
block[10] = 0x8000000000000000;
|
||||
block[11] = 0;
|
||||
block[12] = 0;
|
||||
block[13] = 1;
|
||||
block[14] = 0;
|
||||
block[15] = 0x280;
|
||||
|
||||
const uint64_t u512[16] = {
|
||||
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
|
||||
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
|
||||
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
|
||||
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
|
||||
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
|
||||
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
|
||||
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
|
||||
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
|
||||
};
|
||||
|
||||
uint64_t h[8] = {
|
||||
0x6a09e667f3bcc908ULL,
|
||||
0xbb67ae8584caa73bULL,
|
||||
0x3c6ef372fe94f82bULL,
|
||||
0xa54ff53a5f1d36f1ULL,
|
||||
0x510e527fade682d1ULL,
|
||||
0x9b05688c2b3e6c1fULL,
|
||||
0x1f83d9abfb41bd6bULL,
|
||||
0x5be0cd19137e2179ULL
|
||||
};
|
||||
|
||||
uint64_t v[16] = {
|
||||
h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7],
|
||||
u512[0], u512[1], u512[2], u512[3], u512[4] ^ 640U, u512[5] ^ 640U, u512[6], u512[7]
|
||||
};
|
||||
|
||||
GprecalcHost(0, 4, 8, 12, 0x1, 0x0)
|
||||
GprecalcHost(1, 5, 9, 13, 0x3, 0x2)
|
||||
GprecalcHost(2, 6, 10, 14, 0x5, 0x4)
|
||||
GprecalcHost(3, 7, 11, 15, 0x7, 0x6)
|
||||
|
||||
GprecalcHost(1, 6, 11, 12, 0xb, 0xa)
|
||||
GprecalcHost(2, 7, 8, 13, 0xd, 0xc)
|
||||
|
||||
v[0] += (block[8] ^ u512[9]) + v[5];
|
||||
v[15] = ROTR64(v[15] ^ v[0], 32); \
|
||||
v[10] += v[15];
|
||||
v[5] = ROTR64(v[5] ^ v[10], 25);
|
||||
v[0] += v[5];
|
||||
|
||||
GprecalcHost(3, 4, 9, 14, 0xf, 0xe);
|
||||
|
||||
v[1] += (block[0x4] ^ u512[0x8]);
|
||||
v[2] += v[6];
|
||||
|
||||
v[3] += (block[0xd] ^ u512[6]) + v[7];
|
||||
|
||||
//applog_hash((unsigned char*) &v[0]);
|
||||
//applog_hash((unsigned char*) &v[4]);
|
||||
//applog_hash((unsigned char*) &v[8]);
|
||||
//applog_hash((unsigned char*) &v[12]);
|
||||
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(Hostprecalc, v, 128, 0, cudaMemcpyHostToDevice));
|
||||
}
|
||||
#else
|
||||
// __CUDA_ARCH__ < 500
|
||||
__global__ void quark_blake512_gpu_hash_64_sp(uint32_t, uint32_t startNounce, uint32_t *const __restrict__ g_nonceVector, uint2 *const __restrict__ g_hash) {}
|
||||
__global__ void quark_blake512_gpu_hash_80_sp(uint32_t, uint32_t startNounce, uint2 *outputHash) {}
|
||||
#endif
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_hash_64_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash)
|
||||
{
|
||||
const uint32_t threadsperblock = 32;
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
quark_blake512_gpu_hash_64_sp <<<grid, block>>>(threads, startNounce, d_nonceVector, (uint2*)d_outputHash);
|
||||
}
|
||||
|
||||
__host__
|
||||
void quark_blake512_cpu_hash_80_sp(uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
|
||||
{
|
||||
const uint32_t threadsperblock = 64;
|
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
quark_blake512_gpu_hash_80_sp <<<grid, block>>>(threads, startNounce, (uint2 *)d_outputHash);
|
||||
}
|
@ -20,6 +20,7 @@ static uint32_t *d_branch2Nonces[MAX_GPUS];
|
||||
static uint32_t *d_branch3Nonces[MAX_GPUS];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
@ -57,6 +58,8 @@ extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t st
|
||||
// Original Quarkhash Funktion aus einem miner Quelltext
|
||||
extern "C" void quarkhash(void *state, const void *input)
|
||||
{
|
||||
unsigned char _ALIGN(128) hash[64];
|
||||
|
||||
sph_blake512_context ctx_blake;
|
||||
sph_bmw512_context ctx_bmw;
|
||||
sph_groestl512_context ctx_groestl;
|
||||
@ -64,8 +67,6 @@ extern "C" void quarkhash(void *state, const void *input)
|
||||
sph_keccak512_context ctx_keccak;
|
||||
sph_skein512_context ctx_skein;
|
||||
|
||||
unsigned char hash[64];
|
||||
|
||||
sph_blake512_init(&ctx_blake);
|
||||
sph_blake512 (&ctx_blake, input, 80);
|
||||
sph_blake512_close(&ctx_blake, (void*) hash);
|
||||
@ -132,6 +133,21 @@ extern "C" void quarkhash(void *state, const void *input)
|
||||
memcpy(state, hash, 32);
|
||||
}
|
||||
|
||||
#ifdef _DEBUG
|
||||
#define TRACE(algo) { \
|
||||
if (max_nonce == 1 && pdata[19] <= 1) { \
|
||||
uint32_t* debugbuf = NULL; \
|
||||
cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
|
||||
cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
|
||||
printf("quark %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
|
||||
swab32(debugbuf[2]), swab32(debugbuf[3])); \
|
||||
cudaFreeHost(debugbuf); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
#define TRACE(algo) {}
|
||||
#endif
|
||||
|
||||
static bool init[MAX_GPUS] = { 0 };
|
||||
|
||||
extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||
@ -141,18 +157,17 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1 << 20); // 256*4096
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096
|
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x00F;
|
||||
ptarget[7] = 0x00F;
|
||||
|
||||
if (!init[thr_id])
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
// Konstanten kopieren, Speicher belegen
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
|
||||
|
||||
quark_blake512_cpu_init(thr_id, throughput);
|
||||
quark_groestl512_cpu_init(thr_id, throughput);
|
||||
@ -180,8 +195,8 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
int order = 0;
|
||||
uint32_t nrm1=0, nrm2=0, nrm3=0;
|
||||
|
||||
// erstes Blake512 Hash mit CUDA
|
||||
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
|
||||
TRACE("blake :");
|
||||
|
||||
// das ist der unbedingte Branch für BMW512
|
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
@ -223,28 +238,26 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
d_branch2Nonces[thr_id], &nrm2,
|
||||
order++);
|
||||
|
||||
// das ist der bedingte Branch für Keccak512
|
||||
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
// das ist der bedingte Branch für JH512
|
||||
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||
|
||||
*hashes_done = pdata[19] - first_nonce + 1;
|
||||
|
||||
// Scan nach Gewinner Hashes auf der GPU
|
||||
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
if (foundNonce != 0xffffffff)
|
||||
if (foundNonce != UINT32_MAX)
|
||||
{
|
||||
uint32_t vhash64[8];
|
||||
uint32_t vhash[8];
|
||||
be32enc(&endiandata[19], foundNonce);
|
||||
quarkhash(vhash64, endiandata);
|
||||
quarkhash(vhash, endiandata);
|
||||
|
||||
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
|
||||
work_set_target_ratio(work, vhash64);
|
||||
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
|
||||
work_set_target_ratio(work, vhash);
|
||||
pdata[19] = foundNonce;
|
||||
return 1;
|
||||
} else {
|
||||
applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
|
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
||||
applog_hash((uchar*) vhash);
|
||||
applog_hash((uchar*) ptarget);
|
||||
}
|
||||
}
|
||||
|
||||
@ -270,6 +283,7 @@ extern "C" void free_quark(int thr_id)
|
||||
cudaFree(d_branch2Nonces[thr_id]);
|
||||
cudaFree(d_branch3Nonces[thr_id]);
|
||||
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
quark_compactTest_cpu_free(thr_id);
|
||||
|
||||
|
@ -23,6 +23,7 @@ extern "C"
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
|
||||
@ -257,6 +258,7 @@ extern "C" void free_c11(int thr_id)
|
||||
cudaThreadSynchronize();
|
||||
|
||||
cudaFree(d_hash[thr_id]);
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
x11_simd512_cpu_free(thr_id);
|
||||
|
||||
|
@ -23,6 +23,7 @@ extern "C"
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
|
||||
@ -255,6 +256,7 @@ extern "C" void free_x11(int thr_id)
|
||||
|
||||
cudaFree(d_hash[thr_id]);
|
||||
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
x11_simd512_cpu_free(thr_id);
|
||||
|
||||
|
@ -26,6 +26,7 @@ extern "C"
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
|
||||
@ -263,6 +264,7 @@ extern "C" void free_x13(int thr_id)
|
||||
|
||||
cudaFree(d_hash[thr_id]);
|
||||
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
x11_simd512_cpu_free(thr_id);
|
||||
x13_fugue512_cpu_free(thr_id);
|
||||
|
@ -30,6 +30,7 @@ extern "C" {
|
||||
static uint32_t *d_hash[MAX_GPUS] = { 0 };
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
|
||||
@ -269,6 +270,7 @@ extern "C" void free_x14(int thr_id)
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
x11_simd512_cpu_free(thr_id);
|
||||
x13_fugue512_cpu_free(thr_id);
|
||||
|
@ -31,6 +31,7 @@ extern "C" {
|
||||
static uint32_t *d_hash[MAX_GPUS] = { 0 };
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
|
||||
@ -277,6 +278,7 @@ extern "C" void free_x15(int thr_id)
|
||||
|
||||
cudaFree(d_hash[thr_id]);
|
||||
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
x11_simd512_cpu_free(thr_id);
|
||||
x13_fugue512_cpu_free(thr_id);
|
||||
|
@ -34,6 +34,7 @@ extern "C"
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
|
||||
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void quark_blake512_cpu_free(int thr_id);
|
||||
extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
|
||||
extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
|
||||
@ -301,6 +302,7 @@ extern "C" void free_x17(int thr_id)
|
||||
|
||||
cudaFree(d_hash[thr_id]);
|
||||
|
||||
quark_blake512_cpu_free(thr_id);
|
||||
quark_groestl512_cpu_free(thr_id);
|
||||
x11_simd512_cpu_free(thr_id);
|
||||
x13_fugue512_cpu_free(thr_id);
|
||||
|
Loading…
x
Reference in New Issue
Block a user