mirror of https://github.com/GOSTSec/ccminer
R4SAS
6 years ago
64 changed files with 5413 additions and 488 deletions
@ -0,0 +1,273 @@
@@ -0,0 +1,273 @@
|
||||
/** |
||||
* Blake2-B CUDA Implementation |
||||
* |
||||
* tpruvot@github July 2016 |
||||
* |
||||
*/ |
||||
|
||||
#include <miner.h> |
||||
|
||||
#include <string.h> |
||||
#include <stdint.h> |
||||
|
||||
#include <sph/blake2b.h> |
||||
|
||||
#include <cuda_helper.h> |
||||
#include <cuda_vector_uint2x4.h> |
||||
|
||||
#define TPB 512 |
||||
#define NBN 2 |
||||
|
||||
static uint32_t *d_resNonces[MAX_GPUS]; |
||||
|
||||
__device__ uint64_t d_data[10]; |
||||
|
||||
static __constant__ const int8_t blake2b_sigma[12][16] = { |
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , |
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , |
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , |
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , |
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , |
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , |
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , |
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , |
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , |
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } , |
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , |
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } |
||||
}; |
||||
|
||||
// host mem align |
||||
#define A 64 |
||||
|
||||
extern "C" void blake2b_hash(void *output, const void *input) |
||||
{ |
||||
uint8_t _ALIGN(A) hash[32]; |
||||
blake2b_ctx ctx; |
||||
|
||||
blake2b_init(&ctx, 32, NULL, 0); |
||||
blake2b_update(&ctx, input, 80); |
||||
blake2b_final(&ctx, hash); |
||||
|
||||
memcpy(output, hash, 32); |
||||
} |
||||
|
||||
// ---------------------------------------------------------------- |
||||
|
||||
__device__ __forceinline__ |
||||
static void G(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16]) |
||||
{ |
||||
a = a + b + m[ blake2b_sigma[r][2*i] ]; |
||||
((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); |
||||
c = c + d; |
||||
((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] ); |
||||
a = a + b + m[ blake2b_sigma[r][2*i+1] ]; |
||||
((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); |
||||
c = c + d; |
||||
((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); |
||||
} |
||||
|
||||
#define ROUND(r) \ |
||||
G(r, 0, v[0], v[4], v[ 8], v[12], m); \ |
||||
G(r, 1, v[1], v[5], v[ 9], v[13], m); \ |
||||
G(r, 2, v[2], v[6], v[10], v[14], m); \ |
||||
G(r, 3, v[3], v[7], v[11], v[15], m); \ |
||||
G(r, 4, v[0], v[5], v[10], v[15], m); \ |
||||
G(r, 5, v[1], v[6], v[11], v[12], m); \ |
||||
G(r, 6, v[2], v[7], v[ 8], v[13], m); \ |
||||
G(r, 7, v[3], v[4], v[ 9], v[14], m); |
||||
|
||||
__global__ |
||||
//__launch_bounds__(128, 8) /* to force 64 regs */ |
||||
void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2) |
||||
{ |
||||
const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce; |
||||
|
||||
uint64_t m[16]; |
||||
|
||||
m[0] = d_data[0]; |
||||
m[1] = d_data[1]; |
||||
m[2] = d_data[2]; |
||||
m[3] = d_data[3]; |
||||
m[4] = d_data[4]; |
||||
m[5] = d_data[5]; |
||||
m[6] = d_data[6]; |
||||
m[7] = d_data[7]; |
||||
m[8] = d_data[8]; |
||||
((uint32_t*)m)[18] = AS_U32(&d_data[9]); |
||||
((uint32_t*)m)[19] = nonce; |
||||
|
||||
m[10] = m[11] = 0; |
||||
m[12] = m[13] = 0; |
||||
m[14] = m[15] = 0; |
||||
|
||||
uint64_t v[16] = { |
||||
0x6a09e667f2bdc928, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, |
||||
0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, |
||||
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, |
||||
0x510e527fade68281, 0x9b05688c2b3e6c1f, 0xe07c265404be4294, 0x5be0cd19137e2179 |
||||
}; |
||||
|
||||
ROUND( 0); |
||||
ROUND( 1); |
||||
ROUND( 2); |
||||
ROUND( 3); |
||||
ROUND( 4); |
||||
ROUND( 5); |
||||
ROUND( 6); |
||||
ROUND( 7); |
||||
ROUND( 8); |
||||
ROUND( 9); |
||||
ROUND(10); |
||||
ROUND(11); |
||||
|
||||
uint2 last = vectorize(v[3] ^ v[11] ^ 0xa54ff53a5f1d36f1); |
||||
if (last.y <= target2.y && last.x <= target2.x) { |
||||
resNonce[1] = resNonce[0]; |
||||
resNonce[0] = nonce; |
||||
} |
||||
} |
||||
|
||||
__host__ |
||||
uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce) |
||||
{ |
||||
uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX }; |
||||
uint32_t result = UINT32_MAX; |
||||
|
||||
dim3 grid((threads + TPB-1)/TPB); |
||||
dim3 block(TPB); |
||||
|
||||
/* Check error on Ctrl+C or kill to prevent segfaults on exit */ |
||||
if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) |
||||
return result; |
||||
|
||||
blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2); |
||||
cudaThreadSynchronize(); |
||||
|
||||
if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { |
||||
result = resNonces[0]; |
||||
secNonce = resNonces[1]; |
||||
if (secNonce == result) secNonce = UINT32_MAX; |
||||
} |
||||
return result; |
||||
} |
||||
|
||||
__host__ |
||||
void blake2b_setBlock(uint32_t *data) |
||||
{ |
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice)); |
||||
} |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
|
||||
int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t _ALIGN(A) endiandata[20]; |
||||
uint32_t *pdata = work->data; |
||||
uint32_t *ptarget = work->target; |
||||
|
||||
const uint32_t first_nonce = pdata[19]; |
||||
|
||||
int dev_id = device_map[thr_id]; |
||||
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25; |
||||
if (device_sm[dev_id] >= 520 && is_windows()) intensity = 26; |
||||
if (device_sm[dev_id] < 350) intensity = 22; |
||||
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); |
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
||||
|
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(dev_id); |
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
||||
cudaDeviceReset(); |
||||
// reduce cpu usage (linux) |
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||
CUDA_LOG_ERROR(); |
||||
} |
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); |
||||
|
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1); |
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
for (int i=0; i < 20; i++) |
||||
be32enc(&endiandata[i], pdata[i]); |
||||
|
||||
const uint2 target = make_uint2(ptarget[6], ptarget[7]); |
||||
blake2b_setBlock(endiandata); |
||||
|
||||
do { |
||||
work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[19], target, work->nonces[1]); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
|
||||
if (work->nonces[0] != UINT32_MAX) |
||||
{ |
||||
const uint32_t Htarg = ptarget[7]; |
||||
uint32_t _ALIGN(A) vhash[8]; |
||||
work->valid_nonces = 0; |
||||
endiandata[19] = work->nonces[0]; |
||||
blake2b_hash(vhash, endiandata); |
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { |
||||
work_set_target_ratio(work, vhash); |
||||
work->valid_nonces++; |
||||
pdata[19] = work->nonces[0] + 1; |
||||
} else { |
||||
gpu_increment_reject(thr_id); |
||||
} |
||||
|
||||
if (work->nonces[1] != UINT32_MAX) { |
||||
endiandata[19] = work->nonces[1]; |
||||
blake2b_hash(vhash, endiandata); |
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { |
||||
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) { |
||||
work->sharediff[1] = work->sharediff[0]; |
||||
work->shareratio[1] = work->shareratio[0]; |
||||
xchg(work->nonces[1], work->nonces[0]); |
||||
work_set_target_ratio(work, vhash); |
||||
} else { |
||||
bn_set_target_ratio(work, vhash, 1); |
||||
} |
||||
work->valid_nonces++; |
||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan start |
||||
} else { |
||||
gpu_increment_reject(thr_id); |
||||
} |
||||
} |
||||
|
||||
if (work->valid_nonces) { |
||||
work->nonces[0] = cuda_swab32(work->nonces[0]); |
||||
work->nonces[1] = cuda_swab32(work->nonces[1]); |
||||
return work->valid_nonces; |
||||
} |
||||
} |
||||
|
||||
if ((uint64_t) throughput + pdata[19] >= max_nonce) { |
||||
pdata[19] = max_nonce; |
||||
break; |
||||
} |
||||
|
||||
pdata[19] += throughput; |
||||
|
||||
} while (!work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce; |
||||
|
||||
return 0; |
||||
} |
||||
|
||||
// cleanup |
||||
extern "C" void free_blake2b(int thr_id) |
||||
{ |
||||
if (!init[thr_id]) |
||||
return; |
||||
|
||||
//cudaThreadSynchronize(); |
||||
|
||||
cudaFree(d_resNonces[thr_id]); |
||||
|
||||
init[thr_id] = false; |
||||
|
||||
cudaDeviceSynchronize(); |
||||
} |
@ -0,0 +1,217 @@
@@ -0,0 +1,217 @@
|
||||
extern "C" { |
||||
#include "sph/sph_blake.h" |
||||
#include "sph/sph_keccak.h" |
||||
#include "sph/sph_cubehash.h" |
||||
#include "sph/sph_skein.h" |
||||
#include "sph/sph_groestl.h" |
||||
#include "lyra2/Lyra2.h" |
||||
} |
||||
|
||||
#include <miner.h> |
||||
#include <cuda_helper.h> |
||||
|
||||
static uint64_t* d_hash[MAX_GPUS]; |
||||
static uint64_t* d_matrix[MAX_GPUS]; |
||||
|
||||
extern void blake256_cpu_init(int thr_id, uint32_t threads); |
||||
extern void blake256_cpu_setBlock_80(uint32_t *pdata); |
||||
//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); |
||||
|
||||
//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||
//extern void keccak256_sm3_init(int thr_id, uint32_t threads); |
||||
//extern void keccak256_sm3_free(int thr_id); |
||||
|
||||
extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); |
||||
|
||||
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||
|
||||
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); |
||||
|
||||
extern void skein256_cpu_init(int thr_id, uint32_t threads); |
||||
|
||||
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); |
||||
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti); |
||||
|
||||
extern void groestl256_cpu_init(int thr_id, uint32_t threads); |
||||
extern void groestl256_cpu_free(int thr_id); |
||||
extern void groestl256_setTarget(const void *ptarget); |
||||
extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order); |
||||
extern uint32_t groestl256_getSecNonce(int thr_id, int num); |
||||
|
||||
|
||||
extern "C" void allium_hash(void *state, const void *input) |
||||
{ |
||||
uint32_t hashA[8], hashB[8]; |
||||
|
||||
sph_blake256_context ctx_blake; |
||||
sph_keccak256_context ctx_keccak; |
||||
sph_cubehash256_context ctx_cube; |
||||
sph_skein256_context ctx_skein; |
||||
sph_groestl256_context ctx_groestl; |
||||
|
||||
sph_blake256_set_rounds(14); |
||||
|
||||
sph_blake256_init(&ctx_blake); |
||||
sph_blake256(&ctx_blake, input, 80); |
||||
sph_blake256_close(&ctx_blake, hashA); |
||||
|
||||
sph_keccak256_init(&ctx_keccak); |
||||
sph_keccak256(&ctx_keccak, hashA, 32); |
||||
sph_keccak256_close(&ctx_keccak, hashB); |
||||
|
||||
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); |
||||
|
||||
sph_cubehash256_init(&ctx_cube); |
||||
sph_cubehash256(&ctx_cube, hashA, 32); |
||||
sph_cubehash256_close(&ctx_cube, hashB); |
||||
|
||||
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); |
||||
|
||||
sph_skein256_init(&ctx_skein); |
||||
sph_skein256(&ctx_skein, hashA, 32); |
||||
sph_skein256_close(&ctx_skein, hashB); |
||||
|
||||
sph_groestl256_init(&ctx_groestl); |
||||
sph_groestl256(&ctx_groestl, hashB, 32); |
||||
sph_groestl256_close(&ctx_groestl, hashA); |
||||
|
||||
memcpy(state, hashA, 32); |
||||
} |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
static __thread uint32_t throughput = 0; |
||||
|
||||
extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t *pdata = work->data; |
||||
uint32_t *ptarget = work->target; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
|
||||
if (opt_benchmark) |
||||
ptarget[7] = 0x00ff; |
||||
|
||||
static __thread bool gtx750ti; |
||||
if (!init[thr_id]) |
||||
{ |
||||
int dev_id = device_map[thr_id]; |
||||
cudaSetDevice(dev_id); |
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
||||
cudaDeviceReset(); |
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||
CUDA_LOG_ERROR(); |
||||
} |
||||
|
||||
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; |
||||
if (device_sm[device_map[thr_id]] == 500) intensity = 15; |
||||
throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; |
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
||||
|
||||
cudaDeviceProp props; |
||||
cudaGetDeviceProperties(&props, dev_id); |
||||
|
||||
if (strstr(props.name, "750 Ti")) gtx750ti = true; |
||||
else gtx750ti = false; |
||||
|
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); |
||||
|
||||
blake256_cpu_init(thr_id, throughput); |
||||
//keccak256_sm3_init(thr_id, throughput); |
||||
skein256_cpu_init(thr_id, throughput); |
||||
groestl256_cpu_init(thr_id, throughput); |
||||
|
||||
//cuda_get_arch(thr_id); |
||||
if (device_sm[dev_id] >= 500) |
||||
{ |
||||
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; |
||||
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); |
||||
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); |
||||
} |
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
uint32_t _ALIGN(128) endiandata[20]; |
||||
for (int k=0; k < 20; k++) |
||||
be32enc(&endiandata[k], pdata[k]); |
||||
|
||||
blake256_cpu_setBlock_80(pdata); |
||||
groestl256_setTarget(ptarget); |
||||
|
||||
do { |
||||
int order = 0; |
||||
|
||||
//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti); |
||||
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti); |
||||
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
|
||||
work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
if (work->nonces[0] != UINT32_MAX) |
||||
{ |
||||
const uint32_t Htarg = ptarget[7]; |
||||
uint32_t _ALIGN(64) vhash[8]; |
||||
|
||||
be32enc(&endiandata[19], work->nonces[0]); |
||||
allium_hash(vhash, endiandata); |
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { |
||||
work->valid_nonces = 1; |
||||
work_set_target_ratio(work, vhash); |
||||
work->nonces[1] = groestl256_getSecNonce(thr_id, 1); |
||||
if (work->nonces[1] != UINT32_MAX) { |
||||
be32enc(&endiandata[19], work->nonces[1]); |
||||
allium_hash(vhash, endiandata); |
||||
bn_set_target_ratio(work, vhash, 1); |
||||
work->valid_nonces++; |
||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; |
||||
} else { |
||||
pdata[19] = work->nonces[0] + 1; // cursor |
||||
} |
||||
return work->valid_nonces; |
||||
} |
||||
else if (vhash[7] > Htarg) { |
||||
gpu_increment_reject(thr_id); |
||||
if (!opt_quiet) |
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); |
||||
pdata[19] = work->nonces[0] + 1; |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
if ((uint64_t)throughput + pdata[19] >= max_nonce) { |
||||
pdata[19] = max_nonce; |
||||
break; |
||||
} |
||||
pdata[19] += throughput; |
||||
|
||||
} while (!work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce; |
||||
return 0; |
||||
} |
||||
|
||||
// cleanup |
||||
extern "C" void free_allium(int thr_id) |
||||
{ |
||||
if (!init[thr_id]) |
||||
return; |
||||
|
||||
cudaThreadSynchronize(); |
||||
|
||||
cudaFree(d_hash[thr_id]); |
||||
cudaFree(d_matrix[thr_id]); |
||||
|
||||
//keccak256_sm3_free(thr_id); |
||||
groestl256_cpu_free(thr_id); |
||||
|
||||
init[thr_id] = false; |
||||
|
||||
cudaDeviceSynchronize(); |
||||
} |
@ -0,0 +1,481 @@
@@ -0,0 +1,481 @@
|
||||
/** |
||||
* Lyra2 (v3) CUDA Implementation |
||||
* |
||||
* Based on VTC sources |
||||
*/ |
||||
#include <stdio.h> |
||||
#include <stdint.h> |
||||
#include <memory.h> |
||||
#include "cuda_helper.h" |
||||
|
||||
#include "cuda_lyra2v3_sm3.cuh" |
||||
|
||||
|
||||
|
||||
#ifdef __INTELLISENSE__ |
||||
/* just for vstudio code colors */ |
||||
#define __CUDA_ARCH__ 500 |
||||
#endif |
||||
|
||||
#define TPB 32 |
||||
|
||||
#if __CUDA_ARCH__ >= 500 |
||||
|
||||
#include "cuda_lyra2_vectors.h" |
||||
|
||||
#define Nrow 4 |
||||
#define Ncol 4 |
||||
#define memshift 3 |
||||
|
||||
|
||||
__device__ uint2x4 *DMatrix; |
||||
|
||||
__device__ __forceinline__ uint2 LD4S(const int index) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; |
||||
} |
||||
|
||||
__device__ __forceinline__ void ST4S(const int index, const uint2 data) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; |
||||
} |
||||
|
||||
__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c) |
||||
{ |
||||
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) |
||||
{ |
||||
a += b; d ^= a; d = SWAPUINT2(d); |
||||
c += d; b ^= c; b = ROR2(b, 24); |
||||
a += b; d ^= a; d = ROR2(d, 16); |
||||
c += d; b ^= c; b = ROR2(b, 63); |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void round_lyra_v5(uint2x4 s[4]) |
||||
{ |
||||
Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x); |
||||
Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y); |
||||
Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z); |
||||
Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w); |
||||
|
||||
Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w); |
||||
Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x); |
||||
Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y); |
||||
Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z); |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void round_lyra_v5(uint2 s[4]) |
||||
{ |
||||
Gfunc_v5(s[0], s[1], s[2], s[3]); |
||||
s[1] = shuffle2(s[1], threadIdx.x + 1, 4); |
||||
s[2] = shuffle2(s[2], threadIdx.x + 2, 4); |
||||
s[3] = shuffle2(s[3], threadIdx.x + 3, 4); |
||||
Gfunc_v5(s[0], s[1], s[2], s[3]); |
||||
s[1] = shuffle2(s[1], threadIdx.x + 3, 4); |
||||
s[2] = shuffle2(s[2], threadIdx.x + 2, 4); |
||||
s[3] = shuffle2(s[3], threadIdx.x + 1, 4); |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void reduceDuplexRowSetup2(uint2 state[4]) |
||||
{ |
||||
uint2 state1[Ncol][3], state0[Ncol][3], state2[3]; |
||||
int i, j; |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i < Ncol; i++) |
||||
{ |
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state0[Ncol - i - 1][j] = state[j]; |
||||
round_lyra_v5(state); |
||||
} |
||||
|
||||
//#pragma unroll 4 |
||||
for (i = 0; i < Ncol; i++) |
||||
{ |
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state[j] ^= state0[i][j]; |
||||
|
||||
round_lyra_v5(state); |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state1[Ncol - i - 1][j] = state0[i][j]; |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state1[Ncol - i - 1][j] ^= state[j]; |
||||
} |
||||
|
||||
for (i = 0; i < Ncol; i++) |
||||
{ |
||||
const uint32_t s0 = memshift * Ncol * 0 + i * memshift; |
||||
const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift; |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state[j] ^= state1[i][j] + state0[i][j]; |
||||
|
||||
round_lyra_v5(state); |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state2[j] = state1[i][j]; |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state2[j] ^= state[j]; |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
ST4S(s2 + j, state2[j]); |
||||
|
||||
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); |
||||
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); |
||||
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) { |
||||
state0[i][0] ^= Data2; |
||||
state0[i][1] ^= Data0; |
||||
state0[i][2] ^= Data1; |
||||
} else { |
||||
state0[i][0] ^= Data0; |
||||
state0[i][1] ^= Data1; |
||||
state0[i][2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
ST4S(s0 + j, state0[i][j]); |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state0[i][j] = state2[j]; |
||||
|
||||
} |
||||
|
||||
for (i = 0; i < Ncol; i++) |
||||
{ |
||||
const uint32_t s1 = memshift * Ncol * 1 + i*memshift; |
||||
const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift; |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state[j] ^= state1[i][j] + state0[Ncol - i - 1][j]; |
||||
|
||||
round_lyra_v5(state); |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state0[Ncol - i - 1][j] ^= state[j]; |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
ST4S(s3 + j, state0[Ncol - i - 1][j]); |
||||
|
||||
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); |
||||
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); |
||||
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) { |
||||
state1[i][0] ^= Data2; |
||||
state1[i][1] ^= Data0; |
||||
state1[i][2] ^= Data1; |
||||
} else { |
||||
state1[i][0] ^= Data0; |
||||
state1[i][1] ^= Data1; |
||||
state1[i][2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
ST4S(s1 + j, state1[i][j]); |
||||
} |
||||
} |
||||
|
||||
__device__ |
||||
void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4]) |
||||
{ |
||||
uint2 state1[3], state2[3]; |
||||
const uint32_t ps1 = memshift * Ncol * rowIn; |
||||
const uint32_t ps2 = memshift * Ncol * rowInOut; |
||||
const uint32_t ps3 = memshift * Ncol * rowOut; |
||||
|
||||
for (int i = 0; i < Ncol; i++) |
||||
{ |
||||
const uint32_t s1 = ps1 + i*memshift; |
||||
const uint32_t s2 = ps2 + i*memshift; |
||||
const uint32_t s3 = ps3 + i*memshift; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = LD4S(s1 + j); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = LD4S(s2 + j); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra_v5(state); |
||||
|
||||
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); |
||||
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); |
||||
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) { |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} else { |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
ST4S(s2 + j, state2[j]); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
ST4S(s3 + j, LD4S(s3 + j) ^ state[j]); |
||||
} |
||||
} |
||||
|
||||
__device__ |
||||
void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4]) |
||||
{ |
||||
const int rowIn = 2; |
||||
const int rowOut = 3; |
||||
|
||||
int i, j; |
||||
uint2 last[3]; |
||||
const uint32_t ps1 = memshift * Ncol * rowIn; |
||||
const uint32_t ps2 = memshift * Ncol * rowInOut; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
last[j] = LD4S(ps2 + j); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= LD4S(ps1 + j) + last[j]; |
||||
|
||||
round_lyra_v5(state); |
||||
|
||||
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); |
||||
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); |
||||
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) { |
||||
last[0] ^= Data2; |
||||
last[1] ^= Data0; |
||||
last[2] ^= Data1; |
||||
} else { |
||||
last[0] ^= Data0; |
||||
last[1] ^= Data1; |
||||
last[2] ^= Data2; |
||||
} |
||||
|
||||
if (rowInOut == rowOut) |
||||
{ |
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
last[j] ^= state[j]; |
||||
} |
||||
|
||||
for (i = 1; i < Ncol; i++) |
||||
{ |
||||
const uint32_t s1 = ps1 + i*memshift; |
||||
const uint32_t s2 = ps2 + i*memshift; |
||||
|
||||
#pragma unroll |
||||
for (j = 0; j < 3; j++) |
||||
state[j] ^= LD4S(s1 + j) + LD4S(s2 + j); |
||||
|
||||
round_lyra_v5(state); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= last[j]; |
||||
} |
||||
|
||||
__global__ |
||||
__launch_bounds__(TPB, 1) |
||||
void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) |
||||
{ |
||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; |
||||
|
||||
const uint2x4 blake2b_IV[2] = { |
||||
0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL, |
||||
0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL, |
||||
0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL, |
||||
0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL |
||||
}; |
||||
|
||||
const uint2x4 Mask[2] = { |
||||
0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, |
||||
0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL, |
||||
0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL, |
||||
0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL |
||||
}; |
||||
|
||||
uint2x4 state[4]; |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]); |
||||
state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]); |
||||
state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]); |
||||
state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]); |
||||
state[2] = blake2b_IV[0]; |
||||
state[3] = blake2b_IV[1]; |
||||
|
||||
for (int i = 0; i<12; i++) |
||||
round_lyra_v5(state); |
||||
|
||||
state[0] ^= Mask[0]; |
||||
state[1] ^= Mask[1]; |
||||
|
||||
for (int i = 0; i<12; i++) |
||||
round_lyra_v5(state); |
||||
|
||||
DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0]; |
||||
DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1]; |
||||
DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2]; |
||||
DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3]; |
||||
} |
||||
} |
||||
|
||||
__global__ |
||||
__launch_bounds__(TPB, 1) |
||||
void lyra2v3_gpu_hash_32_2(uint32_t threads) |
||||
{ |
||||
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
uint2 state[4]; |
||||
state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; |
||||
state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; |
||||
state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; |
||||
state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; |
||||
|
||||
reduceDuplexRowSetup2(state); |
||||
|
||||
uint32_t rowa; |
||||
int prev = 3; |
||||
unsigned int instance = 0; |
||||
for (int i = 0; i < 3; i++) |
||||
{ |
||||
instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4); |
||||
rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3; |
||||
|
||||
//rowa = __shfl(state[0].x, 0, 4) & 3; |
||||
reduceDuplexRowt2(prev, rowa, i, state); |
||||
prev = i; |
||||
} |
||||
|
||||
instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4); |
||||
rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3; |
||||
|
||||
//rowa = __shfl(state[0].x, 0, 4) & 3; |
||||
reduceDuplexRowt2x4(rowa, state); |
||||
|
||||
((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0]; |
||||
((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1]; |
||||
((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2]; |
||||
((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3]; |
||||
} |
||||
} |
||||
|
||||
__global__ |
||||
__launch_bounds__(TPB, 1) |
||||
void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) |
||||
{ |
||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; |
||||
|
||||
uint2x4 state[4]; |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]); |
||||
state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]); |
||||
state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]); |
||||
state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]); |
||||
|
||||
for (int i = 0; i < 12; i++) |
||||
round_lyra_v5(state); |
||||
|
||||
outputHash[thread + threads * 0] = state[0].x; |
||||
outputHash[thread + threads * 1] = state[0].y; |
||||
outputHash[thread + threads * 2] = state[0].z; |
||||
outputHash[thread + threads * 3] = state[0].w; |
||||
} |
||||
} |
||||
|
||||
#else |
||||
#include "cuda_helper.h" |
||||
#if __CUDA_ARCH__ < 200 |
||||
__device__ void* DMatrix; |
||||
#endif |
||||
__global__ void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {} |
||||
__global__ void lyra2v3_gpu_hash_32_2(uint32_t threads) {} |
||||
__global__ void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {} |
||||
#endif |
||||
|
||||
|
||||
__host__ |
||||
void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) |
||||
{ |
||||
cuda_get_arch(thr_id); |
||||
// just assign the device pointer allocated in main loop |
||||
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); |
||||
} |
||||
|
||||
__host__ |
||||
void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order) |
||||
{ |
||||
int dev_id = device_map[thr_id % MAX_GPUS]; |
||||
|
||||
if (device_sm[dev_id] >= 500) { |
||||
|
||||
const uint32_t tpb = TPB; |
||||
|
||||
dim3 grid2((threads + tpb - 1) / tpb); |
||||
dim3 block2(tpb); |
||||
dim3 grid4((threads * 4 + tpb - 1) / tpb); |
||||
dim3 block4(4, tpb / 4); |
||||
|
||||
lyra2v3_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash); |
||||
lyra2v3_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads); |
||||
lyra2v3_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash); |
||||
|
||||
} else { |
||||
|
||||
uint32_t tpb = 16; |
||||
if (cuda_arch[dev_id] >= 350) tpb = TPB35; |
||||
else if (cuda_arch[dev_id] >= 300) tpb = TPB30; |
||||
else if (cuda_arch[dev_id] >= 200) tpb = TPB20; |
||||
|
||||
dim3 grid((threads + tpb - 1) / tpb); |
||||
dim3 block(tpb); |
||||
lyra2v3_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash); |
||||
|
||||
} |
||||
} |
||||
|
||||
|
@ -0,0 +1,348 @@
@@ -0,0 +1,348 @@
|
||||
/* SM 2/3/3.5 Variant for lyra2REv2 */ |
||||
|
||||
#ifdef __INTELLISENSE__ |
||||
/* just for vstudio code colors, only uncomment that temporary, dont commit it */ |
||||
//#undef __CUDA_ARCH__ |
||||
//#define __CUDA_ARCH__ 500 |
||||
#endif |
||||
|
||||
#define TPB20 64 |
||||
#define TPB30 64 |
||||
#define TPB35 64 |
||||
|
||||
#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500 |
||||
|
||||
#include "cuda_lyra2_vectors.h" |
||||
|
||||
#define Nrow 4 |
||||
#define Ncol 4 |
||||
|
||||
#define vectype ulonglong4 |
||||
#define memshift 4 |
||||
|
||||
__device__ vectype *DMatrix; |
||||
|
||||
static __device__ __forceinline__ |
||||
void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) |
||||
{ |
||||
a += b; d ^= a; d = ROTR64(d, 32); |
||||
c += d; b ^= c; b = ROTR64(b, 24); |
||||
a += b; d ^= a; d = ROTR64(d, 16); |
||||
c += d; b ^= c; b = ROTR64(b, 63); |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void round_lyra_v35(vectype* s) |
||||
{ |
||||
Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); |
||||
Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); |
||||
Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); |
||||
Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); |
||||
|
||||
Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); |
||||
Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); |
||||
Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); |
||||
Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexV3(vectype state[4], uint32_t thread) |
||||
{ |
||||
vectype state1[3]; |
||||
uint32_t ps1 = (Nrow * Ncol * memshift * thread); |
||||
uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); |
||||
|
||||
#pragma unroll 4 |
||||
for (int i = 0; i < Ncol; i++) |
||||
{ |
||||
uint32_t s1 = ps1 + Nrow * i *memshift; |
||||
uint32_t s2 = ps2 - Nrow * i *memshift; |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = __ldg4(&(DMatrix + s1)[j]); |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j]; |
||||
round_lyra_v35(state); |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] ^= state[j]; |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
(DMatrix + s2)[j] = state1[j]; |
||||
} |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) |
||||
{ |
||||
vectype state2[3], state1[3]; |
||||
|
||||
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); |
||||
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); |
||||
uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); |
||||
|
||||
for (int i = 0; i < Ncol; i++) |
||||
{ |
||||
uint32_t s1 = ps1 + Nrow*i*memshift; |
||||
uint32_t s2 = ps2 + Nrow*i*memshift; |
||||
uint32_t s3 = ps3 - Nrow*i*memshift; |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = __ldg4(&(DMatrix + s1 )[j]); |
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = __ldg4(&(DMatrix + s2 )[j]); |
||||
for (int j = 0; j < 3; j++) { |
||||
vectype tmp = state1[j] + state2[j]; |
||||
state[j] ^= tmp; |
||||
} |
||||
|
||||
round_lyra_v35(state); |
||||
|
||||
for (int j = 0; j < 3; j++) { |
||||
state1[j] ^= state[j]; |
||||
(DMatrix + s3)[j] = state1[j]; |
||||
} |
||||
|
||||
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||
for (int j = 0; j < 11; j++) |
||||
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
(DMatrix + s2)[j] = state2[j]; |
||||
} |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) |
||||
{ |
||||
vectype state1[3], state2[3]; |
||||
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); |
||||
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); |
||||
uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); |
||||
|
||||
#pragma nounroll |
||||
for (int i = 0; i < Ncol; i++) |
||||
{ |
||||
uint32_t s1 = ps1 + Nrow * i*memshift; |
||||
uint32_t s2 = ps2 + Nrow * i*memshift; |
||||
uint32_t s3 = ps3 + Nrow * i*memshift; |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = __ldg4(&(DMatrix + s1)[j]); |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = __ldg4(&(DMatrix + s2)[j]); |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] += state2[j]; |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j]; |
||||
|
||||
round_lyra_v35(state); |
||||
|
||||
((uint2*)state2)[0] ^= ((uint2*)state)[11]; |
||||
|
||||
for (int j = 0; j < 11; j++) |
||||
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; |
||||
|
||||
if (rowInOut != rowOut) { |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
(DMatrix + s2)[j] = state2[j]; |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
(DMatrix + s3)[j] ^= state[j]; |
||||
|
||||
} else { |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] ^= state[j]; |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
(DMatrix + s2)[j] = state2[j]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ >= 300 |
||||
__global__ __launch_bounds__(TPB35, 1) |
||||
void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
|
||||
vectype state[4]; |
||||
vectype blake2b_IV[2]; |
||||
vectype padding[2]; |
||||
|
||||
if (threadIdx.x == 0) { |
||||
|
||||
((uint16*)blake2b_IV)[0] = make_uint16( |
||||
0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85, |
||||
0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a, |
||||
0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c, |
||||
0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 |
||||
); |
||||
((uint16*)padding)[0] = make_uint16( |
||||
0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0, |
||||
0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 |
||||
); |
||||
} |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
((uint2*)state)[0] = __ldg(&outputHash[thread]); |
||||
((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); |
||||
((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); |
||||
((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); |
||||
|
||||
state[1] = state[0]; |
||||
state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); |
||||
state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); |
||||
|
||||
for (int i = 0; i<12; i++) |
||||
round_lyra_v35(state); |
||||
|
||||
state[0] ^= shuffle4(((vectype*)padding)[0], 0); |
||||
state[1] ^= shuffle4(((vectype*)padding)[1], 0); |
||||
|
||||
for (int i = 0; i<12; i++) |
||||
round_lyra_v35(state); |
||||
|
||||
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); |
||||
|
||||
//#pragma unroll 4 |
||||
for (int i = 0; i < 4; i++) |
||||
{ |
||||
uint32_t s1 = ps1 - 4 * memshift * i; |
||||
for (int j = 0; j < 3; j++) |
||||
(DMatrix + s1)[j] = (state)[j]; |
||||
|
||||
round_lyra_v35(state); |
||||
} |
||||
|
||||
reduceDuplexV3(state, thread); |
||||
reduceDuplexRowSetupV3(1, 0, 2, state, thread); |
||||
reduceDuplexRowSetupV3(2, 1, 3, state, thread); |
||||
|
||||
unsigned int instance = 0; |
||||
uint32_t rowa; |
||||
int prev = 3; |
||||
for (int i = 0; i < 4; i++) |
||||
{ |
||||
//rowa = ((uint2*)state)[0].x & 3; |
||||
|
||||
instance = ((uint2*)state)[instance & 0xf].x; |
||||
rowa = ((uint2*)state)[instance & 0xf].x & 0x3; |
||||
reduceDuplexRowtV3(prev, rowa, i, state, thread); |
||||
prev = i; |
||||
} |
||||
|
||||
uint32_t shift = (memshift * rowa + 16 * memshift * thread); |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= __ldg4(&(DMatrix + shift)[j]); |
||||
|
||||
for (int i = 0; i < 12; i++) |
||||
round_lyra_v35(state); |
||||
|
||||
outputHash[thread] = ((uint2*)state)[0]; |
||||
outputHash[thread + threads] = ((uint2*)state)[1]; |
||||
outputHash[thread + 2 * threads] = ((uint2*)state)[2]; |
||||
outputHash[thread + 3 * threads] = ((uint2*)state)[3]; |
||||
|
||||
} //thread |
||||
} |
||||
#elif __CUDA_ARCH__ >= 200 |
||||
__global__ __launch_bounds__(TPB20, 1) |
||||
void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
|
||||
vectype state[4]; |
||||
vectype blake2b_IV[2]; |
||||
vectype padding[2]; |
||||
|
||||
((uint16*)blake2b_IV)[0] = make_uint16( |
||||
0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, |
||||
0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, |
||||
0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, |
||||
0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 |
||||
); |
||||
((uint16*)padding)[0] = make_uint16( |
||||
0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0, |
||||
0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000 |
||||
); |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
|
||||
((uint2*)state)[0] = outputHash[thread]; |
||||
((uint2*)state)[1] = outputHash[thread + threads]; |
||||
((uint2*)state)[2] = outputHash[thread + 2 * threads]; |
||||
((uint2*)state)[3] = outputHash[thread + 3 * threads]; |
||||
|
||||
state[1] = state[0]; |
||||
state[2] = ((vectype*)blake2b_IV)[0]; |
||||
state[3] = ((vectype*)blake2b_IV)[1]; |
||||
|
||||
for (int i = 0; i<12; i++) |
||||
round_lyra_v35(state); |
||||
|
||||
state[0] ^= ((vectype*)padding)[0]; |
||||
state[1] ^= ((vectype*)padding)[1]; |
||||
|
||||
for (int i = 0; i<12; i++) |
||||
round_lyra_v35(state); |
||||
|
||||
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); |
||||
|
||||
//#pragma unroll 4 |
||||
for (int i = 0; i < 4; i++) |
||||
{ |
||||
uint32_t s1 = ps1 - 4 * memshift * i; |
||||
for (int j = 0; j < 3; j++) |
||||
(DMatrix + s1)[j] = (state)[j]; |
||||
|
||||
round_lyra_v35(state); |
||||
} |
||||
|
||||
reduceDuplexV3(state, thread); |
||||
reduceDuplexRowSetupV3(1, 0, 2, state, thread); |
||||
reduceDuplexRowSetupV3(2, 1, 3, state, thread); |
||||
|
||||
uint instance = 0; |
||||
uint32_t rowa; |
||||
int prev = 3; |
||||
for (int i = 0; i < 4; i++) |
||||
{ |
||||
// rowa = ((uint2*)state)[0].x & 3; |
||||
|
||||
instance = ((uint2*)state)[instance & 0xf]; |
||||
rowa = ((uint2*)state)[instance & 0xf] & 0x3; |
||||
reduceDuplexRowtV3(prev, rowa, i, state, thread); |
||||
prev = i; |
||||
} |
||||
|
||||
uint32_t shift = (memshift * rowa + 16 * memshift * thread); |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= __ldg4(&(DMatrix + shift)[j]); |
||||
|
||||
for (int i = 0; i < 12; i++) |
||||
round_lyra_v35(state); |
||||
|
||||
outputHash[thread] = ((uint2*)state)[0]; |
||||
outputHash[thread + threads] = ((uint2*)state)[1]; |
||||
outputHash[thread + 2 * threads] = ((uint2*)state)[2]; |
||||
outputHash[thread + 3 * threads] = ((uint2*)state)[3]; |
||||
|
||||
} //thread |
||||
} |
||||
#endif |
||||
|
||||
#else |
||||
/* host & sm5+ */ |
||||
__global__ void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} |
||||
#endif |
@ -0,0 +1,183 @@
@@ -0,0 +1,183 @@
|
||||
extern "C" { |
||||
#include "sph/sph_blake.h" |
||||
#include "sph/sph_bmw.h" |
||||
#include "sph/sph_cubehash.h" |
||||
#include "lyra2/Lyra2.h" |
||||
} |
||||
|
||||
#include <miner.h> |
||||
#include <cuda_helper.h> |
||||
|
||||
static uint64_t *d_hash[MAX_GPUS]; |
||||
static uint64_t* d_matrix[MAX_GPUS]; |
||||
|
||||
extern void blake256_cpu_init(int thr_id, uint32_t threads); |
||||
extern void blake256_cpu_setBlock_80(uint32_t *pdata); |
||||
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); |
||||
|
||||
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); |
||||
|
||||
extern void lyra2v3_setTarget(const void *pTargetIn); |
||||
extern void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix); |
||||
extern void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||
|
||||
extern void lyra2v3_cpu_hash_32_targ(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); |
||||
|
||||
extern void bmw256_setTarget(const void *ptarget); |
||||
extern void bmw256_cpu_init(int thr_id, uint32_t threads); |
||||
extern void bmw256_cpu_free(int thr_id); |
||||
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); |
||||
|
||||
extern "C" void lyra2v3_hash(void *state, const void *input) |
||||
{ |
||||
uint32_t hashA[8], hashB[8]; |
||||
|
||||
sph_blake256_context ctx_blake; |
||||
sph_cubehash256_context ctx_cube; |
||||
sph_bmw256_context ctx_bmw; |
||||
|
||||
sph_blake256_set_rounds(14); |
||||
|
||||
sph_blake256_init(&ctx_blake); |
||||
sph_blake256(&ctx_blake, input, 80); |
||||
sph_blake256_close(&ctx_blake, hashA); |
||||
|
||||
LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); |
||||
|
||||
sph_cubehash256_init(&ctx_cube); |
||||
sph_cubehash256(&ctx_cube, hashB, 32); |
||||
sph_cubehash256_close(&ctx_cube, hashA); |
||||
|
||||
LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); |
||||
|
||||
sph_bmw256_init(&ctx_bmw); |
||||
sph_bmw256(&ctx_bmw, hashB, 32); |
||||
sph_bmw256_close(&ctx_bmw, hashA); |
||||
|
||||
memcpy(state, hashA, 32); |
||||
} |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
|
||||
extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t *pdata = work->data; |
||||
uint32_t *ptarget = work->target; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
int dev_id = device_map[thr_id]; |
||||
int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20; |
||||
if (strstr(device_name[dev_id], "GTX 1")) intensity = 20; |
||||
if (strstr(device_name[dev_id], "RTX 20")) intensity = 20; |
||||
uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity); |
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
||||
|
||||
if (opt_benchmark) |
||||
ptarget[7] = 0x000f; |
||||
|
||||
|
||||
if (!init[thr_id]) |
||||
{ |
||||
size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3; |
||||
cudaSetDevice(dev_id); |
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
||||
cudaDeviceReset(); |
||||
// reduce cpu usage |
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||
CUDA_LOG_ERROR(); |
||||
} |
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); |
||||
|
||||
blake256_cpu_init(thr_id, throughput); |
||||
bmw256_cpu_init(thr_id, throughput); |
||||
|
||||
cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256 |
||||
|
||||
// SM 3 implentation requires a bit more memory |
||||
if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500) |
||||
matrix_sz = 16 * sizeof(uint64_t) * 4 * 4; |
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); |
||||
lyra2v3_cpu_init(thr_id, throughput, d_matrix[thr_id]); |
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); |
||||
|
||||
api_set_throughput(thr_id, throughput); |
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
uint32_t endiandata[20]; |
||||
for (int k=0; k < 20; k++) |
||||
be32enc(&endiandata[k], pdata[k]); |
||||
|
||||
blake256_cpu_setBlock_80(pdata); |
||||
bmw256_setTarget(ptarget); |
||||
|
||||
do { |
||||
int order = 0; |
||||
|
||||
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
memset(work->nonces, 0, sizeof(work->nonces)); |
||||
bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
|
||||
if (work->nonces[0] != 0) |
||||
{ |
||||
const uint32_t Htarg = ptarget[7]; |
||||
uint32_t _ALIGN(64) vhash[8]; |
||||
be32enc(&endiandata[19], work->nonces[0]); |
||||
lyra2v3_hash(vhash, endiandata); |
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { |
||||
work->valid_nonces = 1; |
||||
work_set_target_ratio(work, vhash); |
||||
if (work->nonces[1] != 0) { |
||||
be32enc(&endiandata[19], work->nonces[1]); |
||||
lyra2v3_hash(vhash, endiandata); |
||||
bn_set_target_ratio(work, vhash, 1); |
||||
work->valid_nonces++; |
||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; |
||||
} else { |
||||
pdata[19] = work->nonces[0] + 1; // cursor |
||||
} |
||||
return work->valid_nonces; |
||||
} |
||||
else if (vhash[7] > Htarg) { |
||||
gpu_increment_reject(thr_id); |
||||
if (!opt_quiet) |
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); |
||||
pdata[19] = work->nonces[0] + 1; |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
if ((uint64_t)throughput + pdata[19] >= max_nonce) { |
||||
pdata[19] = max_nonce; |
||||
break; |
||||
} |
||||
pdata[19] += throughput; |
||||
|
||||
} while (!work_restart[thr_id].restart && !abort_flag); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce; |
||||
return 0; |
||||
} |
||||
|
||||
// cleanup |
||||
extern "C" void free_lyra2v3(int thr_id) |
||||
{ |
||||
if (!init[thr_id]) |
||||
return; |
||||
|
||||
cudaThreadSynchronize(); |
||||
|
||||
cudaFree(d_hash[thr_id]); |
||||
cudaFree(d_matrix[thr_id]); |
||||
|
||||
init[thr_id] = false; |
||||
|
||||
cudaDeviceSynchronize(); |
||||
} |
@ -0,0 +1,89 @@
@@ -0,0 +1,89 @@
|
||||
#include <stdio.h> |
||||
#include <memory.h> |
||||
|
||||
#include "cuda_helper.h" |
||||
|
||||
__global__ __launch_bounds__(128, 8) |
||||
void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch) |
||||
{ |
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t); |
||||
uint4 *psrc = (uint4*) (&d_hash[offset]); |
||||
d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1; |
||||
if (d_NonceBranch[thread]) return; |
||||
if (d_branch2) { |
||||
uint4 *pdst = (uint4*)(&d_branch2[offset]); |
||||
uint4 data; |
||||
data = psrc[0]; pdst[0] = data; |
||||
data = psrc[1]; pdst[1] = data; |
||||
data = psrc[2]; pdst[2] = data; |
||||
data = psrc[3]; pdst[3] = data; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(128, 8) |
||||
void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch) |
||||
{ |
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads && !d_NonceBranch[thread]) |
||||
{ |
||||
const uint32_t offset = thread * 16U; |
||||
uint4 *psrc = (uint4*) (&d_branch2[offset]); |
||||
uint4 *pdst = (uint4*) (&d_hash[offset]); |
||||
uint4 data; |
||||
data = psrc[0]; pdst[0] = data; |
||||
data = psrc[1]; pdst[1] = data; |
||||
data = psrc[2]; pdst[2] = data; |
||||
data = psrc[3]; pdst[3] = data; |
||||
} |
||||
} |
||||
|
||||
__global__ |
||||
void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash) |
||||
{ |
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
const uint32_t offset = thread * 16U; |
||||
uint2 *psrc = (uint2*) (&d_hash[offset]); |
||||
uint2 *pdst = (uint2*) (&d_hash[offset]); |
||||
uint2 data; |
||||
data = psrc[4]; pdst[0] ^= data; |
||||
data = psrc[5]; pdst[1] ^= data; |
||||
data = psrc[6]; pdst[2] ^= data; |
||||
data = psrc[7]; pdst[3] ^= data; |
||||
} |
||||
} |
||||
|
||||
__host__ |
||||
uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces) |
||||
{ |
||||
const uint32_t threadsperblock = 128; |
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
// extract algo permution hashes to a second branch buffer |
||||
phi_filter_gpu <<<grid, block>>> (threads, inpHashes, d_br2, d_nonces); |
||||
return threads; |
||||
} |
||||
|
||||
__host__ |
||||
void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces) |
||||
{ |
||||
const uint32_t threadsperblock = 128; |
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
// put back second branch hashes to the common buffer d_hash |
||||
phi_merge_gpu <<<grid, block>>> (threads, outpHashes, d_br2, d_nonces); |
||||
} |
||||
|
||||
__host__ |
||||
void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes) |
||||
{ |
||||
const uint32_t threadsperblock = 128; |
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
phi_final_compress_gpu <<<grid, block>>> (threads, d_hashes); |
||||
} |
@ -0,0 +1,319 @@
@@ -0,0 +1,319 @@
|
||||
/* phi2 cubehash-512 144-bytes input (80 + 64) */ |
||||
|
||||
#include <cuda_helper.h> |
||||
#include <cuda_vectors.h> |
||||
|
||||
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ |
||||
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ |
||||
|
||||
#if __CUDA_ARCH__ < 350 |
||||
#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) |
||||
#else |
||||
#define LROT(x, bits) __funnelshift_l(x, x, bits) |
||||
#endif |
||||
|
||||
#define ROTATEUPWARDS7(a) LROT(a,7) |
||||
#define ROTATEUPWARDS11(a) LROT(a,11) |
||||
|
||||
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } |
||||
|
||||
#ifdef NO_MIDSTATE |
||||
|
||||
__device__ __constant__ |
||||
static const uint32_t c_IV_512[32] = { |
||||
0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, |
||||
0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695, |
||||
0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537, |
||||
0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE, |
||||
0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532, |
||||
0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, |
||||
0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576, |
||||
0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44 |
||||
}; |
||||
|
||||
#endif |
||||
|
||||
__device__ __forceinline__ |
||||
static void rrounds(uint32_t x[2][2][2][2][2]) |
||||
{ |
||||
int r; |
||||
int j; |
||||
int k; |
||||
int l; |
||||
int m; |
||||
|
||||
//#pragma unroll 16 |
||||
for (r = 0;r < CUBEHASH_ROUNDS;++r) { |
||||
|
||||
/* "add x_0jklm into x_1jklmn modulo 2^32" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
x[1][j][k][l][m] += x[0][j][k][l][m]; |
||||
|
||||
/* "rotate x_0jklm upwards by 7 bits" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); |
||||
|
||||
/* "swap x_00klm with x_01klm" */ |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
SWAP(x[0][0][k][l][m],x[0][1][k][l][m]) |
||||
|
||||
/* "xor x_1jklm into x_0jklm" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
x[0][j][k][l][m] ^= x[1][j][k][l][m]; |
||||
|
||||
/* "swap x_1jk0m with x_1jk1m" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
SWAP(x[1][j][k][0][m],x[1][j][k][1][m]) |
||||
|
||||
/* "add x_0jklm into x_1jklm modulo 2^32" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
x[1][j][k][l][m] += x[0][j][k][l][m]; |
||||
|
||||
/* "rotate x_0jklm upwards by 11 bits" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); |
||||
|
||||
/* "swap x_0j0lm with x_0j1lm" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
SWAP(x[0][j][0][l][m],x[0][j][1][l][m]) |
||||
|
||||
/* "xor x_1jklm into x_0jklm" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
#pragma unroll 2 |
||||
for (m = 0;m < 2;++m) |
||||
x[0][j][k][l][m] ^= x[1][j][k][l][m]; |
||||
|
||||
/* "swap x_1jkl0 with x_1jkl1" */ |
||||
#pragma unroll 2 |
||||
for (j = 0;j < 2;++j) |
||||
#pragma unroll 2 |
||||
for (k = 0;k < 2;++k) |
||||
#pragma unroll 2 |
||||
for (l = 0;l < 2;++l) |
||||
SWAP(x[1][j][k][l][0],x[1][j][k][l][1]) |
||||
|
||||
} |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2]) |
||||
{ |
||||
// read 32 bytes input from global mem with uint2 chunks |
||||
AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]); |
||||
AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]); |
||||
AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]); |
||||
AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]); |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2]) |
||||
{ |
||||
// used to write final hash to global mem |
||||
AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]); |
||||
AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]); |
||||
AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]); |
||||
AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]); |
||||
AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]); |
||||
AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]); |
||||
AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]); |
||||
AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]); |
||||
} |
||||
|
||||
#define Init(x) \ |
||||
AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \ |
||||
AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \ |
||||
AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \ |
||||
AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \ |
||||
AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \ |
||||
AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \ |
||||
AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \ |
||||
AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \ |
||||
AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \ |
||||
AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \ |
||||
AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \ |
||||
AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \ |
||||
AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \ |
||||
AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \ |
||||
AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \ |
||||
AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]); |
||||
|
||||
__device__ __forceinline__ |
||||
static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data) |
||||
{ |
||||
/* "xor the block into the first b bytes of the state" */ |
||||
block_tox(data, x); |
||||
/* "and then transform the state invertibly through r identical rounds" */ |
||||
rrounds(x); |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) |
||||
{ |
||||
/* "the integer 1 is xored into the last state word x_11111" */ |
||||
x[1][1][1][1][1] ^= 1; |
||||
|
||||
/* "the state is then transformed invertibly through 10r identical rounds" */ |
||||
#pragma unroll 10 |
||||
for (int i = 0; i < 10; i++) rrounds(x); |
||||
|
||||
/* "output the first h/8 bytes of the state" */ |
||||
hash_fromx(hashval, x); |
||||
} |
||||
|
||||
__host__ void phi2_cubehash512_cpu_init(int thr_id, uint32_t threads) { } |
||||
|
||||
/***************************************************/ |
||||
|
||||
/** |
||||
* Timetravel and x16 CUBEHASH-80 CUDA implementation |
||||
* by tpruvot@github - Jan 2017 / May 2018 |
||||
*/ |
||||
|
||||
__constant__ static uint32_t c_midstate128[32]; |
||||
__constant__ static uint32_t c_PaddedMessage_144[36]; |
||||
|
||||
#undef SPH_C32 |
||||
#undef SPH_C64 |
||||
#undef SPH_T32 |
||||
#undef SPH_T64 |
||||
#include "sph/sph_cubehash.h" |
||||
|
||||
__host__ |
||||
void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata) |
||||
{ |
||||
sph_cubehash512_context ctx_cubehash; |
||||
sph_cubehash512_init(&ctx_cubehash); |
||||
sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64); |
||||
#ifndef NO_MIDSTATE |
||||
cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice); |
||||
#endif |
||||
cudaMemcpyToSymbol(c_PaddedMessage_144, endiandata, sizeof(c_PaddedMessage_144), 0, cudaMemcpyHostToDevice); |
||||
} |
||||
|
||||
__global__ |
||||
void cubehash512_gpu_hash_144(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash) |
||||
{ |
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
const uint32_t nonce = startNounce + thread; |
||||
uint32_t message[8]; |
||||
uint32_t x[2][2][2][2][2]; |
||||
#ifdef NO_MIDSTATE |
||||
Init(x); |
||||
|
||||
// first 32 bytes |
||||
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[0]); |
||||
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[4]); |
||||
Update32(x, message); |
||||
|
||||
// second 32 bytes |
||||
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[8]); |
||||
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[12]); |
||||
Update32(x, message); |
||||
#else |
||||
AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]); |
||||
AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]); |
||||
AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]); |
||||
AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]); |
||||
AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]); |
||||
AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]); |
||||
AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]); |
||||
AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]); |
||||
|
||||
AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]); |
||||
AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]); |
||||
AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]); |
||||
AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]); |
||||
AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]); |
||||
AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]); |
||||
AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]); |
||||
AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]); |
||||
#endif |
||||
// nonce + state root |
||||
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[16]); |
||||
message[3] = cuda_swab32(nonce); |
||||
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[20]); // state |
||||
Update32(x, message); |
||||
|
||||
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[24]); // state |
||||
AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[28]); // utxo |
||||
Update32(x, message); |
||||
|
||||
AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[32]); // utxo |
||||
message[4] = 0x80; |
||||
message[5] = 0; |
||||
message[6] = 0; |
||||
message[7] = 0; |
||||
Update32(x, message); |
||||
|
||||
uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]); |
||||
Final(x, output); |
||||
} |
||||
} |
||||
|
||||
__host__ |
||||
void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) |
||||
{ |
||||
const uint32_t threadsperblock = 256; |
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
|
||||
cubehash512_gpu_hash_144 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash); |
||||
} |
||||
|
@ -0,0 +1,268 @@
@@ -0,0 +1,268 @@
|
||||
// |
||||
// PHI2 algo (with smart contracts header) |
||||
// CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein |
||||
// |
||||
// Implemented by tpruvot in May 2018 |
||||
// |
||||
|
||||
extern "C" { |
||||
#include "sph/sph_skein.h" |
||||
#include "sph/sph_jh.h" |
||||
#include "sph/sph_cubehash.h" |
||||
#include "sph/sph_streebog.h" |
||||
#include "sph/sph_echo.h" |
||||
#include "lyra2/Lyra2.h" |
||||
} |
||||
|
||||
#include "miner.h" |
||||
#include "cuda_helper.h" |
||||
#include "x11/cuda_x11.h" |
||||
|
||||
#include <stdio.h> |
||||
#include <memory.h> |
||||
|
||||
extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata); |
||||
extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); |
||||
|
||||
extern void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata); |
||||
extern void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); |
||||
|
||||
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); |
||||
extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti); |
||||
|
||||
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); |
||||
extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter); |
||||
extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter); |
||||
|
||||
extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces); |
||||
extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces); |
||||
extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes); |
||||
|
||||
static uint64_t* d_matrix[MAX_GPUS]; |
||||
static uint32_t* d_hash_512[MAX_GPUS]; |
||||
static uint64_t* d_hash_256[MAX_GPUS]; |
||||
static uint32_t* d_hash_br2[MAX_GPUS]; |
||||
static uint32_t* d_nonce_br[MAX_GPUS]; |
||||
|
||||
static bool has_roots; |
||||
|
||||
extern "C" void phi2_hash(void *output, const void *input) |
||||
{ |
||||
unsigned char _ALIGN(128) hash[64]; |
||||
unsigned char _ALIGN(128) hashA[64]; |
||||
unsigned char _ALIGN(128) hashB[64]; |
||||
|
||||
sph_cubehash512_context ctx_cubehash; |
||||
sph_jh512_context ctx_jh; |
||||
sph_gost512_context ctx_gost; |
||||
sph_echo512_context ctx_echo; |
||||
sph_skein512_context ctx_skein; |
||||
|
||||
sph_cubehash512_init(&ctx_cubehash); |
||||
sph_cubehash512(&ctx_cubehash, input, has_roots ? 144 : 80); |
||||
sph_cubehash512_close(&ctx_cubehash, (void*)hashB); |
||||
|
||||
LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8); |
||||
LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8); |
||||
|
||||
sph_jh512_init(&ctx_jh); |
||||
sph_jh512(&ctx_jh, (const void*)hashA, 64); |
||||
sph_jh512_close(&ctx_jh, (void*)hash); |
||||
|
||||
if (hash[0] & 1) { |
||||
sph_gost512_init(&ctx_gost); |
||||
sph_gost512(&ctx_gost, (const void*)hash, 64); |
||||
sph_gost512_close(&ctx_gost, (void*)hash); |
||||
} else { |
||||
sph_echo512_init(&ctx_echo); |
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
sph_echo512_init(&ctx_echo); |
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
} |
||||
|
||||
sph_skein512_init(&ctx_skein); |
||||
sph_skein512(&ctx_skein, (const void*)hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*)hash); |
||||
|
||||
for (int i=0; i<32; i++) |
||||
hash[i] ^= hash[i+32]; |
||||
|
||||
memcpy(output, hash, 32); |
||||
} |
||||
|
||||
//#define _DEBUG |
||||
#define _DEBUG_PREFIX "phi-" |
||||
#include "cuda_debug.cuh" |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
static bool use_compat_kernels[MAX_GPUS] = { 0 }; |
||||
static __thread bool gtx750ti = false; |
||||
|
||||
extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t *pdata = work->data; |
||||
uint32_t *ptarget = work->target; |
||||
|
||||
const uint32_t first_nonce = pdata[19]; |
||||
const int dev_id = device_map[thr_id]; |
||||
|
||||
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; |
||||
if (device_sm[dev_id] == 500) intensity = 15; |
||||
if (device_sm[dev_id] == 600) intensity = 17; |
||||
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); |
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
||||
if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem |
||||
|
||||
if (opt_benchmark) |
||||
ptarget[7] = 0xff; |
||||
|
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(dev_id); |
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
||||
cudaDeviceReset(); |
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||
} |
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); |
||||
|
||||
cuda_get_arch(thr_id); |
||||
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); |
||||
gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL); |
||||
|
||||
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4; |
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1); |
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1); |
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1); |
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1); |
||||
if (use_compat_kernels[thr_id]) { |
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1); |
||||
} |
||||
|
||||
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); |
||||
quark_jh512_cpu_init(thr_id, throughput); |
||||
quark_skein512_cpu_init(thr_id, throughput); |
||||
if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput); |
||||
|
||||
cuda_check_cpu_init(thr_id, throughput); |
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
has_roots = false; |
||||
uint32_t endiandata[36]; |
||||
for (int k = 0; k < 36; k++) { |
||||
be32enc(&endiandata[k], pdata[k]); |
||||
if (k >= 20 && pdata[k]) has_roots = true; |
||||
} |
||||
|
||||
cuda_check_cpu_setTarget(ptarget); |
||||
if (has_roots) |
||||
cubehash512_setBlock_144(thr_id, endiandata); |
||||
else |
||||
cubehash512_setBlock_80(thr_id, endiandata); |
||||
|
||||
do { |
||||
int order = 0; |
||||
if (has_roots) |
||||
cubehash512_cuda_hash_144(thr_id, throughput, pdata[19], d_hash_512[thr_id]); |
||||
else |
||||
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); |
||||
order++; |
||||
TRACE("cube "); |
||||
|
||||
lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti); |
||||
order++; |
||||
TRACE("lyra "); |
||||
|
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++); |
||||
TRACE("jh "); |
||||
|
||||
order++; |
||||
if (!use_compat_kernels[thr_id]) { |
||||
phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]); |
||||
phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); |
||||
phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); |
||||
phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); |
||||
} else { |
||||
// todo: nonces vector to reduce amount of hashes to compute |
||||
phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]); |
||||
streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]); |
||||
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order); |
||||
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order); |
||||
phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]); |
||||
} |
||||
TRACE("mix "); |
||||
|
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++); |
||||
TRACE("skein "); |
||||
|
||||
phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]); |
||||
TRACE("xor "); |
||||
|
||||
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]); |
||||
if (work->nonces[0] != UINT32_MAX) |
||||
{ |
||||
const uint32_t Htarg = ptarget[7]; |
||||
uint32_t _ALIGN(64) vhash[8]; |
||||
be32enc(&endiandata[19], work->nonces[0]); |
||||
phi2_hash(vhash, endiandata); |
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { |
||||
work->valid_nonces = 1; |
||||
work_set_target_ratio(work, vhash); |
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1); |
||||
if (work->nonces[1] != 0) { |
||||
be32enc(&endiandata[19], work->nonces[1]); |
||||
phi2_hash(vhash, endiandata); |
||||
bn_set_target_ratio(work, vhash, 1); |
||||
work->valid_nonces++; |
||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; |
||||
} else { |
||||
pdata[19] = work->nonces[0] + 1; // cursor |
||||
} |
||||
if (pdata[19] > max_nonce) pdata[19] = max_nonce; |
||||
return work->valid_nonces; |
||||
} |
||||
else if (vhash[7] > Htarg) { |
||||
gpu_increment_reject(thr_id); |
||||
if (!opt_quiet) |
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput); |
||||
pdata[19] = work->nonces[0] + 1; |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
if ((uint64_t)throughput + pdata[19] >= max_nonce) { |
||||
pdata[19] = max_nonce; |
||||
break; |
||||
} |
||||
pdata[19] += throughput; |
||||
|
||||
} while (!work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce; |
||||
return 0; |
||||
} |
||||
|
||||
// cleanup |
||||
extern "C" void free_phi2(int thr_id) |
||||
{ |
||||
if (!init[thr_id]) |
||||
return; |
||||
|
||||
cudaThreadSynchronize(); |
||||
cudaFree(d_matrix[thr_id]); |
||||
cudaFree(d_hash_512[thr_id]); |
||||
cudaFree(d_hash_256[thr_id]); |
||||
cudaFree(d_nonce_br[thr_id]); |
||||
if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]); |
||||
|
||||
cuda_check_cpu_free(thr_id); |
||||
init[thr_id] = false; |
||||
|
||||
cudaDeviceSynchronize(); |
||||
} |
@ -0,0 +1,507 @@
@@ -0,0 +1,507 @@
|
||||
/* |
||||
* sha256(-q) CUDA implementation. |
||||
* pyritepirate 2018 |
||||
* tpruvot 2017 |
||||
*/ |
||||
|
||||
#include <stdio.h> |
||||
#include <stdint.h> |
||||
#include <memory.h> |
||||
|
||||
#include <cuda_helper.h> |
||||
#include <miner.h> |
||||
|
||||
__constant__ static uint32_t __align__(8) c_midstate76[8]; |
||||
__constant__ static uint32_t __align__(8) c_dataEnd80[4]; |
||||
|
||||
const __constant__ uint32_t __align__(8) c_H256[8] = { |
||||
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, |
||||
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U |
||||
}; |
||||
__constant__ static uint32_t __align__(8) c_K[64]; |
||||
__constant__ static uint32_t __align__(8) c_target[2]; |
||||
__device__ uint64_t d_target[1]; |
||||
|
||||
static uint32_t* d_resNonces[MAX_GPUS] = { 0 }; |
||||
|
||||
// ------------------------------------------------------------------------------------------------ |
||||
|
||||
static const uint32_t cpu_H256[8] = { |
||||
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, |
||||
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U |
||||
}; |
||||
|
||||
static const uint32_t cpu_K[64] = { |
||||
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, |
||||
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, |
||||
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, |
||||
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, |
||||
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, |
||||
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, |
||||
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, |
||||
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 |
||||
}; |
||||
|
||||
#define ROTR ROTR32 |
||||
|
||||
__host__ |
||||
static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, |
||||
uint32_t e, uint32_t f, uint32_t g, uint32_t &h, |
||||
uint32_t in, const uint32_t Kshared) |
||||
{ |
||||
uint32_t t1,t2; |
||||
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); |
||||
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); |
||||
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); |
||||
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); |
||||
|
||||
t1 = h + bsg21 + vxandx + Kshared + in; |
||||
t2 = bsg20 + andorv; |
||||
d = d + t1; |
||||
h = t1 + t2; |
||||
} |
||||
|
||||
__host__ |
||||
static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, |
||||
uint32_t e, uint32_t f, uint32_t g, uint32_t &h, |
||||
uint32_t* in, uint32_t pc, const uint32_t Kshared) |
||||
{ |
||||
uint32_t t1,t2; |
||||
|
||||
int pcidx1 = (pc-2) & 0xF; |
||||
int pcidx2 = (pc-7) & 0xF; |
||||
int pcidx3 = (pc-15) & 0xF; |
||||
|
||||
uint32_t inx0 = in[pc]; |
||||
uint32_t inx1 = in[pcidx1]; |
||||
uint32_t inx2 = in[pcidx2]; |
||||
uint32_t inx3 = in[pcidx3]; |
||||
|
||||
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); |
||||
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); |
||||
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); |
||||
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); |
||||
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); |
||||
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); |
||||
|
||||
in[pc] = ssg21 + inx2 + ssg20 + inx0; |
||||
|
||||
t1 = h + bsg21 + vxandx + Kshared + in[pc]; |
||||
t2 = bsg20 + andorv; |
||||
d = d + t1; |
||||
h = t1 + t2; |
||||
} |
||||
|
||||
__host__ |
||||
static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared) |
||||
{ |
||||
uint32_t a = state[0]; |
||||
uint32_t b = state[1]; |
||||
uint32_t c = state[2]; |
||||
uint32_t d = state[3]; |
||||
uint32_t e = state[4]; |
||||
uint32_t f = state[5]; |
||||
uint32_t g = state[6]; |
||||
uint32_t h = state[7]; |
||||
|
||||
sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]); |
||||
sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]); |
||||
sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]); |
||||
sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]); |
||||
sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]); |
||||
sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]); |
||||
sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]); |
||||
sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]); |
||||
sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]); |
||||
sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]); |
||||
sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]); |
||||
sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]); |
||||
sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]); |
||||
sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]); |
||||
sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]); |
||||
sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]); |
||||
|
||||
for (int i=0; i<3; i++) |
||||
{ |
||||
sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]); |
||||
sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]); |
||||
sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]); |
||||
sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]); |
||||
sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]); |
||||
sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]); |
||||
sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]); |
||||
sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]); |
||||
sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]); |
||||
sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]); |
||||
sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]); |
||||
sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]); |
||||
sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]); |
||||
sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); |
||||
sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); |
||||
sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); |
||||
} |
||||
|
||||
state[0] += a; |
||||
state[1] += b; |
||||
state[2] += c; |
||||
state[3] += d; |
||||
state[4] += e; |
||||
state[5] += f; |
||||
state[6] += g; |
||||
state[7] += h; |
||||
} |
||||
|
||||
#define xor3b(a,b,c) (a ^ b ^ c) |
||||
|
||||
__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x) |
||||
{ |
||||
return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22)); |
||||
} |
||||
|
||||
__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x) |
||||
{ |
||||
return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25)); |
||||
} |
||||
|
||||
__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x) |
||||
{ |
||||
return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3)); |
||||
} |
||||
|
||||
__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x) |
||||
{ |
||||
return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10)); |
||||
} |
||||
|
||||
__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c) |
||||
{ |
||||
uint32_t result; |
||||
asm("{\n\t" |
||||
".reg .u32 m,n,o;\n\t" |
||||
"and.b32 m, %1, %2;\n\t" |
||||
" or.b32 n, %1, %2;\n\t" |
||||
"and.b32 o, n, %3;\n\t" |
||||
" or.b32 %0, m, o ;\n\t" |
||||
"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c) |
||||
); |
||||
return result; |
||||
} |
||||
|
||||
__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) { |
||||
uint2 result; |
||||
asm("mov.b64 {%0,%1},%2; \n\t" |
||||
: "=r"(result.y), "=r"(result.x) : "l"(v)); |
||||
return result; |
||||
} |
||||
|
||||
__device__ |
||||
static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, |
||||
uint32_t in, const uint32_t Kshared) |
||||
{ |
||||
uint32_t t1,t2; |
||||
uint32_t vxandx = xandx(e, f, g); |
||||
uint32_t bsg21 = bsg2_1(e); |
||||
uint32_t bsg20 = bsg2_0(a); |
||||
uint32_t andorv = andor32(a,b,c); |
||||
|
||||
t1 = h + bsg21 + vxandx + Kshared + in; |
||||
t2 = bsg20 + andorv; |
||||
d = d + t1; |
||||
h = t1 + t2; |
||||
} |
||||
|
||||
__device__ |
||||
static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, |
||||
uint32_t* in, uint32_t pc, const uint32_t Kshared) |
||||
{ |
||||
uint32_t t1,t2; |
||||
|
||||
int pcidx1 = (pc-2) & 0xF; |
||||
int pcidx2 = (pc-7) & 0xF; |
||||
int pcidx3 = (pc-15) & 0xF; |
||||
|
||||
uint32_t inx0 = in[pc]; |
||||
uint32_t inx1 = in[pcidx1]; |
||||
uint32_t inx2 = in[pcidx2]; |
||||
uint32_t inx3 = in[pcidx3]; |
||||
|
||||
uint32_t ssg21 = ssg2_1(inx1); |
||||
uint32_t ssg20 = ssg2_0(inx3); |
||||
uint32_t vxandx = xandx(e, f, g); |
||||
uint32_t bsg21 = bsg2_1(e); |
||||
uint32_t bsg20 = bsg2_0(a); |
||||
uint32_t andorv = andor32(a,b,c); |
||||
|
||||
in[pc] = ssg21 + inx2 + ssg20 + inx0; |
||||
|
||||
t1 = h + bsg21 + vxandx + Kshared + in[pc]; |
||||
t2 = bsg20 + andorv; |
||||
d = d + t1; |
||||
h = t1 + t2; |
||||
} |
||||
|
||||
__device__ |
||||
static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared) |
||||
{ |
||||
uint32_t a = state[0]; |
||||
uint32_t b = state[1]; |
||||
uint32_t c = state[2]; |
||||
uint32_t d = state[3]; |
||||
uint32_t e = state[4]; |
||||
uint32_t f = state[5]; |
||||
uint32_t g = state[6]; |
||||
uint32_t h = state[7]; |
||||
|
||||
sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]); |
||||
sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]); |
||||
sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]); |
||||
sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]); |
||||
sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]); |
||||
sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]); |
||||
sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]); |
||||
sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]); |
||||
sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]); |
||||
sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]); |
||||
sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]); |
||||
sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]); |
||||
sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]); |
||||
sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]); |
||||
sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]); |
||||
sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]); |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i<3; i++) |
||||
{ |
||||
sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]); |
||||
sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]); |
||||
sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]); |
||||
sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]); |
||||
sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]); |
||||
sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]); |
||||
sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]); |
||||
sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]); |
||||
sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]); |
||||
sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]); |
||||
sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]); |
||||
sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]); |
||||
sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]); |
||||
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); |
||||
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); |
||||
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); |
||||
} |
||||
|
||||
state[0] += a; |
||||
state[1] += b; |
||||
state[2] += c; |
||||
state[3] += d; |
||||
state[4] += e; |
||||
state[5] += f; |
||||
state[6] += g; |
||||
state[7] += h; |
||||
} |
||||
|
||||
__device__ |
||||
static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared) |
||||
{ |
||||
uint32_t a = state[0]; |
||||
uint32_t b = state[1]; |
||||
uint32_t c = state[2]; |
||||
uint32_t d = state[3]; |
||||
uint32_t e = state[4]; |
||||
uint32_t f = state[5]; |
||||
uint32_t g = state[6]; |
||||
uint32_t h = state[7]; |
||||
|
||||
sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]); |
||||
sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]); |
||||
sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]); |
||||
sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]); |
||||
sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]); |
||||
sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]); |
||||
sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]); |
||||
sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]); |
||||
sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]); |
||||
sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]); |
||||
sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]); |
||||
sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]); |
||||
sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]); |
||||
sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]); |
||||
sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]); |
||||
sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]); |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i<2; i++) |
||||
{ |
||||
sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]); |
||||
sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]); |
||||
sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]); |
||||
sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]); |
||||
sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]); |
||||
sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]); |
||||
sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]); |
||||
sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]); |
||||
sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]); |
||||
sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]); |
||||
sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]); |
||||
sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]); |
||||
sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]); |
||||
sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]); |
||||
sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]); |
||||
sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]); |
||||
} |
||||
|
||||
sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]); |
||||
sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]); |
||||
sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]); |
||||
sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]); |
||||
sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]); |
||||
sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]); |
||||
sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]); |
||||
sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]); |
||||
sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]); |
||||
sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]); |
||||
sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]); |
||||
sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]); |
||||
sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]); |
||||
sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]); |
||||
|
||||
state[6] += g; |
||||
state[7] += h; |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
uint64_t cuda_swab32ll(uint64_t x) { |
||||
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); |
||||
} |
||||
|
||||
__global__ |
||||
/*__launch_bounds__(256,3)*/ |
||||
void sha256q_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces) |
||||
{ |
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
|
||||
__shared__ uint32_t s_K[64*4]; |
||||
//s_K[thread & 63] = c_K[thread & 63]; |
||||
if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x]; |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
const uint32_t nonce = startNonce + thread; |
||||
|
||||
uint32_t dat[16]; |
||||
AS_UINT2(dat) = AS_UINT2(c_dataEnd80); |
||||
dat[ 2] = c_dataEnd80[2]; |
||||
dat[ 3] = nonce; |
||||
dat[ 4] = 0x80000000; |
||||
dat[15] = 0x280; |
||||
#pragma unroll |
||||
for (int i=5; i<15; i++) dat[i] = 0; |
||||
|
||||
uint32_t buf[8]; |
||||
#pragma unroll |
||||
for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]); |
||||
//for (int i=0; i<8; i++) buf[i] = c_midstate76[i]; |
||||
|
||||
sha256_round_body(dat, buf, s_K); |
||||
|
||||
// second sha256 |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i<8; i++) dat[i] = buf[i]; |
||||
dat[8] = 0x80000000; |
||||
#pragma unroll |
||||
for (int i=9; i<15; i++) dat[i] = 0; |
||||
dat[15] = 0x100; |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i<8; i++) buf[i] = c_H256[i]; |
||||
|
||||
sha256_round_body(dat, buf, s_K); |
||||
|
||||
// third sha256 |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i<8; i++) dat[i] = buf[i]; |
||||
dat[8] = 0x80000000; |
||||
#pragma unroll |
||||
for (int i=9; i<15; i++) dat[i] = 0; |
||||
dat[15] = 0x100; |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i<8; i++) buf[i] = c_H256[i]; |
||||
|
||||
sha256_round_body(dat, buf, s_K); |
||||
|
||||
// last sha256 |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i<8; i++) dat[i] = buf[i]; |
||||
dat[8] = 0x80000000; |
||||
#pragma unroll |
||||
for (int i=9; i<15; i++) dat[i] = 0; |
||||
dat[15] = 0x100; |
||||
|
||||
#pragma unroll |
||||
for (int i=0; i<8; i++) buf[i] = c_H256[i]; |
||||
|
||||
sha256_round_last(dat, buf, s_K); |
||||
|
||||
|
||||
// valid nonces |
||||
uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]); |
||||
if (high <= c_target[0]) { |
||||
//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]); |
||||
resNonces[1] = atomicExch(resNonces, nonce); |
||||
//d_target[0] = high; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__host__ |
||||
void sha256q_init(int thr_id) |
||||
{ |
||||
cuda_get_arch(thr_id); |
||||
cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice); |
||||
CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t))); |
||||
} |
||||
|
||||
__host__ |
||||
void sha256q_free(int thr_id) |
||||
{ |
||||
if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]); |
||||
d_resNonces[thr_id] = NULL; |
||||
} |
||||
|
||||
__host__ |
||||
void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget) |
||||
{ |
||||
uint32_t _ALIGN(64) in[16], buf[8], end[4]; |
||||
for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]); |
||||
for (int i=0;i<8;i++) buf[i] = cpu_H256[i]; |
||||
for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]); |
||||
sha256_round_body_host(in, buf, cpu_K); |
||||
|
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice)); |
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80, end, sizeof(end), 0, cudaMemcpyHostToDevice)); |
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice)); |
||||
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice)); |
||||
} |
||||
|
||||
__host__ |
||||
void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces) |
||||
{ |
||||
const uint32_t threadsperblock = 128; |
||||
|
||||
dim3 grid(threads/threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
|
||||
CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t))); |
||||
cudaThreadSynchronize(); |
||||
sha256q_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]); |
||||
cudaThreadSynchronize(); |
||||
|
||||
CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); |
||||
if (resNonces[0] == resNonces[1]) { |
||||
resNonces[1] = UINT32_MAX; |
||||
} |
||||
} |
@ -0,0 +1,136 @@
@@ -0,0 +1,136 @@
|
||||
/** |
||||
* SHA256 4x |
||||
* by pyritepirate - 2018 |
||||
* by tpruvot@github - 2017 |
||||
*/ |
||||
|
||||
#include <miner.h> |
||||
#include <cuda_helper.h> |
||||
#include <openssl/sha.h> |
||||
|
||||
// CPU Check |
||||
extern "C" void sha256q_hash(void *output, const void *input) |
||||
{ |
||||
unsigned char _ALIGN(64) hash[64]; |
||||
SHA256_CTX sha256; |
||||
|
||||
SHA256_Init(&sha256); |
||||
SHA256_Update(&sha256, (unsigned char *)input, 80); |
||||
SHA256_Final(hash, &sha256); |
||||
|
||||
SHA256_Init(&sha256); |
||||
SHA256_Update(&sha256, hash, 32); |
||||
SHA256_Final(hash, &sha256); |
||||
|
||||
SHA256_Init(&sha256); |
||||
SHA256_Update(&sha256, hash, 32); |
||||
SHA256_Final(hash, &sha256); |
||||
|
||||
SHA256_Init(&sha256); |
||||
SHA256_Update(&sha256, hash, 32); |
||||
SHA256_Final((unsigned char *)output, &sha256); |
||||
} |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
extern void sha256q_init(int thr_id); |
||||
extern void sha256q_free(int thr_id); |
||||
extern void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget); |
||||
extern void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces); |
||||
|
||||
extern "C" int scanhash_sha256q(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t _ALIGN(64) endiandata[20]; |
||||
uint32_t *pdata = work->data; |
||||
uint32_t *ptarget = work->target; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 23); |
||||
if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce)); |
||||
|
||||
if (opt_benchmark) |
||||
((uint32_t*)ptarget)[7] = 0x03; |
||||
|
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(device_map[thr_id]); |
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
||||
cudaDeviceReset(); |
||||
// reduce cpu usage |
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||
CUDA_LOG_ERROR(); |
||||
} |
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); |
||||
|
||||
sha256q_init(thr_id); |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
for (int k=0; k < 19; k++) |
||||
be32enc(&endiandata[k], pdata[k]); |
||||
|
||||
sha256q_setBlock_80(endiandata, ptarget); |
||||
|
||||
do { |
||||
// Hash with CUDA |
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
|
||||
sha256q_hash_80(thr_id, throughput, pdata[19], work->nonces); |
||||
if (work->nonces[0] != UINT32_MAX) |
||||
{ |
||||
uint32_t _ALIGN(64) vhash[8]; |
||||
|
||||
endiandata[19] = swab32(work->nonces[0]); |
||||
sha256q_hash(vhash, endiandata); |
||||
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { |
||||
work->valid_nonces = 1; |
||||
work_set_target_ratio(work, vhash); |
||||
if (work->nonces[1] != UINT32_MAX) { |
||||
endiandata[19] = swab32(work->nonces[1]); |
||||
sha256q_hash(vhash, endiandata); |
||||
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { |
||||
work->valid_nonces++; |
||||
bn_set_target_ratio(work, vhash, 1); |
||||
} |
||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; |
||||
} else { |
||||
pdata[19] = work->nonces[0] + 1; |
||||
} |
||||
return work->valid_nonces; |
||||
} |
||||
else if (vhash[7] > ptarget[7]) { |
||||
gpu_increment_reject(thr_id); |
||||
if (!opt_quiet) |
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); |
||||
pdata[19] = work->nonces[0] + 1; |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
if ((uint64_t) throughput + pdata[19] >= max_nonce) { |
||||
pdata[19] = max_nonce; |
||||
break; |
||||
} |
||||
|
||||
pdata[19] += throughput; |
||||
|
||||
} while (!work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce; |
||||
|
||||
return 0; |
||||
} |
||||
|
||||
// cleanup |
||||
extern "C" void free_sha256q(int thr_id) |
||||
{ |
||||
if (!init[thr_id]) |
||||
return; |
||||
|
||||
cudaThreadSynchronize(); |
||||
|
||||
sha256q_free(thr_id); |
||||
|
||||
init[thr_id] = false; |
||||
|
||||
cudaDeviceSynchronize(); |
||||
} |
@ -0,0 +1,497 @@
@@ -0,0 +1,497 @@
|
||||
/** |
||||
* Timetravel (exosis) CUDA implementation |
||||
* by tpruvot@github, exosis |
||||
*/ |
||||
|
||||
#include <stdio.h> |
||||
#include <memory.h> |
||||
#include <unistd.h> |
||||
|
||||
#define HASH_FUNC_BASE_TIMESTAMP 1538556426U |
||||
#define HASH_FUNC_COUNT 8 |
||||
#define HASH_FUNC_COUNT_PERMUTATIONS 40320U |
||||
|
||||
extern "C" { |
||||
#include "sph/sph_blake.h" |
||||
#include "sph/sph_bmw.h" |
||||
#include "sph/sph_groestl.h" |
||||
#include "sph/sph_skein.h" |
||||
#include "sph/sph_jh.h" |
||||
#include "sph/sph_keccak.h" |
||||
#include "sph/sph_luffa.h" |
||||
#include "sph/sph_cubehash.h" |
||||
} |
||||
|
||||
#include "miner.h" |
||||
#include "cuda_helper.h" |
||||
#include "cuda_x11.h" |
||||
|
||||
static uint32_t *d_hash[MAX_GPUS]; |
||||
|
||||
enum Algo { |
||||
BLAKE = 0, |
||||
BMW, |
||||
GROESTL, |
||||
SKEIN, |
||||
JH, |
||||
KECCAK, |
||||
LUFFA, |
||||
CUBEHASH, |
||||
MAX_ALGOS_COUNT |
||||
}; |
||||
|
||||
static const char* algo_strings[] = { |
||||
"blake", |
||||
"bmw512", |
||||
"groestl", |
||||
"skein", |
||||
"jh512", |
||||
"keccak", |
||||
"luffa", |
||||
"cube", |
||||
NULL |
||||
}; |
||||
|
||||
inline void swap8(uint8_t *a, uint8_t *b) |
||||
{ |
||||
uint8_t t = *a; |
||||
*a = *b; |
||||
*b = t; |
||||
} |
||||
|
||||
inline void initPerm(uint8_t n[], int count) |
||||
{ |
||||
for (int i = 0; i < count; i++) |
||||
n[i] = i; |
||||
} |
||||
|
||||
static int nextPerm(uint8_t n[], int count) |
||||
{ |
||||
int tail, i, j; |
||||
|
||||
if (count <= 1) |
||||
return 0; |
||||
|
||||
for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--); |
||||
tail = i; |
||||
|
||||
if (tail > 0) { |
||||
for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--); |
||||
swap8(&n[tail - 1], &n[j]); |
||||
} |
||||
|
||||
for (i = tail, j = count - 1; i<j; i++, j--) |
||||
swap8(&n[i], &n[j]); |
||||
|
||||
return (tail != 0); |
||||
} |
||||
|
||||
static void getAlgoString(char *str, int seq) |
||||
{ |
||||
uint8_t algoList[HASH_FUNC_COUNT]; |
||||
char *sptr; |
||||
|
||||
initPerm(algoList, HASH_FUNC_COUNT); |
||||
|
||||
for (int k = 0; k < seq; k++) { |
||||
nextPerm(algoList, HASH_FUNC_COUNT); |
||||
} |
||||
|
||||
sptr = str; |
||||
for (int j = 0; j < HASH_FUNC_COUNT; j++) { |
||||
if (algoList[j] >= 10) |
||||
sprintf(sptr, "%c", 'A' + (algoList[j] - 10)); |
||||
else |
||||
sprintf(sptr, "%u", (uint32_t) algoList[j]); |
||||
sptr++; |
||||
} |
||||
*sptr = '\0'; |
||||
} |
||||
|
||||
static __thread uint32_t s_ntime = 0; |
||||
static uint32_t s_sequence = UINT32_MAX; |
||||
static uint8_t s_firstalgo = 0xFF; |
||||
static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 }; |
||||
|
||||
#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP |
||||
static inline uint32_t getCurrentAlgoSeq(uint32_t ntime) |
||||
{ |
||||
// unlike x11evo, the permutation changes often (with ntime) |
||||
return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS; |
||||
} |
||||
|
||||
// To finish... |
||||
static void get_travel_order(uint32_t ntime, char *permstr) |
||||
{ |
||||
uint32_t seq = getCurrentAlgoSeq(ntime); |
||||
if (s_sequence != seq) { |
||||
getAlgoString(permstr, seq); |
||||
s_sequence = seq; |
||||
} |
||||
} |
||||
|
||||
// CPU Hash |
||||
extern "C" void exosis_hash(void *output, const void *input) |
||||
{ |
||||
uint32_t _ALIGN(64) hash[64/4] = { 0 }; |
||||
|
||||
sph_blake512_context ctx_blake; |
||||
sph_bmw512_context ctx_bmw; |
||||
sph_groestl512_context ctx_groestl; |
||||
sph_skein512_context ctx_skein; |
||||
sph_jh512_context ctx_jh; |
||||
sph_keccak512_context ctx_keccak; |
||||
sph_luffa512_context ctx_luffa1; |
||||
sph_cubehash512_context ctx_cubehash1; |
||||
|
||||
if (s_sequence == UINT32_MAX) { |
||||
uint32_t *data = (uint32_t*) input; |
||||
const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17]; |
||||
get_travel_order(ntime, hashOrder); |
||||
} |
||||
|
||||
void *in = (void*) input; |
||||
int size = 80; |
||||
|
||||
const int hashes = (int) strlen(hashOrder); |
||||
|
||||
for (int i = 0; i < hashes; i++) |
||||
{ |
||||
const char elem = hashOrder[i]; |
||||
uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; |
||||
|
||||
switch (algo) { |
||||
case BLAKE: |
||||
sph_blake512_init(&ctx_blake); |
||||
sph_blake512(&ctx_blake, in, size); |
||||
sph_blake512_close(&ctx_blake, hash); |
||||
break; |
||||
case BMW: |
||||
sph_bmw512_init(&ctx_bmw); |
||||
sph_bmw512(&ctx_bmw, in, size); |
||||
sph_bmw512_close(&ctx_bmw, hash); |
||||
break; |
||||
case GROESTL: |
||||
sph_groestl512_init(&ctx_groestl); |
||||
sph_groestl512(&ctx_groestl, in, size); |
||||
sph_groestl512_close(&ctx_groestl, hash); |
||||
break; |
||||
case SKEIN: |
||||
sph_skein512_init(&ctx_skein); |
||||
sph_skein512(&ctx_skein, in, size); |
||||
sph_skein512_close(&ctx_skein, hash); |
||||
break; |
||||
case JH: |
||||
sph_jh512_init(&ctx_jh); |
||||
sph_jh512(&ctx_jh, in, size); |
||||
sph_jh512_close(&ctx_jh, hash); |
||||
break; |
||||
case KECCAK: |
||||
sph_keccak512_init(&ctx_keccak); |
||||
sph_keccak512(&ctx_keccak, in, size); |
||||
sph_keccak512_close(&ctx_keccak, hash); |
||||
break; |
||||
case LUFFA: |
||||
sph_luffa512_init(&ctx_luffa1); |
||||
sph_luffa512(&ctx_luffa1, in, size); |
||||
sph_luffa512_close(&ctx_luffa1, hash); |
||||
break; |
||||
case CUBEHASH: |
||||
sph_cubehash512_init(&ctx_cubehash1); |
||||
sph_cubehash512(&ctx_cubehash1, in, size); |
||||
sph_cubehash512_close(&ctx_cubehash1, hash); |
||||
break; |
||||
} |
||||
|
||||
in = (void*) hash; |
||||
size = 64; |
||||
} |
||||
|
||||
memcpy(output, hash, 32); |
||||
} |
||||
|
||||
static uint32_t get_next_time(uint32_t ntime, char* curOrder) |
||||
{ |
||||
char nextOrder[HASH_FUNC_COUNT + 1] = { 0 }; |
||||
uint32_t secs = 15; |
||||
do { |
||||
uint32_t nseq = getCurrentAlgoSeq(ntime+secs); |
||||
getAlgoString(nextOrder, nseq); |
||||
secs += 15; |
||||
} while (curOrder[0] == nextOrder[0]); |
||||
return secs; |
||||
} |
||||
|
||||
//#define _DEBUG |
||||
#define _DEBUG_PREFIX "tt-" |
||||
#include "cuda_debug.cuh" |
||||
|
||||
void quark_bmw512_cpu_setBlock_80(void *pdata); |
||||
void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); |
||||
|
||||
void groestl512_setBlock_80(int thr_id, uint32_t *endiandata); |
||||
void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); |
||||
|
||||
void skein512_cpu_setBlock_80(void *pdata); |
||||
void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); |
||||
|
||||
void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); |
||||
void qubit_luffa512_cpu_setBlock_80(void *pdata); |
||||
void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); |
||||
|
||||
void jh512_setBlock_80(int thr_id, uint32_t *endiandata); |
||||
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); |
||||
|
||||
void keccak512_setBlock_80(int thr_id, uint32_t *endiandata); |
||||
void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); |
||||
|
||||
void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata); |
||||
void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); |
||||
|
||||
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
|
||||
extern "C" int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t *pdata = work->data; |
||||
uint32_t *ptarget = work->target; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19; |
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; |
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
||||
|
||||
// if (opt_benchmark) pdata[17] = swab32(0x5886a4be); // TO DEBUG GROESTL 80 |
||||
|
||||
if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) { |
||||
uint32_t ntime = swab32(work->data[17]); |
||||
get_travel_order(ntime, hashOrder); |
||||
s_ntime = pdata[17]; |
||||
if (opt_debug && !thr_id) { |
||||
applog(LOG_DEBUG, "exosis hash order %s (%08x)", hashOrder, ntime); |
||||
} |
||||
} |
||||
|
||||
if (opt_benchmark) |
||||
ptarget[7] = 0x5; |
||||
|
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(device_map[thr_id]); |
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
||||
cudaDeviceReset(); |
||||
// reduce cpu usage |
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||
CUDA_LOG_ERROR(); |
||||
} |
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); |
||||
|
||||
quark_blake512_cpu_init(thr_id, throughput); |
||||
quark_bmw512_cpu_init(thr_id, throughput); |
||||
quark_groestl512_cpu_init(thr_id, throughput); |
||||
quark_skein512_cpu_init(thr_id, throughput); |
||||
quark_keccak512_cpu_init(thr_id, throughput); |
||||
quark_jh512_cpu_init(thr_id, throughput); |
||||
qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes) |
||||
x11_luffa512_cpu_init(thr_id, throughput); |
||||
x11_cubehash512_cpu_init(thr_id, throughput); |
||||
|
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1); |
||||
CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1); |
||||
|
||||
cuda_check_cpu_init(thr_id, throughput); |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
uint32_t endiandata[20]; |
||||
for (int k=0; k < 19; k++) |
||||
be32enc(&endiandata[k], pdata[k]); |
||||
|
||||
cuda_check_cpu_setTarget(ptarget); |
||||
|
||||
const int hashes = (int) strlen(hashOrder); |
||||
const char first = hashOrder[0]; |
||||
const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0'; |
||||
if (algo80 != s_firstalgo) { |
||||
s_firstalgo = algo80; |
||||
applog(LOG_INFO, "Exosis first algo is now %s", algo_strings[algo80 % HASH_FUNC_COUNT]); |
||||
} |
||||
|
||||
switch (algo80) { |
||||
case BLAKE: |
||||
quark_blake512_cpu_setBlock_80(thr_id, endiandata); |
||||
break; |
||||
case BMW: |
||||
quark_bmw512_cpu_setBlock_80(endiandata); |
||||
break; |
||||
case GROESTL: |
||||
groestl512_setBlock_80(thr_id, endiandata); |
||||
break; |
||||
case SKEIN: |
||||
skein512_cpu_setBlock_80((void*)endiandata); |
||||
break; |
||||
case JH: |
||||
jh512_setBlock_80(thr_id, endiandata); |
||||
break; |
||||
case KECCAK: |
||||
keccak512_setBlock_80(thr_id, endiandata); |
||||
break; |
||||
case LUFFA: |
||||
qubit_luffa512_cpu_setBlock_80((void*)endiandata); |
||||
break; |
||||
case CUBEHASH: |
||||
cubehash512_setBlock_80(thr_id, endiandata); |
||||
break; |
||||
default: { |
||||
uint32_t next = get_next_time(swab32(s_ntime), hashOrder); |
||||
if (!thr_id) |
||||
applog(LOG_WARNING, "kernel %c unimplemented, next in %u mn", first, next/60); |
||||
sleep(next > 30 ? 60 : 10); |
||||
return -1; |
||||
} |
||||
} |
||||
|
||||
do { |
||||
int order = 0; |
||||
|
||||
// Hash with CUDA |
||||
|
||||
switch (algo80) { |
||||
case BLAKE: |
||||
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; |
||||
TRACE("blake80:"); |
||||
break; |
||||
case BMW: |
||||
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
TRACE("bmw80 :"); |
||||
break; |
||||
case GROESTL: |
||||
groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; |
||||
TRACE("grstl80:"); |
||||
break; |
||||
case SKEIN: |
||||
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; |
||||
TRACE("skein80:"); |
||||
break; |
||||
case JH: |
||||
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; |
||||
TRACE("jh51280:"); |
||||
break; |
||||
case KECCAK: |
||||
keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; |
||||
TRACE("kecck80:"); |
||||
break; |
||||
case LUFFA: |
||||
qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
TRACE("luffa80:"); |
||||
break; |
||||
case CUBEHASH: |
||||
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; |
||||
TRACE("cube 80:"); |
||||
break; |
||||
} |
||||
|
||||
for (int i = 1; i < hashes; i++) |
||||
{ |
||||
const char elem = hashOrder[i]; |
||||
const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; |
||||
|
||||
switch (algo64) { |
||||
case BLAKE: |
||||
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
TRACE("blake :"); |
||||
break; |
||||
case BMW: |
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
TRACE("bmw :"); |
||||
break; |
||||
case GROESTL: |
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
TRACE("groestl:"); |
||||
break; |
||||
case SKEIN: |
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
TRACE("skein :"); |
||||
break; |
||||
case JH: |
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
TRACE("jh512 :"); |
||||
break; |
||||
case KECCAK: |
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
TRACE("keccak :"); |
||||
break; |
||||
case LUFFA: |
||||
x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
TRACE("luffa :"); |
||||
break; |
||||
case CUBEHASH: |
||||
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
TRACE("cube :"); |
||||
break; |
||||
} |
||||
} |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
|
||||
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); |
||||
if (work->nonces[0] != UINT32_MAX) |
||||
{ |
||||
uint32_t _ALIGN(64) vhash[8]; |
||||
const uint32_t Htarg = ptarget[7]; |
||||
be32enc(&endiandata[19], work->nonces[0]); |
||||
exosis_hash(vhash, endiandata); |
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { |
||||
work->valid_nonces = 1; |
||||
work_set_target_ratio(work, vhash); |
||||
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); |
||||
pdata[19] = work->nonces[0]; |
||||
if (work->nonces[1] != 0) { |
||||
be32enc(&endiandata[19], work->nonces[1]); |
||||
exosis_hash(vhash, endiandata); |
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { |
||||
bn_set_target_ratio(work, vhash, 1); |
||||
work->valid_nonces++; |
||||
} |
||||
pdata[19] = max(pdata[19], work->nonces[1]) + 1; |
||||
} |
||||
return work->valid_nonces; |
||||
} else if (vhash[7] > Htarg) { |
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); |
||||
pdata[19] = work->nonces[0] + 1; |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
if ((uint64_t) throughput + pdata[19] >= max_nonce) { |
||||
pdata[19] = max_nonce; |
||||
break; |
||||
} |
||||
pdata[19] += throughput; |
||||
|
||||
} while (!work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce; |
||||
return 0; |
||||
} |
||||
|
||||
// cleanup |
||||
extern "C" void free_exosis(int thr_id) |
||||
{ |
||||
if (!init[thr_id]) |
||||
return; |
||||
|
||||
cudaThreadSynchronize(); |
||||
|
||||
cudaFree(d_hash[thr_id]); |
||||
|
||||
quark_blake512_cpu_free(thr_id); |
||||
quark_groestl512_cpu_free(thr_id); |
||||
|
||||
cuda_check_cpu_free(thr_id); |
||||
init[thr_id] = false; |
||||
|
||||
cudaDeviceSynchronize(); |
||||
} |
@ -0,0 +1,632 @@
@@ -0,0 +1,632 @@
|
||||
/** |
||||
* x97 SONO |
||||
**/ |
||||
|
||||
extern "C" { |
||||
#include "sph/sph_blake.h" |
||||
#include "sph/sph_bmw.h" |
||||
#include "sph/sph_groestl.h" |
||||
#include "sph/sph_skein.h" |
||||
#include "sph/sph_jh.h" |
||||
#include "sph/sph_keccak.h" |
||||
#include "sph/sph_luffa.h" |
||||
#include "sph/sph_cubehash.h" |
||||
#include "sph/sph_shavite.h" |
||||
#include "sph/sph_simd.h" |
||||
#include "sph/sph_echo.h" |
||||
#include "sph/sph_hamsi.h" |
||||
#include "sph/sph_fugue.h" |
||||
#include "sph/sph_shabal.h" |
||||
#include "sph/sph_whirlpool.h" |
||||
#include "sph/sph_sha2.h" |
||||
#include "sph/sph_haval.h" |
||||
} |
||||
|
||||
#include "miner.h" |
||||
#include "cuda_helper.h" |
||||
#include "x11/cuda_x11.h" |
||||
|
||||
#define NBN 2 |
||||
|
||||
static uint32_t *d_hash[MAX_GPUS]; |
||||
|
||||
extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); |
||||
|
||||
extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); |
||||
extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); |
||||
extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
extern void x13_fugue512_cpu_free(int thr_id); |
||||
|
||||
extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); |
||||
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
|
||||
extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag); |
||||
extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); |
||||
extern void x15_whirlpool_cpu_free(int thr_id); |
||||
|
||||
extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); |
||||
extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); |
||||
|
||||
extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); |
||||
extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen); |
||||
|
||||
// CPU Hash Validation |
||||
extern "C" void sonoa_hash(void *output, const void *input) |
||||
{ |
||||
unsigned char _ALIGN(128) hash[64]; |
||||
|
||||
sph_blake512_context ctx_blake; |
||||
sph_bmw512_context ctx_bmw; |
||||
sph_groestl512_context ctx_groestl; |
||||
sph_jh512_context ctx_jh; |
||||
sph_keccak512_context ctx_keccak; |
||||
sph_skein512_context ctx_skein; |
||||
sph_luffa512_context ctx_luffa; |
||||
sph_cubehash512_context ctx_cubehash; |
||||
sph_shavite512_context ctx_shavite; |
||||
sph_simd512_context ctx_simd; |
||||
sph_echo512_context ctx_echo; |
||||
sph_hamsi512_context ctx_hamsi; |
||||
sph_fugue512_context ctx_fugue; |
||||
sph_shabal512_context ctx_shabal; |
||||
sph_whirlpool_context ctx_whirlpool; |
||||
sph_sha512_context ctx_sha512; |
||||
sph_haval256_5_context ctx_haval; |
||||
|
||||
|
||||
sph_blake512_init(&ctx_blake); |
||||
sph_blake512(&ctx_blake, input, 80); |
||||
sph_blake512_close(&ctx_blake, (void*)hash); |
||||
|
||||
sph_bmw512_init(&ctx_bmw); |
||||
sph_bmw512(&ctx_bmw, (const void*)hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*)hash); |
||||
|
||||
sph_groestl512_init(&ctx_groestl); |
||||
sph_groestl512(&ctx_groestl, (const void*)hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*)hash); |
||||
|
||||
sph_skein512_init(&ctx_skein); |
||||
sph_skein512(&ctx_skein, (const void*)hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*)hash); |
||||
|
||||
sph_jh512_init(&ctx_jh); |
||||
sph_jh512(&ctx_jh, (const void*)hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*)hash); |
||||
|
||||
sph_keccak512_init(&ctx_keccak); |
||||
sph_keccak512(&ctx_keccak, (const void*)hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*)hash); |
||||
|
||||
sph_luffa512_init(&ctx_luffa); |
||||
sph_luffa512(&ctx_luffa, (const void*)hash, 64); |
||||
sph_luffa512_close(&ctx_luffa, (void*)hash); |
||||
|
||||
sph_cubehash512_init(&ctx_cubehash); |
||||
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); |
||||
sph_cubehash512_close(&ctx_cubehash, (void*)hash); |
||||
|
||||
sph_shavite512_init(&ctx_shavite); |
||||
sph_shavite512(&ctx_shavite, (const void*)hash, 64); |
||||
sph_shavite512_close(&ctx_shavite, (void*)hash); |
||||
|
||||
sph_simd512_init(&ctx_simd); |
||||
sph_simd512(&ctx_simd, (const void*)hash, 64); |
||||
sph_simd512_close(&ctx_simd, (void*)hash); |
||||
|
||||
sph_echo512_init(&ctx_echo); |
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
|
||||
sph_bmw512(&ctx_bmw, (const void*)hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*)hash); |
||||
|
||||
sph_groestl512(&ctx_groestl, (const void*)hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*)hash); |
||||
|
||||
sph_skein512(&ctx_skein, (const void*)hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*)hash); |
||||
|
||||
sph_jh512(&ctx_jh, (const void*)hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*)hash); |
||||
|
||||
sph_keccak512(&ctx_keccak, (const void*)hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*)hash); |
||||
|
||||
sph_luffa512(&ctx_luffa, (const void*)hash, 64); |
||||
sph_luffa512_close(&ctx_luffa, (void*)hash); |
||||
|
||||
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); |
||||
sph_cubehash512_close(&ctx_cubehash, (void*)hash); |
||||
|
||||
sph_shavite512(&ctx_shavite, (const void*)hash, 64); |
||||
sph_shavite512_close(&ctx_shavite, (void*)hash); |
||||
|
||||
sph_simd512(&ctx_simd, (const void*)hash, 64); |
||||
sph_simd512_close(&ctx_simd, (void*)hash); |
||||
|
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
sph_hamsi512_init(&ctx_hamsi); |
||||
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); |
||||
sph_hamsi512_close(&ctx_hamsi, (void*)hash); |
||||
|
||||
|
||||
sph_bmw512(&ctx_bmw, (const void*)hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*)hash); |
||||
|
||||
sph_groestl512(&ctx_groestl, (const void*)hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*)hash); |
||||
|
||||
sph_skein512(&ctx_skein, (const void*)hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*)hash); |
||||
|
||||
sph_jh512(&ctx_jh, (const void*)hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*)hash); |
||||
|
||||
sph_keccak512(&ctx_keccak, (const void*)hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*)hash); |
||||
|
||||
sph_luffa512(&ctx_luffa, (const void*)hash, 64); |
||||
sph_luffa512_close(&ctx_luffa, (void*)hash); |
||||
|
||||
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); |
||||
sph_cubehash512_close(&ctx_cubehash, (void*)hash); |
||||
|
||||
sph_shavite512(&ctx_shavite, (const void*)hash, 64); |
||||
sph_shavite512_close(&ctx_shavite, (void*)hash); |
||||
|
||||
sph_simd512(&ctx_simd, (const void*)hash, 64); |
||||
sph_simd512_close(&ctx_simd, (void*)hash); |
||||
|
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); |
||||
sph_hamsi512_close(&ctx_hamsi, (void*)hash); |
||||
|
||||
sph_fugue512_init(&ctx_fugue); |
||||
sph_fugue512(&ctx_fugue, (const void*)hash, 64); |
||||
sph_fugue512_close(&ctx_fugue, (void*)hash); |
||||
|
||||
|
||||
sph_bmw512(&ctx_bmw, (const void*)hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*)hash); |
||||
|
||||
sph_groestl512(&ctx_groestl, (const void*)hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*)hash); |
||||
|
||||
sph_skein512(&ctx_skein, (const void*)hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*)hash); |
||||
|
||||
sph_jh512(&ctx_jh, (const void*)hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*)hash); |
||||
|
||||
sph_keccak512(&ctx_keccak, (const void*)hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*)hash); |
||||
|
||||
sph_luffa512(&ctx_luffa, (const void*)hash, 64); |
||||
sph_luffa512_close(&ctx_luffa, (void*)hash); |
||||
|
||||
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); |
||||
sph_cubehash512_close(&ctx_cubehash, (void*)hash); |
||||
|
||||
sph_shavite512(&ctx_shavite, (const void*)hash, 64); |
||||
sph_shavite512_close(&ctx_shavite, (void*)hash); |
||||
|
||||
sph_simd512(&ctx_simd, (const void*)hash, 64); |
||||
sph_simd512_close(&ctx_simd, (void*)hash); |
||||
|
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); |
||||
sph_hamsi512_close(&ctx_hamsi, (void*)hash); |
||||
|
||||
sph_fugue512(&ctx_fugue, (const void*)hash, 64); |
||||
sph_fugue512_close(&ctx_fugue, (void*)hash); |
||||
|
||||
sph_shabal512_init(&ctx_shabal); |
||||
sph_shabal512(&ctx_shabal, (const void*)hash, 64); |
||||
sph_shabal512_close(&ctx_shabal, (void*)hash); |
||||
|
||||
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); |
||||
sph_hamsi512_close(&ctx_hamsi, (void*)hash); |
||||
|
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
sph_shavite512(&ctx_shavite, (const void*)hash, 64); |
||||
sph_shavite512_close(&ctx_shavite, (void*)hash); |
||||
|
||||
|
||||
sph_bmw512(&ctx_bmw, (const void*)hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*)hash); |
||||
|
||||
sph_shabal512(&ctx_shabal, (const void*)hash, 64); |
||||
sph_shabal512_close(&ctx_shabal, (void*)hash); |
||||
|
||||
sph_groestl512(&ctx_groestl, (const void*)hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*)hash); |
||||
|
||||
sph_skein512(&ctx_skein, (const void*)hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*)hash); |
||||
|
||||
sph_jh512(&ctx_jh, (const void*)hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*)hash); |
||||
|
||||
sph_keccak512(&ctx_keccak, (const void*)hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*)hash); |
||||
|
||||
sph_luffa512(&ctx_luffa, (const void*)hash, 64); |
||||
sph_luffa512_close(&ctx_luffa, (void*)hash); |
||||
|
||||
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); |
||||
sph_cubehash512_close(&ctx_cubehash, (void*)hash); |
||||
|
||||
sph_shavite512(&ctx_shavite, (const void*)hash, 64); |
||||
sph_shavite512_close(&ctx_shavite, (void*)hash); |
||||
|
||||
sph_simd512(&ctx_simd, (const void*)hash, 64); |
||||
sph_simd512_close(&ctx_simd, (void*)hash); |
||||
|
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); |
||||
sph_hamsi512_close(&ctx_hamsi, (void*)hash); |
||||
|
||||
sph_fugue512(&ctx_fugue, (const void*)hash, 64); |
||||
sph_fugue512_close(&ctx_fugue, (void*)hash); |
||||
|
||||
sph_shabal512(&ctx_shabal, (const void*)hash, 64); |
||||
sph_shabal512_close(&ctx_shabal, (void*)hash); |
||||
|
||||
sph_whirlpool_init(&ctx_whirlpool); |
||||
sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64); |
||||
sph_whirlpool_close(&ctx_whirlpool, (void*)hash); |
||||
|
||||
|
||||
sph_bmw512(&ctx_bmw, (const void*)hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*)hash); |
||||
|
||||
sph_groestl512(&ctx_groestl, (const void*)hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*)hash); |
||||
|
||||
sph_skein512(&ctx_skein, (const void*)hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*)hash); |
||||
|
||||
sph_jh512(&ctx_jh, (const void*)hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*)hash); |
||||
|
||||
sph_keccak512(&ctx_keccak, (const void*)hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*)hash); |
||||
|
||||
sph_luffa512(&ctx_luffa, (const void*)hash, 64); |
||||
sph_luffa512_close(&ctx_luffa, (void*)hash); |
||||
|
||||
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); |
||||
sph_cubehash512_close(&ctx_cubehash, (void*)hash); |
||||
|
||||
sph_shavite512(&ctx_shavite, (const void*)hash, 64); |
||||
sph_shavite512_close(&ctx_shavite, (void*)hash); |
||||
|
||||
sph_simd512(&ctx_simd, (const void*)hash, 64); |
||||
sph_simd512_close(&ctx_simd, (void*)hash); |
||||
|
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); |
||||
sph_hamsi512_close(&ctx_hamsi, (void*)hash); |
||||
|
||||
sph_fugue512(&ctx_fugue, (const void*)hash, 64); |
||||
sph_fugue512_close(&ctx_fugue, (void*)hash); |
||||
|
||||
sph_shabal512(&ctx_shabal, (const void*)hash, 64); |
||||
sph_shabal512_close(&ctx_shabal, (void*)hash); |
||||
|
||||
sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64); |
||||
sph_whirlpool_close(&ctx_whirlpool, (void*)hash); |
||||
|
||||
sph_sha512_init(&ctx_sha512); |
||||
sph_sha512(&ctx_sha512, (const void*)hash, 64); |
||||
sph_sha512_close(&ctx_sha512, (void*)hash); |
||||
|
||||
sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64); |
||||
sph_whirlpool_close(&ctx_whirlpool, (void*)hash); |
||||
|
||||
|
||||
sph_bmw512(&ctx_bmw, (const void*)hash, 64); |
||||
sph_bmw512_close(&ctx_bmw, (void*)hash); |
||||
|
||||
sph_groestl512(&ctx_groestl, (const void*)hash, 64); |
||||
sph_groestl512_close(&ctx_groestl, (void*)hash); |
||||
|
||||
sph_skein512(&ctx_skein, (const void*)hash, 64); |
||||
sph_skein512_close(&ctx_skein, (void*)hash); |
||||
|
||||
sph_jh512(&ctx_jh, (const void*)hash, 64); |
||||
sph_jh512_close(&ctx_jh, (void*)hash); |
||||
|
||||
sph_keccak512(&ctx_keccak, (const void*)hash, 64); |
||||
sph_keccak512_close(&ctx_keccak, (void*)hash); |
||||
|
||||
sph_luffa512(&ctx_luffa, (const void*)hash, 64); |
||||
sph_luffa512_close(&ctx_luffa, (void*)hash); |
||||
|
||||
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); |
||||
sph_cubehash512_close(&ctx_cubehash, (void*)hash); |
||||
|
||||
sph_shavite512(&ctx_shavite, (const void*)hash, 64); |
||||
sph_shavite512_close(&ctx_shavite, (void*)hash); |
||||
|
||||
sph_simd512(&ctx_simd, (const void*)hash, 64); |
||||
sph_simd512_close(&ctx_simd, (void*)hash); |
||||
|
||||
sph_echo512(&ctx_echo, (const void*)hash, 64); |
||||
sph_echo512_close(&ctx_echo, (void*)hash); |
||||
|
||||
sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); |
||||
sph_hamsi512_close(&ctx_hamsi, (void*)hash); |
||||
|
||||
sph_fugue512(&ctx_fugue, (const void*)hash, 64); |
||||
sph_fugue512_close(&ctx_fugue, (void*)hash); |
||||
|
||||
sph_shabal512(&ctx_shabal, (const void*)hash, 64); |
||||
sph_shabal512_close(&ctx_shabal, (void*)hash); |
||||
|
||||
sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64); |
||||
sph_whirlpool_close(&ctx_whirlpool, (void*)hash); |
||||
|
||||
sph_sha512(&ctx_sha512, (const void*)hash, 64); |
||||
sph_sha512_close(&ctx_sha512, (void*)hash); |
||||
|
||||
sph_haval256_5_init(&ctx_haval); |
||||
sph_haval256_5(&ctx_haval, (const void*)hash, 64); |
||||
sph_haval256_5_close(&ctx_haval, (void*)hash); |
||||
|
||||
memcpy(output, hash, 32); |
||||
} |
||||
|
||||
#define x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash) \ |
||||
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \ |
||||
if (use_compat_kernels[thr_id]) x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \ |
||||
else x16_echo512_cpu_hash_64(thr_id, throughput, d_hash) |
||||
|
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
static bool use_compat_kernels[MAX_GPUS] = { 0 }; |
||||
|
||||
extern "C" int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t *pdata = work->data; |
||||
uint32_t *ptarget = work->target; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
const int dev_id = device_map[thr_id]; |
||||
|
||||
uint32_t default_throughput = 1 << 18; |
||||
if (device_sm[dev_id] <= 500) default_throughput = 1 << 18; |
||||
else if (device_sm[dev_id] <= 520) default_throughput = 1 << 18; |
||||
else if (device_sm[dev_id] > 520) default_throughput = (1 << 19) + (1 << 18); |
||||
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, default_throughput); |
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
||||
|
||||
throughput &= 0xFFFFFF00; |
||||
|
||||
if (opt_benchmark) |
||||
((uint32_t*)ptarget)[7] = 0x00ff; |
||||
|
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(dev_id); |
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
||||
cudaDeviceReset(); |
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||
} |
||||
gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); |
||||
|
||||
cuda_get_arch(thr_id); |
||||
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); |
||||
if (use_compat_kernels[thr_id]) |
||||
x11_echo512_cpu_init(thr_id, throughput); |
||||
|
||||
quark_blake512_cpu_init(thr_id, throughput); |
||||
quark_groestl512_cpu_init(thr_id, throughput); |
||||
quark_skein512_cpu_init(thr_id, throughput); |
||||
quark_bmw512_cpu_init(thr_id, throughput); |
||||
quark_keccak512_cpu_init(thr_id, throughput); |
||||
quark_jh512_cpu_init(thr_id, throughput); |
||||
x11_luffaCubehash512_cpu_init(thr_id, throughput); |
||||
x11_shavite512_cpu_init(thr_id, throughput); |
||||
x11_simd512_cpu_init(thr_id, throughput); |
||||
x13_hamsi512_cpu_init(thr_id, throughput); |
||||
x13_fugue512_cpu_init(thr_id, throughput); |
||||
x14_shabal512_cpu_init(thr_id, throughput); |
||||
x15_whirlpool_cpu_init(thr_id, throughput, 0); |
||||
x17_sha512_cpu_init(thr_id, throughput); |
||||
x17_haval256_cpu_init(thr_id, throughput); |
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput)); |
||||
|
||||
cuda_check_cpu_init(thr_id, throughput); |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
int warn = 0; |
||||
uint32_t _ALIGN(64) endiandata[20]; |
||||
for (int k=0; k < 20; k++) |
||||
be32enc(&endiandata[k], pdata[k]); |
||||
|
||||
quark_blake512_cpu_setBlock_80(thr_id, endiandata); |
||||
cuda_check_cpu_setTarget(ptarget); |
||||
|
||||
do { |
||||
int order = 0; |
||||
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); |
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); |
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); |
||||
|
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); |
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); |
||||
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
|
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); |
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); |
||||
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
|
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); |
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); |
||||
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); |
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
|
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); |
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); |
||||
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
|
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); |
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); |
||||
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; |
||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
|
||||
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); |
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); |
||||
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); |
||||
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; |
||||
x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 256); order++; |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
|
||||
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); |
||||
if (work->nonces[0] != UINT32_MAX) |
||||
{ |
||||
const uint32_t Htarg = ptarget[7]; |
||||
uint32_t _ALIGN(64) vhash[8]; |
||||
be32enc(&endiandata[19], work->nonces[0]); |
||||
sonoa_hash(vhash, endiandata); |
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) |
||||
{ |
||||
work->valid_nonces = 1; |
||||
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); |
||||
work_set_target_ratio(work, vhash); |
||||
if (work->nonces[1] != 0) { |
||||
be32enc(&endiandata[19], work->nonces[1]); |
||||
sonoa_hash(vhash, endiandata); |
||||
bn_set_target_ratio(work, vhash, 1); |
||||
work->valid_nonces++; |
||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; |
||||
} else { |
||||
pdata[19] = work->nonces[0] + 1; // cursor |
||||
} |
||||
return work->valid_nonces; |
||||
} |
||||
else if (vhash[7] > Htarg) { |
||||
gpu_increment_reject(thr_id); |
||||
if (!warn) { |
||||
warn++; |
||||
pdata[19] = work->nonces[0] + 1; |
||||
continue; |
||||
} else { |
||||
if (!opt_quiet) |
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); |
||||
warn = 0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
if ((uint64_t)throughput + pdata[19] >= max_nonce) { |
||||
pdata[19] = max_nonce; |
||||
break; |
||||
} |
||||
|
||||
pdata[19] += throughput; |
||||
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce; |
||||
return 0; |
||||
} |
||||
|
||||
extern "C" void free_sonoa(int thr_id) |
||||
{ |
||||
if (!init[thr_id]) |
||||
return; |
||||
|
||||
cudaDeviceSynchronize(); |
||||
|
||||
cudaFree(d_hash[thr_id]); |
||||
|
||||
quark_blake512_cpu_free(thr_id); |
||||
quark_groestl512_cpu_free(thr_id); |
||||
x11_simd512_cpu_free(thr_id); |
||||
x13_fugue512_cpu_free(thr_id); |
||||
x15_whirlpool_cpu_free(thr_id); |
||||
|
||||
cudaDeviceSynchronize(); |
||||
init[thr_id] = false; |
||||
} |
Loading…
Reference in new issue