mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-22 04:24:29 +00:00
veltor update, 10x faster :p
From Alexis work, sib hash rate 200% also..
This commit is contained in:
parent
3eba451d4c
commit
36aedbb48e
@ -246,16 +246,16 @@ Options:\n\
|
||||
skein Skein SHA2 (Skeincoin)\n\
|
||||
skein2 Double Skein (Woodcoin)\n\
|
||||
s3 S3 (1Coin)\n\
|
||||
vanilla Blake256-8 (VNL)\n\
|
||||
veltor Thorsriddle streebog\n\
|
||||
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
|
||||
whirlpool Whirlpool algo\n\
|
||||
x11evo Permuted x11 (Revolver)\n\
|
||||
x11 X11 (DarkCoin)\n\
|
||||
x13 X13 (MaruCoin)\n\
|
||||
x14 X14\n\
|
||||
x15 X15\n\
|
||||
x17 X17\n\
|
||||
vanilla Blake256-8 (VNL)\n\
|
||||
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
|
||||
whirlpool Whirlpool algo\n\
|
||||
zr5 ZR5 (ZiftrCoin)\n\
|
||||
-d, --devices Comma separated list of CUDA devices to use.\n\
|
||||
Device IDs start counting from 0! Alternatively takes\n\
|
||||
|
@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
|
||||
// require a uint32_t[9] ret array
|
||||
// note: djm neoscrypt implementation is near the limits of gpu capabilities
|
||||
// and weird behaviors can happen when tuning device functions code...
|
||||
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
{
|
||||
uint8_t *v = (uint8_t*) &vec4.s0;
|
||||
uint8_t *r = (uint8_t*) ret;
|
||||
@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
#else
|
||||
|
||||
// same for SM 3.5+, really faster ?
|
||||
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
{
|
||||
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
|
||||
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
|
||||
|
1976
x11/cuda_streebog.cu
1976
x11/cuda_streebog.cu
File diff suppressed because it is too large
Load Diff
19
x11/sib.cu
19
x11/sib.cu
@ -17,7 +17,7 @@ extern "C" {
|
||||
#include "cuda_helper.h"
|
||||
#include "cuda_x11.h"
|
||||
|
||||
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
@ -104,9 +104,11 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 19 : 18;
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
|
||||
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
const int dev_id = device_map[thr_id];
|
||||
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
|
||||
if (device_sm[dev_id] >= 600) intensity = 20;
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
|
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0xf;
|
||||
@ -132,9 +134,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
||||
x11_shavite512_cpu_init(thr_id, throughput);
|
||||
x11_echo512_cpu_init(thr_id, throughput);
|
||||
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
|
||||
return 0;
|
||||
return -1;
|
||||
}
|
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
|
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
|
||||
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
|
||||
@ -165,7 +167,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
||||
TRACE("jh512 :");
|
||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
TRACE("keccak :");
|
||||
streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
||||
streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
|
||||
TRACE("gost :");
|
||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
|
||||
TRACE("luffa+c:");
|
||||
@ -186,7 +188,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
||||
|
||||
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
|
||||
int res = 1;
|
||||
// check if there was some other ones...
|
||||
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
|
||||
work_set_target_ratio(work, vhash64);
|
||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||
@ -200,7 +201,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
||||
}
|
||||
pdata[19] = foundNonce;
|
||||
return res;
|
||||
} else {
|
||||
} else if (vhash64[7] > Htarg && !opt_quiet) {
|
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
||||
pdata[19] = foundNonce + 1;
|
||||
continue;
|
||||
|
@ -9,29 +9,29 @@ extern "C" {
|
||||
#include "cuda_helper.h"
|
||||
#include "cuda_x11.h"
|
||||
|
||||
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
||||
|
||||
extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
|
||||
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void skein512_cpu_setBlock_80(void *pdata);
|
||||
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
|
||||
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||
extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
|
||||
extern void streebog_set_target(const uint32_t* ptarget);
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#define NBN 2
|
||||
static uint32_t *d_hash[MAX_GPUS];
|
||||
static uint32_t *d_resNonce[MAX_GPUS];
|
||||
|
||||
// Veltor CPU Hash
|
||||
// veltorcoin CPU Hash
|
||||
extern "C" void veltorhash(void *output, const void *input)
|
||||
{
|
||||
uint8_t _ALIGN(64) hash[128] = { 0 };
|
||||
unsigned char _ALIGN(128) hash[128] = { 0 };
|
||||
|
||||
sph_skein512_context ctx_skein;
|
||||
sph_shavite512_context ctx_shavite;
|
||||
sph_shabal512_context ctx_shabal;
|
||||
sph_gost512_context ctx_gost;
|
||||
sph_shabal512_context ctx_shabal;
|
||||
sph_shavite512_context ctx_shavite;
|
||||
|
||||
sph_skein512_init(&ctx_skein);
|
||||
sph_skein512(&ctx_skein, input, 80);
|
||||
@ -52,19 +52,18 @@ extern "C" void veltorhash(void *output, const void *input)
|
||||
memcpy(output, hash, 32);
|
||||
}
|
||||
|
||||
//#define _DEBUG
|
||||
#define _DEBUG_PREFIX "veltor"
|
||||
#include "cuda_debug.cuh"
|
||||
|
||||
static bool init[MAX_GPUS] = { 0 };
|
||||
|
||||
extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
int dev_id = device_map[thr_id];
|
||||
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 18;
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
|
||||
int intensity = (device_sm[device_map[thr_id]] > 500) ? 20 : 18;
|
||||
if (strstr(device_name[dev_id], "GTX 10")) intensity = 21;
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
|
||||
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
|
||||
if (opt_benchmark)
|
||||
@ -79,58 +78,59 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
|
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
|
||||
CUDA_LOG_ERROR();
|
||||
}
|
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
|
||||
|
||||
quark_skein512_cpu_init(thr_id, throughput);
|
||||
x11_shavite512_cpu_init(thr_id, throughput);
|
||||
x14_shabal512_cpu_init(thr_id, throughput);
|
||||
|
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
|
||||
|
||||
cuda_check_cpu_init(thr_id, throughput);
|
||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
|
||||
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
uint32_t endiandata[20];
|
||||
uint32_t _ALIGN(64) h_resNonce[NBN];
|
||||
uint32_t _ALIGN(64) endiandata[20];
|
||||
for (int k=0; k < 20; k++)
|
||||
be32enc(&endiandata[k], pdata[k]);
|
||||
|
||||
skein512_cpu_setBlock_80(endiandata);
|
||||
cuda_check_cpu_setTarget(ptarget);
|
||||
|
||||
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
|
||||
streebog_set_target(ptarget);
|
||||
|
||||
do {
|
||||
int order = 0;
|
||||
uint32_t foundNonce;
|
||||
|
||||
// Hash with CUDA
|
||||
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
|
||||
TRACE("blake :");
|
||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
TRACE("shavite:");
|
||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||
TRACE("shabal :");
|
||||
streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
||||
TRACE("gost :");
|
||||
streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
|
||||
|
||||
cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||
|
||||
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
||||
if (foundNonce != UINT32_MAX)
|
||||
if (h_resNonce[0] != UINT32_MAX)
|
||||
{
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t _ALIGN(64) vhash[8];
|
||||
be32enc(&endiandata[19], foundNonce);
|
||||
veltorhash(vhash, endiandata);
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t startNounce = pdata[19];
|
||||
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
||||
be32enc(&endiandata[19], startNounce + h_resNonce[0]);
|
||||
veltorhash(vhash, endiandata);
|
||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||
{
|
||||
int res = 1;
|
||||
// check if there was another one...
|
||||
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], res);
|
||||
work_set_target_ratio(work, vhash);
|
||||
if (secNonce != 0) {
|
||||
work->nonces[0] = startNounce + h_resNonce[0];
|
||||
if (h_resNonce[1] != UINT32_MAX)
|
||||
{
|
||||
uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
|
||||
gpulog(LOG_DEBUG, thr_id, "Found 2nd nonce: %08x", secNonce);
|
||||
be32enc(&endiandata[19], secNonce);
|
||||
veltorhash(vhash, endiandata);
|
||||
work->nonces[1] = secNonce;
|
||||
|
||||
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
|
||||
work_set_target_ratio(work, vhash);
|
||||
xchg(work->nonces[1], work->nonces[0]);
|
||||
@ -139,24 +139,25 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
|
||||
}
|
||||
res++;
|
||||
}
|
||||
pdata[19] = work->nonces[0] = foundNonce;
|
||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan
|
||||
return res;
|
||||
} else {
|
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
||||
pdata[19] = foundNonce + 1;
|
||||
continue;
|
||||
}
|
||||
else if (vhash[7] > Htarg && !opt_quiet) {
|
||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[0]);
|
||||
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
|
||||
}
|
||||
}
|
||||
|
||||
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
|
||||
pdata[19] = max_nonce;
|
||||
break;
|
||||
}
|
||||
|
||||
pdata[19] += throughput;
|
||||
|
||||
} while (!work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = pdata[19] - first_nonce;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user