mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-13 00:08:00 +00:00
veltor update, 10x faster :p
From Alexis work, sib hash rate 200% also..
This commit is contained in:
parent
3eba451d4c
commit
36aedbb48e
@ -246,16 +246,16 @@ Options:\n\
|
|||||||
skein Skein SHA2 (Skeincoin)\n\
|
skein Skein SHA2 (Skeincoin)\n\
|
||||||
skein2 Double Skein (Woodcoin)\n\
|
skein2 Double Skein (Woodcoin)\n\
|
||||||
s3 S3 (1Coin)\n\
|
s3 S3 (1Coin)\n\
|
||||||
|
vanilla Blake256-8 (VNL)\n\
|
||||||
veltor Thorsriddle streebog\n\
|
veltor Thorsriddle streebog\n\
|
||||||
|
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
|
||||||
|
whirlpool Whirlpool algo\n\
|
||||||
x11evo Permuted x11 (Revolver)\n\
|
x11evo Permuted x11 (Revolver)\n\
|
||||||
x11 X11 (DarkCoin)\n\
|
x11 X11 (DarkCoin)\n\
|
||||||
x13 X13 (MaruCoin)\n\
|
x13 X13 (MaruCoin)\n\
|
||||||
x14 X14\n\
|
x14 X14\n\
|
||||||
x15 X15\n\
|
x15 X15\n\
|
||||||
x17 X17\n\
|
x17 X17\n\
|
||||||
vanilla Blake256-8 (VNL)\n\
|
|
||||||
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
|
|
||||||
whirlpool Whirlpool algo\n\
|
|
||||||
zr5 ZR5 (ZiftrCoin)\n\
|
zr5 ZR5 (ZiftrCoin)\n\
|
||||||
-d, --devices Comma separated list of CUDA devices to use.\n\
|
-d, --devices Comma separated list of CUDA devices to use.\n\
|
||||||
Device IDs start counting from 0! Alternatively takes\n\
|
Device IDs start counting from 0! Alternatively takes\n\
|
||||||
|
@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
|
|||||||
// require a uint32_t[9] ret array
|
// require a uint32_t[9] ret array
|
||||||
// note: djm neoscrypt implementation is near the limits of gpu capabilities
|
// note: djm neoscrypt implementation is near the limits of gpu capabilities
|
||||||
// and weird behaviors can happen when tuning device functions code...
|
// and weird behaviors can happen when tuning device functions code...
|
||||||
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||||
{
|
{
|
||||||
uint8_t *v = (uint8_t*) &vec4.s0;
|
uint8_t *v = (uint8_t*) &vec4.s0;
|
||||||
uint8_t *r = (uint8_t*) ret;
|
uint8_t *r = (uint8_t*) ret;
|
||||||
@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
|||||||
#else
|
#else
|
||||||
|
|
||||||
// same for SM 3.5+, really faster ?
|
// same for SM 3.5+, really faster ?
|
||||||
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||||
{
|
{
|
||||||
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
|
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
|
||||||
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
|
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
|
||||||
|
1980
x11/cuda_streebog.cu
1980
x11/cuda_streebog.cu
File diff suppressed because it is too large
Load Diff
19
x11/sib.cu
19
x11/sib.cu
@ -17,7 +17,7 @@ extern "C" {
|
|||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
#include "cuda_x11.h"
|
#include "cuda_x11.h"
|
||||||
|
|
||||||
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
@ -104,9 +104,11 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 19 : 18;
|
const int dev_id = device_map[thr_id];
|
||||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
|
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
|
||||||
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
if (device_sm[dev_id] >= 600) intensity = 20;
|
||||||
|
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
|
||||||
|
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||||
|
|
||||||
if (opt_benchmark)
|
if (opt_benchmark)
|
||||||
ptarget[7] = 0xf;
|
ptarget[7] = 0xf;
|
||||||
@ -132,9 +134,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
x11_shavite512_cpu_init(thr_id, throughput);
|
x11_shavite512_cpu_init(thr_id, throughput);
|
||||||
x11_echo512_cpu_init(thr_id, throughput);
|
x11_echo512_cpu_init(thr_id, throughput);
|
||||||
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
|
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
|
||||||
return 0;
|
return -1;
|
||||||
}
|
}
|
||||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
|
||||||
|
|
||||||
cuda_check_cpu_init(thr_id, throughput);
|
cuda_check_cpu_init(thr_id, throughput);
|
||||||
|
|
||||||
@ -165,7 +167,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
TRACE("jh512 :");
|
TRACE("jh512 :");
|
||||||
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
TRACE("keccak :");
|
TRACE("keccak :");
|
||||||
streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
|
||||||
TRACE("gost :");
|
TRACE("gost :");
|
||||||
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
|
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
|
||||||
TRACE("luffa+c:");
|
TRACE("luffa+c:");
|
||||||
@ -186,7 +188,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
|
|
||||||
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
|
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
|
||||||
int res = 1;
|
int res = 1;
|
||||||
// check if there was some other ones...
|
|
||||||
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
|
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
|
||||||
work_set_target_ratio(work, vhash64);
|
work_set_target_ratio(work, vhash64);
|
||||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||||
@ -200,7 +201,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
}
|
}
|
||||||
pdata[19] = foundNonce;
|
pdata[19] = foundNonce;
|
||||||
return res;
|
return res;
|
||||||
} else {
|
} else if (vhash64[7] > Htarg && !opt_quiet) {
|
||||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
||||||
pdata[19] = foundNonce + 1;
|
pdata[19] = foundNonce + 1;
|
||||||
continue;
|
continue;
|
||||||
|
@ -9,29 +9,29 @@ extern "C" {
|
|||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
#include "cuda_x11.h"
|
#include "cuda_x11.h"
|
||||||
|
|
||||||
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
|
|
||||||
|
|
||||||
extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
|
|
||||||
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
|
||||||
|
|
||||||
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
|
|
||||||
extern void skein512_cpu_setBlock_80(void *pdata);
|
extern void skein512_cpu_setBlock_80(void *pdata);
|
||||||
|
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
|
||||||
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
|
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
|
||||||
|
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
|
||||||
|
extern void streebog_set_target(const uint32_t* ptarget);
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
|
|
||||||
|
#define NBN 2
|
||||||
static uint32_t *d_hash[MAX_GPUS];
|
static uint32_t *d_hash[MAX_GPUS];
|
||||||
|
static uint32_t *d_resNonce[MAX_GPUS];
|
||||||
|
|
||||||
// Veltor CPU Hash
|
// veltorcoin CPU Hash
|
||||||
extern "C" void veltorhash(void *output, const void *input)
|
extern "C" void veltorhash(void *output, const void *input)
|
||||||
{
|
{
|
||||||
uint8_t _ALIGN(64) hash[128] = { 0 };
|
unsigned char _ALIGN(128) hash[128] = { 0 };
|
||||||
|
|
||||||
sph_skein512_context ctx_skein;
|
sph_skein512_context ctx_skein;
|
||||||
sph_shavite512_context ctx_shavite;
|
|
||||||
sph_shabal512_context ctx_shabal;
|
|
||||||
sph_gost512_context ctx_gost;
|
sph_gost512_context ctx_gost;
|
||||||
|
sph_shabal512_context ctx_shabal;
|
||||||
|
sph_shavite512_context ctx_shavite;
|
||||||
|
|
||||||
sph_skein512_init(&ctx_skein);
|
sph_skein512_init(&ctx_skein);
|
||||||
sph_skein512(&ctx_skein, input, 80);
|
sph_skein512(&ctx_skein, input, 80);
|
||||||
@ -52,19 +52,18 @@ extern "C" void veltorhash(void *output, const void *input)
|
|||||||
memcpy(output, hash, 32);
|
memcpy(output, hash, 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
//#define _DEBUG
|
|
||||||
#define _DEBUG_PREFIX "veltor"
|
|
||||||
#include "cuda_debug.cuh"
|
|
||||||
|
|
||||||
static bool init[MAX_GPUS] = { 0 };
|
static bool init[MAX_GPUS] = { 0 };
|
||||||
|
|
||||||
extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||||
{
|
{
|
||||||
|
int dev_id = device_map[thr_id];
|
||||||
|
|
||||||
uint32_t *pdata = work->data;
|
uint32_t *pdata = work->data;
|
||||||
uint32_t *ptarget = work->target;
|
uint32_t *ptarget = work->target;
|
||||||
const uint32_t first_nonce = pdata[19];
|
const uint32_t first_nonce = pdata[19];
|
||||||
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 18;
|
int intensity = (device_sm[device_map[thr_id]] > 500) ? 20 : 18;
|
||||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
|
if (strstr(device_name[dev_id], "GTX 10")) intensity = 21;
|
||||||
|
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
|
||||||
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||||
|
|
||||||
if (opt_benchmark)
|
if (opt_benchmark)
|
||||||
@ -79,58 +78,59 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
|
|||||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
|
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
|
||||||
CUDA_LOG_ERROR();
|
CUDA_LOG_ERROR();
|
||||||
}
|
}
|
||||||
|
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
|
||||||
|
|
||||||
quark_skein512_cpu_init(thr_id, throughput);
|
quark_skein512_cpu_init(thr_id, throughput);
|
||||||
x11_shavite512_cpu_init(thr_id, throughput);
|
x11_shavite512_cpu_init(thr_id, throughput);
|
||||||
x14_shabal512_cpu_init(thr_id, throughput);
|
|
||||||
|
|
||||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
|
||||||
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
|
||||||
cuda_check_cpu_init(thr_id, throughput);
|
|
||||||
|
|
||||||
init[thr_id] = true;
|
init[thr_id] = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t endiandata[20];
|
uint32_t _ALIGN(64) h_resNonce[NBN];
|
||||||
|
uint32_t _ALIGN(64) endiandata[20];
|
||||||
for (int k=0; k < 20; k++)
|
for (int k=0; k < 20; k++)
|
||||||
be32enc(&endiandata[k], pdata[k]);
|
be32enc(&endiandata[k], pdata[k]);
|
||||||
|
|
||||||
skein512_cpu_setBlock_80(endiandata);
|
skein512_cpu_setBlock_80(endiandata);
|
||||||
cuda_check_cpu_setTarget(ptarget);
|
|
||||||
|
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
|
||||||
|
streebog_set_target(ptarget);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
int order = 0;
|
int order = 0;
|
||||||
uint32_t foundNonce;
|
|
||||||
|
|
||||||
// Hash with CUDA
|
|
||||||
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
|
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
|
||||||
TRACE("blake :");
|
|
||||||
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
TRACE("shavite:");
|
|
||||||
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
TRACE("shabal :");
|
streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
|
||||||
streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
|
||||||
TRACE("gost :");
|
cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||||
|
|
||||||
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
|
if (h_resNonce[0] != UINT32_MAX)
|
||||||
if (foundNonce != UINT32_MAX)
|
|
||||||
{
|
{
|
||||||
const uint32_t Htarg = ptarget[7];
|
|
||||||
uint32_t _ALIGN(64) vhash[8];
|
uint32_t _ALIGN(64) vhash[8];
|
||||||
be32enc(&endiandata[19], foundNonce);
|
const uint32_t Htarg = ptarget[7];
|
||||||
veltorhash(vhash, endiandata);
|
const uint32_t startNounce = pdata[19];
|
||||||
|
|
||||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
be32enc(&endiandata[19], startNounce + h_resNonce[0]);
|
||||||
|
veltorhash(vhash, endiandata);
|
||||||
|
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||||
|
{
|
||||||
int res = 1;
|
int res = 1;
|
||||||
// check if there was another one...
|
|
||||||
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], res);
|
|
||||||
work_set_target_ratio(work, vhash);
|
work_set_target_ratio(work, vhash);
|
||||||
if (secNonce != 0) {
|
work->nonces[0] = startNounce + h_resNonce[0];
|
||||||
|
if (h_resNonce[1] != UINT32_MAX)
|
||||||
|
{
|
||||||
|
uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
|
||||||
|
gpulog(LOG_DEBUG, thr_id, "Found 2nd nonce: %08x", secNonce);
|
||||||
be32enc(&endiandata[19], secNonce);
|
be32enc(&endiandata[19], secNonce);
|
||||||
veltorhash(vhash, endiandata);
|
veltorhash(vhash, endiandata);
|
||||||
work->nonces[1] = secNonce;
|
work->nonces[1] = secNonce;
|
||||||
|
|
||||||
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
|
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
|
||||||
work_set_target_ratio(work, vhash);
|
work_set_target_ratio(work, vhash);
|
||||||
xchg(work->nonces[1], work->nonces[0]);
|
xchg(work->nonces[1], work->nonces[0]);
|
||||||
@ -139,24 +139,25 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
|
|||||||
}
|
}
|
||||||
res++;
|
res++;
|
||||||
}
|
}
|
||||||
pdata[19] = work->nonces[0] = foundNonce;
|
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan
|
||||||
return res;
|
return res;
|
||||||
} else {
|
}
|
||||||
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
|
else if (vhash[7] > Htarg && !opt_quiet) {
|
||||||
pdata[19] = foundNonce + 1;
|
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[0]);
|
||||||
continue;
|
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
|
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
|
||||||
pdata[19] = max_nonce;
|
pdata[19] = max_nonce;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
pdata[19] += throughput;
|
pdata[19] += throughput;
|
||||||
|
|
||||||
} while (!work_restart[thr_id].restart);
|
} while (!work_restart[thr_id].restart);
|
||||||
|
|
||||||
*hashes_done = pdata[19] - first_nonce;
|
*hashes_done = pdata[19] - first_nonce;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user