Browse Source

veltor update, 10x faster :p

From Alexis work, sib hash rate 200% also..
2upstream
Tanguy Pruvot 8 years ago
parent
commit
36aedbb48e
  1. 6
      ccminer.cpp
  2. 4
      neoscrypt/cuda_vectors.h
  3. 1924
      x11/cuda_streebog.cu
  4. 19
      x11/sib.cu
  5. 89
      x11/veltor.cu

6
ccminer.cpp

@ -246,16 +246,16 @@ Options:\n\ @@ -246,16 +246,16 @@ Options:\n\
skein Skein SHA2 (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
s3 S3 (1Coin)\n\
vanilla Blake256-8 (VNL)\n\
veltor Thorsriddle streebog\n\
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
whirlpool Whirlpool algo\n\
x11evo Permuted x11 (Revolver)\n\
x11 X11 (DarkCoin)\n\
x13 X13 (MaruCoin)\n\
x14 X14\n\
x15 X15\n\
x17 X17\n\
vanilla Blake256-8 (VNL)\n\
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
whirlpool Whirlpool algo\n\
zr5 ZR5 (ZiftrCoin)\n\
-d, --devices Comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\

4
neoscrypt/cuda_vectors.h

@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift @@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
// require a uint32_t[9] ret array
// note: djm neoscrypt implementation is near the limits of gpu capabilities
// and weird behaviors can happen when tuning device functions code...
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{
uint8_t *v = (uint8_t*) &vec4.s0;
uint8_t *r = (uint8_t*) ret;
@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) @@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
#else
// same for SM 3.5+, really faster ?
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));

1924
x11/cuda_streebog.cu

File diff suppressed because it is too large Load Diff

19
x11/sib.cu

@ -17,7 +17,7 @@ extern "C" { @@ -17,7 +17,7 @@ extern "C" {
#include "cuda_helper.h"
#include "cuda_x11.h"
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
#include <stdio.h>
#include <memory.h>
@ -104,9 +104,11 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u @@ -104,9 +104,11 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 19 : 18;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
if (device_sm[dev_id] >= 600) intensity = 20;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
ptarget[7] = 0xf;
@ -132,9 +134,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u @@ -132,9 +134,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
x11_shavite512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
return 0;
return -1;
}
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
cuda_check_cpu_init(thr_id, throughput);
@ -165,7 +167,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u @@ -165,7 +167,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
TRACE("jh512 :");
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :");
streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
TRACE("gost :");
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
TRACE("luffa+c:");
@ -186,7 +188,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u @@ -186,7 +188,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
int res = 1;
// check if there was some other ones...
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
work_set_target_ratio(work, vhash64);
*hashes_done = pdata[19] - first_nonce + throughput;
@ -200,7 +201,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u @@ -200,7 +201,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
}
pdata[19] = foundNonce;
return res;
} else {
} else if (vhash64[7] > Htarg && !opt_quiet) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
pdata[19] = foundNonce + 1;
continue;

89
x11/veltor.cu

@ -9,29 +9,29 @@ extern "C" { @@ -9,29 +9,29 @@ extern "C" {
#include "cuda_helper.h"
#include "cuda_x11.h"
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void skein512_cpu_setBlock_80(void *pdata);
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
extern void streebog_set_target(const uint32_t* ptarget);
#include <stdio.h>
#include <memory.h>
#define NBN 2
static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS];
// Veltor CPU Hash
// veltorcoin CPU Hash
extern "C" void veltorhash(void *output, const void *input)
{
uint8_t _ALIGN(64) hash[128] = { 0 };
unsigned char _ALIGN(128) hash[128] = { 0 };
sph_skein512_context ctx_skein;
sph_shavite512_context ctx_shavite;
sph_shabal512_context ctx_shabal;
sph_gost512_context ctx_gost;
sph_shabal512_context ctx_shabal;
sph_shavite512_context ctx_shavite;
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, input, 80);
@ -52,19 +52,18 @@ extern "C" void veltorhash(void *output, const void *input) @@ -52,19 +52,18 @@ extern "C" void veltorhash(void *output, const void *input)
memcpy(output, hash, 32);
}
//#define _DEBUG
#define _DEBUG_PREFIX "veltor"
#include "cuda_debug.cuh"
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
int dev_id = device_map[thr_id];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 18;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
int intensity = (device_sm[device_map[thr_id]] > 500) ? 20 : 18;
if (strstr(device_name[dev_id], "GTX 10")) intensity = 21;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
@ -79,58 +78,59 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce @@ -79,58 +78,59 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
quark_skein512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
cuda_check_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
init[thr_id] = true;
}
uint32_t endiandata[20];
uint32_t _ALIGN(64) h_resNonce[NBN];
uint32_t _ALIGN(64) endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
skein512_cpu_setBlock_80(endiandata);
cuda_check_cpu_setTarget(ptarget);
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
streebog_set_target(ptarget);
do {
int order = 0;
uint32_t foundNonce;
// Hash with CUDA
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
TRACE("blake :");
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shavite:");
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shabal :");
streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
TRACE("gost :");
streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
*hashes_done = pdata[19] - first_nonce + throughput;
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != UINT32_MAX)
if (h_resNonce[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], foundNonce);
veltorhash(vhash, endiandata);
const uint32_t Htarg = ptarget[7];
const uint32_t startNounce = pdata[19];
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
be32enc(&endiandata[19], startNounce + h_resNonce[0]);
veltorhash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
int res = 1;
// check if there was another one...
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], res);
work_set_target_ratio(work, vhash);
if (secNonce != 0) {
work->nonces[0] = startNounce + h_resNonce[0];
if (h_resNonce[1] != UINT32_MAX)
{
uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
gpulog(LOG_DEBUG, thr_id, "Found 2nd nonce: %08x", secNonce);
be32enc(&endiandata[19], secNonce);
veltorhash(vhash, endiandata);
work->nonces[1] = secNonce;
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
work_set_target_ratio(work, vhash);
xchg(work->nonces[1], work->nonces[0]);
@ -139,24 +139,25 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce @@ -139,24 +139,25 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
}
res++;
}
pdata[19] = work->nonces[0] = foundNonce;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan
return res;
} else {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
pdata[19] = foundNonce + 1;
continue;
}
else if (vhash[7] > Htarg && !opt_quiet) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[0]);
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
}
}
if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}

Loading…
Cancel
Save