Browse Source

veltor update, 10x faster :p

From Alexis work, sib hash rate 200% also..
2upstream
Tanguy Pruvot 8 years ago
parent
commit
36aedbb48e
  1. 6
      ccminer.cpp
  2. 4
      neoscrypt/cuda_vectors.h
  3. 1924
      x11/cuda_streebog.cu
  4. 19
      x11/sib.cu
  5. 89
      x11/veltor.cu

6
ccminer.cpp

@ -246,16 +246,16 @@ Options:\n\
skein Skein SHA2 (Skeincoin)\n\ skein Skein SHA2 (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\ skein2 Double Skein (Woodcoin)\n\
s3 S3 (1Coin)\n\ s3 S3 (1Coin)\n\
vanilla Blake256-8 (VNL)\n\
veltor Thorsriddle streebog\n\ veltor Thorsriddle streebog\n\
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
whirlpool Whirlpool algo\n\
x11evo Permuted x11 (Revolver)\n\ x11evo Permuted x11 (Revolver)\n\
x11 X11 (DarkCoin)\n\ x11 X11 (DarkCoin)\n\
x13 X13 (MaruCoin)\n\ x13 X13 (MaruCoin)\n\
x14 X14\n\ x14 X14\n\
x15 X15\n\ x15 X15\n\
x17 X17\n\ x17 X17\n\
vanilla Blake256-8 (VNL)\n\
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
whirlpool Whirlpool algo\n\
zr5 ZR5 (ZiftrCoin)\n\ zr5 ZR5 (ZiftrCoin)\n\
-d, --devices Comma separated list of CUDA devices to use.\n\ -d, --devices Comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\ Device IDs start counting from 0! Alternatively takes\n\

4
neoscrypt/cuda_vectors.h

@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
// require a uint32_t[9] ret array // require a uint32_t[9] ret array
// note: djm neoscrypt implementation is near the limits of gpu capabilities // note: djm neoscrypt implementation is near the limits of gpu capabilities
// and weird behaviors can happen when tuning device functions code... // and weird behaviors can happen when tuning device functions code...
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) __device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{ {
uint8_t *v = (uint8_t*) &vec4.s0; uint8_t *v = (uint8_t*) &vec4.s0;
uint8_t *r = (uint8_t*) ret; uint8_t *r = (uint8_t*) ret;
@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
#else #else
// same for SM 3.5+, really faster ? // same for SM 3.5+, really faster ?
__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) __device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{ {
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));

1924
x11/cuda_streebog.cu

File diff suppressed because it is too large Load Diff

19
x11/sib.cu

@ -17,7 +17,7 @@ extern "C" {
#include "cuda_helper.h" #include "cuda_helper.h"
#include "cuda_x11.h" #include "cuda_x11.h"
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
@ -104,9 +104,11 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 19 : 18; const int dev_id = device_map[thr_id];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (device_sm[dev_id] >= 600) intensity = 20;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
ptarget[7] = 0xf; ptarget[7] = 0xf;
@ -132,9 +134,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
x11_shavite512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput);
if (x11_simd512_cpu_init(thr_id, throughput) != 0) { if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
return 0; return -1;
} }
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
cuda_check_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput);
@ -165,7 +167,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
TRACE("jh512 :"); TRACE("jh512 :");
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :"); TRACE("keccak :");
streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
TRACE("gost :"); TRACE("gost :");
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
TRACE("luffa+c:"); TRACE("luffa+c:");
@ -186,7 +188,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
int res = 1; int res = 1;
// check if there was some other ones...
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
work_set_target_ratio(work, vhash64); work_set_target_ratio(work, vhash64);
*hashes_done = pdata[19] - first_nonce + throughput; *hashes_done = pdata[19] - first_nonce + throughput;
@ -200,7 +201,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
} }
pdata[19] = foundNonce; pdata[19] = foundNonce;
return res; return res;
} else { } else if (vhash64[7] > Htarg && !opt_quiet) {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
pdata[19] = foundNonce + 1; pdata[19] = foundNonce + 1;
continue; continue;

89
x11/veltor.cu

@ -9,29 +9,29 @@ extern "C" {
#include "cuda_helper.h" #include "cuda_helper.h"
#include "cuda_x11.h" #include "cuda_x11.h"
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_setBlock_80(void *pdata);
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
extern void streebog_set_target(const uint32_t* ptarget);
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#define NBN 2
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS];
// Veltor CPU Hash // veltorcoin CPU Hash
extern "C" void veltorhash(void *output, const void *input) extern "C" void veltorhash(void *output, const void *input)
{ {
uint8_t _ALIGN(64) hash[128] = { 0 }; unsigned char _ALIGN(128) hash[128] = { 0 };
sph_skein512_context ctx_skein; sph_skein512_context ctx_skein;
sph_shavite512_context ctx_shavite;
sph_shabal512_context ctx_shabal;
sph_gost512_context ctx_gost; sph_gost512_context ctx_gost;
sph_shabal512_context ctx_shabal;
sph_shavite512_context ctx_shavite;
sph_skein512_init(&ctx_skein); sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, input, 80); sph_skein512(&ctx_skein, input, 80);
@ -52,19 +52,18 @@ extern "C" void veltorhash(void *output, const void *input)
memcpy(output, hash, 32); memcpy(output, hash, 32);
} }
//#define _DEBUG
#define _DEBUG_PREFIX "veltor"
#include "cuda_debug.cuh"
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{ {
int dev_id = device_map[thr_id];
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 18; int intensity = (device_sm[device_map[thr_id]] > 500) ? 20 : 18;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; if (strstr(device_name[dev_id], "GTX 10")) intensity = 21;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
@ -79,58 +78,59 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR(); CUDA_LOG_ERROR();
} }
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
quark_skein512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
uint32_t endiandata[20]; uint32_t _ALIGN(64) h_resNonce[NBN];
uint32_t _ALIGN(64) endiandata[20];
for (int k=0; k < 20; k++) for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]); be32enc(&endiandata[k], pdata[k]);
skein512_cpu_setBlock_80(endiandata); skein512_cpu_setBlock_80(endiandata);
cuda_check_cpu_setTarget(ptarget);
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
streebog_set_target(ptarget);
do { do {
int order = 0; int order = 0;
uint32_t foundNonce;
// Hash with CUDA
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
TRACE("blake :");
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shavite:");
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shabal :"); streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
TRACE("gost :"); cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
*hashes_done = pdata[19] - first_nonce + throughput; *hashes_done = pdata[19] - first_nonce + throughput;
foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (h_resNonce[0] != UINT32_MAX)
if (foundNonce != UINT32_MAX)
{ {
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8]; uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], foundNonce); const uint32_t Htarg = ptarget[7];
veltorhash(vhash, endiandata); const uint32_t startNounce = pdata[19];
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { be32enc(&endiandata[19], startNounce + h_resNonce[0]);
veltorhash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
int res = 1; int res = 1;
// check if there was another one...
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], res);
work_set_target_ratio(work, vhash); work_set_target_ratio(work, vhash);
if (secNonce != 0) { work->nonces[0] = startNounce + h_resNonce[0];
if (h_resNonce[1] != UINT32_MAX)
{
uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
gpulog(LOG_DEBUG, thr_id, "Found 2nd nonce: %08x", secNonce);
be32enc(&endiandata[19], secNonce); be32enc(&endiandata[19], secNonce);
veltorhash(vhash, endiandata); veltorhash(vhash, endiandata);
work->nonces[1] = secNonce; work->nonces[1] = secNonce;
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) { if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
work_set_target_ratio(work, vhash); work_set_target_ratio(work, vhash);
xchg(work->nonces[1], work->nonces[0]); xchg(work->nonces[1], work->nonces[0]);
@ -139,24 +139,25 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
} }
res++; res++;
} }
pdata[19] = work->nonces[0] = foundNonce; pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan
return res; return res;
} else { }
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); else if (vhash[7] > Htarg && !opt_quiet) {
pdata[19] = foundNonce + 1; gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[0]);
continue; cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
} }
} }
if ((uint64_t) throughput + pdata[19] >= max_nonce) { if ((uint64_t) throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce; pdata[19] = max_nonce;
break; break;
} }
pdata[19] += throughput; pdata[19] += throughput;
} while (!work_restart[thr_id].restart); } while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce; *hashes_done = pdata[19] - first_nonce;
return 0; return 0;
} }

Loading…
Cancel
Save