Browse Source

streebog: apply skunk improvements to veltor

pull/2/head
Tanguy Pruvot 7 years ago
parent
commit
1e71dc5782
  1. 2
      configure.ac
  2. 2
      skunk/cuda_skunk_streebog.cu
  3. 12
      skunk/skunk.cu
  4. 5
      x11/cuda_streebog.cu
  5. 29
      x11/veltor.cu

2
configure.ac

@ -1,4 +1,4 @@
AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_INIT([ccminer], [2.2.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

2
skunk/cuda_skunk_streebog.cu

@ -204,7 +204,7 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin
__constant__ uint64_t target64[4]; __constant__ uint64_t target64[4];
__host__ __host__
void skunk_set_target(uint32_t* ptarget) void skunk_streebog_set_target(uint32_t* ptarget)
{ {
cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
} }

12
skunk/skunk.cu

@ -23,12 +23,12 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_free(int thr_id); extern void x13_fugue512_cpu_free(int thr_id);
extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); extern void streebog_sm3_set_target(uint32_t* ptarget);
extern void streebog_set_target(const uint32_t* ptarget); extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
// krnlx merged kernel (for high-end cards only) // krnlx merged kernel (for high-end cards only)
extern void skunk_cpu_init(int thr_id, uint32_t threads); extern void skunk_cpu_init(int thr_id, uint32_t threads);
extern void skunk_set_target(uint32_t* ptarget); extern void skunk_streebog_set_target(uint32_t* ptarget);
extern void skunk_setBlock_80(int thr_id, void *pdata); extern void skunk_setBlock_80(int thr_id, void *pdata);
extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
@ -117,10 +117,10 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
if (use_compat_kernels[thr_id]) { if (use_compat_kernels[thr_id]) {
skein512_cpu_setBlock_80(endiandata); skein512_cpu_setBlock_80(endiandata);
streebog_set_target(ptarget); streebog_sm3_set_target(ptarget);
} else { } else {
skunk_setBlock_80(thr_id, endiandata); skunk_setBlock_80(thr_id, endiandata);
skunk_set_target(ptarget); skunk_streebog_set_target(ptarget);
} }
do { do {
@ -129,7 +129,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
} else { } else {
skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);

5
x11/cuda_streebog.cu

@ -806,10 +806,11 @@ void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
#define T6(x) shared[6][x] #define T6(x) shared[6][x]
#define T7(x) shared[7][x] #define T7(x) shared[7][x]
// Streebog final for Veltor and skunk on SM 3.x
__constant__ uint64_t target64[4]; __constant__ uint64_t target64[4];
__host__ __host__
void streebog_set_target(const uint32_t* ptarget) void streebog_sm3_set_target(uint32_t* ptarget)
{ {
cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice); cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice);
} }
@ -995,7 +996,7 @@ void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce)
} }
__host__ __host__
void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce) void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
{ {
dim3 grid((threads + TPB-1) / TPB); dim3 grid((threads + TPB-1) / TPB);
dim3 block(TPB); dim3 block(TPB);

29
x11/veltor.cu

@ -10,11 +10,17 @@ extern "C" {
#include "cuda_x11.h" #include "cuda_x11.h"
extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_setBlock_80(void *pdata);
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
extern void streebog_set_target(const uint32_t* ptarget); // for SM3.x
extern void streebog_sm3_set_target(uint32_t* ptarget);
extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
// for latest cards only
extern void skunk_cpu_init(int thr_id, uint32_t threads);
extern void skunk_streebog_set_target(uint32_t* ptarget);
extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
@ -23,7 +29,7 @@ extern void streebog_set_target(const uint32_t* ptarget);
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS]; static uint32_t *d_resNonce[MAX_GPUS];
// veltorcoin CPU Hash // veltor CPU Hash
extern "C" void veltorhash(void *output, const void *input) extern "C" void veltorhash(void *output, const void *input)
{ {
unsigned char _ALIGN(128) hash[128] = { 0 }; unsigned char _ALIGN(128) hash[128] = { 0 };
@ -53,6 +59,7 @@ extern "C" void veltorhash(void *output, const void *input)
} }
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{ {
@ -80,7 +87,9 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
} }
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
quark_skein512_cpu_init(thr_id, throughput); skunk_cpu_init(thr_id, throughput);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
x11_shavite512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
@ -97,14 +106,20 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
skein512_cpu_setBlock_80(endiandata); skein512_cpu_setBlock_80(endiandata);
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
streebog_set_target(ptarget); if(use_compat_kernels[thr_id])
streebog_sm3_set_target(ptarget);
else
skunk_streebog_set_target(ptarget);
do { do {
int order = 0; int order = 0;
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); if(use_compat_kernels[thr_id])
streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
else
skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);

Loading…
Cancel
Save