Browse Source

Add whirlcoin and optimize x11 luffa (maxrregcount)

2upstream
Tanguy Pruvot 10 years ago
parent
commit
1fbcbbacc4
  1. 17
      Makefile.am
  2. 1
      config.sh
  3. 8
      cpu-miner.c
  4. 5
      cuda_helper.h
  5. 5
      miner.h
  6. 6
      quark/cuda_bmw512.cu
  7. 12
      quark/cuda_quark_keccak512.cu
  8. 2
      quark/cuda_skein512.cu
  9. 4
      util.c
  10. 10
      x11/cuda_x11_luffa512.cu
  11. 117
      x15/whirlcoin.cu

17
Makefile.am

@ -40,23 +40,30 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlcoin.cu
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME
nvcc_FLAGS = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" -I . -Xptxas "-v" --ptxas-options=-v
nvcc_FLAGS += $(JANSSON_INCLUDES)
# we're now targeting all major compute architectures within one binary. # we're now targeting all major compute architectures within one binary.
.cu.o: .cu.o:
$(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $<
# Luffa is faster with 80 registers than 128
x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
$(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $<
# Shavite compiles faster with 128 regs # Shavite compiles faster with 128 regs
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ --maxrregcount=128 -o $@ -c $<
# ABI requiring code modules # ABI requiring code modules
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_50,code=\"sm_50,compute_50\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $<

1
config.sh

@ -7,6 +7,7 @@
make clean || echo clean make clean || echo clean
rm -f Makefile.in
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done

8
cpu-miner.c

@ -134,6 +134,7 @@ typedef enum {
ALGO_ANIME, ALGO_ANIME,
ALGO_FRESH, ALGO_FRESH,
ALGO_NIST5, ALGO_NIST5,
ALGO_WHC,
ALGO_X11, ALGO_X11,
ALGO_X13, ALGO_X13,
ALGO_X14, ALGO_X14,
@ -152,6 +153,7 @@ static const char *algo_names[] = {
"anime", "anime",
"fresh", "fresh",
"nist5", "nist5",
"whirlcoin",
"x11", "x11",
"x13", "x13",
"x14", "x14",
@ -229,6 +231,7 @@ Options:\n\
anime Animecoin hash\n\ anime Animecoin hash\n\
fresh Freshcoin hash (shavite 80)\n\ fresh Freshcoin hash (shavite 80)\n\
nist5 NIST5 (TalkCoin) hash\n\ nist5 NIST5 (TalkCoin) hash\n\
whirlcoin Whirlcoin hash\n\
x11 X11 (DarkCoin) hash\n\ x11 X11 (DarkCoin) hash\n\
x13 X13 (MaruCoin) hash\n\ x13 X13 (MaruCoin) hash\n\
x14 X14 hash\n\ x14 X14 hash\n\
@ -931,6 +934,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done); max_nonce, &hashes_done);
break; break;
case ALGO_WHC:
rc = scanhash_whc(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_X11: case ALGO_X11:
rc = scanhash_x11(thr_id, work.data, work.target, rc = scanhash_x11(thr_id, work.data, work.target,
max_nonce, &hashes_done); max_nonce, &hashes_done);

5
cuda_helper.h

@ -238,7 +238,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
// 64-bit ROTATE RIGHT // 64-bit ROTATE RIGHT
#ifdef DJM_SM35_ROT64 #if __CUDA_ARCH__ >= 350
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
__device__ __forceinline__ __device__ __forceinline__
uint64_t ROTR64(const uint64_t value, const int offset) { uint64_t ROTR64(const uint64_t value, const int offset) {
@ -274,8 +274,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
#endif #endif
// 64-bit ROTATE LEFT // 64-bit ROTATE LEFT
#ifdef DJM_SM35_ROT64 #if __CUDA_ARCH__ >= 350
/* complicated sm >= 3.5 one, to bench */
__device__ __forceinline__ __device__ __forceinline__
uint64_t ROTL64(const uint64_t value, const int offset) { uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result; uint2 result;

5
miner.h

@ -242,6 +242,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
extern int scanhash_whc(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_x11(int thr_id, uint32_t *pdata, extern int scanhash_x11(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
@ -362,6 +366,7 @@ void myriadhash(void *state, const void *input);
void fresh_hash(void *state, const void *input); void fresh_hash(void *state, const void *input);
void nist5hash(void *state, const void *input); void nist5hash(void *state, const void *input);
void quarkhash(void *state, const void *input); void quarkhash(void *state, const void *input);
void wcoinhash(void *state, const void *input);
void x11hash(void *output, const void *input); void x11hash(void *output, const void *input);
void x13hash(void *output, const void *input); void x13hash(void *output, const void *input);
void x14hash(void *output, const void *input); void x14hash(void *output, const void *input);

6
quark/cuda_bmw512.cu

@ -1,5 +1,3 @@
#if 1
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
@ -11,9 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// die Message it Padding zur Berechnung auf der GPU // die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define SHL(x, n) ((x) << (n)) #define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n)) #define SHR(x, n) ((x) >> (n))
@ -320,4 +315,3 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
#endif

12
quark/cuda_quark_keccak512.cu

@ -3,7 +3,7 @@
#include "cuda_helper.h" #include "cuda_helper.h"
// aus heavy.cu // heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#define U32TO64_LE(p) \ #define U32TO64_LE(p) \
@ -12,8 +12,7 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
#define U64TO32_LE(p, v) \ #define U64TO32_LE(p, v) \
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32); *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
__device__ __constant__ static const uint64_t host_keccak_round_constants[24] = {
static const uint64_t c_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull, 0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull, 0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull, 0x000000000000808bull, 0x0000000080000001ull,
@ -28,6 +27,8 @@ static const uint64_t c_keccak_round_constants[24] = {
0x0000000080000001ull, 0x8000000080008008ull 0x0000000080000001ull, 0x8000000080008008ull
}; };
__constant__ uint64_t c_keccak_round_constants[24];
static __device__ __forceinline__ void static __device__ __forceinline__ void
keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) { keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
size_t i; size_t i;
@ -147,6 +148,11 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_keccak512_cpu_init(int thr_id, int threads) __host__ void quark_keccak512_cpu_init(int thr_id, int threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants,
host_keccak_round_constants,
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice);
} }
__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

2
quark/cuda_skein512.cu

@ -289,7 +289,7 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
} }
static __constant__ uint64_t d_constMem[8]; static __constant__ uint64_t d_constMem[8];
static uint64_t h_constMem[8] = { static const uint64_t h_constMem[8] = {
SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x4903ADFF749C51CE),
SPH_C64(0x0D95DE399746DF03), SPH_C64(0x0D95DE399746DF03),
SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x8FD1934127C79BCE),

4
util.c

@ -1364,6 +1364,10 @@ void print_hash_tests(void)
fresh_hash(&hash[0], &buf[0]); fresh_hash(&hash[0], &buf[0]);
printf("\nfresh: "); print_hash(hash); printf("\nfresh: "); print_hash(hash);
memset(hash, 0, sizeof hash);
wcoinhash(&hash[0], &buf[0]);
printf("\nwhirlc: "); print_hash(hash);
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
x11hash(&hash[0], &buf[0]); x11hash(&hash[0], &buf[0]);
printf("\nX11: "); print_hash(hash); printf("\nX11: "); print_hash(hash);

10
x11/cuda_x11_luffa512.cu

@ -90,8 +90,8 @@ typedef struct {
b0 ^= c1; b0 ^= c1;
/* initial values of chaining variables */ /* initial values of chaining variables */
__device__ __constant__ __device__ __constant__ uint32_t c_IV[40];
const uint32_t c_IV[40] = { const uint32_t h_IV[40] = {
0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
0x6e292011,0x90152df4,0xee058139,0xdef610bb, 0x6e292011,0x90152df4,0xee058139,0xdef610bb,
0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
@ -103,8 +103,8 @@ const uint32_t c_IV[40] = {
0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
__device__ __constant__ __device__ __constant__ uint32_t c_CNS[80];
uint32_t c_CNS[80] = { const uint32_t h_CNS[80] = {
0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
@ -356,6 +356,8 @@ __global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint
// Setup-Funktionen // Setup-Funktionen
__host__ void x11_luffa512_cpu_init(int thr_id, int threads) __host__ void x11_luffa512_cpu_init(int thr_id, int threads)
{ {
cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice);
} }
__host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)

117
x15/whirlcoin.cu

@ -0,0 +1,117 @@
/*
* whirlpool routine (djm)
*/
extern "C"
{
#include "sph/sph_whirlpool.h"
#include "miner.h"
}
// from cpu-miner.c
extern int device_map[8];
extern bool opt_benchmark;
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);
extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
extern void whirlpool512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void cuda_check_cpu_init(int thr_id, int threads);
extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// CPU Hash function
extern "C" void wcoinhash(void *state, const void *input)
{
sph_whirlpool_context ctx_whirlpool;
uint32_t hash[16];
// shavite 1
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, input, 80);
sph_whirlpool1_close(&ctx_whirlpool, (void*) hash);
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64);
sph_whirlpool1_close(&ctx_whirlpool, (void*) hash);
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64);
sph_whirlpool1_close(&ctx_whirlpool, (void*) hash);
sph_whirlpool1_init(&ctx_whirlpool);
sph_whirlpool1(&ctx_whirlpool, (const void*) hash, 64);
sph_whirlpool1_close(&ctx_whirlpool, (void*) hash);
memcpy(state, hash, 32);
}
extern "C" int scanhash_whc(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
const int throughput = 256*256*8;
static bool init[8] = {0,0,0,0,0,0,0,0};
uint32_t endiandata[20];
uint32_t Htarg = ptarget[7];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
if (!init[thr_id]) {
cudaSetDevice(device_map[thr_id]);
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
x15_whirlpool_cpu_init(thr_id, throughput,1);
init[thr_id] = true;
}
for (int k=0; k < 20; k++) {
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
}
whirlpool512_setBlock_80((void*)endiandata, ptarget);
do {
uint32_t foundNonce;
int order = 0;
whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
wcoinhash(vhash64, endiandata);
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
{
pdata[19] = foundNonce;
*hashes_done = foundNonce - first_nonce + 1;
return 1;
}
else if (vhash64[7] > Htarg) {
applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
}
else {
applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
Loading…
Cancel
Save