From c3eb66683a613f10d7e61b9de739485c39377db5 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 9 Sep 2014 21:56:25 +0200 Subject: [PATCH] Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot (Which is tired to remove these german copy/pasted comments) --- Makefile.am | 9 +- README.txt | 11 +- configure.ac | 2 +- cpu-miner.c | 39 +++- cpuminer-config.h | 6 +- cuda_helper.h | 4 + miner.h | 25 +- qubit/deep.cu | 120 ++++++++++ qubit/doom.cu | 93 ++++++++ qubit/qubit.cu | 146 ++++++++++++ qubit/qubit_luffa512.cu | 496 ++++++++++++++++++++++++++++++++++++++++ util.c | 12 + 12 files changed, 943 insertions(+), 20 deletions(-) create mode 100644 qubit/deep.cu create mode 100644 qubit/doom.cu create mode 100644 qubit/qubit.cu create mode 100644 qubit/qubit_luffa512.cu diff --git a/Makefile.am b/Makefile.am index c2fa11d..2067074 100644 --- a/Makefile.am +++ b/Makefile.am @@ -31,17 +31,20 @@ ccminer_SOURCES = elist.h miner.h compat.h \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/cuda_jha_compactionTest.cu quark/cuda_checkhash.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ - quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \ - quark/cuda_quark_compactionTest.cu \ + quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \ + quark/quarkcoin.cu quark/animecoin.cu \ + quark/cuda_quark_compactionTest.cu \ cuda_nist5.cu blake32.cu pentablake.cu \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \ + qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/doom.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ - x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlpool.cu \ + x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ + x15/whirlpool.cu \ x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu diff --git a/README.txt b/README.txt index 001ff3f..9a49a0a 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.4.2-tpruvot (Sep 09th 2014) - "Pentablake" +ccMiner release 1.4.3-tpruvot (Sep 11th 2014) - "DJM34 Algos" --------------------------------------------------------------- *************************************************************** @@ -35,6 +35,7 @@ TalkCoin DarkCoin and other X11 coins NEOS blake (256 14-rounds) BlakeCoin (256 8-rounds) +Deep, Doom, Goalcoin and Qubit Pentablake (Blake 512 x5) where some of these coins have a VERY NOTABLE nVidia advantage @@ -56,12 +57,15 @@ its command line interface and options. -a, --algo=ALGO specify the algorithm to use heavy use to mine Heavycoin mjollnir use to mine Mjollnircoin + deep use to mine Deepcoin + doom use to mine Doomcoin fugue256 use to mine Fuguecoin groestl use to mine Groestlcoin dmd-gr use to mine Diamond-Groestl myr-gr use to mine Myriad-Groest jackpot use to mine Jackpotcoin quark use to mine Quarkcoin + qubit use to mine Qubit Algo anime use to mine Animecoin blake use to mine NEOS (Blake 256) blakecoin use to mine Old Blake 256 @@ -103,7 +107,7 @@ its command line interface and options. --benchmark run in offline benchmark mode --cputest debug hashes from cpu algorithms -c, --config=FILE load a JSON-format configuration file - -C, --color display colored output in a linux Terminal + -K, --nocolor disable colored console output -V, --version display version information and exit -h, --help display this help text and exit @@ -154,6 +158,9 @@ features. >>> RELEASE HISTORY <<< + Sep 11th 2O14 + add algos from djm34 (deep,doom,qubit) + Sep. 1st 2014 add X17, optimized x15 and whirl add blake (256 variant) color support on Windows, diff --git a/configure.ac b/configure.ac index a4ef290..3710824 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.09.09]) +AC_INIT([ccminer], [2014.09.11]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index cb6e790..287fdc9 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -129,6 +129,8 @@ typedef enum { ALGO_ANIME, ALGO_BLAKE, ALGO_BLAKECOIN, + ALGO_DEEP, + ALGO_DOOM, ALGO_FRESH, ALGO_FUGUE256, /* Fugue256 */ ALGO_GROESTL, @@ -139,6 +141,7 @@ typedef enum { ALGO_NIST5, ALGO_PENTABLAKE, ALGO_QUARK, + ALGO_QUBIT, ALGO_WHC, ALGO_X11, ALGO_X13, @@ -152,6 +155,8 @@ static const char *algo_names[] = { "anime", "blake", "blakecoin", + "deep", + "doom", "fresh", "fugue256", "groestl", @@ -162,6 +167,7 @@ static const char *algo_names[] = { "nist5", "penta", "quark", + "qubit", "whirl", "x11", "x13", @@ -235,6 +241,8 @@ Options:\n\ anime Animecoin hash\n\ blake Blake 256 (like NEOS blake)\n\ blakecoin Old Blake 256 (8 rounds)\n\ + deep Deepcoin hash\n\ + doom Doomcoin hash\n\ fresh Freshcoin hash (shavite 80)\n\ fugue256 Fuguecoin hash\n\ groestl Groestlcoin hash\n\ @@ -245,6 +253,7 @@ Options:\n\ nist5 NIST5 (TalkCoin) hash\n\ penta Pentablake hash (5x Blake 512)\n\ quark Quark hash\n\ + qubit Qubit hash\n\ whirl Whirlcoin (old whirlpool)\n\ x11 X11 (DarkCoin) hash\n\ x13 X13 (MaruCoin) hash\n\ @@ -275,7 +284,7 @@ Options:\n\ --no-longpoll disable X-Long-Polling support\n\ --no-stratum disable X-Stratum support\n\ -q, --quiet disable per-thread hashmeter output\n\ - -C, --color enable colored output\n\ + -K, --nocolor disable colored output\n\ -D, --debug enable debug output\n\ -P, --protocol-dump verbose dump of protocol-level activities\n" #ifdef HAVE_SYSLOG_H @@ -301,7 +310,7 @@ static char const short_options[] = #ifdef HAVE_SYSLOG_H "S" #endif - "a:c:CDhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:"; + "a:c:CKDhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:"; static struct option const options[] = { { "algo", 1, NULL, 'a' }, @@ -312,7 +321,7 @@ static struct option const options[] = { { "cputest", 0, NULL, 1006 }, { "cert", 1, NULL, 1001 }, { "config", 1, NULL, 'c' }, - { "color", 0, NULL, 'C' }, + { "nocolor", 0, NULL, 'K' }, { "debug", 0, NULL, 'D' }, { "help", 0, NULL, 'h' }, { "no-longpoll", 0, NULL, 1003 }, @@ -1083,6 +1092,16 @@ continue_scan: max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ); break; + case ALGO_DEEP: + rc = scanhash_deep(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_DOOM: + rc = scanhash_doom(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_FUGUE256: rc = scanhash_fugue256(thr_id, work.data, work.target, max_nonce, &hashes_done); @@ -1109,6 +1128,11 @@ continue_scan: max_nonce, &hashes_done); break; + case ALGO_QUBIT: + rc = scanhash_qubit(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_ANIME: rc = scanhash_anime(thr_id, work.data, work.target, max_nonce, &hashes_done); @@ -1427,7 +1451,7 @@ out: return NULL; } -#define PROGRAM_VERSION "1.4.2" +#define PROGRAM_VERSION "1.4.3" static void show_version_and_exit(void) { printf("%s v%s\n" @@ -1489,8 +1513,12 @@ static void parse_arg (int key, char *arg) break; } case 'C': + /* color for compat */ use_colors = true; break; + case 'K': + use_colors = false; + break; case 'D': opt_debug = true; break; @@ -1801,8 +1829,7 @@ int main(int argc, char *argv[]) #endif printf("\t based on pooler-cpuminer 2.3.2 (c) 2010 Jeff Garzik, 2012 pooler\n"); printf("\t and HVC extension from http://hvc.1gh.com/" "\n\n"); - printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n"); - printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n"); + printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n\n"); printf("\tInclude some of djm34 additions, cleaned by Tanguy Pruvot\n"); printf("\t BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n"); diff --git a/cpuminer-config.h b/cpuminer-config.h index 11edf82..4681e46 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -156,7 +156,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 2014.09.06" +#define PACKAGE_STRING "ccminer 2014.09.11" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "ccminer" @@ -165,7 +165,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2014.09.06" +#define PACKAGE_VERSION "2014.09.11" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be @@ -188,7 +188,7 @@ #define USE_XOP 1 /* Version number of package */ -#define VERSION "2014.09.06" +#define VERSION "2014.09.11" /* Define curl_free() as free() if our version of curl lacks curl_free. */ /* #undef curl_free */ diff --git a/cuda_helper.h b/cuda_helper.h index 66c8e7d..9e10968 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -12,6 +12,10 @@ #include +// common functions +extern void cuda_check_cpu_init(int thr_id, int threads); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern __device__ __device_builtin__ void __syncthreads(void); diff --git a/miner.h b/miner.h index d9d29a8..d6127e2 100644 --- a/miner.h +++ b/miner.h @@ -205,13 +205,13 @@ extern int scanhash_sha256d(int thr_id, uint32_t *pdata, extern unsigned char *scrypt_buffer_alloc(); -extern int scanhash_scrypt(int thr_id, uint32_t *pdata, - unsigned char *scratchbuf, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_deep(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); -extern int scanhash_heavy(int thr_id, uint32_t *pdata, +extern int scanhash_doom(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote, int blocklen); + unsigned long *hashes_done); extern int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, @@ -221,6 +221,10 @@ extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_heavy(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done, uint32_t maxvote, int blocklen); + extern int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -253,6 +257,14 @@ extern int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_qubit(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + +extern int scanhash_scrypt(int thr_id, uint32_t *pdata, + unsigned char *scratchbuf, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done); + extern int scanhash_whc(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -426,6 +438,8 @@ void applog_compare_hash(unsigned char *hash, unsigned char *hash2); void print_hash_tests(void); void animehash(void *state, const void *input); void blake256hash(void *output, const void *input, int rounds); +void deephash(void *state, const void *input); +void doomhash(void *state, const void *input); void fresh_hash(void *state, const void *input); void fugue256_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); @@ -435,6 +449,7 @@ void myriadhash(void *state, const void *input); void nist5hash(void *state, const void *input); void pentablakehash(void *output, const void *input); void quarkhash(void *state, const void *input); +void qubithash(void *state, const void *input); void wcoinhash(void *state, const void *input); void x11hash(void *output, const void *input); void x13hash(void *output, const void *input); diff --git a/qubit/deep.cu b/qubit/deep.cu new file mode 100644 index 0000000..8e9a8be --- /dev/null +++ b/qubit/deep.cu @@ -0,0 +1,120 @@ +/* + * deepcoin algorithm + * + */ +extern "C" { +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" +} + +#include "miner.h" + +#include "cuda_helper.h" + +extern int device_map[8]; + +static uint32_t *d_hash[8]; + +extern void qubit_luffa512_cpu_init(int thr_id, int threads); +extern void qubit_luffa512_cpu_setBlock_80(void *pdata); +extern void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget); +extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); + +extern void x11_cubehash512_cpu_init(int thr_id, int threads); +extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x11_echo512_cpu_init(int thr_id, int threads); +extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern "C" void deephash(void *state, const void *input) +{ + // luffa1-cubehash2-shavite3-simd4-echo5 + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_echo512_context ctx_echo; + + uint8_t hash[64]; + + sph_luffa512_init(&ctx_luffa); + sph_luffa512 (&ctx_luffa, input, 80); + sph_luffa512_close(&ctx_luffa, (void*) hash); + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*) hash); + + sph_echo512_init(&ctx_echo); + sph_echo512 (&ctx_echo, (const void*) hash, 64); + sph_echo512_close(&ctx_echo, (void*) hash); + + memcpy(state, hash, 32); +} + + +extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + const int throughput = 256*256*8*8; + static bool init[8] = {0,0,0,0,0,0,0,0}; + uint32_t endiandata[20]; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + + qubit_luffa512_cpu_init(thr_id, throughput); + x11_cubehash512_cpu_init(thr_id, throughput); + x11_echo512_cpu_init(thr_id, throughput); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget); + cuda_check_cpu_setTarget(ptarget); + + do { + const uint32_t Htarg = ptarget[7]; + int order = 0; + + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + deephash(vhash64, endiandata); + + if (vhash64[7]<=Htarg && fulltest(vhash64, ptarget) ) + { + pdata[19] = foundNonce; + *hashes_done = foundNonce - first_nonce + 1; + return 1; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); + } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} diff --git a/qubit/doom.cu b/qubit/doom.cu new file mode 100644 index 0000000..30719c7 --- /dev/null +++ b/qubit/doom.cu @@ -0,0 +1,93 @@ +/* + * qubit algorithm + * + */ +extern "C" { +#include "sph/sph_luffa.h" +} + +#include "miner.h" + +#include "cuda_helper.h" + +extern int device_map[8]; + +static uint32_t *d_hash[8]; + +extern void qubit_luffa512_cpu_init(int thr_id, int threads); +extern void qubit_luffa512_cpu_setBlock_80(void *pdata); +extern void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget); +extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); + +extern void doomhash(void *state, const void *input) +{ + // luffa512 + sph_luffa512_context ctx_luffa; + + uint8_t hash[64]; + + sph_luffa512_init(&ctx_luffa); + sph_luffa512 (&ctx_luffa, input, 80); + sph_luffa512_close(&ctx_luffa, (void*) hash); + + memcpy(state, hash, 32); +} + + +extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + const int throughput = 256*256*8*8; + static bool init[8] = {0,0,0,0,0,0,0,0}; + uint32_t endiandata[20]; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + + cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + + qubit_luffa512_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget); + + do { + const uint32_t Htarg = ptarget[7]; + int order = 0; + + uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + doomhash(vhash64, endiandata); + + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget) ) + { + pdata[19] = foundNonce; + *hashes_done = foundNonce - first_nonce + 1; + return 1; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); + } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} diff --git a/qubit/qubit.cu b/qubit/qubit.cu new file mode 100644 index 0000000..11eb0d1 --- /dev/null +++ b/qubit/qubit.cu @@ -0,0 +1,146 @@ +/* + * qubit algorithm + * + */ +extern "C" { +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" +} + +#include "miner.h" + +#include "cuda_helper.h" + +extern int device_map[8]; + +static uint32_t *d_hash[8]; + +extern void qubit_luffa512_cpu_init(int thr_id, int threads); +extern void qubit_luffa512_cpu_setBlock_80(void *pdata); +extern void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); + +extern void x11_cubehash512_cpu_init(int thr_id, int threads); +extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x11_shavite512_cpu_init(int thr_id, int threads); +extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x11_simd512_cpu_init(int thr_id, int threads); +extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x11_echo512_cpu_init(int thr_id, int threads); +extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_compactTest_cpu_init(int thr_id, int threads); +extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, + uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, + int order); + +extern "C" void qubithash(void *state, const void *input) +{ + // luffa1-cubehash2-shavite3-simd4-echo5 + + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + + uint8_t hash[64]; + + sph_luffa512_init(&ctx_luffa); + sph_luffa512 (&ctx_luffa, input, 80); + sph_luffa512_close(&ctx_luffa, (void*) hash); + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*) hash); + + sph_shavite512_init(&ctx_shavite); + sph_shavite512 (&ctx_shavite, (const void*) hash, 64); + sph_shavite512_close(&ctx_shavite, (void*) hash); + + sph_simd512_init(&ctx_simd); + sph_simd512 (&ctx_simd, (const void*) hash, 64); + sph_simd512_close(&ctx_simd, (void*) hash); + + sph_echo512_init(&ctx_echo); + sph_echo512 (&ctx_echo, (const void*) hash, 64); + sph_echo512_close(&ctx_echo, (void*) hash); + + memcpy(state, hash, 32); +} + +extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + const int throughput = 256*256*8; + static bool init[8] = {0,0,0,0,0,0,0,0}; + uint32_t endiandata[20]; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + + cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + + qubit_luffa512_cpu_init(thr_id, throughput); + x11_cubehash512_cpu_init(thr_id, throughput); + x11_shavite512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughput); + x11_echo512_cpu_init(thr_id, throughput); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + cuda_check_cpu_setTarget(ptarget); + + do { + const uint32_t Htarg = ptarget[7]; + int order = 0; + + // Hash with CUDA + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + qubithash(vhash64, endiandata); + + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget) ) + { + pdata[19] = foundNonce; + *hashes_done = foundNonce - first_nonce + 1; + return 1; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); + } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu new file mode 100644 index 0000000..1d827d5 --- /dev/null +++ b/qubit/qubit_luffa512.cu @@ -0,0 +1,496 @@ +/* + * luffa_for_32.c + * Version 2.0 (Sep 15th 2009) + * + * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. + * + * Hitachi, Ltd. is the owner of this software and hereby grant + * the U.S. Government and any interested party the right to use + * this software for the purposes of the SHA-3 evaluation process, + * notwithstanding that this software is copyrighted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include + +#include "cuda_helper.h" + +typedef unsigned char BitSequence; + +__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) +__constant__ uint32_t pTarget[8]; + +uint32_t *d_lnounce[8]; +uint32_t *d_LNonce[8]; + +typedef struct { + uint32_t buffer[8]; /* Buffer to be hashed */ + uint32_t chainv[40]; /* Chaining values */ +} hashState; + +#define BYTES_SWAP32(x) cuda_swab32(x) + +#define MULT2(a,j)\ + tmp = a[7+(8*j)];\ + a[7+(8*j)] = a[6+(8*j)];\ + a[6+(8*j)] = a[5+(8*j)];\ + a[5+(8*j)] = a[4+(8*j)];\ + a[4+(8*j)] = a[3+(8*j)] ^ tmp;\ + a[3+(8*j)] = a[2+(8*j)] ^ tmp;\ + a[2+(8*j)] = a[1+(8*j)];\ + a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ + a[0+(8*j)] = tmp; + +#define TWEAK(a0,a1,a2,a3,j)\ + a0 = (a0<<(j))|(a0>>(32-j));\ + a1 = (a1<<(j))|(a1>>(32-j));\ + a2 = (a2<<(j))|(a2>>(32-j));\ + a3 = (a3<<(j))|(a3>>(32-j)); + +#define STEP(c0,c1)\ + SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ + SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ + MIXWORD(chainv[0],chainv[4]);\ + MIXWORD(chainv[1],chainv[5]);\ + MIXWORD(chainv[2],chainv[6]);\ + MIXWORD(chainv[3],chainv[7]);\ + ADD_CONSTANT(chainv[0],chainv[4],c0,c1); + +#define SUBCRUMB(a0,a1,a2,a3,a4)\ + a4 = a0;\ + a0 |= a1;\ + a2 ^= a3;\ + a1 = ~a1;\ + a0 ^= a3;\ + a3 &= a4;\ + a1 ^= a3;\ + a3 ^= a2;\ + a2 &= a0;\ + a0 = ~a0;\ + a2 ^= a1;\ + a1 |= a3;\ + a4 ^= a1;\ + a3 ^= a2;\ + a2 &= a1;\ + a1 ^= a0;\ + a0 = a4; + +#define MIXWORD(a0,a4)\ + a4 ^= a0;\ + a0 = (a0<<2) | (a0>>(30));\ + a0 ^= a4;\ + a4 = (a4<<14) | (a4>>(18));\ + a4 ^= a0;\ + a0 = (a0<<10) | (a0>>(22));\ + a0 ^= a4;\ + a4 = (a4<<1) | (a4>>(31)); + +#define ADD_CONSTANT(a0,b0,c0,c1)\ + a0 ^= c0;\ + b0 ^= c1; + +/* initial values of chaining variables */ +__constant__ uint32_t c_IV[40]; +const uint32_t h2_IV[40] = { + 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, + 0x6e292011,0x90152df4,0xee058139,0xdef610bb, + 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, + 0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581, + 0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05, + 0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7, + 0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67, + 0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce, + 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, + 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; + +__constant__ uint32_t c_CNS[80]; +uint32_t h2_CNS[80] = { + 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, + 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, + 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, + 0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d, + 0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4, + 0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28, + 0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b, + 0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704, + 0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72, + 0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7, + 0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719, + 0xd9847356,0x36eda57f,0xa2c78434,0x703aace7, + 0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91, + 0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be, + 0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5, + 0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355, + 0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab, + 0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0, + 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3, + 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31}; + + +/***************************************************/ +__device__ __forceinline__ +void rnd512(hashState *state) +{ + int i,j; + uint32_t t[40]; + uint32_t chainv[8]; + uint32_t tmp; + +#pragma unroll 8 + for(i=0;i<8;i++) { + t[i]=0; +#pragma unroll 5 + for(j=0;j<5;j++) { + t[i] ^= state->chainv[i+8*j]; + } + } + + MULT2(t, 0); + +#pragma unroll 5 + for(j=0;j<5;j++) { +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[i+8*j] ^= t[i]; + } + } + +#pragma unroll 5 + for(j=0;j<5;j++) { +#pragma unroll 8 + for(i=0;i<8;i++) { + t[i+8*j] = state->chainv[i+8*j]; + } + } + +#pragma unroll 5 + for(j=0;j<5;j++) { + MULT2(state->chainv, j); + } + +#pragma unroll 5 + for(j=0;j<5;j++) { +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; + } + } + +#pragma unroll 5 + for(j=0;j<5;j++) { +#pragma unroll 8 + for(i=0;i<8;i++) { + t[i+8*j] = state->chainv[i+8*j]; + } + } + +#pragma unroll 5 + for(j=0;j<5;j++) { + MULT2(state->chainv, j); + } + +#pragma unroll 5 + for(j=0;j<5;j++) { +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; + } + } + +#pragma unroll 5 + for(j=0;j<5;j++) { +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[i+8*j] ^= state->buffer[i]; + } + MULT2(state->buffer, 0); + } + +#pragma unroll 8 + for(i=0;i<8;i++) { + chainv[i] = state->chainv[i]; + } + +#pragma unroll 8 + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); + } + +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[i] = chainv[i]; + chainv[i] = state->chainv[i+8]; + } + + TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); + +#pragma unroll 8 + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); + } + +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[i+8] = chainv[i]; + chainv[i] = state->chainv[i+16]; + } + + TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); + +#pragma unroll 8 + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); + } + +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[i+16] = chainv[i]; + chainv[i] = state->chainv[i+24]; + } + + TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); + +#pragma unroll 8 + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); + } + +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[i+24] = chainv[i]; + chainv[i] = state->chainv[i+32]; + } + + TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); + +#pragma unroll 8 + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); + } + +#pragma unroll 8 + for(i=0;i<8;i++) { + state->chainv[i+32] = chainv[i]; + } +} + + +__device__ __forceinline__ +void Update512(hashState *state, const BitSequence *data) +{ +#pragma unroll 8 + for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]); + rnd512(state); + +#pragma unroll 8 + for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]); + rnd512(state); +#pragma unroll 4 + for(int i=0;i<4;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+64))[i]); +} + + +/***************************************************/ +__device__ __forceinline__ +void finalization512(hashState *state, uint32_t *b) +{ + int i,j; + + state->buffer[4] = 0x80000000; +#pragma unroll 3 + for(int i=5;i<8;i++) state->buffer[i] = 0; + rnd512(state); + + /*---- blank round with m=0 ----*/ +#pragma unroll 8 + for(i=0;i<8;i++) state->buffer[i] =0; + rnd512(state); + +#pragma unroll 8 + for(i=0;i<8;i++) { + b[i] = 0; +#pragma unroll 5 + for(j=0;j<5;j++) { + b[i] ^= state->chainv[i+8*j]; + } + b[i] = BYTES_SWAP32((b[i])); + } + +#pragma unroll 8 + for(i=0;i<8;i++) state->buffer[i]=0; + rnd512(state); + +#pragma unroll 8 + for(i=0;i<8;i++) { + b[8+i] = 0; +#pragma unroll 5 + for(j=0;j<5;j++) { + b[8+i] ^= state->chainv[i+8*j]; + } + b[8+i] = BYTES_SWAP32((b[8+i])); + } +} + + +/***************************************************/ +// Die Hash-Funktion +__global__ +void qubit_luffa512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + union { + uint64_t buf64[16]; + uint32_t buf32[32]; + } buff; + +#pragma unroll 16 + for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; + + // die Nounce durch die thread-spezifische ersetzen + buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce)); + + + hashState state; +#pragma unroll 40 + for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; +#pragma unroll 8 + for(int i=0;i<8;i++) state.buffer[i] = 0; + Update512(&state, (BitSequence*)buff.buf32); + uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; + finalization512(&state, (uint32_t*)outHash); + } +} + +__global__ +void qubit_luffa512_gpu_finalhash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + union { + uint64_t buf64[16]; + uint32_t buf32[32]; + } buff; + uint32_t Hash[16]; + +#pragma unroll 16 + for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; + + // die Nounce durch die thread-spezifische ersetzen + buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce)); + + + hashState state; +#pragma unroll 40 + for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; +#pragma unroll 8 + for(int i=0;i<8;i++) state.buffer[i] = 0; + Update512(&state, (BitSequence*)buff.buf32); + finalization512(&state, Hash); + + bool rc = true; + int position = -1; +#pragma unroll 8 + for (int i = 7; i >= 0; i--) { + if (Hash[i] > pTarget[i]) { + if(position < i) { + position = i; + rc = false; + } + + } + if (Hash[i] < pTarget[i]) { + if(position < i) { + position = i; + rc = true; + } + } + } + + if(rc && resNounce[0] > nounce) + resNounce[0] = nounce; + } +} + +__host__ +void qubit_luffa512_cpu_init(int thr_id, int threads) +{ + cudaMemcpyToSymbol( c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice ); + cudaMemcpyToSymbol( c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice ); + cudaMalloc(&d_LNonce[thr_id], sizeof(uint32_t)); + cudaMallocHost(&d_lnounce[thr_id], 1*sizeof(uint32_t)); +} + +__host__ +uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash,int order) +{ + uint32_t result = 0xffffffff; + cudaMemset(d_LNonce[thr_id], 0xffffffff, sizeof(uint32_t)); + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + size_t shared_size = 0; + + qubit_luffa512_gpu_finalhash_80<<>>(threads, startNounce, d_outputHash, d_LNonce[thr_id]); + MyStreamSynchronize(NULL, order, thr_id); + cudaMemcpy(d_lnounce[thr_id], d_LNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + //cudaThreadSynchronize(); + result = *d_lnounce[thr_id]; + return result; +} + +__host__ +void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash,int order) +{ + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + size_t shared_size = 0; + + qubit_luffa512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); + MyStreamSynchronize(NULL, order, thr_id); +} + +__host__ +void qubit_luffa512_cpu_setBlock_80(void *pdata) +{ + unsigned char PaddedMessage[128]; + memcpy(PaddedMessage, pdata, 80); + memset(PaddedMessage+80, 0, 48); + PaddedMessage[80] = 0x80; + PaddedMessage[111] = 1; + PaddedMessage[126] = 0x02; + PaddedMessage[127] = 0x80; + + cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); +} + +__host__ +void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget) +{ + unsigned char PaddedMessage[128]; + memcpy(PaddedMessage, pdata, 80); + memset(PaddedMessage+80, 0, 48); + PaddedMessage[80] = 0x80; + PaddedMessage[111] = 1; + PaddedMessage[126] = 0x02; + PaddedMessage[127] = 0x80; + cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + + cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); +} \ No newline at end of file diff --git a/util.c b/util.c index fe733c0..f8c8192 100644 --- a/util.c +++ b/util.c @@ -1440,6 +1440,14 @@ void print_hash_tests(void) blake256hash(&hash[0], &buf[0], 14); printpfx("blake", hash); + memset(hash, 0, sizeof hash); + deephash(&hash[0], &buf[0]); + printpfx("deep", hash); + + memset(hash, 0, sizeof hash); + doomhash(&hash[0], &buf[0]); + printpfx("doom", hash); + memset(hash, 0, sizeof hash); fresh_hash(&hash[0], &buf[0]); printpfx("fresh", hash); @@ -1476,6 +1484,10 @@ void print_hash_tests(void) quarkhash(&hash[0], &buf[0]); printpfx("quark", hash); + memset(hash, 0, sizeof hash); + qubithash(&hash[0], &buf[0]); + printpfx("qubit", hash); + memset(hash, 0, sizeof hash); wcoinhash(&hash[0], &buf[0]); printpfx("whirl", hash);