From 0720797f1b165f108cbfd99b90b54492a0979977 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 17 Oct 2014 06:22:16 +0200 Subject: [PATCH] Add proper keccak-256 (maxcoin) Cleaned from djm34 repo, tuned for the 750 Ti --- Makefile.am | 4 + README.txt | 6 +- ccminer.vcxproj | 4 + ccminer.vcxproj.filters | 9 +++ cpu-miner.c | 11 +++ cpuminer-config.h | 2 +- keccak/cuda_keccak256.cu | 171 +++++++++++++++++++++++++++++++++++++++ keccak/keccak256.cu | 103 +++++++++++++++++++++++ miner.h | 5 ++ util.c | 4 + 10 files changed, 316 insertions(+), 3 deletions(-) create mode 100644 keccak/cuda_keccak256.cu create mode 100644 keccak/keccak256.cu diff --git a/Makefile.am b/Makefile.am index 72a7bec..431c260 100644 --- a/Makefile.am +++ b/Makefile.am @@ -25,6 +25,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \ heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \ heavy/cuda_sha256.cu heavy/cuda_sha256.h \ + keccak/cuda_keccak256.cu keccak/keccak256.cu \ fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \ groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ myriadgroestl.cpp cuda_myriadgroestl.cu \ @@ -66,6 +67,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v" blake32.o: blake32.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< +keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu + $(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $< + qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< diff --git a/README.txt b/README.txt index 14bd6ab..203bba9 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.4.5-tpruvot (Oct 1st 2014) - "" +ccMiner release 1.4.5-tpruvot (Oct 1st 2014) - "Keccak 256" --------------------------------------------------------------- *************************************************************** @@ -35,6 +35,7 @@ TalkCoin DarkCoin and other X11 coins NEOS blake (256 14-rounds) BlakeCoin (256 8-rounds) +Keccak (Maxcoin) Deep, Doom and Qubit Pentablake (Blake 512 x5) @@ -55,14 +56,15 @@ This code is based on the pooler cpuminer 2.3.2 release and inherits its command line interface and options. -a, --algo=ALGO specify the algorithm to use - heavy use to mine Heavycoin mjollnir use to mine Mjollnircoin deep use to mine Deepcoin fugue256 use to mine Fuguecoin groestl use to mine Groestlcoin dmd-gr use to mine Diamond-Groestl myr-gr use to mine Myriad-Groest + heavy use to mine Heavycoin jackpot use to mine Jackpotcoin + keccak use to mine Maxcoin luffa use to mine Doomcoin quark use to mine Quarkcoin qubit use to mine Qubit Algo diff --git a/ccminer.vcxproj b/ccminer.vcxproj index cb3a710..dcc26f9 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -383,6 +383,10 @@ %(AdditionalOptions) true + + 92 + + 80 --ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions) diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 5161ec5..6801915 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -67,6 +67,9 @@ {17b56151-79ec-4a32-bac3-9d94ae7f68fe} + + {9762c92c-9677-4044-8292-ff6ba4bfdd89} + @@ -463,5 +466,11 @@ Source Files\CUDA + + Source Files\CUDA\keccak + + + Source Files\CUDA\keccak + \ No newline at end of file diff --git a/cpu-miner.c b/cpu-miner.c index 23196a1..60f2c8c 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -139,6 +139,7 @@ typedef enum { ALGO_FUGUE256, /* Fugue256 */ ALGO_GROESTL, ALGO_HEAVY, /* Heavycoin hash */ + ALGO_KECCAK, ALGO_JACKPOT, ALGO_LUFFA_DOOM, ALGO_MJOLLNIR, /* Mjollnir hash */ @@ -166,6 +167,7 @@ static const char *algo_names[] = { "fugue256", "groestl", "heavy", + "keccak", "jackpot", "luffa", "mjollnir", @@ -253,6 +255,7 @@ Options:\n\ fugue256 Fuguecoin hash\n\ groestl Groestlcoin hash\n\ heavy Heavycoin hash\n\ + keccak Keccak-256 (Maxcoin) hash\n\ jackpot Jackpot hash\n\ luffa Doomcoin hash\n\ mjollnir Mjollnircoin hash\n\ @@ -860,6 +863,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) break; case ALGO_FUGUE256: case ALGO_GROESTL: + case ALGO_KECCAK: case ALGO_BLAKECOIN: case ALGO_WHC: SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root); @@ -925,6 +929,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH) diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); + else if (opt_algo == ALGO_KECCAK) + diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty)); else diff_to_target(work->target, sctx->job.diff / opt_difficulty); } @@ -1147,6 +1153,11 @@ continue_scan: max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ); break; + case ALGO_KECCAK: + rc = scanhash_keccak256(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_MJOLLNIR: rc = scanhash_heavy(thr_id, work.data, work.target, max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ); diff --git a/cpuminer-config.h b/cpuminer-config.h index 6979b65..0602c17 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -165,7 +165,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.4" +#define PACKAGE_VERSION "2014.09.28" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/keccak/cuda_keccak256.cu b/keccak/cuda_keccak256.cu new file mode 100644 index 0000000..2b9315e --- /dev/null +++ b/keccak/cuda_keccak256.cu @@ -0,0 +1,171 @@ +#include "miner.h" + +extern "C" { +#include +#include +} + +#include "cuda_helper.h" + +static const uint64_t host_keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +uint32_t *d_nounce[8]; +uint32_t *d_KNonce[8]; + +__constant__ uint32_t pTarget[8]; +__constant__ uint64_t keccak_round_constants[24]; +__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding) + + +static __device__ __forceinline__ +void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) { + size_t i; + uint64_t t[5], u[5], v, w; + + /* absorb input */ + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +__global__ +void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + uint64_t keccak_gpu_state[25]; + + //#pragma unroll 25 + for (int i=0; i<25; i++) { + if(i<9) {keccak_gpu_state[i] = c_PaddedMessage80[i];} + else {keccak_gpu_state[i] = 0;} + } + keccak_gpu_state[9]=REPLACE_HIWORD(c_PaddedMessage80[9],cuda_swab32(nounce)); + keccak_gpu_state[10]=0x0000000000000001; + keccak_gpu_state[16]=0x8000000000000000; + + keccak_block(keccak_gpu_state,keccak_round_constants); + + bool rc = false; + if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;} + + if (rc == true) { + if(resNounce[0] > nounce) + resNounce[0] = nounce; + } + } //thread +} + +void keccak256_cpu_init(int thr_id, int threads) +{ + CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, + host_keccak_round_constants, + sizeof(host_keccak_round_constants), + 0, cudaMemcpyHostToDevice)); + + CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t))); +} + +__host__ +uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +{ + uint32_t result = 0xffffffff; + cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t)); + const int threadsperblock = 128; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + size_t shared_size = 0; + + keccak256_gpu_hash_80<<>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]); + + MyStreamSynchronize(NULL, order, thr_id); + cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); + result = *d_nounce[thr_id]; + + return result; +} + +__host__ +void keccak256_setBlock_80(void *pdata,const void *pTargetIn) +{ + unsigned char PaddedMessage[80]; + memcpy(PaddedMessage, pdata, 80); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); +} \ No newline at end of file diff --git a/keccak/keccak256.cu b/keccak/keccak256.cu new file mode 100644 index 0000000..52108d0 --- /dev/null +++ b/keccak/keccak256.cu @@ -0,0 +1,103 @@ +/* + * Keccak 256 + * + */ + +extern "C" +{ +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_keccak.h" + +#include "miner.h" +} + +#include "cuda_helper.h" + +// in cpu-miner.c +extern int device_map[8]; + +static uint32_t *d_hash[8]; + +extern void keccak256_cpu_init(int thr_id, int threads); +extern void keccak256_setBlock_80(void *pdata,const void *ptarget); +extern uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); + +// CPU Hash +extern "C" void keccak256_hash(void *state, const void *input) +{ + sph_keccak_context ctx_keccak; + + uint32_t hash[16]; + + sph_keccak256_init(&ctx_keccak); + sph_keccak256 (&ctx_keccak, input, 80); + sph_keccak256_close(&ctx_keccak, (void*) hash); + + memcpy(state, hash, 32); +} + +extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x000f; + + const uint32_t Htarg = ptarget[7]; + + const int throughput = 256*256*8*8; + + static bool init[8] = {0,0,0,0,0,0,0,0}; + if (!init[thr_id]) { + cudaSetDevice(device_map[thr_id]); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + keccak256_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) { + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + } + + keccak256_setBlock_80((void*)endiandata, ptarget); + do { + int order = 0; + + uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + + uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); + + keccak256_hash(vhash64, endiandata); + + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + + pdata[19] = foundNonce; + *hashes_done = foundNonce - first_nonce + 1; + return 1; + + } else { + applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); + } + } + + if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} diff --git a/miner.h b/miner.h index 62cff79..8a75322 100644 --- a/miner.h +++ b/miner.h @@ -251,6 +251,10 @@ extern int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen); +extern int scanhash_keccak256(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + extern int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -471,6 +475,7 @@ void doomhash(void *state, const void *input); void fresh_hash(void *state, const void *input); void fugue256_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); +void keccak256_hash(void *state, const void *input); unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); void myriadhash(void *state, const void *input); diff --git a/util.c b/util.c index 4586ddd..639baa7 100644 --- a/util.c +++ b/util.c @@ -1502,6 +1502,10 @@ void print_hash_tests(void) heavycoin_hash(&hash[0], &buf[0], 32); printpfx("heavy", hash); + memset(hash, 0, sizeof hash); + keccak256_hash(&hash[0], &buf[0]); + printpfx("keccak", hash); + memset(hash, 0, sizeof hash); jackpothash(&hash[0], &buf[0]); printpfx("jackpot", hash);