Add proper keccak-256 (maxcoin)

Cleaned from djm34 repo, tuned for the 750 Ti
2014-10-17 06:22:16 +02:00 · 2014-10-17 06:22:16 +02:00 · 0720797f1b
commit 0720797f1b
parent cdc29336f7
10 changed files with 316 additions and 3 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -25,6 +25,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
 			  heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
 			  heavy/cuda_sha256.cu heavy/cuda_sha256.h \
+			  keccak/cuda_keccak256.cu keccak/keccak256.cu \
 			  fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
 			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
 			  myriadgroestl.cpp cuda_myriadgroestl.cu \
@ -66,6 +67,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
 blake32.o: blake32.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<

+keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
+
 qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<

--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@

-ccMiner release 1.4.5-tpruvot (Oct 1st 2014) - ""
+ccMiner release 1.4.5-tpruvot (Oct 1st 2014) - "Keccak 256"
 ---------------------------------------------------------------

 ***************************************************************
@ -35,6 +35,7 @@ TalkCoin
 DarkCoin and other X11 coins
 NEOS blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
+Keccak (Maxcoin)
 Deep, Doom and Qubit
 Pentablake (Blake 512 x5)

@ -55,14 +56,15 @@ This code is based on the pooler cpuminer 2.3.2 release and inherits
 its command line interface and options.

  -a, --algo=ALGO       specify the algorithm to use
-                          heavy       use to mine Heavycoin
                          mjollnir    use to mine Mjollnircoin
                          deep        use to mine Deepcoin
                          fugue256    use to mine Fuguecoin
                          groestl     use to mine Groestlcoin
                          dmd-gr      use to mine Diamond-Groestl
                          myr-gr      use to mine Myriad-Groest
+                          heavy       use to mine Heavycoin
                          jackpot     use to mine Jackpotcoin
+                          keccak      use to mine Maxcoin
                          luffa       use to mine Doomcoin
                          quark       use to mine Quarkcoin
                          qubit       use to mine Qubit Algo
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -383,6 +383,10 @@
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <FastMath>true</FastMath>
    </CudaCompile>
+    <CudaCompile Include="keccak\cuda_keccak256.cu">
+      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">92</MaxRegCount>
+    </CudaCompile>
+    <CudaCompile Include="keccak\keccak256.cu" />
    <CudaCompile Include="pentablake.cu">
      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -67,6 +67,9 @@
    <Filter Include="Source Files\jansson">
      <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Source Files\CUDA\keccak">
+      <UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -463,5 +466,11 @@
    <CudaCompile Include="pentablake.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
+    <CudaCompile Include="keccak\cuda_keccak256.cu">
+      <Filter>Source Files\CUDA\keccak</Filter>
+    </CudaCompile>
+    <CudaCompile Include="keccak\keccak256.cu">
+      <Filter>Source Files\CUDA\keccak</Filter>
+    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -139,6 +139,7 @@ typedef enum {
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
 	ALGO_HEAVY,		/* Heavycoin hash */
+	ALGO_KECCAK,
 	ALGO_JACKPOT,
 	ALGO_LUFFA_DOOM,
 	ALGO_MJOLLNIR,		/* Mjollnir hash */
@ -166,6 +167,7 @@ static const char *algo_names[] = {
 	"fugue256",
 	"groestl",
 	"heavy",
+	"keccak",
 	"jackpot",
 	"luffa",
 	"mjollnir",
@ -253,6 +255,7 @@ Options:\n\
                        fugue256  Fuguecoin hash\n\
                        groestl   Groestlcoin hash\n\
                        heavy     Heavycoin hash\n\
+                        keccak    Keccak-256 (Maxcoin) hash\n\
                        jackpot   Jackpot hash\n\
                        luffa     Doomcoin hash\n\
                        mjollnir  Mjollnircoin hash\n\
@ -860,6 +863,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 			break;
 		case ALGO_FUGUE256:
 		case ALGO_GROESTL:
+		case ALGO_KECCAK:
 		case ALGO_BLAKECOIN:
 		case ALGO_WHC:
 			SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root);
@ -925,6 +929,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
+	else if (opt_algo == ALGO_KECCAK)
+		diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 }
@ -1147,6 +1153,11 @@ continue_scan:
 			                      max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
 			break;

+		case ALGO_KECCAK:
+			rc = scanhash_keccak256(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
 		case ALGO_MJOLLNIR:
 			rc = scanhash_heavy(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ);
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -165,7 +165,7 @@
 #define PACKAGE_URL ""

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.4"
+#define PACKAGE_VERSION "2014.09.28"

 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/keccak/cuda_keccak256.cu
+++ b/keccak/cuda_keccak256.cu
@ -0,0 +1,171 @@
+#include "miner.h"
+
+extern "C" {
+#include <stdint.h>
+#include <memory.h>
+}
+
+#include "cuda_helper.h"
+
+static const uint64_t host_keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+uint32_t *d_nounce[8];
+uint32_t *d_KNonce[8];
+
+__constant__ uint32_t pTarget[8];
+__constant__ uint64_t keccak_round_constants[24];
+__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding)
+
+
+static __device__ __forceinline__
+void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) {
+	size_t i;
+	uint64_t t[5], u[5], v, w;
+
+	/* absorb input */
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
+}
+
+__global__
+void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+		uint64_t keccak_gpu_state[25];
+
+		//#pragma unroll 25
+		for (int i=0; i<25; i++) {
+			if(i<9) {keccak_gpu_state[i] = c_PaddedMessage80[i];}
+			else    {keccak_gpu_state[i] = 0;}
+		}
+		keccak_gpu_state[9]=REPLACE_HIWORD(c_PaddedMessage80[9],cuda_swab32(nounce));
+		keccak_gpu_state[10]=0x0000000000000001;
+		keccak_gpu_state[16]=0x8000000000000000;
+
+		keccak_block(keccak_gpu_state,keccak_round_constants);
+
+		bool rc = false;
+		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;}
+
+		if (rc == true) {
+			if(resNounce[0] > nounce)
+				resNounce[0] = nounce;
+		}
+	} //thread
+}
+
+void keccak256_cpu_init(int thr_id, int threads)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants,
+						host_keccak_round_constants,
+						sizeof(host_keccak_round_constants),
+						0, cudaMemcpyHostToDevice));
+
+	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
+}
+
+__host__
+uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+{
+	uint32_t result = 0xffffffff;
+	cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
+	const int threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	keccak256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+	cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+	result = *d_nounce[thr_id];
+
+	return result;
+}
+
+__host__
+void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
+{
+	unsigned char PaddedMessage[80];
+	memcpy(PaddedMessage, pdata, 80);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+}
--- a/keccak/keccak256.cu
+++ b/keccak/keccak256.cu
@ -0,0 +1,103 @@
+/*
+ * Keccak 256
+ *
+ */
+
+extern "C"
+{
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_keccak.h"
+
+#include "miner.h"
+}
+
+#include "cuda_helper.h"
+
+// in cpu-miner.c
+extern int device_map[8];
+
+static uint32_t *d_hash[8];
+
+extern void keccak256_cpu_init(int thr_id, int threads);
+extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
+extern uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+// CPU Hash
+extern "C" void keccak256_hash(void *state, const void *input)
+{
+	sph_keccak_context ctx_keccak;
+
+	uint32_t hash[16];
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256 (&ctx_keccak, input, 80);
+	sph_keccak256_close(&ctx_keccak, (void*) hash);
+
+	memcpy(state, hash, 32);
+}
+
+extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000f;
+
+	const uint32_t Htarg = ptarget[7];
+
+	const int throughput = 256*256*8*8;
+
+	static bool init[8] = {0,0,0,0,0,0,0,0};
+	if (!init[thr_id]) {
+		cudaSetDevice(device_map[thr_id]);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		keccak256_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++) {
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+	}
+
+	keccak256_setBlock_80((void*)endiandata, ptarget);
+	do {
+		int order = 0;
+
+		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if  (foundNonce != 0xffffffff)
+		{
+
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], foundNonce);
+
+			keccak256_hash(vhash64, endiandata);
+
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+
+				pdata[19] = foundNonce;
+				*hashes_done = foundNonce - first_nonce + 1;
+				return 1;
+
+			} else {
+				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
+			}
+		}
+
+		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
--- a/miner.h
+++ b/miner.h
@ -251,6 +251,10 @@ extern int scanhash_heavy(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done, uint32_t maxvote, int blocklen);

+extern int scanhash_keccak256(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern int scanhash_myriad(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -471,6 +475,7 @@ void doomhash(void *state, const void *input);
 void fresh_hash(void *state, const void *input);
 void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
+void keccak256_hash(void *state, const void *input);
 unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
--- a/util.c
+++ b/util.c
@ -1502,6 +1502,10 @@ void print_hash_tests(void)
 	heavycoin_hash(&hash[0], &buf[0], 32);
 	printpfx("heavy", hash);

+	memset(hash, 0, sizeof hash);
+	keccak256_hash(&hash[0], &buf[0]);
+	printpfx("keccak", hash);
+
 	memset(hash, 0, sizeof hash);
 	jackpothash(&hash[0], &buf[0]);
 	printpfx("jackpot", hash);