Add proper keccak-256 (maxcoin)

Cleaned from djm34 repo, tuned for the 750 Ti
10 years ago · 0720797f1b
10 changed files with 316 additions and 3 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -25,6 +25,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
 			  heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
 			  heavy/cuda_sha256.cu heavy/cuda_sha256.h \
 			  keccak/cuda_keccak256.cu keccak/keccak256.cu \
 			  fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
 			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
 			  myriadgroestl.cpp cuda_myriadgroestl.cu \
@ -66,6 +67,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
 blake32.o: blake32.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
 qubit/qubit_luffa512.o: qubit/qubit_luffa512.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner release 1.4.5-tpruvot (Oct 1st 2014) - ""
+ccMiner release 1.4.5-tpruvot (Oct 1st 2014) - "Keccak 256"
 ---------------------------------------------------------------
 ***************************************************************
@ -35,6 +35,7 @@ TalkCoin
 DarkCoin and other X11 coins
 NEOS blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
 Keccak (Maxcoin)
 Deep, Doom and Qubit
 Pentablake (Blake 512 x5)
@ -55,14 +56,15 @@ This code is based on the pooler cpuminer 2.3.2 release and inherits
 its command line interface and options.
  -a, --algo=ALGO       specify the algorithm to use
                          heavy       use to mine Heavycoin
                          mjollnir    use to mine Mjollnircoin
                          deep        use to mine Deepcoin
                          fugue256    use to mine Fuguecoin
                          groestl     use to mine Groestlcoin
                          dmd-gr      use to mine Diamond-Groestl
                          myr-gr      use to mine Myriad-Groest
                          heavy       use to mine Heavycoin
                          jackpot     use to mine Jackpotcoin
                          keccak      use to mine Maxcoin
                          luffa       use to mine Doomcoin
                          quark       use to mine Quarkcoin
                          qubit       use to mine Qubit Algo
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -383,6 +383,10 @@
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <FastMath>true</FastMath>
    </CudaCompile>
    <CudaCompile Include="keccak\cuda_keccak256.cu">
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">92</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="keccak\keccak256.cu" />
    <CudaCompile Include="pentablake.cu">
      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -67,6 +67,9 @@
    <Filter Include="Source Files\jansson">
      <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\keccak">
      <UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -463,5 +466,11 @@
    <CudaCompile Include="pentablake.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="keccak\cuda_keccak256.cu">
      <Filter>Source Files\CUDA\keccak</Filter>
    </CudaCompile>
    <CudaCompile Include="keccak\keccak256.cu">
      <Filter>Source Files\CUDA\keccak</Filter>
    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -139,6 +139,7 @@ typedef enum {
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
 	ALGO_HEAVY,		/* Heavycoin hash */
 	ALGO_KECCAK,
 	ALGO_JACKPOT,
 	ALGO_LUFFA_DOOM,
 	ALGO_MJOLLNIR,		/* Mjollnir hash */
@ -166,6 +167,7 @@ static const char *algo_names[] = {
 	"fugue256",
 	"groestl",
 	"heavy",
 	"keccak",
 	"jackpot",
 	"luffa",
 	"mjollnir",
@ -253,6 +255,7 @@ Options:\n\
                        fugue256  Fuguecoin hash\n\
                        groestl   Groestlcoin hash\n\
                        heavy     Heavycoin hash\n\
                        keccak    Keccak-256 (Maxcoin) hash\n\
                        jackpot   Jackpot hash\n\
                        luffa     Doomcoin hash\n\
                        mjollnir  Mjollnircoin hash\n\
@ -860,6 +863,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 			break;
 		case ALGO_FUGUE256:
 		case ALGO_GROESTL:
 		case ALGO_KECCAK:
 		case ALGO_BLAKECOIN:
 		case ALGO_WHC:
 			SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root);
@ -925,6 +929,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
 	else if (opt_algo == ALGO_KECCAK)
 		diff_to_target(work->target, sctx->job.diff / (128.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 }
@ -1147,6 +1153,11 @@ continue_scan:
 			                      max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
 			break;
 		case ALGO_KECCAK:
 			rc = scanhash_keccak256(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_MJOLLNIR:
 			rc = scanhash_heavy(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ);
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -165,7 +165,7 @@
 #define PACKAGE_URL ""
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.4"
+#define PACKAGE_VERSION "2014.09.28"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/keccak/cuda_keccak256.cu
+++ b/keccak/cuda_keccak256.cu
@ -0,0 +1,171 @@
 #include "miner.h"
 extern "C" {
 #include <stdint.h>
 #include <memory.h>
 }
 #include "cuda_helper.h"
 static const uint64_t host_keccak_round_constants[24] = {
 	0x0000000000000001ull, 0x0000000000008082ull,
 	0x800000000000808aull, 0x8000000080008000ull,
 	0x000000000000808bull, 0x0000000080000001ull,
 	0x8000000080008081ull, 0x8000000000008009ull,
 	0x000000000000008aull, 0x0000000000000088ull,
 	0x0000000080008009ull, 0x000000008000000aull,
 	0x000000008000808bull, 0x800000000000008bull,
 	0x8000000000008089ull, 0x8000000000008003ull,
 	0x8000000000008002ull, 0x8000000000000080ull,
 	0x000000000000800aull, 0x800000008000000aull,
 	0x8000000080008081ull, 0x8000000000008080ull,
 	0x0000000080000001ull, 0x8000000080008008ull
 };
 uint32_t *d_nounce[8];
 uint32_t *d_KNonce[8];
 __constant__ uint32_t pTarget[8];
 __constant__ uint64_t keccak_round_constants[24];
 __constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding)
 static __device__ __forceinline__
 void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) {
 	size_t i;
 	uint64_t t[5], u[5], v, w;
 	/* absorb input */
 	for (i = 0; i < 24; i++) {
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		u[0] = t[4] ^ ROTL64(t[1], 1);
 		u[1] = t[0] ^ ROTL64(t[2], 1);
 		u[2] = t[1] ^ ROTL64(t[3], 1);
 		u[3] = t[2] ^ ROTL64(t[4], 1);
 		u[4] = t[3] ^ ROTL64(t[0], 1);
 		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
 		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
 		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
 		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
 		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
 		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
 		/* rho pi: b[..] = rotl(a[..], ..) */
 		v = s[ 1];
 		s[ 1] = ROTL64(s[ 6], 44);
 		s[ 6] = ROTL64(s[ 9], 20);
 		s[ 9] = ROTL64(s[22], 61);
 		s[22] = ROTL64(s[14], 39);
 		s[14] = ROTL64(s[20], 18);
 		s[20] = ROTL64(s[ 2], 62);
 		s[ 2] = ROTL64(s[12], 43);
 		s[12] = ROTL64(s[13], 25);
 		s[13] = ROTL64(s[19],  8);
 		s[19] = ROTL64(s[23], 56);
 		s[23] = ROTL64(s[15], 41);
 		s[15] = ROTL64(s[ 4], 27);
 		s[ 4] = ROTL64(s[24], 14);
 		s[24] = ROTL64(s[21],  2);
 		s[21] = ROTL64(s[ 8], 55);
 		s[ 8] = ROTL64(s[16], 45);
 		s[16] = ROTL64(s[ 5], 36);
 		s[ 5] = ROTL64(s[ 3], 28);
 		s[ 3] = ROTL64(s[18], 21);
 		s[18] = ROTL64(s[17], 15);
 		s[17] = ROTL64(s[11], 10);
 		s[11] = ROTL64(s[ 7],  6);
 		s[ 7] = ROTL64(s[10],  3);
 		s[10] = ROTL64(    v,  1);
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
 		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
 		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
 		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
 		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
 		/* iota: a[0,0] ^= round constant */
 		s[0] ^= keccak_round_constants[i];
 	}
 }
 __global__
 void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 		uint64_t keccak_gpu_state[25];
 		//#pragma unroll 25
 		for (int i=0; i<25; i++) {
 			if(i<9) {keccak_gpu_state[i] = c_PaddedMessage80[i];}
 			else    {keccak_gpu_state[i] = 0;}
 		}
 		keccak_gpu_state[9]=REPLACE_HIWORD(c_PaddedMessage80[9],cuda_swab32(nounce));
 		keccak_gpu_state[10]=0x0000000000000001;
 		keccak_gpu_state[16]=0x8000000000000000;
 		keccak_block(keccak_gpu_state,keccak_round_constants);
 		bool rc = false;
 		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;}
 		if (rc == true) {
 			if(resNounce[0] > nounce)
 				resNounce[0] = nounce;
 		}
 	} //thread
 }
 void keccak256_cpu_init(int thr_id, int threads)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants,
 						host_keccak_round_constants,
 						sizeof(host_keccak_round_constants),
 						0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
 	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
 }
 __host__
 uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
 	const int threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 	keccak256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]);
 	MyStreamSynchronize(NULL, order, thr_id);
 	cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	cudaThreadSynchronize();
 	result = *d_nounce[thr_id];
 	return result;
 }
 __host__
 void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
 {
 	unsigned char PaddedMessage[80];
 	memcpy(PaddedMessage, pdata, 80);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 }
--- a/keccak/keccak256.cu
+++ b/keccak/keccak256.cu
@ -0,0 +1,103 @@
 /*
 * Keccak 256
 *
 */
 extern "C"
 {
 #include "sph/sph_shavite.h"
 #include "sph/sph_simd.h"
 #include "sph/sph_keccak.h"
 #include "miner.h"
 }
 #include "cuda_helper.h"
 // in cpu-miner.c
 extern int device_map[8];
 static uint32_t *d_hash[8];
 extern void keccak256_cpu_init(int thr_id, int threads);
 extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
 extern uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
 // CPU Hash
 extern "C" void keccak256_hash(void *state, const void *input)
 {
 	sph_keccak_context ctx_keccak;
 	uint32_t hash[16];
 	sph_keccak256_init(&ctx_keccak);
 	sph_keccak256 (&ctx_keccak, input, 80);
 	sph_keccak256_close(&ctx_keccak, (void*) hash);
 	memcpy(state, hash, 32);
 }
 extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x000f;
 	const uint32_t Htarg = ptarget[7];
 	const int throughput = 256*256*8*8;
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
 		keccak256_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++) {
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
 	}
 	keccak256_setBlock_80((void*)endiandata, ptarget);
 	do {
 		int order = 0;
 		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			keccak256_hash(vhash64, endiandata);
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				pdata[19] = foundNonce;
 				*hashes_done = foundNonce - first_nonce + 1;
 				return 1;
 			} else {
 				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
 		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 	} while (!work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
--- a/miner.h
+++ b/miner.h
@ -251,6 +251,10 @@ extern int scanhash_heavy(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done, uint32_t maxvote, int blocklen);
 extern int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_myriad(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -471,6 +475,7 @@ void doomhash(void *state, const void *input);
 void fresh_hash(void *state, const void *input);
 void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void keccak256_hash(void *state, const void *input);
 unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
--- a/util.c
+++ b/util.c
@ -1502,6 +1502,10 @@ void print_hash_tests(void)
 	heavycoin_hash(&hash[0], &buf[0], 32);
 	printpfx("heavy", hash);
 	memset(hash, 0, sizeof hash);
 	keccak256_hash(&hash[0], &buf[0]);
 	printpfx("keccak", hash);
 	memset(hash, 0, sizeof hash);
 	jackpothash(&hash[0], &buf[0]);
 	printpfx("jackpot", hash);