ccminer/pluck/pluck.cu

/* Based on djm code */

extern "C" {
#include "miner.h"
}

#include <stdint.h>

static uint32_t *d_hash[MAX_GPUS] ;

extern void pluck_setBlockTarget(const void* data, const void *ptarget);
extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash);
extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order);

extern float tp_coef[MAX_GPUS];

#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
//note, this is 64 bytes
static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
{
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
	uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15;
	int i;

	x00 = (B[0] ^= Bx[0]);
	x01 = (B[1] ^= Bx[1]);
	x02 = (B[2] ^= Bx[2]);
	x03 = (B[3] ^= Bx[3]);
	x04 = (B[4] ^= Bx[4]);
	x05 = (B[5] ^= Bx[5]);
	x06 = (B[6] ^= Bx[6]);
	x07 = (B[7] ^= Bx[7]);
	x08 = (B[8] ^= Bx[8]);
	x09 = (B[9] ^= Bx[9]);
	x10 = (B[10] ^= Bx[10]);
	x11 = (B[11] ^= Bx[11]);
	x12 = (B[12] ^= Bx[12]);
	x13 = (B[13] ^= Bx[13]);
	x14 = (B[14] ^= Bx[14]);
	x15 = (B[15] ^= Bx[15]);
	for (i = 0; i < 8; i += 2) {
		/* Operate on columns. */
		x04 ^= ROTL(x00 + x12, 7);  x09 ^= ROTL(x05 + x01, 7);
		x14 ^= ROTL(x10 + x06, 7);  x03 ^= ROTL(x15 + x11, 7);

		x08 ^= ROTL(x04 + x00, 9);  x13 ^= ROTL(x09 + x05, 9);
		x02 ^= ROTL(x14 + x10, 9);  x07 ^= ROTL(x03 + x15, 9);

		x12 ^= ROTL(x08 + x04, 13);  x01 ^= ROTL(x13 + x09, 13);
		x06 ^= ROTL(x02 + x14, 13);  x11 ^= ROTL(x07 + x03, 13);

		x00 ^= ROTL(x12 + x08, 18);  x05 ^= ROTL(x01 + x13, 18);
		x10 ^= ROTL(x06 + x02, 18);  x15 ^= ROTL(x11 + x07, 18);

		/* Operate on rows. */
		x01 ^= ROTL(x00 + x03, 7);  x06 ^= ROTL(x05 + x04, 7);
		x11 ^= ROTL(x10 + x09, 7);  x12 ^= ROTL(x15 + x14, 7);

		x02 ^= ROTL(x01 + x00, 9);  x07 ^= ROTL(x06 + x05, 9);
		x08 ^= ROTL(x11 + x10, 9);  x13 ^= ROTL(x12 + x15, 9);

		x03 ^= ROTL(x02 + x01, 13);  x04 ^= ROTL(x07 + x06, 13);
		x09 ^= ROTL(x08 + x11, 13);  x14 ^= ROTL(x13 + x12, 13);

		x00 ^= ROTL(x03 + x02, 18);  x05 ^= ROTL(x04 + x07, 18);
		x10 ^= ROTL(x09 + x08, 18);  x15 ^= ROTL(x14 + x13, 18);
	}
	B[0] += x00;
	B[1] += x01;
	B[2] += x02;
	B[3] += x03;
	B[4] += x04;
	B[5] += x05;
	B[6] += x06;
	B[7] += x07;
	B[8] += x08;
	B[9] += x09;
	B[10] += x10;
	B[11] += x11;
	B[12] += x12;
	B[13] += x13;
	B[14] += x14;
	B[15] += x15;
#undef ROTL
}

static void sha256_hash(unsigned char *hash, const unsigned char *data, int len)
{
	uint32_t S[16], T[16];
	int i, r;

	sha256_init(S);
	for (r = len; r > -9; r -= 64) {
		if (r < 64)
			memset(T, 0, 64);
		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
		if (r >= 0 && r < 64)
			((unsigned char *)T)[r] = 0x80;
		for (i = 0; i < 16; i++)
			T[i] = be32dec(T + i);

		if (r < 56)
			T[15] = 8 * len;
		sha256_transform(S, T, 0);
	}
	for (i = 0; i < 8; i++)
		be32enc((uint32_t *)hash + i, S[i]);
}

static void sha256_hash512(unsigned char *hash, const unsigned char *data)
{
	uint32_t S[16], T[16];
	int i;

	sha256_init(S);

	memcpy(T, data, 64);
	for (i = 0; i < 16; i++)
		T[i] = be32dec(T + i);
	sha256_transform(S, T, 0);

	memset(T, 0, 64);
	//memcpy(T, data + 64, 0);
	((unsigned char *)T)[0] = 0x80;
	for (i = 0; i < 16; i++)
		T[i] = be32dec(T + i);
	T[15] = 8 * 64;
	sha256_transform(S, T, 0);

	for (i = 0; i < 8; i++)
		be32enc((uint32_t *)hash + i, S[i]);
}

void pluckhash(uint32_t *hash, uint32_t *input)
{

	uint32_t data[20];
	//uint32_t midstate[8];

	const int HASH_MEMORY = 128 * 1024;
	uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY);

	for (int k = 0; k<20; k++) { data[k] = input[k]; }

		uint8_t *hashbuffer = scratchbuf; //don't allocate this on stack, since it's huge..
		int size = HASH_MEMORY;
		memset(hashbuffer, 0, 64);

		sha256_hash(&hashbuffer[0], (uint8_t*)data, 80);
		for (int i = 64; i < size - 32; i += 32)
		{
			//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
			int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases
			uint32_t joint[16];
			uint32_t randbuffer[16];

			uint32_t randseed[16];
			memcpy(randseed, &hashbuffer[i - 64], 64);
			if (i>128)
			{
				memcpy(randbuffer, &hashbuffer[i - 128], 64);
			}
			else
			{
				memset(&randbuffer, 0, 64);
			}

			xor_salsa8(randbuffer, randseed);

			memcpy(joint, &hashbuffer[i - 32], 32);
			//use the last hash value as the seed
			for (int j = 32; j < 64; j += 4)
			{
				uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); //randmax - 32 as otherwise we go beyond memory that's already been written to
				joint[j / 4] = *((uint32_t*)&hashbuffer[rand]);
			}
			sha256_hash512(&hashbuffer[i], (uint8_t*)joint);
//			for (int k = 0; k<8; k++) { printf("sha hashbuffer %d %08x\n", k, ((uint32_t*)(hashbuffer+i))[k]); }
			memcpy(randseed, &hashbuffer[i - 32], 64); //use last hash value and previous hash value(post-mixing)
			if (i>128)
			{
				memcpy(randbuffer, &hashbuffer[i - 128], 64);
			}
			else
			{
				memset(randbuffer, 0, 64);
			}
			xor_salsa8(randbuffer, randseed);
			for (int j = 0; j < 32; j += 2)
			{
				uint32_t rand = randbuffer[j / 2] % randmax;
				*((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]);
			}
		}

//		for (int k = 0; k<8; k++) { printf("cpu final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }

		//note: off-by-one error is likely here...
/*
		for (int i = size - 64 - 1; i >= 64; i -= 64)
		{
			sha256_hash512(&hashbuffer[i - 64], &hashbuffer[i]);
		}

		for (int k = 0; k<8; k++) { printf("cpu after of by one final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
*/
		memcpy((unsigned char*)hash, hashbuffer, 32);
}

static bool init[MAX_GPUS] = { 0 };

extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
	uint32_t max_nonce, unsigned long *hashes_done)
{
	const uint32_t first_nonce = pdata[19];
	uint32_t endiandata[20];

	int intensity = 18; /* beware > 20 could work and create diff problems later */
	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
	// divide by 128 for this algo which require a lot of memory
	throughput = throughput / 128 - 256;
	throughput = min(throughput, max_nonce - first_nonce + 1);

	if (opt_benchmark)
		((uint32_t*)ptarget)[7] = 0x0000ff;

	if (!init[thr_id])
	{
		cudaSetDevice(device_map[thr_id]);
		//cudaDeviceReset();
		//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
		//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);

		cudaMalloc(&d_hash[thr_id], 32 * 1024 * sizeof(uint32_t) * throughput);

		pluck_cpu_init(thr_id, throughput, d_hash[thr_id]);
		init[thr_id] = true;
	}


	for (int k = 0; k < 20; k++)
		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

	pluck_setBlockTarget(endiandata,ptarget);

	do {
		uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0);
		if (foundNonce != UINT32_MAX)
		{
//			const uint32_t Htarg = ptarget[7];
//			uint32_t vhash64[8];
//			be32enc(&endiandata[19], foundNonce);
//			pluckhash(vhash64,endiandata);
//			printf("target %08x vhash64 %08x", ptarget[7], vhash64[7]);
//			if (vhash64[7] <= Htarg) { // && fulltest(vhash64, ptarget)) {
				*hashes_done = pdata[19] - first_nonce + throughput;
				pdata[19] = foundNonce;
				return 1;
//			} else {
//				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
//			}
		}

		pdata[19] += throughput;

	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}
pluck: adaptation from djm repo remains the cpu validation check to do... throughput for this algo is divided by 128 to keep same kind of intensity values (default 18.0) 10 years ago			`/* Based on djm code */`

			`extern "C" {`
			`#include "miner.h"`
			`}`

			`#include <stdint.h>`

			`static uint32_t *d_hash[MAX_GPUS] ;`

			`extern void pluck_setBlockTarget(const void* data, const void *ptarget);`
			`extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash);`
			`extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order);`

			`extern float tp_coef[MAX_GPUS];`

			`#define ROTL(a, b) (((a) << (b)) \| ((a) >> (32 - (b))))`
			`//note, this is 64 bytes`
			`static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])`
			`{`
			`#define ROTL(a, b) (((a) << (b)) \| ((a) >> (32 - (b))))`
			`uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15;`
			`int i;`

			`x00 = (B[0] ^= Bx[0]);`
			`x01 = (B[1] ^= Bx[1]);`
			`x02 = (B[2] ^= Bx[2]);`
			`x03 = (B[3] ^= Bx[3]);`
			`x04 = (B[4] ^= Bx[4]);`
			`x05 = (B[5] ^= Bx[5]);`
			`x06 = (B[6] ^= Bx[6]);`
			`x07 = (B[7] ^= Bx[7]);`
			`x08 = (B[8] ^= Bx[8]);`
			`x09 = (B[9] ^= Bx[9]);`
			`x10 = (B[10] ^= Bx[10]);`
			`x11 = (B[11] ^= Bx[11]);`
			`x12 = (B[12] ^= Bx[12]);`
			`x13 = (B[13] ^= Bx[13]);`
			`x14 = (B[14] ^= Bx[14]);`
			`x15 = (B[15] ^= Bx[15]);`
			`for (i = 0; i < 8; i += 2) {`
			`/* Operate on columns. */`
			`x04 ^= ROTL(x00 + x12, 7); x09 ^= ROTL(x05 + x01, 7);`
			`x14 ^= ROTL(x10 + x06, 7); x03 ^= ROTL(x15 + x11, 7);`

			`x08 ^= ROTL(x04 + x00, 9); x13 ^= ROTL(x09 + x05, 9);`
			`x02 ^= ROTL(x14 + x10, 9); x07 ^= ROTL(x03 + x15, 9);`

			`x12 ^= ROTL(x08 + x04, 13); x01 ^= ROTL(x13 + x09, 13);`
			`x06 ^= ROTL(x02 + x14, 13); x11 ^= ROTL(x07 + x03, 13);`

			`x00 ^= ROTL(x12 + x08, 18); x05 ^= ROTL(x01 + x13, 18);`
			`x10 ^= ROTL(x06 + x02, 18); x15 ^= ROTL(x11 + x07, 18);`

			`/* Operate on rows. */`
			`x01 ^= ROTL(x00 + x03, 7); x06 ^= ROTL(x05 + x04, 7);`
			`x11 ^= ROTL(x10 + x09, 7); x12 ^= ROTL(x15 + x14, 7);`

			`x02 ^= ROTL(x01 + x00, 9); x07 ^= ROTL(x06 + x05, 9);`
			`x08 ^= ROTL(x11 + x10, 9); x13 ^= ROTL(x12 + x15, 9);`

			`x03 ^= ROTL(x02 + x01, 13); x04 ^= ROTL(x07 + x06, 13);`
			`x09 ^= ROTL(x08 + x11, 13); x14 ^= ROTL(x13 + x12, 13);`

			`x00 ^= ROTL(x03 + x02, 18); x05 ^= ROTL(x04 + x07, 18);`
			`x10 ^= ROTL(x09 + x08, 18); x15 ^= ROTL(x14 + x13, 18);`
			`}`
			`B[0] += x00;`
			`B[1] += x01;`
			`B[2] += x02;`
			`B[3] += x03;`
			`B[4] += x04;`
			`B[5] += x05;`
			`B[6] += x06;`
			`B[7] += x07;`
			`B[8] += x08;`
			`B[9] += x09;`
			`B[10] += x10;`
			`B[11] += x11;`
			`B[12] += x12;`
			`B[13] += x13;`
			`B[14] += x14;`
			`B[15] += x15;`
			`#undef ROTL`
			`}`

			`static void sha256_hash(unsigned char hash, const unsigned char data, int len)`
			`{`
			`uint32_t S[16], T[16];`
			`int i, r;`

			`sha256_init(S);`
			`for (r = len; r > -9; r -= 64) {`
			`if (r < 64)`
			`memset(T, 0, 64);`
			`memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));`
			`if (r >= 0 && r < 64)`
			`((unsigned char *)T)[r] = 0x80;`
			`for (i = 0; i < 16; i++)`
			`T[i] = be32dec(T + i);`

			`if (r < 56)`
			`T[15] = 8 * len;`
			`sha256_transform(S, T, 0);`
			`}`
			`for (i = 0; i < 8; i++)`
			`be32enc((uint32_t *)hash + i, S[i]);`
			`}`

			`static void sha256_hash512(unsigned char hash, const unsigned char data)`
			`{`
			`uint32_t S[16], T[16];`
			`int i;`

			`sha256_init(S);`

			`memcpy(T, data, 64);`
			`for (i = 0; i < 16; i++)`
			`T[i] = be32dec(T + i);`
			`sha256_transform(S, T, 0);`

			`memset(T, 0, 64);`
			`//memcpy(T, data + 64, 0);`
			`((unsigned char *)T)[0] = 0x80;`
			`for (i = 0; i < 16; i++)`
			`T[i] = be32dec(T + i);`
			`T[15] = 8 * 64;`
			`sha256_transform(S, T, 0);`

			`for (i = 0; i < 8; i++)`
			`be32enc((uint32_t *)hash + i, S[i]);`
			`}`

			`void pluckhash(uint32_t hash, uint32_t input)`
			`{`

			`uint32_t data[20];`
			`//uint32_t midstate[8];`

			`const int HASH_MEMORY = 128 * 1024;`
			`uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY);`

			`for (int k = 0; k<20; k++) { data[k] = input[k]; }`

			`uint8_t *hashbuffer = scratchbuf; //don't allocate this on stack, since it's huge..`
			`int size = HASH_MEMORY;`
			`memset(hashbuffer, 0, 64);`

			`sha256_hash(&hashbuffer[0], (uint8_t*)data, 80);`
			`for (int i = 64; i < size - 32; i += 32)`
			`{`
			`//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area`
			`int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases`
			`uint32_t joint[16];`
			`uint32_t randbuffer[16];`

			`uint32_t randseed[16];`
			`memcpy(randseed, &hashbuffer[i - 64], 64);`
			`if (i>128)`
			`{`
			`memcpy(randbuffer, &hashbuffer[i - 128], 64);`
			`}`
			`else`
			`{`
			`memset(&randbuffer, 0, 64);`
			`}`

			`xor_salsa8(randbuffer, randseed);`

			`memcpy(joint, &hashbuffer[i - 32], 32);`
			`//use the last hash value as the seed`
			`for (int j = 32; j < 64; j += 4)`
			`{`
			`uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); //randmax - 32 as otherwise we go beyond memory that's already been written to`
			`joint[j / 4] = ((uint32_t)&hashbuffer[rand]);`
			`}`
			`sha256_hash512(&hashbuffer[i], (uint8_t*)joint);`
			`// for (int k = 0; k<8; k++) { printf("sha hashbuffer %d %08x\n", k, ((uint32_t*)(hashbuffer+i))[k]); }`
			`memcpy(randseed, &hashbuffer[i - 32], 64); //use last hash value and previous hash value(post-mixing)`
			`if (i>128)`
			`{`
			`memcpy(randbuffer, &hashbuffer[i - 128], 64);`
			`}`
			`else`
			`{`
			`memset(randbuffer, 0, 64);`
			`}`
			`xor_salsa8(randbuffer, randseed);`
			`for (int j = 0; j < 32; j += 2)`
			`{`
			`uint32_t rand = randbuffer[j / 2] % randmax;`
			`((uint32_t)&hashbuffer[rand]) = ((uint32_t)&hashbuffer[j + i - 4]);`
			`}`
			`}`

			`// for (int k = 0; k<8; k++) { printf("cpu final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }`

			`//note: off-by-one error is likely here...`
			`/*`
			`for (int i = size - 64 - 1; i >= 64; i -= 64)`
			`{`
			`sha256_hash512(&hashbuffer[i - 64], &hashbuffer[i]);`
			`}`

			`for (int k = 0; k<8; k++) { printf("cpu after of by one final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }`
			`*/`
			`memcpy((unsigned char*)hash, hashbuffer, 32);`
			`}`

			`static bool init[MAX_GPUS] = { 0 };`

			`extern "C" int scanhash_pluck(int thr_id, uint32_t pdata, const uint32_t ptarget,`
			`uint32_t max_nonce, unsigned long *hashes_done)`
			`{`
			`const uint32_t first_nonce = pdata[19];`
			`uint32_t endiandata[20];`

			`int intensity = 18; /* beware > 20 could work and create diff problems later */`
			`uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);`
			`// divide by 128 for this algo which require a lot of memory`
			`throughput = throughput / 128 - 256;`
			`throughput = min(throughput, max_nonce - first_nonce + 1);`

			`if (opt_benchmark)`
			`((uint32_t*)ptarget)[7] = 0x0000ff;`

			`if (!init[thr_id])`
			`{`
			`cudaSetDevice(device_map[thr_id]);`
			`//cudaDeviceReset();`
			`//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);`

			`cudaMalloc(&d_hash[thr_id], 32 * 1024 * sizeof(uint32_t) * throughput);`

			`pluck_cpu_init(thr_id, throughput, d_hash[thr_id]);`
			`init[thr_id] = true;`
			`}`


			`for (int k = 0; k < 20; k++)`
			`be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);`

			`pluck_setBlockTarget(endiandata,ptarget);`

			`do {`
			`uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0);`
			`if (foundNonce != UINT32_MAX)`
			`{`
			`// const uint32_t Htarg = ptarget[7];`
			`// uint32_t vhash64[8];`
			`// be32enc(&endiandata[19], foundNonce);`
			`// pluckhash(vhash64,endiandata);`
			`// printf("target %08x vhash64 %08x", ptarget[7], vhash64[7]);`
			`// if (vhash64[7] <= Htarg) { // && fulltest(vhash64, ptarget)) {`
			`*hashes_done = pdata[19] - first_nonce + throughput;`
			`pdata[19] = foundNonce;`
			`return 1;`
			`// } else {`
			`// applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);`
			`// }`
			`}`

			`pdata[19] += throughput;`

			`} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);`

			`*hashes_done = pdata[19] - first_nonce;`
			`return 0;`
			`}`