/* Based on djm code */

extern "C" {
#include "miner.h"
}

#include <stdint.h>

static uint32_t *d_hash[MAX_GPUS] ;

extern void pluck_setBlockTarget(const void* data, const void *ptarget);
extern void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t *d_outputHash);
extern uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, int order);

extern float tp_coef[MAX_GPUS];

#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
//note, this is 64 bytes
static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16])
{
#define ROTL(a, b) (((a) << (b)) | ((a) >> (32 - (b))))
	uint32_t x00, x01, x02, x03, x04, x05, x06, x07, x08, x09, x10, x11, x12, x13, x14, x15;
	int i;

	x00 = (B[0] ^= Bx[0]);
	x01 = (B[1] ^= Bx[1]);
	x02 = (B[2] ^= Bx[2]);
	x03 = (B[3] ^= Bx[3]);
	x04 = (B[4] ^= Bx[4]);
	x05 = (B[5] ^= Bx[5]);
	x06 = (B[6] ^= Bx[6]);
	x07 = (B[7] ^= Bx[7]);
	x08 = (B[8] ^= Bx[8]);
	x09 = (B[9] ^= Bx[9]);
	x10 = (B[10] ^= Bx[10]);
	x11 = (B[11] ^= Bx[11]);
	x12 = (B[12] ^= Bx[12]);
	x13 = (B[13] ^= Bx[13]);
	x14 = (B[14] ^= Bx[14]);
	x15 = (B[15] ^= Bx[15]);
	for (i = 0; i < 8; i += 2) {
		/* Operate on columns. */
		x04 ^= ROTL(x00 + x12, 7);  x09 ^= ROTL(x05 + x01, 7);
		x14 ^= ROTL(x10 + x06, 7);  x03 ^= ROTL(x15 + x11, 7);

		x08 ^= ROTL(x04 + x00, 9);  x13 ^= ROTL(x09 + x05, 9);
		x02 ^= ROTL(x14 + x10, 9);  x07 ^= ROTL(x03 + x15, 9);

		x12 ^= ROTL(x08 + x04, 13);  x01 ^= ROTL(x13 + x09, 13);
		x06 ^= ROTL(x02 + x14, 13);  x11 ^= ROTL(x07 + x03, 13);

		x00 ^= ROTL(x12 + x08, 18);  x05 ^= ROTL(x01 + x13, 18);
		x10 ^= ROTL(x06 + x02, 18);  x15 ^= ROTL(x11 + x07, 18);

		/* Operate on rows. */
		x01 ^= ROTL(x00 + x03, 7);  x06 ^= ROTL(x05 + x04, 7);
		x11 ^= ROTL(x10 + x09, 7);  x12 ^= ROTL(x15 + x14, 7);

		x02 ^= ROTL(x01 + x00, 9);  x07 ^= ROTL(x06 + x05, 9);
		x08 ^= ROTL(x11 + x10, 9);  x13 ^= ROTL(x12 + x15, 9);

		x03 ^= ROTL(x02 + x01, 13);  x04 ^= ROTL(x07 + x06, 13);
		x09 ^= ROTL(x08 + x11, 13);  x14 ^= ROTL(x13 + x12, 13);

		x00 ^= ROTL(x03 + x02, 18);  x05 ^= ROTL(x04 + x07, 18);
		x10 ^= ROTL(x09 + x08, 18);  x15 ^= ROTL(x14 + x13, 18);
	}
	B[0] += x00;
	B[1] += x01;
	B[2] += x02;
	B[3] += x03;
	B[4] += x04;
	B[5] += x05;
	B[6] += x06;
	B[7] += x07;
	B[8] += x08;
	B[9] += x09;
	B[10] += x10;
	B[11] += x11;
	B[12] += x12;
	B[13] += x13;
	B[14] += x14;
	B[15] += x15;
#undef ROTL
}

static void sha256_hash(unsigned char *hash, const unsigned char *data, int len)
{
	uint32_t S[16], T[16];
	int i, r;

	sha256_init(S);
	for (r = len; r > -9; r -= 64) {
		if (r < 64)
			memset(T, 0, 64);
		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
		if (r >= 0 && r < 64)
			((unsigned char *)T)[r] = 0x80;
		for (i = 0; i < 16; i++)
			T[i] = be32dec(T + i);

		if (r < 56)
			T[15] = 8 * len;
		sha256_transform(S, T, 0);
	}
	for (i = 0; i < 8; i++)
		be32enc((uint32_t *)hash + i, S[i]);
}

static void sha256_hash512(unsigned char *hash, const unsigned char *data)
{
	uint32_t S[16], T[16];
	int i;

	sha256_init(S);

	memcpy(T, data, 64);
	for (i = 0; i < 16; i++)
		T[i] = be32dec(T + i);
	sha256_transform(S, T, 0);

	memset(T, 0, 64);
	//memcpy(T, data + 64, 0);
	((unsigned char *)T)[0] = 0x80;
	for (i = 0; i < 16; i++)
		T[i] = be32dec(T + i);
	T[15] = 8 * 64;
	sha256_transform(S, T, 0);

	for (i = 0; i < 8; i++)
		be32enc((uint32_t *)hash + i, S[i]);
}

void pluckhash(uint32_t *hash, uint32_t *input)
{

	uint32_t data[20];
	//uint32_t midstate[8];

	const int HASH_MEMORY = 128 * 1024;
	uint8_t * scratchbuf = (uint8_t*)malloc(HASH_MEMORY);

	for (int k = 0; k<20; k++) { data[k] = input[k]; }

		uint8_t *hashbuffer = scratchbuf; //don't allocate this on stack, since it's huge..
		int size = HASH_MEMORY;
		memset(hashbuffer, 0, 64);

		sha256_hash(&hashbuffer[0], (uint8_t*)data, 80);
		for (int i = 64; i < size - 32; i += 32)
		{
			//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
			int randmax = i - 4; //we could use size here, but then it's probable to use 0 as the value in most cases
			uint32_t joint[16];
			uint32_t randbuffer[16];

			uint32_t randseed[16];
			memcpy(randseed, &hashbuffer[i - 64], 64);
			if (i>128)
			{
				memcpy(randbuffer, &hashbuffer[i - 128], 64);
			}
			else
			{
				memset(&randbuffer, 0, 64);
			}

			xor_salsa8(randbuffer, randseed);

			memcpy(joint, &hashbuffer[i - 32], 32);
			//use the last hash value as the seed
			for (int j = 32; j < 64; j += 4)
			{
				uint32_t rand = randbuffer[(j - 32) / 4] % (randmax - 32); //randmax - 32 as otherwise we go beyond memory that's already been written to
				joint[j / 4] = *((uint32_t*)&hashbuffer[rand]);
			}
			sha256_hash512(&hashbuffer[i], (uint8_t*)joint);
//			for (int k = 0; k<8; k++) { printf("sha hashbuffer %d %08x\n", k, ((uint32_t*)(hashbuffer+i))[k]); }
			memcpy(randseed, &hashbuffer[i - 32], 64); //use last hash value and previous hash value(post-mixing)
			if (i>128)
			{
				memcpy(randbuffer, &hashbuffer[i - 128], 64);
			}
			else
			{
				memset(randbuffer, 0, 64);
			}
			xor_salsa8(randbuffer, randseed);
			for (int j = 0; j < 32; j += 2)
			{
				uint32_t rand = randbuffer[j / 2] % randmax;
				*((uint32_t*)&hashbuffer[rand]) = *((uint32_t*)&hashbuffer[j + i - 4]);
			}
		}

//		for (int k = 0; k<8; k++) { printf("cpu final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }

		//note: off-by-one error is likely here...
/*
		for (int i = size - 64 - 1; i >= 64; i -= 64)
		{
			sha256_hash512(&hashbuffer[i - 64], &hashbuffer[i]);
		}

		for (int k = 0; k<8; k++) { printf("cpu after of by one final hash %d %08x\n", k, ((uint32_t*)hashbuffer)[k]); }
*/
		memcpy((unsigned char*)hash, hashbuffer, 32);
}

static bool init[MAX_GPUS] = { 0 };

extern "C" int scanhash_pluck(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
	uint32_t max_nonce, unsigned long *hashes_done)
{
	const uint32_t first_nonce = pdata[19];
	uint32_t endiandata[20];

	int intensity = 18; /* beware > 20 could work and create diff problems later */
	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
	// divide by 128 for this algo which require a lot of memory
	throughput = throughput / 128 - 256;
	throughput = min(throughput, max_nonce - first_nonce + 1);

	if (opt_benchmark)
		((uint32_t*)ptarget)[7] = 0x0000ff;

	if (!init[thr_id])
	{
		cudaSetDevice(device_map[thr_id]);
		//cudaDeviceReset();
		//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
		//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);

		cudaMalloc(&d_hash[thr_id], 32 * 1024 * sizeof(uint32_t) * throughput);

		pluck_cpu_init(thr_id, throughput, d_hash[thr_id]);
		init[thr_id] = true;
	}


	for (int k = 0; k < 20; k++)
		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

	pluck_setBlockTarget(endiandata,ptarget);

	do {
		uint32_t foundNonce = pluck_cpu_hash(thr_id, throughput, pdata[19], 0);
		if (foundNonce != UINT32_MAX)
		{
//			const uint32_t Htarg = ptarget[7];
//			uint32_t vhash64[8];
//			be32enc(&endiandata[19], foundNonce);
//			pluckhash(vhash64,endiandata);
//			printf("target %08x vhash64 %08x", ptarget[7], vhash64[7]);
//			if (vhash64[7] <= Htarg) { // && fulltest(vhash64, ptarget)) {
				*hashes_done = pdata[19] - first_nonce + throughput;
				pdata[19] = foundNonce;
				return 1;
//			} else {
//				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
//			}
		}

		pdata[19] += throughput;

	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}