/*
 * "pluck" kernel implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2015  djm34
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author djm34
 * @author tpruvot
 */

#include <stdio.h>
#include <stdint.h>
#include <memory.h>

#include "cuda_vector.h"

#include "miner.h"

uint32_t *d_PlNonce[MAX_GPUS];

__device__  uint8_t *  hashbuffer;
__constant__  uint32_t pTarget[8];
__constant__  uint32_t c_data[20];

#define HASH_MEMORY_8bit 131072
#define HASH_MEMORY_32bit 32768
#define HASH_MEMORY 4096

static __constant__  uint32_t H256[8] = {
	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
	0x510E527F, 0x9B05688C,	0x1F83D9AB, 0x5BE0CD19
};

static  __constant__  uint32_t Ksha[64] = {
	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
	0x3956C25B, 0x59F111F1,	0x923F82A4, 0xAB1C5ED5,
	0xD807AA98, 0x12835B01,	0x243185BE, 0x550C7DC3,
	0x72BE5D74, 0x80DEB1FE,	0x9BDC06A7, 0xC19BF174,
	0xE49B69C1, 0xEFBE4786,	0x0FC19DC6, 0x240CA1CC,
	0x2DE92C6F, 0x4A7484AA,	0x5CB0A9DC, 0x76F988DA,
	0x983E5152, 0xA831C66D,	0xB00327C8, 0xBF597FC7,
	0xC6E00BF3, 0xD5A79147,	0x06CA6351, 0x14292967,
	0x27B70A85, 0x2E1B2138,	0x4D2C6DFC, 0x53380D13,
	0x650A7354, 0x766A0ABB,	0x81C2C92E, 0x92722C85,
	0xA2BFE8A1, 0xA81A664B,	0xC24B8B70, 0xC76C51A3,
	0xD192E819, 0xD6990624,	0xF40E3585, 0x106AA070,
	0x19A4C116, 0x1E376C08,	0x2748774C, 0x34B0BCB5,
	0x391C0CB3, 0x4ED8AA4A,	0x5B9CCA4F, 0x682E6FF3,
	0x748F82EE, 0x78A5636F,	0x84C87814, 0x8CC70208,
	0x90BEFFFA, 0xA4506CEB,	0xBEF9A3F7, 0xC67178F2
};


#define SALSA(a,b,c,d) { \
    t = a+d; b^=rotateL(t,  7); \
    t = b+a; c^=rotateL(t,  9); \
    t = c+b; d^=rotateL(t, 13); \
    t = d+c; a^=rotateL(t, 18); \
}

#define SALSA_CORE(state) { \
	SALSA(state.s0,state.s4,state.s8,state.sc); \
	SALSA(state.s5,state.s9,state.sd,state.s1); \
	SALSA(state.sa,state.se,state.s2,state.s6); \
	SALSA(state.sf,state.s3,state.s7,state.sb); \
	SALSA(state.s0,state.s1,state.s2,state.s3); \
	SALSA(state.s5,state.s6,state.s7,state.s4); \
	SALSA(state.sa,state.sb,state.s8,state.s9); \
	SALSA(state.sf,state.sc,state.sd,state.se); \
}

#if __CUDA_ARCH__ >= 320
static __device__ __forceinline__ uint16 xor_salsa8(const uint16 &Bx)
{
	uint32_t t;
	uint16 state = Bx;
	SALSA_CORE(state);
	SALSA_CORE(state);
	SALSA_CORE(state);
	SALSA_CORE(state);
	return(state+Bx);
}
#endif

// sha256

static __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
{
	uint32_t r1 = ROTR32(x, 2);
	uint32_t r2 = ROTR32(x, 13);
	uint32_t r3 = ROTR32(x, 22);
	return xor3b(r1, r2, r3);
}

static __device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
{
	uint32_t r1 = ROTR32(x, 6);
	uint32_t r2 = ROTR32(x, 11);
	uint32_t r3 = ROTR32(x, 25);
	return xor3b(r1, r2, r3);
}

static __device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
{
	uint64_t r1 = ROTR32(x, 7);
	uint64_t r2 = ROTR32(x, 18);
	uint64_t r3 = shr_t32(x, 3);
	return xor3b(r1, r2, r3);
}

static __device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
{
	uint64_t r1 = ROTR32(x, 17);
	uint64_t r2 = ROTR32(x, 19);
	uint64_t r3 = shr_t32(x, 10);
	return xor3b(r1, r2, r3);
}

static __device__ __forceinline__ void sha2_step1(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
	const uint32_t f, const uint32_t g, uint32_t &h, const uint32_t in, const uint32_t Kshared)
{
	uint32_t t1, t2;
	uint32_t vxandx = xandx(e, f, g);
	uint32_t bsg21 = bsg2_1(e);
	uint32_t bsg20 = bsg2_0(a);
	uint32_t andorv = andor32(a, b, c);

	t1 = h + bsg21 + vxandx + Kshared + in;
	t2 = bsg20 + andorv;
	d = d + t1;
	h = t1 + t2;
}

static __device__ __forceinline__ void sha2_step2(const uint32_t a, const uint32_t b, const uint32_t c, uint32_t &d, const uint32_t e,
	const uint32_t f, const uint32_t g, uint32_t &h, uint32_t* in, const uint32_t pc, const uint32_t Kshared)
{
	uint32_t t1, t2;

	int pcidx1 = (pc - 2) & 0xF;
	int pcidx2 = (pc - 7) & 0xF;
	int pcidx3 = (pc - 15) & 0xF;
	uint32_t inx0 = in[pc];
	uint32_t inx1 = in[pcidx1];
	uint32_t inx2 = in[pcidx2];
	uint32_t inx3 = in[pcidx3];

	uint32_t ssg21 = ssg2_1(inx1);
	uint32_t ssg20 = ssg2_0(inx3);
	uint32_t vxandx = xandx(e, f, g);
	uint32_t bsg21 = bsg2_1(e);
	uint32_t bsg20 = bsg2_0(a);
	uint32_t andorv = andor32(a, b, c);

	in[pc] = ssg21 + inx2 + ssg20 + inx0;

	t1 = h + bsg21 + vxandx + Kshared + in[pc];
	t2 = bsg20 + andorv;
	d = d + t1;
	h = t1 + t2;
}

static __device__ __forceinline__
void sha2_round_body(uint32_t* in, uint32_t* r)
{
	uint32_t a = r[0];
	uint32_t b = r[1];
	uint32_t c = r[2];
	uint32_t d = r[3];
	uint32_t e = r[4];
	uint32_t f = r[5];
	uint32_t g = r[6];
	uint32_t h = r[7];

	sha2_step1(a, b, c, d, e, f, g, h, in[0], Ksha[0]);
	sha2_step1(h, a, b, c, d, e, f, g, in[1], Ksha[1]);
	sha2_step1(g, h, a, b, c, d, e, f, in[2], Ksha[2]);
	sha2_step1(f, g, h, a, b, c, d, e, in[3], Ksha[3]);
	sha2_step1(e, f, g, h, a, b, c, d, in[4], Ksha[4]);
	sha2_step1(d, e, f, g, h, a, b, c, in[5], Ksha[5]);
	sha2_step1(c, d, e, f, g, h, a, b, in[6], Ksha[6]);
	sha2_step1(b, c, d, e, f, g, h, a, in[7], Ksha[7]);
	sha2_step1(a, b, c, d, e, f, g, h, in[8], Ksha[8]);
	sha2_step1(h, a, b, c, d, e, f, g, in[9], Ksha[9]);
	sha2_step1(g, h, a, b, c, d, e, f, in[10], Ksha[10]);
	sha2_step1(f, g, h, a, b, c, d, e, in[11], Ksha[11]);
	sha2_step1(e, f, g, h, a, b, c, d, in[12], Ksha[12]);
	sha2_step1(d, e, f, g, h, a, b, c, in[13], Ksha[13]);
	sha2_step1(c, d, e, f, g, h, a, b, in[14], Ksha[14]);
	sha2_step1(b, c, d, e, f, g, h, a, in[15], Ksha[15]);

	#pragma unroll 3
	for (int i = 0; i<3; i++) {

		sha2_step2(a, b, c, d, e, f, g, h, in, 0, Ksha[16 + 16 * i]);
		sha2_step2(h, a, b, c, d, e, f, g, in, 1, Ksha[17 + 16 * i]);
		sha2_step2(g, h, a, b, c, d, e, f, in, 2, Ksha[18 + 16 * i]);
		sha2_step2(f, g, h, a, b, c, d, e, in, 3, Ksha[19 + 16 * i]);
		sha2_step2(e, f, g, h, a, b, c, d, in, 4, Ksha[20 + 16 * i]);
		sha2_step2(d, e, f, g, h, a, b, c, in, 5, Ksha[21 + 16 * i]);
		sha2_step2(c, d, e, f, g, h, a, b, in, 6, Ksha[22 + 16 * i]);
		sha2_step2(b, c, d, e, f, g, h, a, in, 7, Ksha[23 + 16 * i]);
		sha2_step2(a, b, c, d, e, f, g, h, in, 8, Ksha[24 + 16 * i]);
		sha2_step2(h, a, b, c, d, e, f, g, in, 9, Ksha[25 + 16 * i]);
		sha2_step2(g, h, a, b, c, d, e, f, in, 10, Ksha[26 + 16 * i]);
		sha2_step2(f, g, h, a, b, c, d, e, in, 11, Ksha[27 + 16 * i]);
		sha2_step2(e, f, g, h, a, b, c, d, in, 12, Ksha[28 + 16 * i]);
		sha2_step2(d, e, f, g, h, a, b, c, in, 13, Ksha[29 + 16 * i]);
		sha2_step2(c, d, e, f, g, h, a, b, in, 14, Ksha[30 + 16 * i]);
		sha2_step2(b, c, d, e, f, g, h, a, in, 15, Ksha[31 + 16 * i]);

	}

	r[0] += a;
	r[1] += b;
	r[2] += c;
	r[3] += d;
	r[4] += e;
	r[5] += f;
	r[6] += g;
	r[7] += h;
}

static __device__ __forceinline__ uint8 sha256_64(uint32_t *data)
{
	uint32_t __align__(64) in[16];
	uint32_t __align__(32) buf[8];

	((uint16 *)in)[0] = swapvec((uint16*)data);

	((uint8*)buf)[0] = ((uint8*)H256)[0];

	sha2_round_body(in, buf);

	#pragma unroll 14
	for (int i = 0; i<14; i++) { in[i + 1] = 0; }

	in[0] = 0x80000000;
	in[15] = 0x200;

	sha2_round_body(in, buf);
	return swapvec((uint8*)buf);
}

static __device__ __forceinline__ uint8 sha256_80(uint32_t nonce)
{
	uint32_t __align__(64) in[16];
	uint32_t __align__(32) buf[8];

	((uint16 *)in)[0] = swapvec((uint16*)c_data);
	((uint8*)buf)[0] = ((uint8*)H256)[0];

	sha2_round_body(in, buf);

	#pragma unroll 3
	for (int i = 0; i<3; i++) { in[i] = cuda_swab32(c_data[i + 16]); }

//	in[3] = cuda_swab32(nonce);
	in[3] = nonce;
	in[4] = 0x80000000;
	in[15] = 0x280;

	#pragma unroll
	for (int i = 5; i<15; i++) { in[i] = 0; }

	sha2_round_body(in, buf);
	return swapvec((uint8*)buf);
}

// Pluck Factor 128
#define SHIFT (1024 * 128)

__global__ __launch_bounds__(256, 1)
void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce)
{
	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
#if __CUDA_ARCH__ >= 320
		const uint32_t nonce = startNonce + thread;
		uint32_t shift = SHIFT * thread;
		((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
		((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
		for (int i = 2; i < 5; i++)
		{
			uint32_t randmax = i * 32 - 4;
			uint32_t randseed[16];
			uint32_t randbuffer[16];
			uint32_t joint[16];

			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);

			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);

			((uint8*)joint)[0] = ((uint8*)randseed)[1];

			#pragma unroll
			for (int j = 0; j < 8; j++) {
				uint32_t rand = randbuffer[j] % (randmax - 32);
				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
			}

			uint8 truc = sha256_64(joint);
			((uint8*)(hashbuffer + shift))[i] = truc;
			((uint8*)randseed)[0] = ((uint8*)joint)[0];
			((uint8*)randseed)[1] = truc;

			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);

			for (int j = 0; j < 32; j += 2)
			{
				uint32_t rand = randbuffer[j / 2] % randmax;
				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
			}
		} // main loop
#endif
	}
}

__global__ __launch_bounds__(256, 1)
void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
{
	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
#if __CUDA_ARCH__ >= 320
		const uint32_t nonce = startNonce + thread;
		uint32_t shift = SHIFT * thread;

		for (int i = 5; i < HASH_MEMORY - 1; i++)
		{
			uint32_t randmax = i*32-4;
			uint32_t randseed[16];
			uint32_t randbuffer[16];
			uint32_t joint[16];
			uint8 Buffbuffer[2];

			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32*i-64]);
			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32*i-32]);


			Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32*i - 128]);
			Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32*i - 96]);

			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
			((uint16*)randbuffer)[0]= xor_salsa8(((uint16*)randseed)[0]);
			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i-1)<<5]);

			#pragma unroll
			for (int j = 0; j < 8; j++) {
				uint32_t rand = randbuffer[j] % (randmax - 32);
				joint[j+8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
			}

			uint8 truc =  sha256_64(joint);
			((uint8*)(hashbuffer + shift))[i] = truc;
			((uint8*)randseed)[0] = ((uint8*)joint)[0];
			((uint8*)randseed)[1] = truc;

			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);

			for (int j = 0; j < 32; j += 2)
			{
				uint32_t rand = randbuffer[j / 2] % randmax;

				(hashbuffer+shift)[rand] =       __ldg(&(hashbuffer+shift)[randmax+j]);
				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
			}

		} // main loop

		uint32_t outbuf =  __ldgtoint(&(hashbuffer + shift)[28]);

		if (outbuf <= pTarget[7]) {
			nonceVector[0] = nonce;
		}
#endif
	}
}

__global__ __launch_bounds__(128, 3)
void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce)
{
	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
#if __CUDA_ARCH__ >= 320
		const uint32_t nonce = startNonce + thread;

		uint32_t shift = SHIFT * thread;
		((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
		((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
		for (int i = 2; i < 5; i++)
		{
			uint32_t randmax = i * 32 - 4;
			uint32_t randseed[16];
			uint32_t randbuffer[16];
			uint32_t joint[16];

			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);

			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);

			((uint8*)joint)[0] = ((uint8*)randseed)[1];

			#pragma unroll
			for (int j = 0; j < 8; j++) {
				uint32_t rand = randbuffer[j] % (randmax - 32);
				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
			}

			uint8 truc = sha256_64(joint);
			((uint8*)(hashbuffer + shift))[i] = truc;
			((uint8*)randseed)[0] = ((uint8*)joint)[0];
			((uint8*)randseed)[1] = truc;

			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);

			for (int j = 0; j < 32; j += 2)
			{
				uint32_t rand = randbuffer[j / 2] % randmax;
				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
			}
		} // main loop
#endif
	}
}

__global__ __launch_bounds__(128, 3)
void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
{
	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
#if __CUDA_ARCH__ >= 320
		const uint32_t nonce = startNonce + thread;

		uint32_t shift = SHIFT * thread;

		for (int i = 5; i < HASH_MEMORY - 1; i++)
		{
			uint32_t randmax = i * 32 - 4;
			uint32_t randseed[16];
			uint32_t randbuffer[16];
			uint32_t joint[16];
			uint8 Buffbuffer[2];

			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);


			Buffbuffer[0] = __ldg8(&(hashbuffer + shift)[32 * i - 128]);
			Buffbuffer[1] = __ldg8(&(hashbuffer + shift)[32 * i - 96]);
			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];

			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);

			((uint8*)joint)[0] = __ldg8(&(hashbuffer + shift)[(i - 1) << 5]);

			#pragma unroll
			for (int j = 0; j < 8; j++)
			{
				uint32_t rand = randbuffer[j] % (randmax - 32);
				joint[j + 8] = __ldgtoint_unaligned(&(hashbuffer + shift)[rand]);
			}

			uint8 truc = sha256_64(joint);
			((uint8*)(hashbuffer + shift))[i] = truc;
			((uint8*)randseed)[0] = ((uint8*)joint)[0];
			((uint8*)randseed)[1] = truc;


			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
			for (int j = 0; j < 32; j += 2)
			{
				uint32_t rand = randbuffer[j / 2] % randmax;

				(hashbuffer + shift)[rand] = __ldg(&(hashbuffer + shift)[randmax + j]);
				(hashbuffer + shift)[rand + 1] = __ldg(&(hashbuffer + shift)[randmax + j + 1]);
				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
			}
		} // main loop

		uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]);

		if (outbuf <= pTarget[7]) {
			nonceVector[0] = nonce;
		}
#endif
	}
}

void pluck_cpu_init(int thr_id, uint32_t threads, uint32_t* hash)
{
	cuda_get_arch(thr_id);
	cudaMemcpyToSymbol(hashbuffer, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice);
	cudaMalloc(&d_PlNonce[thr_id], sizeof(uint32_t));
}

__host__
uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,  int order)
{
	uint32_t result[8] = { 0xffffffff };
	cudaMemset(d_PlNonce[thr_id], 0xff, sizeof(uint32_t));

	const uint32_t threadsperblock = 128;

	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
	dim3 block(threadsperblock);
	dim3 grid50((threads + 256 - 1) / 256);
	dim3 block50(256);

	if (device_sm[device_map[thr_id]] <= 300) {
		applog(LOG_ERR,"Sorry pluck not supported on SM 3.0 devices");
		return 0;
	} else if (device_sm[device_map[thr_id]] >= 500) {
		pluck_gpu_hash0_v50 <<< grid50, block50 >>>(threads, startNounce);
		pluck_gpu_hash_v50  <<< grid50, block50 >>>(threads, startNounce, d_PlNonce[thr_id]);
	} else {
		pluck_gpu_hash0 <<< grid, block >>>(threads, startNounce);
		pluck_gpu_hash <<< grid, block >>>(threads, startNounce, d_PlNonce[thr_id]);
	}

	//MyStreamSynchronize(NULL, order, thr_id);
	CUDA_SAFE_CALL(cudaThreadSynchronize());
	cudaMemcpy(&result[thr_id], d_PlNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

	return result[thr_id];
}

__host__
void pluck_setBlockTarget(const void *pdata, const void *ptarget)
{
	unsigned char PaddedMessage[80];
	memcpy(PaddedMessage, pdata, 80);

	cudaMemcpyToSymbol(c_data, PaddedMessage, 80, 0, cudaMemcpyHostToDevice);
	cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
}