ccminer/heavy/cuda_sha256.cu

#include <stdio.h>
#include <memory.h>

#include "cuda_helper.h"

// globaler Speicher für alle HeftyHashes aller Threads
extern uint32_t *heavy_heftyHashes[8];
extern uint32_t *heavy_nonceVector[8];

// globaler Speicher für unsere Ergebnisse
uint32_t *d_hash2output[8];


/* Hash-Tabellen */
__constant__ uint32_t sha256_gpu_constantTable[64];

// muss expandiert werden
__constant__ uint32_t sha256_gpu_blockHeader[16]; // 2x512 Bit Message
__constant__ uint32_t sha256_gpu_register[8];

uint32_t sha256_cpu_hashTable[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
uint32_t sha256_cpu_constantTable[] = {
	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};

#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
#define R(x, n)			((x) >> (n))
#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
#define Maj(x, y, z)	((x & (y | z)) | (y & z))
#define S0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
#define S1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
#define s0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
#define s1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))

#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )

// Die Hash-Funktion
template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{
	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
		// bestimme den aktuellen Zähler
		uint32_t nounce = startNounce + thread;
		nonceVector[thread] = nounce;

		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
		uint32_t W1[16];
		uint32_t W2[16];

		// Initialisiere die register a bis h mit der Hash-Tabelle
		uint32_t regs[8];
		uint32_t hash[8];

		// pre
#pragma unroll 8
		for (int k=0; k < 8; k++)
		{
			regs[k] = sha256_gpu_register[k];
			hash[k] = regs[k];
		}

		// 2. Runde
		//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
		//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
#pragma unroll 16
		for(int k=0;k<16;k++)
			W1[k] = sha256_gpu_blockHeader[k];

		uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
#pragma unroll 8
		for(int k=0;k<8;k++)
			W1[((BLOCKSIZE-64)/4)+k] = heftyHashes[offset + k];

#pragma unroll 8
		for (int i=((BLOCKSIZE-64)/4); i < ((BLOCKSIZE-64)/4)+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;)
		W1[3] = SWAB32(nounce);

// Progress W1
#pragma unroll 16
		for(int j=0;j<16;j++)
		{
			uint32_t T1, T2;
			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];
			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);

			#pragma unroll 7
			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
			regs[0] = T1 + T2;
			regs[4] += T1;
		}

// Progress W2...W3
#pragma unroll 3
		for(int k=0;k<3;k++)
		{
	#pragma unroll 2
			for(int j=0;j<2;j++)
				W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
	#pragma unroll 5
			for(int j=2;j<7;j++)
				W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];

	#pragma unroll 8
			for(int j=7;j<15;j++)
				W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];

			W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];

			// Rundenfunktion
	#pragma unroll 16
			for(int j=0;j<16;j++)
			{
				uint32_t T1, T2;
				T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
				T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);

				#pragma unroll 7
				for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
				regs[0] = T1 + T2;
				regs[4] += T1;
			}

	#pragma unroll 16
			for(int j=0;j<16;j++)
				W1[j] = W2[j];
		}

/*
		for(int j=16;j<64;j++)
			W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];

#pragma unroll 64
		for(int j=0;j<64;j++)
		{
			uint32_t T1, T2;
			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];
			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);

			#pragma unroll 7
			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
			regs[0] = T1 + T2;
			regs[4] += T1;
		}
*/
#pragma unroll 8
		for(int k=0;k<8;k++)
			hash[k] += regs[k];

#pragma unroll 8
		for(int k=0;k<8;k++)
			((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
	}
}

// Setup-Funktionen
__host__ void sha256_cpu_init(int thr_id, int threads)
{
	// Kopiere die Hash-Tabellen in den GPU-Speicher
	cudaMemcpyToSymbol(	sha256_gpu_constantTable,
						sha256_cpu_constantTable,
						sizeof(uint32_t) * 64 );

	// Speicher für alle Ergebnisse belegen
	cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
}

static int BLOCKSIZE = 84;

__host__ void sha256_cpu_setBlock(void *data, int len)
	// data muss 80/84-Byte haben!
	// heftyHash hat 32-Byte
{
	// Nachricht expandieren und setzen
	uint32_t msgBlock[32];

	memset(msgBlock, 0, sizeof(uint32_t) * 32);
	memcpy(&msgBlock[0], data, len);
	if (len == 84) {
		memset(&msgBlock[21], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
		msgBlock[29] |= 0x80;
		msgBlock[31] = 928; // bitlen
	} else if (len == 80) {
		memset(&msgBlock[20], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
		msgBlock[28] |= 0x80;
		msgBlock[31] = 896; // bitlen
	}

	for(int i=0;i<31;i++) // Byteorder drehen
		msgBlock[i] = SWAB32(msgBlock[i]);

	// die erste Runde wird auf der CPU durchgeführt, da diese für
	// alle Threads gleich ist. Der Hash wird dann an die Threads
	// übergeben
	uint32_t W[64];

	// Erstelle expandierten Block W
	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
	for(int j=16;j<64;j++)
		W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];

	// Initialisiere die register a bis h mit der Hash-Tabelle
	uint32_t regs[8];
	uint32_t hash[8];

	// pre
	for (int k=0; k < 8; k++)
	{
		regs[k] = sha256_cpu_hashTable[k];
		hash[k] = regs[k];
	}

	// 1. Runde
	for(int j=0;j<64;j++)
	{
		uint32_t T1, T2;
		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];
		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);

		//#pragma unroll 7
		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
		// sollte mal noch durch memmov ersetzt werden!
//		memcpy(&regs[1], &regs[0], sizeof(uint32_t) * 7);
		regs[0] = T1 + T2;
		regs[4] += T1;
	}

	for(int k=0;k<8;k++)
		hash[k] += regs[k];

	// hash speichern
	cudaMemcpyToSymbol(	sha256_gpu_register,
						hash,
						sizeof(uint32_t) * 8 );

	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
	cudaMemcpyToSymbol(	sha256_gpu_blockHeader,
						&msgBlock[16],
						64);

	BLOCKSIZE = len;
}

__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
{
	// Hefty1 Hashes kopieren
	if (copy)
		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
	//else cudaThreadSynchronize();
}

__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
{
	const int threadsperblock = 256;

	// berechne wie viele Thread Blocks wir brauchen
	dim3 grid((threads + threadsperblock-1)/threadsperblock);
	dim3 block(threadsperblock);

	// Größe des dynamischen Shared Memory Bereichs
	size_t shared_size = 0;

	if (BLOCKSIZE == 84)
		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
	else if (BLOCKSIZE == 80) {
		sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
	}
}
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`#include <stdio.h>`
			`#include <memory.h>`

Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`#include "cuda_helper.h"`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`// globaler Speicher für alle HeftyHashes aller Threads`
			`extern uint32_t *heavy_heftyHashes[8];`
			`extern uint32_t *heavy_nonceVector[8];`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`// globaler Speicher für unsere Ergebnisse`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`uint32_t *d_hash2output[8];`


			`/* Hash-Tabellen */`
			`__constant__ uint32_t sha256_gpu_constantTable[64];`

			`// muss expandiert werden`
			`__constant__ uint32_t sha256_gpu_blockHeader[16]; // 2x512 Bit Message`
			`__constant__ uint32_t sha256_gpu_register[8];`

			`uint32_t sha256_cpu_hashTable[] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };`
			`uint32_t sha256_cpu_constantTable[] = {`
			`0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,`
			`0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,`
			`0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,`
			`0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,`
			`0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,`
			`0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,`
			`0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,`
			`0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,`
			`};`

			`#define S(x, n) (((x) >> (n)) \| ((x) << (32 - (n))))`
			`#define R(x, n) ((x) >> (n))`
			`#define Ch(x, y, z) ((x & (y ^ z)) ^ z)`
			`#define Maj(x, y, z) ((x & (y \| z)) \| (y & z))`
			`#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))`
			`#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))`
			`#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))`
			`#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))`

			`#define SWAB32(x) ( ((x & 0x000000FF) << 24) \| ((x & 0x0000FF00) << 8) \| ((x & 0x00FF0000) >> 8) \| ((x & 0xFF000000) >> 24) )`

			`// Die Hash-Funktion`
bump to revision v0.8 11 years ago			`template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void outputHash, uint32_t heftyHashes, uint32_t *nonceVector)`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`{`
			`int thread = (blockDim.x * blockIdx.x + threadIdx.x);`
			`if (thread < threads)`
			`{`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`// bestimme den aktuellen Zähler`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`uint32_t nounce = startNounce + thread;`
			`nonceVector[thread] = nounce;`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory`
			`uint32_t W1[16];`
			`uint32_t W2[16];`

			`// Initialisiere die register a bis h mit der Hash-Tabelle`
			`uint32_t regs[8];`
			`uint32_t hash[8];`

			`// pre`
			`#pragma unroll 8`
			`for (int k=0; k < 8; k++)`
			`{`
			`regs[k] = sha256_gpu_register[k];`
			`hash[k] = regs[k];`
			`}`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`// 2. Runde`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke`
			`//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`#pragma unroll 16`
			`for(int k=0;k<16;k++)`
			`W1[k] = sha256_gpu_blockHeader[k];`

			`uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x);`
			`#pragma unroll 8`
			`for(int k=0;k<8;k++)`
bump to revision v0.8 11 years ago			`W1[((BLOCKSIZE-64)/4)+k] = heftyHashes[offset + k];`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
			`#pragma unroll 8`
bump to revision v0.8 11 years ago			`for (int i=((BLOCKSIZE-64)/4); i < ((BLOCKSIZE-64)/4)+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;)`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`W1[3] = SWAB32(nounce);`

			`// Progress W1`
			`#pragma unroll 16`
			`for(int j=0;j<16;j++)`
			`{`
			`uint32_t T1, T2;`
			`T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];`
			`T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`#pragma unroll 7`
			`for (int k=6; k >= 0; k--) regs[k+1] = regs[k];`
			`regs[0] = T1 + T2;`
			`regs[4] += T1;`
			`}`

			`// Progress W2...W3`
			`#pragma unroll 3`
			`for(int k=0;k<3;k++)`
			`{`
			`#pragma unroll 2`
			`for(int j=0;j<2;j++)`
			`W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];`
			`#pragma unroll 5`
			`for(int j=2;j<7;j++)`
			`W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];`

			`#pragma unroll 8`
			`for(int j=7;j<15;j++)`
			`W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];`

			`W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];`

			`// Rundenfunktion`
			`#pragma unroll 16`
			`for(int j=0;j<16;j++)`
			`{`
			`uint32_t T1, T2;`
			`T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];`
			`T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`#pragma unroll 7`
			`for (int l=6; l >= 0; l--) regs[l+1] = regs[l];`
			`regs[0] = T1 + T2;`
			`regs[4] += T1;`
			`}`

			`#pragma unroll 16`
			`for(int j=0;j<16;j++)`
			`W1[j] = W2[j];`
			`}`

			`/*`
			`for(int j=16;j<64;j++)`
			`W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`#pragma unroll 64`
			`for(int j=0;j<64;j++)`
			`{`
			`uint32_t T1, T2;`
			`T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];`
			`T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`#pragma unroll 7`
			`for (int k=6; k >= 0; k--) regs[k+1] = regs[k];`
			`regs[0] = T1 + T2;`
			`regs[4] += T1;`
			`}`
			`*/`
			`#pragma unroll 8`
			`for(int k=0;k<8;k++)`
			`hash[k] += regs[k];`

			`#pragma unroll 8`
			`for(int k=0;k<8;k++)`
			`((uint32_t)outputHash)[8thread+k] = SWAB32(hash[k]);`
			`}`
			`}`

			`// Setup-Funktionen`
			`__host__ void sha256_cpu_init(int thr_id, int threads)`
			`{`
			`// Kopiere die Hash-Tabellen in den GPU-Speicher`
			`cudaMemcpyToSymbol( sha256_gpu_constantTable,`
			`sha256_cpu_constantTable,`
			`sizeof(uint32_t) * 64 );`

heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`// Speicher für alle Ergebnisse belegen`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);`
			`}`

bump to revision v0.8 11 years ago			`static int BLOCKSIZE = 84;`

			`__host__ void sha256_cpu_setBlock(void *data, int len)`
			`// data muss 80/84-Byte haben!`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`// heftyHash hat 32-Byte`
			`{`
			`// Nachricht expandieren und setzen`
			`uint32_t msgBlock[32];`

			`memset(msgBlock, 0, sizeof(uint32_t) * 32);`
bump to revision v0.8 11 years ago			`memcpy(&msgBlock[0], data, len);`
			`if (len == 84) {`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen`
bump to revision v0.8 11 years ago			`msgBlock[29] \|= 0x80;`
			`msgBlock[31] = 928; // bitlen`
			`} else if (len == 80) {`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen`
bump to revision v0.8 11 years ago			`msgBlock[28] \|= 0x80;`
			`msgBlock[31] = 896; // bitlen`
			`}`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`for(int i=0;i<31;i++) // Byteorder drehen`
			`msgBlock[i] = SWAB32(msgBlock[i]);`

heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`// die erste Runde wird auf der CPU durchgeführt, da diese für`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`// alle Threads gleich ist. Der Hash wird dann an die Threads`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`// übergeben`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`uint32_t W[64];`

			`// Erstelle expandierten Block W`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`for(int j=16;j<64;j++)`
			`W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];`

			`// Initialisiere die register a bis h mit der Hash-Tabelle`
			`uint32_t regs[8];`
			`uint32_t hash[8];`

			`// pre`
bump to revision v0.8 11 years ago			`for (int k=0; k < 8; k++)`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`{`
			`regs[k] = sha256_cpu_hashTable[k];`
			`hash[k] = regs[k];`
			`}`

			`// 1. Runde`
			`for(int j=0;j<64;j++)`
			`{`
			`uint32_t T1, T2;`
			`T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];`
			`T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`//#pragma unroll 7`
			`for (int k=6; k >= 0; k--) regs[k+1] = regs[k];`
			`// sollte mal noch durch memmov ersetzt werden!`
			`// memcpy(&regs[1], &regs[0], sizeof(uint32_t) * 7);`
			`regs[0] = T1 + T2;`
			`regs[4] += T1;`
			`}`

			`for(int k=0;k<8;k++)`
			`hash[k] += regs[k];`

			`// hash speichern`
			`cudaMemcpyToSymbol( sha256_gpu_register,`
			`hash,`
			`sizeof(uint32_t) * 8 );`

			`// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)`
			`cudaMemcpyToSymbol( sha256_gpu_blockHeader,`
			`&msgBlock[16],`
			`64);`
bump to revision v0.8 11 years ago
			`BLOCKSIZE = len;`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`}`

			`__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)`
			`{`
			`// Hefty1 Hashes kopieren`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`if (copy)`
			`CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`//else cudaThreadSynchronize();`
			`}`

			`__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)`
			`{`
			`const int threadsperblock = 256;`

			`// berechne wie viele Thread Blocks wir brauchen`
			`dim3 grid((threads + threadsperblock-1)/threadsperblock);`
			`dim3 block(threadsperblock);`

heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`// Größe des dynamischen Shared Memory Bereichs`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`size_t shared_size = 0;`

bump to revision v0.8 11 years ago			`if (BLOCKSIZE == 84)`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);`
bump to revision v0.8 11 years ago			`else if (BLOCKSIZE == 80) {`
heavy: add error checks, fix strict aliasing and linux The core problem was the cuda hefty Thread per block set to high but took me several hours to find that... btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512 if max reg count is set to 80... 10 years ago			`sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);`
bump to revision v0.8 11 years ago			`}`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`}`