ccminer-gostd-lite/x13/cuda_hsr_sm3.cu

#include <stdio.h>
#include <stdint.h>
#include <memory.h>

#include <cuda_helper.h>
#include <miner.h>

#define  F(x, y, z) (((x) ^ (y) ^ (z)))
#define FF(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
#define GG(x, y, z) ((z)  ^ ((x) & ((y) ^ (z))))

#define P0(x) x ^ ROTL32(x,  9) ^ ROTL32(x, 17)
#define P1(x) x ^ ROTL32(x, 15) ^ ROTL32(x, 23)

static __forceinline__ __device__
void sm3_compress2(uint32_t digest[8], const uint32_t pblock[16])
{
	uint32_t tt1, tt2, i, t, ss1, ss2, x, y;
	uint32_t w[68];
	uint32_t a = digest[0];
	uint32_t b = digest[1];
	uint32_t c = digest[2];
	uint32_t d = digest[3];
	uint32_t e = digest[4];
	uint32_t f = digest[5];
	uint32_t g = digest[6];
	uint32_t h = digest[7];

	#pragma unroll
	for (i = 0; i<16; i++) {
		w[i] = cuda_swab32(pblock[i]);
	}

	for (i = 16; i<68; i++) {
		x = ROTL32(w[i - 3], 15);
		y = ROTL32(w[i - 13], 7);

		x ^= w[i - 16];
		x ^= w[i - 9];
		y ^= w[i - 6];

		w[i] = P1(x) ^ y;
	}

	for (i = 0; i<64; i++) {

		t = (i < 16) ? 0x79cc4519 : 0x7a879d8a;

		ss2 = ROTL32(a, 12);
		ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);
		ss2 ^= ss1;

		tt1 = d + ss2 + (w[i] ^ w[i + 4]);
		tt2 = h + ss1 + w[i];

		if (i < 16) {
			tt1 += F(a, b, c);
			tt2 += F(e, f, g);
		}
		else {
			tt1 += FF(a, b, c);
			tt2 += GG(e, f, g);
		}
		d = c;
		c = ROTL32(b, 9);
		b = a;
		a = tt1;
		h = g;
		g = ROTL32(f, 19);
		f = e;
		e = P0(tt2);
	}

	digest[0] ^= a;
	digest[1] ^= b;
	digest[2] ^= c;
	digest[3] ^= d;
	digest[4] ^= e;
	digest[5] ^= f;
	digest[6] ^= g;
	digest[7] ^= h;
}

/***************************************************/
// GPU Hash Function
__global__
void sm3_gpu_hash_64(const uint32_t threads, uint32_t *g_hash)
{
	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

	if (thread < threads)
	{
		const size_t hashPosition = thread;

		uint32_t digest[8];
		digest[0] = 0x7380166F;
		digest[1] = 0x4914B2B9;
		digest[2] = 0x172442D7;
		digest[3] = 0xDA8A0600;
		digest[4] = 0xA96F30BC;
		digest[5] = 0x163138AA;
		digest[6] = 0xE38DEE4D;
		digest[7] = 0xB0FB0E4E;

		uint32_t *pHash = &g_hash[hashPosition << 4];
		sm3_compress2(digest, pHash);

		uint32_t block[16];
		block[0] = 0x80;

		#pragma unroll
		for (int i = 1; i < 14; i++)
			block[i] = 0;

		// count
		block[14] = cuda_swab32(1 >> 23);
		block[15] = cuda_swab32((1 << 9) + (0 << 3));

		sm3_compress2(digest, block);

		for (int i = 0; i < 8; i++)
			pHash[i] = cuda_swab32(digest[i]);

		for (int i = 8; i < 16; i++)
			pHash[i] = 0;
	}
}

__host__
void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *g_hash, int order)
{
	const uint32_t threadsperblock = 256;

	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
	dim3 block(threadsperblock);

	sm3_gpu_hash_64 <<<grid, block>>>(threads, g_hash);
	//MyStreamSynchronize(NULL, order, thr_id);
}
import and clean hsr algo 7 years ago			`#include <stdio.h>`
			`#include <stdint.h>`
			`#include <memory.h>`

			`#include <cuda_helper.h>`
			`#include <miner.h>`

			`#define F(x, y, z) (((x) ^ (y) ^ (z)))`
			`#define FF(x, y, z) (((x) & (y)) \| ((x) & (z)) \| ((y) & (z)))`
			`#define GG(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))`

			`#define P0(x) x ^ ROTL32(x, 9) ^ ROTL32(x, 17)`
			`#define P1(x) x ^ ROTL32(x, 15) ^ ROTL32(x, 23)`

			`static __forceinline__ __device__`
			`void sm3_compress2(uint32_t digest[8], const uint32_t pblock[16])`
			`{`
			`uint32_t tt1, tt2, i, t, ss1, ss2, x, y;`
			`uint32_t w[68];`
			`uint32_t a = digest[0];`
			`uint32_t b = digest[1];`
			`uint32_t c = digest[2];`
			`uint32_t d = digest[3];`
			`uint32_t e = digest[4];`
			`uint32_t f = digest[5];`
			`uint32_t g = digest[6];`
			`uint32_t h = digest[7];`

			`#pragma unroll`
			`for (i = 0; i<16; i++) {`
			`w[i] = cuda_swab32(pblock[i]);`
			`}`

			`for (i = 16; i<68; i++) {`
			`x = ROTL32(w[i - 3], 15);`
			`y = ROTL32(w[i - 13], 7);`

			`x ^= w[i - 16];`
			`x ^= w[i - 9];`
			`y ^= w[i - 6];`

			`w[i] = P1(x) ^ y;`
			`}`

			`for (i = 0; i<64; i++) {`

			`t = (i < 16) ? 0x79cc4519 : 0x7a879d8a;`

			`ss2 = ROTL32(a, 12);`
			`ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);`
			`ss2 ^= ss1;`

			`tt1 = d + ss2 + (w[i] ^ w[i + 4]);`
			`tt2 = h + ss1 + w[i];`

			`if (i < 16) {`
			`tt1 += F(a, b, c);`
			`tt2 += F(e, f, g);`
			`}`
			`else {`
			`tt1 += FF(a, b, c);`
			`tt2 += GG(e, f, g);`
			`}`
			`d = c;`
			`c = ROTL32(b, 9);`
			`b = a;`
			`a = tt1;`
			`h = g;`
			`g = ROTL32(f, 19);`
			`f = e;`
			`e = P0(tt2);`
			`}`

			`digest[0] ^= a;`
			`digest[1] ^= b;`
			`digest[2] ^= c;`
			`digest[3] ^= d;`
			`digest[4] ^= e;`
			`digest[5] ^= f;`
			`digest[6] ^= g;`
			`digest[7] ^= h;`
			`}`

			`/***************************************************/`
			`// GPU Hash Function`
			`__global__`
			`void sm3_gpu_hash_64(const uint32_t threads, uint32_t *g_hash)`
			`{`
			`const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);`

			`if (thread < threads)`
			`{`
			`const size_t hashPosition = thread;`

			`uint32_t digest[8];`
			`digest[0] = 0x7380166F;`
			`digest[1] = 0x4914B2B9;`
			`digest[2] = 0x172442D7;`
			`digest[3] = 0xDA8A0600;`
			`digest[4] = 0xA96F30BC;`
			`digest[5] = 0x163138AA;`
			`digest[6] = 0xE38DEE4D;`
			`digest[7] = 0xB0FB0E4E;`

			`uint32_t *pHash = &g_hash[hashPosition << 4];`
			`sm3_compress2(digest, pHash);`

			`uint32_t block[16];`
			`block[0] = 0x80;`

			`#pragma unroll`
			`for (int i = 1; i < 14; i++)`
			`block[i] = 0;`

			`// count`
			`block[14] = cuda_swab32(1 >> 23);`
			`block[15] = cuda_swab32((1 << 9) + (0 << 3));`

			`sm3_compress2(digest, block);`

			`for (int i = 0; i < 8; i++)`
			`pHash[i] = cuda_swab32(digest[i]);`

			`for (int i = 8; i < 16; i++)`
			`pHash[i] = 0;`
			`}`
			`}`

			`__host__`
			`void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *g_hash, int order)`
			`{`
			`const uint32_t threadsperblock = 256;`

			`dim3 grid((threads + threadsperblock - 1) / threadsperblock);`
			`dim3 block(threadsperblock);`

			`sm3_gpu_hash_64 <<<grid, block>>>(threads, g_hash);`
			`//MyStreamSynchronize(NULL, order, thr_id);`
			`}`