ccminer/quark/groestl_transf_quad.h

/* File included in quark/groestl (quark/jha,nist5/X11+) and groest/myriad coins for SM 3+ */

#define merge8(z,x,y)\
	z=__byte_perm(x, y, 0x5140); \

#define SWAP8(x,y)\
	x=__byte_perm(x, y, 0x5410); \
	y=__byte_perm(x, y, 0x7632);

#define SWAP4(x,y)\
	t = (y<<4); \
	t = (x ^ t); \
	t = 0xf0f0f0f0UL & t; \
	x = (x ^ t); \
	t=  t>>4;\
	y=  y ^ t;

#define SWAP2(x,y)\
	t = (y<<2); \
	t = (x ^ t); \
	t = 0xccccccccUL & t; \
	x = (x ^ t); \
	t=  t>>2;\
	y=  y ^ t;

#define SWAP1(x,y)\
	t = (y+y); \
	t = (x ^ t); \
	t = 0xaaaaaaaaUL & t; \
	x = (x ^ t); \
	t=  t>>1;\
	y=  y ^ t;


__device__ __forceinline__
void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
	uint32_t other[8];
	uint32_t d[8];
	uint32_t t;
	const unsigned int n = threadIdx.x & 3;

	#pragma unroll
	for (int i = 0; i < 8; i++) {
		input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
		other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4);
		input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
		other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
		if (threadIdx.x & 1) {
			input[i] = __byte_perm(input[i], 0, 0x1032);
			other[i] = __byte_perm(other[i], 0, 0x1032);
		}
	}

	merge8(d[0], input[0], input[4]);
	merge8(d[1], other[0], other[4]);
	merge8(d[2], input[1], input[5]);
	merge8(d[3], other[1], other[5]);
	merge8(d[4], input[2], input[6]);
	merge8(d[5], other[2], other[6]);
	merge8(d[6], input[3], input[7]);
	merge8(d[7], other[3], other[7]);

	SWAP1(d[0], d[1]);
	SWAP1(d[2], d[3]);
	SWAP1(d[4], d[5]);
	SWAP1(d[6], d[7]);

	SWAP2(d[0], d[2]);
	SWAP2(d[1], d[3]);
	SWAP2(d[4], d[6]);
	SWAP2(d[5], d[7]);

	SWAP4(d[0], d[4]);
	SWAP4(d[1], d[5]);
	SWAP4(d[2], d[6]);
	SWAP4(d[3], d[7]);

	output[0] = d[0];
	output[1] = d[1];
	output[2] = d[2];
	output[3] = d[3];
	output[4] = d[4];
	output[5] = d[5];
	output[6] = d[6];
	output[7] = d[7];
}

__device__ __forceinline__
void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
	uint32_t d[8];
	uint32_t t;

	d[0] = __byte_perm(input[0], input[4], 0x7531);
	d[1] = __byte_perm(input[1], input[5], 0x7531);
	d[2] = __byte_perm(input[2], input[6], 0x7531);
	d[3] = __byte_perm(input[3], input[7], 0x7531);

	SWAP1(d[0], d[1]);
	SWAP1(d[2], d[3]);

	SWAP2(d[0], d[2]);
	SWAP2(d[1], d[3]);

	t = __byte_perm(d[0], d[2], 0x5410);
	d[2] = __byte_perm(d[0], d[2], 0x7632);
	d[0] = t;

	t = __byte_perm(d[1], d[3], 0x5410);
	d[3] = __byte_perm(d[1], d[3], 0x7632);
	d[1] = t;

	SWAP4(d[0], d[2]);
	SWAP4(d[1], d[3]);

	output[0] = d[0];
	output[2] = d[1];
	output[4] = d[0] >> 16;
	output[6] = d[1] >> 16;
	output[8] = d[2];
	output[10] = d[3];
	output[12] = d[2] >> 16;
	output[14] = d[3] >> 16;

	#pragma unroll 8
	for (int i = 0; i < 16; i+=2) {
		if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
		output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)&3, 4), 0x7610);
		output[i+1] = __shfl((int)output[i], (threadIdx.x+2)&3, 4);
		if (threadIdx.x & 3) output[i] = output[i+1] = 0;
	}
}
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`/* File included in quark/groestl (quark/jha,nist5/X11+) and groest/myriad coins for SM 3+ */`

			`#define merge8(z,x,y)\`
			`z=__byte_perm(x, y, 0x5140); \`

			`#define SWAP8(x,y)\`
			`x=__byte_perm(x, y, 0x5410); \`
			`y=__byte_perm(x, y, 0x7632);`

			`#define SWAP4(x,y)\`
			`t = (y<<4); \`
			`t = (x ^ t); \`
			`t = 0xf0f0f0f0UL & t; \`
			`x = (x ^ t); \`
			`t= t>>4;\`
			`y= y ^ t;`

			`#define SWAP2(x,y)\`
			`t = (y<<2); \`
			`t = (x ^ t); \`
			`t = 0xccccccccUL & t; \`
			`x = (x ^ t); \`
			`t= t>>2;\`
			`y= y ^ t;`

			`#define SWAP1(x,y)\`
			`t = (y+y); \`
			`t = (x ^ t); \`
			`t = 0xaaaaaaaaUL & t; \`
			`x = (x ^ t); \`
			`t= t>>1;\`
			`y= y ^ t;`
Try to restore compat with 2.1 devices (GTX 460) 10 years ago
Move common check_cpu functions to root 10 years ago
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`__device__ __forceinline__`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`void to_bitslice_quad(uint32_t const __restrict__ input, uint32_t const __restrict__ output)`
bump to revision V1.1 with Killer Groestl 11 years ago			`{`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`uint32_t other[8];`
			`uint32_t d[8];`
			`uint32_t t;`
			`const unsigned int n = threadIdx.x & 3;`

			`#pragma unroll`
			`for (int i = 0; i < 8; i++) {`
			`input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);`
			`other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4);`
			`input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);`
			`other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);`
			`if (threadIdx.x & 1) {`
			`input[i] = __byte_perm(input[i], 0, 0x1032);`
			`other[i] = __byte_perm(other[i], 0, 0x1032);`
			`}`
			`}`

			`merge8(d[0], input[0], input[4]);`
			`merge8(d[1], other[0], other[4]);`
			`merge8(d[2], input[1], input[5]);`
			`merge8(d[3], other[1], other[5]);`
			`merge8(d[4], input[2], input[6]);`
			`merge8(d[5], other[2], other[6]);`
			`merge8(d[6], input[3], input[7]);`
			`merge8(d[7], other[3], other[7]);`

			`SWAP1(d[0], d[1]);`
			`SWAP1(d[2], d[3]);`
			`SWAP1(d[4], d[5]);`
			`SWAP1(d[6], d[7]);`

			`SWAP2(d[0], d[2]);`
			`SWAP2(d[1], d[3]);`
			`SWAP2(d[4], d[6]);`
			`SWAP2(d[5], d[7]);`

			`SWAP4(d[0], d[4]);`
			`SWAP4(d[1], d[5]);`
			`SWAP4(d[2], d[6]);`
			`SWAP4(d[3], d[7]);`

			`output[0] = d[0];`
			`output[1] = d[1];`
			`output[2] = d[2];`
			`output[3] = d[3];`
			`output[4] = d[4];`
			`output[5] = d[5];`
			`output[6] = d[6];`
			`output[7] = d[7];`
bump to revision V1.1 with Killer Groestl 11 years ago			`}`

groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`__device__ __forceinline__`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`void from_bitslice_quad(const uint32_t const __restrict__ input, uint32_t const __restrict__ output)`
bump to revision V1.1 with Killer Groestl 11 years ago			`{`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`uint32_t d[8];`
			`uint32_t t;`

			`d[0] = __byte_perm(input[0], input[4], 0x7531);`
			`d[1] = __byte_perm(input[1], input[5], 0x7531);`
			`d[2] = __byte_perm(input[2], input[6], 0x7531);`
			`d[3] = __byte_perm(input[3], input[7], 0x7531);`

			`SWAP1(d[0], d[1]);`
			`SWAP1(d[2], d[3]);`

			`SWAP2(d[0], d[2]);`
			`SWAP2(d[1], d[3]);`

			`t = __byte_perm(d[0], d[2], 0x5410);`
			`d[2] = __byte_perm(d[0], d[2], 0x7632);`
			`d[0] = t;`

			`t = __byte_perm(d[1], d[3], 0x5410);`
			`d[3] = __byte_perm(d[1], d[3], 0x7632);`
			`d[1] = t;`

			`SWAP4(d[0], d[2]);`
			`SWAP4(d[1], d[3]);`

			`output[0] = d[0];`
			`output[2] = d[1];`
			`output[4] = d[0] >> 16;`
			`output[6] = d[1] >> 16;`
			`output[8] = d[2];`
			`output[10] = d[3];`
			`output[12] = d[2] >> 16;`
			`output[14] = d[3] >> 16;`

			`#pragma unroll 8`
			`for (int i = 0; i < 16; i+=2) {`
			`if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);`
			`output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)&3, 4), 0x7610);`
			`output[i+1] = __shfl((int)output[i], (threadIdx.x+2)&3, 4);`
			`if (threadIdx.x & 3) output[i] = output[i+1] = 0;`
			`}`
bump to revision V1.1 with Killer Groestl 11 years ago			`}`