/* File included in quark/groestl (quark/jha,nist5/X11+) and groest/myriad coins for SM 3+ */

#define merge8(z,x,y)\
	z=__byte_perm(x, y, 0x5140); \

#define SWAP8(x,y)\
	x=__byte_perm(x, y, 0x5410); \
	y=__byte_perm(x, y, 0x7632);

#define SWAP4(x,y)\
	t = (y<<4); \
	t = (x ^ t); \
	t = 0xf0f0f0f0UL & t; \
	x = (x ^ t); \
	t=  t>>4;\
	y=  y ^ t;

#define SWAP2(x,y)\
	t = (y<<2); \
	t = (x ^ t); \
	t = 0xccccccccUL & t; \
	x = (x ^ t); \
	t=  t>>2;\
	y=  y ^ t;

#define SWAP1(x,y)\
	t = (y+y); \
	t = (x ^ t); \
	t = 0xaaaaaaaaUL & t; \
	x = (x ^ t); \
	t=  t>>1;\
	y=  y ^ t;


__device__ __forceinline__
void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
	uint32_t other[8];
	uint32_t d[8];
	uint32_t t;
	const unsigned int n = threadIdx.x & 3;

	#pragma unroll
	for (int i = 0; i < 8; i++) {
		input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
		other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4);
		input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
		other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
		if (threadIdx.x & 1) {
			input[i] = __byte_perm(input[i], 0, 0x1032);
			other[i] = __byte_perm(other[i], 0, 0x1032);
		}
	}

	merge8(d[0], input[0], input[4]);
	merge8(d[1], other[0], other[4]);
	merge8(d[2], input[1], input[5]);
	merge8(d[3], other[1], other[5]);
	merge8(d[4], input[2], input[6]);
	merge8(d[5], other[2], other[6]);
	merge8(d[6], input[3], input[7]);
	merge8(d[7], other[3], other[7]);

	SWAP1(d[0], d[1]);
	SWAP1(d[2], d[3]);
	SWAP1(d[4], d[5]);
	SWAP1(d[6], d[7]);

	SWAP2(d[0], d[2]);
	SWAP2(d[1], d[3]);
	SWAP2(d[4], d[6]);
	SWAP2(d[5], d[7]);

	SWAP4(d[0], d[4]);
	SWAP4(d[1], d[5]);
	SWAP4(d[2], d[6]);
	SWAP4(d[3], d[7]);

	output[0] = d[0];
	output[1] = d[1];
	output[2] = d[2];
	output[3] = d[3];
	output[4] = d[4];
	output[5] = d[5];
	output[6] = d[6];
	output[7] = d[7];
}

__device__ __forceinline__
void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
	uint32_t d[8];
	uint32_t t;

	d[0] = __byte_perm(input[0], input[4], 0x7531);
	d[1] = __byte_perm(input[1], input[5], 0x7531);
	d[2] = __byte_perm(input[2], input[6], 0x7531);
	d[3] = __byte_perm(input[3], input[7], 0x7531);

	SWAP1(d[0], d[1]);
	SWAP1(d[2], d[3]);

	SWAP2(d[0], d[2]);
	SWAP2(d[1], d[3]);

	t = __byte_perm(d[0], d[2], 0x5410);
	d[2] = __byte_perm(d[0], d[2], 0x7632);
	d[0] = t;

	t = __byte_perm(d[1], d[3], 0x5410);
	d[3] = __byte_perm(d[1], d[3], 0x7632);
	d[1] = t;

	SWAP4(d[0], d[2]);
	SWAP4(d[1], d[3]);

	output[0] = d[0];
	output[2] = d[1];
	output[4] = d[0] >> 16;
	output[6] = d[1] >> 16;
	output[8] = d[2];
	output[10] = d[3];
	output[12] = d[2] >> 16;
	output[14] = d[3] >> 16;

	#pragma unroll 8
	for (int i = 0; i < 16; i+=2) {
		if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
		output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)&3, 4), 0x7610);
		output[i+1] = __shfl((int)output[i], (threadIdx.x+2)&3, 4);
		if (threadIdx.x & 3) output[i] = output[i+1] = 0;
	}
}