/* File included in quark/groestl (quark/jha,nist5/X11+) and groest/myriad coins for SM 3+ */ #define merge8(z,x,y)\ z=__byte_perm(x, y, 0x5140); \ #define SWAP8(x,y)\ x=__byte_perm(x, y, 0x5410); \ y=__byte_perm(x, y, 0x7632); #define SWAP4(x,y)\ t = (y<<4); \ t = (x ^ t); \ t = 0xf0f0f0f0UL & t; \ x = (x ^ t); \ t= t>>4;\ y= y ^ t; #define SWAP2(x,y)\ t = (y<<2); \ t = (x ^ t); \ t = 0xccccccccUL & t; \ x = (x ^ t); \ t= t>>2;\ y= y ^ t; #define SWAP1(x,y)\ t = (y+y); \ t = (x ^ t); \ t = 0xaaaaaaaaUL & t; \ x = (x ^ t); \ t= t>>1;\ y= y ^ t; __device__ __forceinline__ void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) { uint32_t other[8]; uint32_t d[8]; uint32_t t; const unsigned int n = threadIdx.x & 3; #pragma unroll for (int i = 0; i < 8; i++) { input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4); other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4); input[i] = __shfl((int)input[i], threadIdx.x & 2, 4); other[i] = __shfl((int)other[i], threadIdx.x & 2, 4); if (threadIdx.x & 1) { input[i] = __byte_perm(input[i], 0, 0x1032); other[i] = __byte_perm(other[i], 0, 0x1032); } } merge8(d[0], input[0], input[4]); merge8(d[1], other[0], other[4]); merge8(d[2], input[1], input[5]); merge8(d[3], other[1], other[5]); merge8(d[4], input[2], input[6]); merge8(d[5], other[2], other[6]); merge8(d[6], input[3], input[7]); merge8(d[7], other[3], other[7]); SWAP1(d[0], d[1]); SWAP1(d[2], d[3]); SWAP1(d[4], d[5]); SWAP1(d[6], d[7]); SWAP2(d[0], d[2]); SWAP2(d[1], d[3]); SWAP2(d[4], d[6]); SWAP2(d[5], d[7]); SWAP4(d[0], d[4]); SWAP4(d[1], d[5]); SWAP4(d[2], d[6]); SWAP4(d[3], d[7]); output[0] = d[0]; output[1] = d[1]; output[2] = d[2]; output[3] = d[3]; output[4] = d[4]; output[5] = d[5]; output[6] = d[6]; output[7] = d[7]; } __device__ __forceinline__ void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output) { uint32_t d[8]; uint32_t t; d[0] = __byte_perm(input[0], input[4], 0x7531); d[1] = __byte_perm(input[1], input[5], 0x7531); d[2] = __byte_perm(input[2], input[6], 0x7531); d[3] = __byte_perm(input[3], input[7], 0x7531); SWAP1(d[0], d[1]); SWAP1(d[2], d[3]); SWAP2(d[0], d[2]); SWAP2(d[1], d[3]); t = __byte_perm(d[0], d[2], 0x5410); d[2] = __byte_perm(d[0], d[2], 0x7632); d[0] = t; t = __byte_perm(d[1], d[3], 0x5410); d[3] = __byte_perm(d[1], d[3], 0x7632); d[1] = t; SWAP4(d[0], d[2]); SWAP4(d[1], d[3]); output[0] = d[0]; output[2] = d[1]; output[4] = d[0] >> 16; output[6] = d[1] >> 16; output[8] = d[2]; output[10] = d[3]; output[12] = d[2] >> 16; output[14] = d[3] >> 16; #pragma unroll 8 for (int i = 0; i < 16; i+=2) { if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032); output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)&3, 4), 0x7610); output[i+1] = __shfl((int)output[i], (threadIdx.x+2)&3, 4); if (threadIdx.x & 3) output[i] = output[i+1] = 0; } }