1
0
mirror of https://github.com/GOSTSec/ccminer synced 2025-01-09 06:18:07 +00:00
ccminer/bitslice_transformations_quad.cu
Tanguy Pruvot b521acb480 groestl: use sp bitslice enhancement, prepare SM 2.x variant
todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions
2015-01-19 00:42:14 +01:00

134 lines
3.0 KiB
Plaintext

/* File included in quark/groestl (quark/jha,nist5/X11+) and groest/myriad coins for SM 3+ */
#define merge8(z,x,y)\
z=__byte_perm(x, y, 0x5140); \
#define SWAP8(x,y)\
x=__byte_perm(x, y, 0x5410); \
y=__byte_perm(x, y, 0x7632);
#define SWAP4(x,y)\
t = (y<<4); \
t = (x ^ t); \
t = 0xf0f0f0f0UL & t; \
x = (x ^ t); \
t= t>>4;\
y= y ^ t;
#define SWAP2(x,y)\
t = (y<<2); \
t = (x ^ t); \
t = 0xccccccccUL & t; \
x = (x ^ t); \
t= t>>2;\
y= y ^ t;
#define SWAP1(x,y)\
t = (y+y); \
t = (x ^ t); \
t = 0xaaaaaaaaUL & t; \
x = (x ^ t); \
t= t>>1;\
y= y ^ t;
__device__ __forceinline__
void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
uint32_t other[8];
uint32_t d[8];
uint32_t t;
const unsigned int n = threadIdx.x & 3;
#pragma unroll
for (int i = 0; i < 8; i++) {
input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
other[i] = __shfl((int)input[i], (threadIdx.x + 1) & 3, 4);
input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
if (threadIdx.x & 1) {
input[i] = __byte_perm(input[i], 0, 0x1032);
other[i] = __byte_perm(other[i], 0, 0x1032);
}
}
merge8(d[0], input[0], input[4]);
merge8(d[1], other[0], other[4]);
merge8(d[2], input[1], input[5]);
merge8(d[3], other[1], other[5]);
merge8(d[4], input[2], input[6]);
merge8(d[5], other[2], other[6]);
merge8(d[6], input[3], input[7]);
merge8(d[7], other[3], other[7]);
SWAP1(d[0], d[1]);
SWAP1(d[2], d[3]);
SWAP1(d[4], d[5]);
SWAP1(d[6], d[7]);
SWAP2(d[0], d[2]);
SWAP2(d[1], d[3]);
SWAP2(d[4], d[6]);
SWAP2(d[5], d[7]);
SWAP4(d[0], d[4]);
SWAP4(d[1], d[5]);
SWAP4(d[2], d[6]);
SWAP4(d[3], d[7]);
output[0] = d[0];
output[1] = d[1];
output[2] = d[2];
output[3] = d[3];
output[4] = d[4];
output[5] = d[5];
output[6] = d[6];
output[7] = d[7];
}
__device__ __forceinline__
void from_bitslice_quad(const uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
{
uint32_t d[8];
uint32_t t;
d[0] = __byte_perm(input[0], input[4], 0x7531);
d[1] = __byte_perm(input[1], input[5], 0x7531);
d[2] = __byte_perm(input[2], input[6], 0x7531);
d[3] = __byte_perm(input[3], input[7], 0x7531);
SWAP1(d[0], d[1]);
SWAP1(d[2], d[3]);
SWAP2(d[0], d[2]);
SWAP2(d[1], d[3]);
t = __byte_perm(d[0], d[2], 0x5410);
d[2] = __byte_perm(d[0], d[2], 0x7632);
d[0] = t;
t = __byte_perm(d[1], d[3], 0x5410);
d[3] = __byte_perm(d[1], d[3], 0x7632);
d[1] = t;
SWAP4(d[0], d[2]);
SWAP4(d[1], d[3]);
output[0] = d[0];
output[2] = d[1];
output[4] = d[0] >> 16;
output[6] = d[1] >> 16;
output[8] = d[2];
output[10] = d[3];
output[12] = d[2] >> 16;
output[14] = d[3] >> 16;
#pragma unroll 8
for (int i = 0; i < 16; i+=2) {
if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)&3, 4), 0x7610);
output[i+1] = __shfl((int)output[i], (threadIdx.x+2)&3, 4);
if (threadIdx.x & 3) output[i] = output[i+1] = 0;
}
}