1
0
mirror of https://github.com/GOSTSec/ccminer synced 2025-01-18 18:50:11 +00:00
ccminer/groestl_functions_quad.cu

351 lines
9.2 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "cuda_helper.h"
__device__ __forceinline__
void G256_Mul2(uint32_t *regs)
{
uint32_t tmp = regs[7];
regs[7] = regs[6];
regs[6] = regs[5];
regs[5] = regs[4];
regs[4] = regs[3] ^ tmp;
regs[3] = regs[2] ^ tmp;
regs[2] = regs[1];
regs[1] = regs[0] ^ tmp;
regs[0] = tmp;
}
__device__ __forceinline__
void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int rnd)
{
x0 = ~x0;
x1 = ~x1;
x2 = ~x2;
x3 = ~x3;
x4 = ~x4;
x5 = ~x5;
x6 = ~x6;
x7 = ~x7;
#if 0
if ((threadIdx.x & 3) != 3)
return;
int andmask = 0xFFFF0000;
#else
/* from sp: faster (branching problem with if ?) */
uint32_t andmask = -((threadIdx.x & 3) == 3) & 0xFFFF0000U;
#endif
x0 ^= ((- (rnd & 0x01) ) & andmask);
x1 ^= ((-((rnd & 0x02)>>1)) & andmask);
x2 ^= ((-((rnd & 0x04)>>2)) & andmask);
x3 ^= ((-((rnd & 0x08)>>3)) & andmask);
x4 ^= (0xAAAA0000 & andmask);
x5 ^= (0xCCCC0000 & andmask);
x6 ^= (0xF0F00000 & andmask);
x7 ^= (0xFF000000 & andmask);
}
__device__ __forceinline__
void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int rnd)
{
if (threadIdx.x & 3)
return;
int andmask = 0xFFFF;
x0 ^= ((- (rnd & 0x01) ) & andmask);
x1 ^= ((-((rnd & 0x02)>>1)) & andmask);
x2 ^= ((-((rnd & 0x04)>>2)) & andmask);
x3 ^= ((-((rnd & 0x08)>>3)) & andmask);
x4 ^= 0xAAAAU;
x5 ^= 0xCCCCU;
x6 ^= 0xF0F0U;
x7 ^= 0xFF00U;
}
__device__ __forceinline__
void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
{
uint32_t t0,t1,t2;
t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
t0 = (x2^x3) & (y2^y3);
x3 = (x3 & y3) ^ t0 ^ t1;
x2 = (x2 & y2) ^ t0 ^ t2;
t0 = (x0^x1) & (y0^y1);
x1 = (x1 & y1) ^ t0 ^ t1;
x0 = (x0 & y0) ^ t0 ^ t2;
}
__device__ __forceinline__
void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
t3 = x7;
t2 = x6;
t1 = x5;
t0 = x4;
G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
a = (x4 ^ x0);
t0 ^= a;
t2 ^= (x7 ^ x3) ^ (x5 ^ x1);
t1 ^= (x5 ^ x1) ^ a;
t3 ^= (x6 ^ x2) ^ a;
b = t0 ^ t1;
t4 = (t2 ^ t3) & b;
a = t4 ^ t3 ^ t1;
t5 = (t3 & t1) ^ a;
t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
t4 = (t5 ^ t6) & b;
t1 = (t6 & t1) ^ t4;
t0 = (t5 & t0) ^ t4;
t4 = (t5 ^ t6) & (t2^t3);
t3 = (t6 & t3) ^ t4;
t2 = (t5 & t2) ^ t4;
G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
}
__device__ __forceinline__
void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
{
uint32_t t0, t1;
t0 = x0 ^ x1 ^ x2;
t1 = x5 ^ x6;
x2 = t0 ^ t1 ^ x7;
x6 = t0 ^ x3 ^ x6;
x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;
x4 = x0 ^ x4 ^ t1;
x2 = t0 ^ t1 ^ x7;
x1 = x0 ^ x1 ^ t1;
x7 = x0 ^ t1 ^ x7;
x5 = x0 ^ t1;
}
__device__ __forceinline__
void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
{
uint32_t t0,t2,t3,t5;
x1 ^= x4;
t0 = x1 ^ x6;
x1 ^= x5;
t2 = x0 ^ x2;
x2 = x3 ^ x5;
t2 ^= x2 ^ x6;
x2 ^= x7;
t3 = x4 ^ x2 ^ x6;
t5 = x0 ^ x6;
x4 = x3 ^ x7;
x0 = x3 ^ x5;
x6 = t0;
x3 = t2;
x7 = t3;
x5 = t5;
}
__device__ __forceinline__
void sbox_quad(uint32_t *r)
{
transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
r[0] = ~r[0];
r[1] = ~r[1];
r[5] = ~r[5];
r[6] = ~r[6];
}
__device__ __forceinline__
void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1;
int tpos = threadIdx.x & 0x03;
int shift1 = tpos << 1;
int shift2 = shift1+1 + ((tpos == 3)<<2);
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
x1 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
x2 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
x3 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
x4 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
x5 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
x6 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
x7 = __byte_perm(t0, t1, 0x5410);
}
__device__ __forceinline__
void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1;
int tpos = threadIdx.x & 0x03;
int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
int shift2 = shift1+2 + ((tpos == 1)<<2);
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
x1 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
x2 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
x3 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
x4 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
x5 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
x6 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
x7 = __byte_perm(t0, t1, 0x5410);
}
#if __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the threads own value of var is returned.
*/
#undef __shfl
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__
void G256_MixFunction_quad(uint32_t *r)
{
#define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432)
#define A(v, u) __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
#define S(idx, l) SHIFT64_16( A(idx, (l+1)), A(idx, l) )
#define DOUBLE_ODD(i, bc) ( S(i, (bc)) ^ A(i, (bc) + 1) )
#define DOUBLE_EVEN(i, bc) ( S(i, (bc)) ^ A(i, (bc) ) )
#define SINGLE_ODD(i, bc) ( S(i, (bc)) )
#define SINGLE_EVEN(i, bc) ( A(i, (bc)) )
uint32_t b[8];
#pragma unroll 8
for(int i=0;i<8;i++)
b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
G256_Mul2(b);
#pragma unroll 8
for(int i=0;i<8;i++)
b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
G256_Mul2(b);
#pragma unroll 8
for(int i=0;i<8;i++)
r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
#undef S
#undef A
#undef SHIFT64_16
#undef t
#undef X
}
__device__ __forceinline__
void groestl512_perm_P_quad(uint32_t *r)
{
for(int round=0;round<14;round++)
{
G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
sbox_quad(r);
G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
G256_MixFunction_quad(r);
}
}
__device__ __forceinline__
void groestl512_perm_Q_quad(uint32_t *r)
{
for(int round=0;round<14;round++)
{
G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
sbox_quad(r);
G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
G256_MixFunction_quad(r);
}
}
__device__ __forceinline__
void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
{
#pragma unroll 8
for(int u=0;u<8;u++) state[u] = message[u];
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
groestl512_perm_P_quad(state);
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
groestl512_perm_Q_quad(message);
#pragma unroll 8
for(int u=0;u<8;u++) state[u] ^= message[u];
#pragma unroll 8
for(int u=0;u<8;u++) message[u] = state[u];
groestl512_perm_P_quad(message);
#pragma unroll 8
for(int u=0;u<8;u++) state[u] ^= message[u];
}