1
0
mirror of https://github.com/GOSTSec/ccminer synced 2025-01-01 18:55:59 +00:00
ccminer/groestl_functions_quad.cu
Tanguy Pruvot d9ea5f72ce Remove duplicated defines present in cuda_helper.h
also add cudaDeviceReset() on Ctrl+C for nvprof
2014-08-19 03:29:11 +02:00

326 lines
9.1 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "cuda_helper.h"
__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
{
uint32_t tmp = regs[7];
regs[7] = regs[6];
regs[6] = regs[5];
regs[5] = regs[4];
regs[4] = regs[3] ^ tmp;
regs[3] = regs[2] ^ tmp;
regs[2] = regs[1];
regs[1] = regs[0] ^ tmp;
regs[0] = tmp;
}
__device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
{
x0 = ~x0;
x1 = ~x1;
x2 = ~x2;
x3 = ~x3;
x4 = ~x4;
x5 = ~x5;
x6 = ~x6;
x7 = ~x7;
if ((threadIdx.x & 0x03) == 3) {
x0 ^= ((- (round & 0x01) ) & 0xFFFF0000);
x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000);
x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000);
x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000);
x4 ^= 0xAAAA0000;
x5 ^= 0xCCCC0000;
x6 ^= 0xF0F00000;
x7 ^= 0xFF000000;
}
}
__device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
{
if ((threadIdx.x & 0x03) == 0)
{
x4 ^= 0xAAAA;
x5 ^= 0xCCCC;
x6 ^= 0xF0F0;
x7 ^= 0xFF00;
x0 ^= ((- (round & 0x01) ) & 0xFFFF);
x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF);
x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF);
x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF);
}
}
__device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
{
uint32_t t0,t1,t2;
t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
t0 = (x2^x3) & (y2^y3);
x3 = (x3 & y3) ^ t0 ^ t1;
x2 = (x2 & y2) ^ t0 ^ t2;
t0 = (x0^x1) & (y0^y1);
x1 = (x1 & y1) ^ t0 ^ t1;
x0 = (x0 & y0) ^ t0 ^ t2;
}
__device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
t3 = x7;
t2 = x6;
t1 = x5;
t0 = x4;
G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
a = (x4 ^ x0);
t0 ^= a;
t2 ^= (x7 ^ x3) ^ (x5 ^ x1);
t1 ^= (x5 ^ x1) ^ a;
t3 ^= (x6 ^ x2) ^ a;
b = t0 ^ t1;
t4 = (t2 ^ t3) & b;
a = t4 ^ t3 ^ t1;
t5 = (t3 & t1) ^ a;
t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
t4 = (t5 ^ t6) & b;
t1 = (t6 & t1) ^ t4;
t0 = (t5 & t0) ^ t4;
t4 = (t5 ^ t6) & (t2^t3);
t3 = (t6 & t3) ^ t4;
t2 = (t5 & t2) ^ t4;
G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
}
__device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
{
uint32_t t0, t1;
t0 = x0 ^ x1 ^ x2;
t1 = x5 ^ x6;
x2 = t0 ^ t1 ^ x7;
x6 = t0 ^ x3 ^ x6;
x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;
x4 = x0 ^ x4 ^ t1;
x2 = t0 ^ t1 ^ x7;
x1 = x0 ^ x1 ^ t1;
x7 = x0 ^ t1 ^ x7;
x5 = x0 ^ t1;
}
__device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
{
uint32_t t0,t2,t3,t5;
x1 ^= x4;
t0 = x1 ^ x6;
x1 ^= x5;
t2 = x0 ^ x2;
x2 = x3 ^ x5;
t2 ^= x2 ^ x6;
x2 ^= x7;
t3 = x4 ^ x2 ^ x6;
t5 = x0 ^ x6;
x4 = x3 ^ x7;
x0 = x3 ^ x5;
x6 = t0;
x3 = t2;
x7 = t3;
x5 = t5;
}
__device__ __forceinline__ void sbox_quad(uint32_t *r)
{
transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
r[0] = ~r[0];
r[1] = ~r[1];
r[5] = ~r[5];
r[6] = ~r[6];
}
__device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1;
int tpos = threadIdx.x & 0x03;
int shift1 = tpos << 1;
int shift2 = shift1+1 + ((tpos == 3)<<2);
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
x1 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
x2 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
x3 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
x4 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
x5 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
x6 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
x7 = __byte_perm(t0, t1, 0x5410);
}
__device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1;
int tpos = threadIdx.x & 0x03;
int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
int shift2 = shift1+2 + ((tpos == 1)<<2);
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
x1 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
x2 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
x3 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
x4 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
x5 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
x6 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
x7 = __byte_perm(t0, t1, 0x5410);
}
#if __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the threads own value of var is returned.
*/
#undef __shfl
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
{
#define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432)
#define A(v, u) __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
#define S(idx, l) SHIFT64_16( A(idx, (l+1)), A(idx, l) )
#define DOUBLE_ODD(i, bc) ( S(i, (bc)) ^ A(i, (bc) + 1) )
#define DOUBLE_EVEN(i, bc) ( S(i, (bc)) ^ A(i, (bc) ) )
#define SINGLE_ODD(i, bc) ( S(i, (bc)) )
#define SINGLE_EVEN(i, bc) ( A(i, (bc)) )
uint32_t b[8];
#pragma unroll 8
for(int i=0;i<8;i++)
b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
G256_Mul2(b);
#pragma unroll 8
for(int i=0;i<8;i++)
b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
G256_Mul2(b);
#pragma unroll 8
for(int i=0;i<8;i++)
r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
#undef S
#undef A
#undef SHIFT64_16
#undef t
#undef X
}
__device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r)
{
for(int round=0;round<14;round++)
{
G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
sbox_quad(r);
G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
G256_MixFunction_quad(r);
}
}
__device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r)
{
for(int round=0;round<14;round++)
{
G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
sbox_quad(r);
G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
G256_MixFunction_quad(r);
}
}
__device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
{
#pragma unroll 8
for(int u=0;u<8;u++) state[u] = message[u];
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
groestl512_perm_P_quad(state);
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
groestl512_perm_Q_quad(message);
#pragma unroll 8
for(int u=0;u<8;u++) state[u] ^= message[u];
#pragma unroll 8
for(int u=0;u<8;u++) message[u] = state[u];
groestl512_perm_P_quad(message);
#pragma unroll 8
for(int u=0;u<8;u++) state[u] ^= message[u];
}