mirror of
https://github.com/GOSTSec/sgminer
synced 2025-03-13 06:01:03 +00:00
new kernels (faster)
This commit is contained in:
parent
87d7abef81
commit
67046b3e82
@ -39,18 +39,51 @@ __constant static const sph_u64 blake2b_IV[8] =
|
|||||||
};
|
};
|
||||||
|
|
||||||
/*Blake2b's rotation*/
|
/*Blake2b's rotation*/
|
||||||
static inline sph_u64 rotr64( const sph_u64 w, const unsigned c ){
|
|
||||||
return rotate(w, (ulong)(64-c));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*Blake2b's G function*/
|
|
||||||
|
static inline uint2 ror2(uint2 v, unsigned a) {
|
||||||
|
uint2 result;
|
||||||
|
unsigned n = 64 - a;
|
||||||
|
if (n == 32) { return (uint2)(v.y,v.x); }
|
||||||
|
if (n < 32) {
|
||||||
|
result.y = ((v.y << (n)) | (v.x >> (32 - n)));
|
||||||
|
result.x = ((v.x << (n)) | (v.y >> (32 - n)));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
|
||||||
|
result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
static inline uint2 ror2l(uint2 v, unsigned a) {
|
||||||
|
uint2 result;
|
||||||
|
result.y = ((v.x << (32-a)) | (v.y >> (a)));
|
||||||
|
result.x = ((v.y << (32-a)) | (v.x >> (a)));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
static inline uint2 ror2r(uint2 v, unsigned a) {
|
||||||
|
uint2 result;
|
||||||
|
result.y = ((v.y << (64-a)) | (v.x >> (a-32)));
|
||||||
|
result.x = ((v.x << (64-a)) | (v.y >> (a-32)));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
/*
|
||||||
#define G(a,b,c,d) \
|
#define G(a,b,c,d) \
|
||||||
do { \
|
do { \
|
||||||
a += b; d ^= a; d = SPH_ROTR64(d, 32); \
|
a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = d.yx; \
|
||||||
c += d; b ^= c; b = SPH_ROTR64(b, 24); \
|
c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2l(b, 24); \
|
||||||
a += b; d ^= a; d = SPH_ROTR64(d, 16); \
|
a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = ror2l(d, 16); \
|
||||||
c += d; b ^= c; b = SPH_ROTR64(b, 63); \
|
c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2r(b, 63); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
*/
|
||||||
|
#define G(a,b,c,d) \
|
||||||
|
do { \
|
||||||
|
a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = d.yx; \
|
||||||
|
c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = as_uint2(as_uchar8(b).s34567012); \
|
||||||
|
a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = ror2l(d, 16); \
|
||||||
|
c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2r(b, 63); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*One Round of the Blake2b's compression function*/
|
/*One Round of the Blake2b's compression function*/
|
||||||
@ -72,7 +105,7 @@ c += d; b ^= c; b = SPH_ROTR64(b, 63); \
|
|||||||
for (int i = 0; i < 8; i++) \
|
for (int i = 0; i < 8; i++) \
|
||||||
{ \
|
{ \
|
||||||
\
|
\
|
||||||
for (int j = 0; j < 12; j++) {state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut];} \
|
for (int j = 0; j < 12; j++) {state[j] ^= as_uint2(as_ulong(Matrix[12 * i + j][rowIn]) + as_ulong(Matrix[12 * i + j][rowInOut]));} \
|
||||||
round_lyra(state); \
|
round_lyra(state); \
|
||||||
for (int j = 0; j < 12; j++) {Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];} \
|
for (int j = 0; j < 12; j++) {Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];} \
|
||||||
\
|
\
|
||||||
@ -97,7 +130,7 @@ c += d; b ^= c; b = SPH_ROTR64(b, 63); \
|
|||||||
for (int i = 0; i < 8; i++) \
|
for (int i = 0; i < 8; i++) \
|
||||||
{ \
|
{ \
|
||||||
for (int j = 0; j < 12; j++) \
|
for (int j = 0; j < 12; j++) \
|
||||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
|
state[j] ^= as_uint2(as_ulong(Matrix[12 * i + j][rowIn]) + as_ulong(Matrix[12 * i + j][rowInOut])); \
|
||||||
\
|
\
|
||||||
round_lyra(state); \
|
round_lyra(state); \
|
||||||
for (int j = 0; j < 12; j++) {Matrix[j + 12 * i][rowOut] ^= state[j];} \
|
for (int j = 0; j < 12; j++) {Matrix[j + 12 * i][rowOut] ^= state[j];} \
|
||||||
|
@ -70,7 +70,56 @@ typedef long sph_s64;
|
|||||||
#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6
|
#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6
|
||||||
#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n))
|
#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n))
|
||||||
#define SPH_ROTL64(x,n) rotate(x,(ulong)n)
|
#define SPH_ROTL64(x,n) rotate(x,(ulong)n)
|
||||||
#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n))
|
//#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n))
|
||||||
|
|
||||||
|
/*
|
||||||
|
inline ulong rol64 (ulong l,ulong n)
|
||||||
|
{
|
||||||
|
if (n<=32) {
|
||||||
|
uint2 t = rotate(as_uint2(l), (n));
|
||||||
|
return as_ulong((uint2)(bitselect(t.s0, t.s1, (uint)(1 << (n)) - 1), bitselect(t.s0, t.s1, (uint)(~((1 << (n)) - 1))))); }
|
||||||
|
else {
|
||||||
|
uint2 t = rotate(as_uint2(l), (n - 32));
|
||||||
|
return as_ulong((uint2)(bitselect(t.s1, t.s0, (uint)(1 << (n - 32)) - 1), bitselect(t.s1, t.s0, (uint)(~((1 << (n - 32)) - 1)))));
|
||||||
|
}}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
static inline ulong rol64(const ulong vw, unsigned n) {
|
||||||
|
uint2 result;
|
||||||
|
uint2 v=as_uint2(vw);
|
||||||
|
if (n == 32) { return as_ulong((uint2)(v.y,v.x)); }
|
||||||
|
if (n < 32) {
|
||||||
|
result.y = ( (v.y << (n)) | (v.x >> (32 - n)) );
|
||||||
|
result.x = ( (v.x << (n)) | (v.y >> (32 - n)) );
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result.y = ( (v.x << (n - 32)) | (v.y >> (64 - n)) );
|
||||||
|
result.x = ( (v.y << (n - 32)) | (v.x >> (64 - n)) );
|
||||||
|
}
|
||||||
|
return as_ulong(result);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
static inline sph_u64 ror64(sph_u64 vw, unsigned a) {
|
||||||
|
uint2 result;
|
||||||
|
uint2 v = as_uint2(vw);
|
||||||
|
unsigned n = (unsigned)(64 - a);
|
||||||
|
if (n == 32) { return as_ulong((uint2)(v.y,v.x)); }
|
||||||
|
if (n < 32) {
|
||||||
|
result.y = ((v.y << (n)) | (v.x >> (32 - n)));
|
||||||
|
result.x = ((v.x << (n)) | (v.y >> (32 - n)));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
|
||||||
|
result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
|
||||||
|
}
|
||||||
|
return as_ulong(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SPH_ROTR64(l,n) ror64(l,n)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#include "blake256.cl"
|
#include "blake256.cl"
|
||||||
@ -170,7 +219,7 @@ h[7]=h7;
|
|||||||
|
|
||||||
for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);}
|
for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);}
|
||||||
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,8 +243,8 @@ __kernel void search1(__global hash_t* hashes)
|
|||||||
|
|
||||||
keccak_block(keccak_gpu_state);
|
keccak_block(keccak_gpu_state);
|
||||||
for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; }
|
for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; }
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -211,18 +260,18 @@ __kernel void search2(__global hash_t* hashes)
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
sph_u64 state[16];
|
uint2 state[16];
|
||||||
|
|
||||||
for (int i = 0; i<4; i++) { state[i] = hash->h8[i];} //password
|
for (int i = 0; i<4; i++) { state[i] = as_uint2(hash->h8[i]);} //password
|
||||||
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
|
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
|
||||||
|
|
||||||
for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
|
for (int i = 0; i<8; i++) { state[i + 8] = as_uint2(blake2b_IV[i]); }
|
||||||
|
|
||||||
// blake2blyra x2
|
// blake2blyra x2
|
||||||
|
|
||||||
for (int i = 0; i<24; i++) { round_lyra(state); } //because 12 is not enough
|
for (int i = 0; i<24; i++) { round_lyra(state); } //because 12 is not enough
|
||||||
|
|
||||||
sph_u64 Matrix[96][8]; // very uncool
|
__private uint2 Matrix[96][8]; // very uncool
|
||||||
/// reducedSqueezeRow0
|
/// reducedSqueezeRow0
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++)
|
for (int i = 0; i < 8; i++)
|
||||||
@ -248,30 +297,31 @@ __kernel void search2(__global hash_t* hashes)
|
|||||||
reduceDuplexRowSetup(5, 2, 6);
|
reduceDuplexRowSetup(5, 2, 6);
|
||||||
reduceDuplexRowSetup(6, 1, 7);
|
reduceDuplexRowSetup(6, 1, 7);
|
||||||
|
|
||||||
sph_u64 rowa;
|
sph_u32 rowa;
|
||||||
rowa = state[0] & 7;
|
rowa = state[0].x & 7;
|
||||||
|
|
||||||
reduceDuplexRow(7, rowa, 0);
|
reduceDuplexRow(7, rowa, 0);
|
||||||
rowa = state[0] & 7;
|
rowa = state[0].x & 7;
|
||||||
reduceDuplexRow(0, rowa, 3);
|
reduceDuplexRow(0, rowa, 3);
|
||||||
rowa = state[0] & 7;
|
rowa = state[0].x & 7;
|
||||||
reduceDuplexRow(3, rowa, 6);
|
reduceDuplexRow(3, rowa, 6);
|
||||||
rowa = state[0] & 7;
|
rowa = state[0].x & 7;
|
||||||
reduceDuplexRow(6, rowa, 1);
|
reduceDuplexRow(6, rowa, 1);
|
||||||
rowa = state[0] & 7;
|
rowa = state[0].x & 7;
|
||||||
reduceDuplexRow(1, rowa, 4);
|
reduceDuplexRow(1, rowa, 4);
|
||||||
rowa = state[0] & 7;
|
rowa = state[0].x & 7;
|
||||||
reduceDuplexRow(4, rowa, 7);
|
reduceDuplexRow(4, rowa, 7);
|
||||||
rowa = state[0] & 7;
|
rowa = state[0].x & 7;
|
||||||
reduceDuplexRow(7, rowa, 2);
|
reduceDuplexRow(7, rowa, 2);
|
||||||
rowa = state[0] & 7;
|
rowa = state[0].x & 7;
|
||||||
reduceDuplexRow(2, rowa, 5);
|
reduceDuplexRow(2, rowa, 5);
|
||||||
|
|
||||||
absorbblock(rowa);
|
absorbblock(rowa);
|
||||||
|
|
||||||
for (int i = 0; i<4; i++) {hash->h8[i] = state[i];}
|
for (int i = 0; i<4; i++) {hash->h8[i] = as_ulong(state[i]);}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -344,44 +394,99 @@ __kernel void search3(__global hash_t* hashes)
|
|||||||
hash->h8[1] = p1;
|
hash->h8[1] = p1;
|
||||||
hash->h8[2] = p2;
|
hash->h8[2] = p2;
|
||||||
hash->h8[3] = p3;
|
hash->h8[3] = p3;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
||||||
__kernel void search4(__global hash_t* hashes, __global uint* output, const uint target)
|
__kernel void search4(__global hash_t* hashes, __global uint* output, const ulong target)
|
||||||
{
|
{
|
||||||
|
// __local ulong T0[256], T1[256], T2[256], T3[256], T4[256], T5[256], T6[256], T7[256];
|
||||||
|
// uint u = get_local_id(0);
|
||||||
|
/*
|
||||||
|
for (uint u = get_local_id(0); u < 256; u += get_local_size(0)) {
|
||||||
|
|
||||||
|
|
||||||
|
T0[u] = T0_G[u];
|
||||||
|
T1[u] = T1_G[u];
|
||||||
|
T2[u] = T2_G[u];
|
||||||
|
T3[u] = T3_G[u];
|
||||||
|
T4[u] = T4_G[u];
|
||||||
|
T5[u] = T5_G[u];
|
||||||
|
T6[u] = T6_G[u];
|
||||||
|
T7[u] = T7_G[u];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
T1[u] = SPH_ROTL64(T0[u], 8UL);
|
||||||
|
T2[u] = SPH_ROTL64(T0[u], 16UL);
|
||||||
|
T3[u] = SPH_ROTL64(T0[u], 24UL);
|
||||||
|
T4[u] = SPH_ROTL64(T0[u], 32UL);
|
||||||
|
T5[u] = SPH_ROTL64(T0[u], 40UL);
|
||||||
|
T6[u] = SPH_ROTL64(T0[u], 48UL);
|
||||||
|
T7[u] = SPH_ROTL64(T0[u], 56UL);
|
||||||
|
|
||||||
|
*/
|
||||||
uint gid = get_global_id(0);
|
uint gid = get_global_id(0);
|
||||||
|
|
||||||
__global hash_t *hash = &(hashes[gid - get_global_offset(0)]);
|
__global hash_t *hash = &(hashes[gid - get_global_offset(0)]);
|
||||||
|
|
||||||
sph_u64 message[8], state[8];
|
|
||||||
sph_u64 t[8];
|
|
||||||
for (int k = 0; k<4; k++) { message[k] = hash->h8[k]; }
|
__private ulong message[8], state[8];
|
||||||
|
__private ulong t[8];
|
||||||
|
|
||||||
|
|
||||||
|
for (int u = 0; u<4; u++) { message[u] = hash->h8[u]; }
|
||||||
|
|
||||||
message[4] = 0x80UL;
|
message[4] = 0x80UL;
|
||||||
message[5] = 0UL;
|
message[5] = 0UL;
|
||||||
message[6] = 0UL;
|
message[6] = 0UL;
|
||||||
message[7] = 0x0100000000000000UL;
|
message[7] = 0x0100000000000000UL;
|
||||||
|
|
||||||
|
|
||||||
for (int u = 0; u<8; u++) { state[u] = message[u]; }
|
for (int u = 0; u<8; u++) { state[u] = message[u]; }
|
||||||
state[7] ^= 0x0001000000000000UL;
|
state[7] ^= 0x0001000000000000UL;
|
||||||
|
|
||||||
PERM_SMALL_P(state);
|
|
||||||
|
|
||||||
|
for (int r = 0; r < 10; r ++) {ROUND_SMALL_P(state, r); }
|
||||||
|
|
||||||
state[7] ^= 0x0001000000000000UL;
|
state[7] ^= 0x0001000000000000UL;
|
||||||
PERM_SMALL_Q(message);
|
|
||||||
|
|
||||||
|
for (int r = 0; r < 10; r ++) {ROUND_SMALL_Q(message, r); }
|
||||||
|
|
||||||
|
|
||||||
for (int u = 0; u<8; u++) { state[u] ^= message[u]; }
|
for (int u = 0; u<8; u++) { state[u] ^= message[u]; }
|
||||||
message[7] = state[7];
|
message[7] = state[7];
|
||||||
|
|
||||||
PERM_SMALL_Pf(state);
|
for (int r = 0; r < 9; r ++) {ROUND_SMALL_P(state, r); }
|
||||||
|
uchar8 State;
|
||||||
|
State.s0 =as_uchar8(state[7]^0x79).s0;
|
||||||
|
State.s1 =as_uchar8(state[0]^0x09).s1;
|
||||||
|
State.s2 =as_uchar8(state[1]^0x19).s2;
|
||||||
|
State.s3 =as_uchar8(state[2]^0x29).s3;
|
||||||
|
State.s4 =as_uchar8(state[3]^0x39).s4;
|
||||||
|
State.s5 =as_uchar8(state[4]^0x49).s5;
|
||||||
|
State.s6 =as_uchar8(state[5]^0x59).s6;
|
||||||
|
State.s7 =as_uchar8(state[6]^0x69).s7;
|
||||||
|
|
||||||
state[7] ^= message[7];
|
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
state[7] =T0_G[State.s0]
|
||||||
|
^ R64(T0_G[State.s1], 8)
|
||||||
|
^ R64(T0_G[State.s2], 16)
|
||||||
|
^ R64(T0_G[State.s3], 24)
|
||||||
|
^ T4_G[State.s4]
|
||||||
|
^ R64(T4_G[State.s5], 8)
|
||||||
|
^ R64(T4_G[State.s6], 16)
|
||||||
|
^ R64(T4_G[State.s7], 24) ^message[7];
|
||||||
|
|
||||||
bool result = ( as_uint2(state[7]).y <= target);
|
// t[7] ^= message[7];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
|
||||||
|
bool result = ( state[7] <= target);
|
||||||
if (result) {
|
if (result) {
|
||||||
output[atomic_inc(output + 0xFF)] = SWAP4(gid);
|
output[atomic_inc(output + 0xFF)] = SWAP4(gid);
|
||||||
}
|
}
|
||||||
|
@ -57,8 +57,8 @@
|
|||||||
#define USE_LE 1
|
#define USE_LE 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if USE_LE
|
|
||||||
|
|
||||||
|
#if USE_LE
|
||||||
#if SPH_64
|
#if SPH_64
|
||||||
#define C64e(x) ((SPH_C64(x) >> 56) \
|
#define C64e(x) ((SPH_C64(x) >> 56) \
|
||||||
| ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
|
| ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
|
||||||
@ -1173,6 +1173,8 @@ __constant static const sph_u64 T7[] = {
|
|||||||
^ R64(T4[B64_7(a[b7])], 24); \
|
^ R64(T4[B64_7(a[b7])], 24); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
|
#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
|
||||||
@ -1186,6 +1188,9 @@ __constant static const sph_u64 T7[] = {
|
|||||||
^ T7[B64_7(a[b7])]; \
|
^ T7[B64_7(a[b7])]; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if SPH_SMALL_FOOTPRINT_GROESTL
|
#if SPH_SMALL_FOOTPRINT_GROESTL
|
||||||
@ -1417,6 +1422,9 @@ __constant static const sph_u64 T7[] = {
|
|||||||
} while (0)
|
} while (0)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define PERM_BIG_P(a) do { \
|
#define PERM_BIG_P(a) do { \
|
||||||
int r; \
|
int r; \
|
||||||
for (r = 0; r < 14; ++r) { \
|
for (r = 0; r < 14; ++r) { \
|
||||||
@ -1429,4 +1437,125 @@ __constant static const sph_u64 T7[] = {
|
|||||||
for (r = 0; r < 14; ++r) { \
|
for (r = 0; r < 14; ++r) { \
|
||||||
ROUND_BIG_Q(a, r); \
|
ROUND_BIG_Q(a, r); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
#if SPH_SMALL_FOOTPRINT_GROESTL
|
||||||
|
|
||||||
|
#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
|
||||||
|
t[d] = T0[B64_0(a[b0])] \
|
||||||
|
^ R64(T0[B64_1(a[b1])], 8) \
|
||||||
|
^ R64(T0[B64_2(a[b2])], 16) \
|
||||||
|
^ R64(T0[B64_3(a[b3])], 24) \
|
||||||
|
^ T4[B64_4(a[b4])] \
|
||||||
|
^ R64(T4[B64_5(a[b5])], 8) \
|
||||||
|
^ R64(T4[B64_6(a[b6])], 16) \
|
||||||
|
^ R64(T4[B64_7(a[b7])], 24); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
|
||||||
|
t[d] = T0[B64_0(a[b0])] \
|
||||||
|
^ T1[B64_1(a[b1])] \
|
||||||
|
^ T2[B64_2(a[b2])] \
|
||||||
|
^ T3[B64_3(a[b3])] \
|
||||||
|
^ T4[B64_4(a[b4])] \
|
||||||
|
^ T5[B64_5(a[b5])] \
|
||||||
|
^ T6[B64_6(a[b6])] \
|
||||||
|
^ T7[B64_7(a[b7])]; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#define ROUND_SMALL_P(a, r) do { \
|
||||||
|
sph_u64 t[8]; \
|
||||||
|
a[0] ^= PC64(0x00, r); \
|
||||||
|
a[1] ^= PC64(0x10, r); \
|
||||||
|
a[2] ^= PC64(0x20, r); \
|
||||||
|
a[3] ^= PC64(0x30, r); \
|
||||||
|
a[4] ^= PC64(0x40, r); \
|
||||||
|
a[5] ^= PC64(0x50, r); \
|
||||||
|
a[6] ^= PC64(0x60, r); \
|
||||||
|
a[7] ^= PC64(0x70, r); \
|
||||||
|
RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \
|
||||||
|
RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \
|
||||||
|
RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \
|
||||||
|
RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \
|
||||||
|
RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \
|
||||||
|
RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \
|
||||||
|
RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \
|
||||||
|
RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \
|
||||||
|
a[0] = t[0]; \
|
||||||
|
a[1] = t[1]; \
|
||||||
|
a[2] = t[2]; \
|
||||||
|
a[3] = t[3]; \
|
||||||
|
a[4] = t[4]; \
|
||||||
|
a[5] = t[5]; \
|
||||||
|
a[6] = t[6]; \
|
||||||
|
a[7] = t[7]; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define ROUND_SMALL_Q(a, r) do { \
|
||||||
|
sph_u64 t[8]; \
|
||||||
|
a[0] ^= QC64(0x00, r); \
|
||||||
|
a[1] ^= QC64(0x10, r); \
|
||||||
|
a[2] ^= QC64(0x20, r); \
|
||||||
|
a[3] ^= QC64(0x30, r); \
|
||||||
|
a[4] ^= QC64(0x40, r); \
|
||||||
|
a[5] ^= QC64(0x50, r); \
|
||||||
|
a[6] ^= QC64(0x60, r); \
|
||||||
|
a[7] ^= QC64(0x70, r); \
|
||||||
|
RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \
|
||||||
|
RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \
|
||||||
|
RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \
|
||||||
|
RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \
|
||||||
|
RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \
|
||||||
|
RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \
|
||||||
|
RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \
|
||||||
|
RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \
|
||||||
|
a[0] = t[0]; \
|
||||||
|
a[1] = t[1]; \
|
||||||
|
a[2] = t[2]; \
|
||||||
|
a[3] = t[3]; \
|
||||||
|
a[4] = t[4]; \
|
||||||
|
a[5] = t[5]; \
|
||||||
|
a[6] = t[6]; \
|
||||||
|
a[7] = t[7]; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#if SPH_SMALL_FOOTPRINT_GROESTL
|
||||||
|
|
||||||
|
#define PERM_SMALL_P(a) do { \
|
||||||
|
int r; \
|
||||||
|
for (r = 0; r < 10; r ++) \
|
||||||
|
ROUND_SMALL_P(a, r); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define PERM_SMALL_Q(a) do { \
|
||||||
|
int r; \
|
||||||
|
for (r = 0; r < 10; r ++) \
|
||||||
|
ROUND_SMALL_Q(a, r); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Apparently, unrolling more than that confuses GCC, resulting in
|
||||||
|
* lower performance, even though L1 cache would be no problem.
|
||||||
|
*/
|
||||||
|
#define PERM_SMALL_P(a) do { \
|
||||||
|
int r; \
|
||||||
|
for (r = 0; r < 10; r += 2) { \
|
||||||
|
ROUND_SMALL_P(a, r + 0); \
|
||||||
|
ROUND_SMALL_P(a, r + 1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define PERM_SMALL_Q(a) do { \
|
||||||
|
int r; \
|
||||||
|
for (r = 0; r < 10; r += 2) { \
|
||||||
|
ROUND_SMALL_Q(a, r + 0); \
|
||||||
|
ROUND_SMALL_Q(a, r + 1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
1120
kernel/groestl256.cl
1120
kernel/groestl256.cl
File diff suppressed because it is too large
Load Diff
1854
kernel/groestlcoin-v1.cl
Normal file
1854
kernel/groestlcoin-v1.cl
Normal file
File diff suppressed because it is too large
Load Diff
@ -68,9 +68,7 @@ __constant static const sph_u64 t12[6] =
|
|||||||
0xff00000000000000UL,
|
0xff00000000000000UL,
|
||||||
0xff00000000000008UL
|
0xff00000000000008UL
|
||||||
};
|
};
|
||||||
static inline ulong ROTL64(const ulong v, const ulong n){
|
|
||||||
return rotate(v,n);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT) { \
|
#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT) { \
|
||||||
p0 += p1; p1 = SPH_ROTL64(p1, ROT256[ROT][0]); p1 ^= p0; \
|
p0 += p1; p1 = SPH_ROTL64(p1, ROT256[ROT][0]); p1 ^= p0; \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user