Browse Source

Old Kernel Fix

Old kernels would produce HW. Also, corrected an hamsi problem for
people who didn't use hamsi-expand-big 1.
djm34
ystarnaud 11 years ago
parent
commit
22c34fbf45
  1. 4
      algorithm.c
  2. 11
      kernel/bitblock.cl
  3. 657
      kernel/bitblockold.cl
  4. 2
      kernel/hamsi.cl
  5. 11
      kernel/marucoin-mod.cl
  6. 540
      kernel/marucoin-modold.cl
  7. 11
      kernel/x14.cl
  8. 602
      kernel/x14old.cl

4
algorithm.c

@ -94,8 +94,8 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru @@ -94,8 +94,8 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
static void append_hamsi_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
{
char buf[255];
sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d%s ",
opt_hamsi_expand_big, ((opt_hamsi_short)?" -D SPH_HAMSI_SHORT=1 ":""));
sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d -D SPH_HAMSI_SHORT=%d ",
opt_hamsi_expand_big, ((opt_hamsi_short)?1:0));
strcat(data->compiler_options, buf);
sprintf(buf, "big%u%s", (unsigned int)opt_hamsi_expand_big, ((opt_hamsi_short)?"hs":""));

11
kernel/bitblock.cl

@ -80,9 +80,6 @@ typedef int sph_s32; @@ -80,9 +80,6 @@ typedef int sph_s32;
#ifndef SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 1
#endif
#ifndef SPH_HAMSI_SHORT
#define SPH_HAMSI_SHORT 1
#endif
#include "blake.cl"
#include "bmw.cl"
@ -1108,6 +1105,8 @@ __kernel void search11(__global hash_t* hashes) @@ -1108,6 +1105,8 @@ __kernel void search11(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
#ifdef INPUT_BIG_LOCAL
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
@ -1117,6 +1116,9 @@ __kernel void search11(__global hash_t* hashes) @@ -1117,6 +1116,9 @@ __kernel void search11(__global hash_t* hashes)
T512_L[i] = T512_C[i];
barrier(CLK_LOCAL_MEM_FENCE);
#else
#define INPUT_BIG_LOCAL INPUT_BIG
#endif
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
@ -1128,7 +1130,8 @@ __kernel void search11(__global hash_t* hashes) @@ -1128,7 +1130,8 @@ __kernel void search11(__global hash_t* hashes)
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8) {
for(int i = 0; i < 64; i += 8)
{
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;

657
kernel/bitblockold.cl

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
/*
* X13 kernel implementation.
* X15 kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
@ -75,7 +75,9 @@ typedef long sph_s64; @@ -75,7 +75,9 @@ typedef long sph_s64;
#define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0
#define SPH_KECCAK_UNROLL 1
#ifndef SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 4
#endif
#include "blake.cl"
#include "bmw.cl"
@ -115,8 +117,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -115,8 +117,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// blake
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -125,13 +127,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -125,13 +127,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;;
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
{
T1 = SPH_T64(T1 + 1);
}
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
M0 = DEC64BE(block + 0);
M1 = DEC64BE(block + 8);
M2 = DEC64BE(block + 16);
@ -170,16 +172,13 @@ __kernel void search1(__global hash_t* hashes) @@ -170,16 +172,13 @@ __kernel void search1(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// bmw
sph_u64 BMW_H[16];
#pragma unroll 16
for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u];
sph_u64 mv[16],q[32];
sph_u64 tmp;
sph_u64 BMW_h1[16], BMW_h2[16];
sph_u64 mv[16];
mv[ 0] = SWAP8(hash->h8[0]);
mv[ 1] = SWAP8(hash->h8[1]);
@ -196,243 +195,35 @@ __kernel void search1(__global hash_t* hashes) @@ -196,243 +195,35 @@ __kernel void search1(__global hash_t* hashes)
mv[12] = 0;
mv[13] = 0;
mv[14] = 0;
mv[15] = SPH_C64(512);
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
mv[15] = 0x200;
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
FOLDb;
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
#undef M
#undef H
#undef dH
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x])
#define H(x) (final_b[x])
#define dH(x) (BMW_h1[x])
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
FOLDb;
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#undef M
#undef H
#undef dH
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash->h8[0] = SWAP8(BMW_H[8]);
hash->h8[1] = SWAP8(BMW_H[9]);
hash->h8[2] = SWAP8(BMW_H[10]);
hash->h8[3] = SWAP8(BMW_H[11]);
hash->h8[4] = SWAP8(BMW_H[12]);
hash->h8[5] = SWAP8(BMW_H[13]);
hash->h8[6] = SWAP8(BMW_H[14]);
hash->h8[7] = SWAP8(BMW_H[15]);
hash->h8[0] = SWAP8(BMW_h1[8]);
hash->h8[1] = SWAP8(BMW_h1[9]);
hash->h8[2] = SWAP8(BMW_h1[10]);
hash->h8[3] = SWAP8(BMW_h1[11]);
hash->h8[4] = SWAP8(BMW_h1[12]);
hash->h8[5] = SWAP8(BMW_h1[13]);
hash->h8[6] = SWAP8(BMW_h1[14]);
hash->h8[7] = SWAP8(BMW_h1[15]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -451,15 +242,14 @@ __kernel void search2(__global hash_t* hashes) @@ -451,15 +242,14 @@ __kernel void search2(__global hash_t* hashes)
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T4_L[i] = T4[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
@ -472,38 +262,47 @@ __kernel void search2(__global hash_t* hashes) @@ -472,38 +262,47 @@ __kernel void search2(__global hash_t* hashes)
#define T7 T7_L
// groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u];
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -528,14 +327,10 @@ __kernel void search3(__global hash_t* hashes) @@ -528,14 +327,10 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8(hash->h8[5]);
m6 = SWAP8(hash->h8[6]);
m7 = SWAP8(hash->h8[7]);
UBI_BIG(480, 64);
bcount = 0;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
UBI_BIG(510, 8);
hash->h8[0] = SWAP8(h0);
hash->h8[1] = SWAP8(h1);
hash->h8[2] = SWAP8(h2);
@ -562,8 +357,7 @@ __kernel void search4(__global hash_t* hashes) @@ -562,8 +357,7 @@ __kernel void search4(__global hash_t* hashes)
for(int i = 0; i < 2; i++)
{
if (i == 0)
{
if (i == 0) {
h0h ^= DEC64E(hash->h8[0]);
h0l ^= DEC64E(hash->h8[1]);
h1h ^= DEC64E(hash->h8[2]);
@ -572,9 +366,7 @@ __kernel void search4(__global hash_t* hashes) @@ -572,9 +366,7 @@ __kernel void search4(__global hash_t* hashes)
h2l ^= DEC64E(hash->h8[5]);
h3h ^= DEC64E(hash->h8[6]);
h3l ^= DEC64E(hash->h8[7]);
}
else if(i == 1)
{
} else if(i == 1) {
h4h ^= DEC64E(hash->h8[0]);
h4l ^= DEC64E(hash->h8[1]);
h5h ^= DEC64E(hash->h8[2]);
@ -635,7 +427,6 @@ __kernel void search5(__global hash_t* hashes) @@ -635,7 +427,6 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8(hash->h8[7]);
a31 ^= 0x8000000000000001;
KECCAK_F_1600;
// Finalize the "lane complement"
a10 = ~a10;
a20 = ~a20;
@ -682,8 +473,7 @@ __kernel void search6(__global hash_t* hashes) @@ -682,8 +473,7 @@ __kernel void search6(__global hash_t* hashes)
MI5;
LUFFA_P5;
if(i == 0)
{
if(i == 0) {
M0 = hash->h4[9];
M1 = hash->h4[8];
M2 = hash->h4[11];
@ -692,16 +482,12 @@ __kernel void search6(__global hash_t* hashes) @@ -692,16 +482,12 @@ __kernel void search6(__global hash_t* hashes)
M5 = hash->h4[12];
M6 = hash->h4[15];
M7 = hash->h4[14];
}
else if(i == 1)
{
} else if(i == 1) {
M0 = 0x80000000;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
}
else if(i == 2)
} else if(i == 2) {
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
else if(i == 3)
{
} else if(i == 3) {
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42;
@ -751,12 +537,10 @@ __kernel void search7(__global hash_t* hashes) @@ -751,12 +537,10 @@ __kernel void search7(__global hash_t* hashes)
x6 ^= SWAP4(hash->h4[7]);
x7 ^= SWAP4(hash->h4[6]);
for (int i = 0; i < 13; i ++)
{
for (int i = 0; i < 13; i ++) {
SIXTEEN_ROUNDS;
if (i == 0)
{
if (i == 0) {
x0 ^= SWAP4(hash->h4[9]);
x1 ^= SWAP4(hash->h4[8]);
x2 ^= SWAP4(hash->h4[11]);
@ -765,12 +549,12 @@ __kernel void search7(__global hash_t* hashes) @@ -765,12 +549,12 @@ __kernel void search7(__global hash_t* hashes)
x5 ^= SWAP4(hash->h4[12]);
x6 ^= SWAP4(hash->h4[15]);
x7 ^= SWAP4(hash->h4[14]);
}
else if(i == 1)
} else if(i == 1) {
x0 ^= 0x80;
else if (i == 2)
} else if (i == 2) {
xv ^= SPH_C32(1);
}
}
hash->h4[0] = x0;
hash->h4[1] = x1;
@ -797,7 +581,6 @@ __kernel void search8(__global hash_t* hashes) @@ -797,7 +581,6 @@ __kernel void search8(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
int init = get_local_id(0);
@ -826,7 +609,7 @@ __kernel void search8(__global hash_t* hashes) @@ -826,7 +609,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0];
rk01 = hash->h4[1];
@ -892,8 +675,7 @@ __kernel void search9(__global hash_t* hashes) @@ -892,8 +675,7 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);
FFT256(0, 1, 0, ll1);
for (int i = 0; i < 256; i ++)
{
for (int i = 0; i < 256; i ++) {
s32 tq;
tq = q[i] + yoff_b_n[i];
@ -929,17 +711,14 @@ __kernel void search9(__global hash_t* hashes) @@ -929,17 +711,14 @@ __kernel void search9(__global hash_t* hashes)
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
IF, 4, 13, PP8_4_);
STEP_BIG(
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
IF, 13, 10, PP8_5_);
STEP_BIG(
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
IF, 10, 25, PP8_6_);
STEP_BIG(
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
@ -958,27 +737,22 @@ __kernel void search9(__global hash_t* hashes) @@ -958,27 +737,22 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
STEP_BIG(
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4, 13, PP8_4_);
STEP_BIG(
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13, 10, PP8_5_);
STEP_BIG(
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10, 25, PP8_6_);
STEP_BIG(
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25, 4, PP8_0_);
#undef q
hash->h4[0] = A0;
@ -1006,7 +780,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1006,7 +780,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
{
uint gid = get_global_id(0);
uint offset = get_global_offset(0);
__global hash_t *hash = &(hashes[gid-offset]);
hash_t hash;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
@ -1023,9 +797,20 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1023,9 +797,20 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef INPUT_BIG_LOCAL
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
for (int i = init; i < 1024; i += step)
T512_L[i] = T512_C[i];
barrier(CLK_LOCAL_MEM_FENCE);
#else
#define INPUT_BIG_LOCAL INPUT_BIG
#endif
// mixtab
__local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256];
for (int i = init; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i];
@ -1033,37 +818,17 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1033,37 +818,17 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
mixtab2[i] = mixtab2_c[i];
mixtab3[i] = mixtab3_c[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
for (int i = init; i < 1024; i += step)
T512_L[i] = T512_C[i];
barrier(CLK_LOCAL_MEM_FENCE);
__local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256];
for (int i = init; i < 256; i += step)
{
LT0[i] = plain_T0[i];
LT1[i] = plain_T1[i];
LT2[i] = plain_T2[i];
LT3[i] = plain_T3[i];
LT4[i] = plain_T4[i];
LT5[i] = plain_T5[i];
LT6[i] = plain_T6[i];
LT7[i] = plain_T7[i];
for (int i = 0; i < 8; i++) {
hash.h8[i] = hashes[gid-offset].h8[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
// echo
for (int i = 0; i < 8; i++)
hash->h8[i] = hashes[gid-offset].h8[i];
{
// echo
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1;
sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71;
Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL;
@ -1090,14 +855,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1090,14 +855,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
W61 = Vb61;
W70 = Vb70;
W71 = Vb71;
W80 = hash->h8[0];
W81 = hash->h8[1];
W90 = hash->h8[2];
W91 = hash->h8[3];
WA0 = hash->h8[4];
WA1 = hash->h8[5];
WB0 = hash->h8[6];
WB1 = hash->h8[7];
W80 = hash.h8[0];
W81 = hash.h8[1];
W90 = hash.h8[2];
W91 = hash.h8[3];
WA0 = hash.h8[4];
WA1 = hash.h8[5];
WB0 = hash.h8[6];
WB1 = hash.h8[7];
WC0 = 0x80;
WC1 = 0;
WD0 = 0;
@ -1107,19 +872,25 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1107,19 +872,25 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
WF0 = 0x200;
WF1 = 0;
for (unsigned u = 0; u < 10; u ++)
for (unsigned u = 0; u < 10; u ++) {
BIG_ROUND;
}
hash->h8[0] ^= Vb00 ^ W00 ^ W80;
hash->h8[1] ^= Vb01 ^ W01 ^ W81;
hash->h8[2] ^= Vb10 ^ W10 ^ W90;
hash->h8[3] ^= Vb11 ^ W11 ^ W91;
hash->h8[4] ^= Vb20 ^ W20 ^ WA0;
hash->h8[5] ^= Vb21 ^ W21 ^ WA1;
hash->h8[6] ^= Vb30 ^ W30 ^ WB0;
hash->h8[7] ^= Vb31 ^ W31 ^ WB1;
hash.h8[0] ^= Vb00 ^ W00 ^ W80;
hash.h8[1] ^= Vb01 ^ W01 ^ W81;
hash.h8[2] ^= Vb10 ^ W10 ^ W90;
hash.h8[3] ^= Vb11 ^ W11 ^ W91;
hash.h8[4] ^= Vb20 ^ W20 ^ WA0;
hash.h8[5] ^= Vb21 ^ W21 ^ WA1;
hash.h8[6] ^= Vb30 ^ W30 ^ WB0;
hash.h8[7] ^= Vb31 ^ W31 ^ WB1;
}
// hamsi
{
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
@ -1128,39 +899,38 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1128,39 +899,38 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8)
{
#define buf(u) hash.h1[i + u]
for(int i = 0; i < 64; i += 8) {
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG_LOCAL;
PF_BIG;
T_BIG;
for (unsigned u = 0; u < 16; u ++)
hash->h4[u] = h[u];
hash.h4[u] = h[u];
}
// fugue
{
sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
sph_u32 S30, S31, S32, S33, S34, S35;
ulong fc_bit_count = (sph_u64) 0x200;
ulong fc_bit_count = (sph_u64) 64 << 3;
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
@ -1168,25 +938,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1168,25 +938,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2]));
FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5]));
FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8]));
FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB]));
FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE]));
FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
FUGUE512_3((hash.h4[0x0]), (hash.h4[0x1]), (hash.h4[0x2]));
FUGUE512_3((hash.h4[0x3]), (hash.h4[0x4]), (hash.h4[0x5]));
FUGUE512_3((hash.h4[0x6]), (hash.h4[0x7]), (hash.h4[0x8]));
FUGUE512_3((hash.h4[0x9]), (hash.h4[0xA]), (hash.h4[0xB]));
FUGUE512_3((hash.h4[0xC]), (hash.h4[0xD]), (hash.h4[0xE]));
FUGUE512_3((hash.h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
// apply round shift if necessary
int i;
for (i = 0; i < 32; i ++)
{
for (i = 0; i < 32; i ++) {
ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03);
}
for (i = 0; i < 13; i ++)
{
for (i = 0; i < 13; i ++) {
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
@ -1217,24 +984,27 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1217,24 +984,27 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S18 ^= S00;
S27 ^= S00;
hash->h4[0] = SWAP4(S01);
hash->h4[1] = SWAP4(S02);
hash->h4[2] = SWAP4(S03);
hash->h4[3] = SWAP4(S04);
hash->h4[4] = SWAP4(S09);
hash->h4[5] = SWAP4(S10);
hash->h4[6] = SWAP4(S11);
hash->h4[7] = SWAP4(S12);
hash->h4[8] = SWAP4(S18);
hash->h4[9] = SWAP4(S19);
hash->h4[10] = SWAP4(S20);
hash->h4[11] = SWAP4(S21);
hash->h4[12] = SWAP4(S27);
hash->h4[13] = SWAP4(S28);
hash->h4[14] = SWAP4(S29);
hash->h4[15] = SWAP4(S30);
hash.h4[0] = SWAP4(S01);
hash.h4[1] = SWAP4(S02);
hash.h4[2] = SWAP4(S03);
hash.h4[3] = SWAP4(S04);
hash.h4[4] = SWAP4(S09);
hash.h4[5] = SWAP4(S10);
hash.h4[6] = SWAP4(S11);
hash.h4[7] = SWAP4(S12);
hash.h4[8] = SWAP4(S18);
hash.h4[9] = SWAP4(S19);
hash.h4[10] = SWAP4(S20);
hash.h4[11] = SWAP4(S21);
hash.h4[12] = SWAP4(S27);
hash.h4[13] = SWAP4(S28);
hash.h4[14] = SWAP4(S29);
hash.h4[15] = SWAP4(S30);
}
//shabal
{
sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[2], A03 = A_init_512[3], A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[6], A07 = A_init_512[7],
A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
sph_u32 B0 = B_init_512[0], B1 = B_init_512[1], B2 = B_init_512[2], B3 = B_init_512[3], B4 = B_init_512[4], B5 = B_init_512[5], B6 = B_init_512[6], B7 = B_init_512[7],
@ -1244,22 +1014,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1244,22 +1014,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
sph_u32 Wlow = 1, Whigh = 0;
M0 = hash->h4[0];
M1 = hash->h4[1];
M2 = hash->h4[2];
M3 = hash->h4[3];
M4 = hash->h4[4];
M5 = hash->h4[5];
M6 = hash->h4[6];
M7 = hash->h4[7];
M8 = hash->h4[8];
M9 = hash->h4[9];
MA = hash->h4[10];
MB = hash->h4[11];
MC = hash->h4[12];
MD = hash->h4[13];
ME = hash->h4[14];
MF = hash->h4[15];
M0 = hash.h4[0];
M1 = hash.h4[1];
M2 = hash.h4[2];
M3 = hash.h4[3];
M4 = hash.h4[4];
M5 = hash.h4[5];
M6 = hash.h4[6];
M7 = hash.h4[7];
M8 = hash.h4[8];
M9 = hash.h4[9];
MA = hash.h4[10];
MB = hash.h4[11];
MC = hash.h4[12];
MD = hash.h4[13];
ME = hash.h4[14];
MF = hash.h4[15];
INPUT_BLOCK_ADD;
XOR_W;
@ -1274,44 +1044,44 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1274,44 +1044,44 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
for (unsigned i = 0; i < 3; i ++)
{
for (unsigned i = 0; i < 3; i ++) {
SWAP_BC;
XOR_W;
APPLY_P;
}
hash->h4[0] = B0;
hash->h4[1] = B1;
hash->h4[2] = B2;
hash->h4[3] = B3;
hash->h4[4] = B4;
hash->h4[5] = B5;
hash->h4[6] = B6;
hash->h4[7] = B7;
hash->h4[8] = B8;
hash->h4[9] = B9;
hash->h4[10] = BA;
hash->h4[11] = BB;
hash->h4[12] = BC;
hash->h4[13] = BD;
hash->h4[14] = BE;
hash->h4[15] = BF;
hash.h4[0] = B0;
hash.h4[1] = B1;
hash.h4[2] = B2;
hash.h4[3] = B3;
hash.h4[4] = B4;
hash.h4[5] = B5;
hash.h4[6] = B6;
hash.h4[7] = B7;
hash.h4[8] = B8;
hash.h4[9] = B9;
hash.h4[10] = BA;
hash.h4[11] = BB;
hash.h4[12] = BC;
hash.h4[13] = BD;
hash.h4[14] = BE;
hash.h4[15] = BF;
}
//whirlpool
{
sph_u64 n0, n1, n2, n3, n4, n5, n6, n7;
sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
sph_u64 state[8];
n0 = (hash->h8[0]);
n1 = (hash->h8[1]);
n2 = (hash->h8[2]);
n3 = (hash->h8[3]);
n4 = (hash->h8[4]);
n5 = (hash->h8[5]);
n6 = (hash->h8[6]);
n7 = (hash->h8[7]);
n0 = (hash.h8[0]);
n1 = (hash.h8[1]);
n2 = (hash.h8[2]);
n3 = (hash.h8[3]);
n4 = (hash.h8[4]);
n5 = (hash.h8[5]);
n6 = (hash.h8[6]);
n7 = (hash.h8[7]);
h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0;
@ -1324,9 +1094,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1324,9 +1094,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
n6 ^= h6;
n7 ^= h7;
#pragma unroll 10
for (unsigned r = 0; r < 10; r ++)
{
for (unsigned r = 0; r < 10; r ++) {
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
@ -1335,14 +1103,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1335,14 +1103,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
TRANSFER(n, tmp);
}
state[0] = n0 ^ (hash->h8[0]);
state[1] = n1 ^ (hash->h8[1]);
state[2] = n2 ^ (hash->h8[2]);
state[3] = n3 ^ (hash->h8[3]);
state[4] = n4 ^ (hash->h8[4]);
state[5] = n5 ^ (hash->h8[5]);
state[6] = n6 ^ (hash->h8[6]);
state[7] = n7 ^ (hash->h8[7]);
state[0] = n0 ^ (hash.h8[0]);
state[1] = n1 ^ (hash.h8[1]);
state[2] = n2 ^ (hash.h8[2]);
state[3] = n3 ^ (hash.h8[3]);
state[4] = n4 ^ (hash.h8[4]);
state[5] = n5 ^ (hash.h8[5]);
state[6] = n6 ^ (hash.h8[6]);
state[7] = n7 ^ (hash.h8[7]);
n0 = 0x80;
n1 = n2 = n3 = n4 = n5 = n6 = 0;
@ -1366,12 +1134,10 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1366,12 +1134,10 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
n6 ^= h6;
n7 ^= h7;
#pragma unroll 10
for (unsigned r = 0; r < 10; r ++)
{
for (unsigned r = 0; r < 10; r ++) {
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(LT, h, tmp, plain_RC[r]);
ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
TRANSFER(h, tmp);
ROUND_WENC(plain_T, n, h, tmp);
TRANSFER(n, tmp);
@ -1387,9 +1153,10 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1387,9 +1153,10 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
state[7] ^= n7 ^ 0x2000000000000;
for (unsigned i = 0; i < 8; i ++)
hash->h8[i] = state[i];
hash.h8[i] = state[i];
}
bool result = (hash->h8[3] <= target);
bool result = (hash.h8[3] <= target);
if (result)
output[atomic_inc(output+0xFF)] = SWAP4(gid);

2
kernel/hamsi.cl

@ -105,7 +105,7 @@ @@ -105,7 +105,7 @@
//temp fix for shortened implementation of X15
#ifdef SPH_HAMSI_SHORT
#if SPH_HAMSI_EXPAND_BIG == 1
#if SPH_HAMSI_SHORT == 1 && SPH_HAMSI_EXPAND_BIG == 1
#include "hamsi_helper_big.cl"
#else
#include "hamsi_helper.cl"

11
kernel/marucoin-mod.cl

@ -78,9 +78,6 @@ typedef long sph_s64; @@ -78,9 +78,6 @@ typedef long sph_s64;
#if !defined SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 1
#endif
#ifndef SPH_HAMSI_SHORT
#define SPH_HAMSI_SHORT 1
#endif
#include "blake.cl"
#include "bmw.cl"
@ -1093,6 +1090,8 @@ __kernel void search11(__global hash_t* hashes) @@ -1093,6 +1090,8 @@ __kernel void search11(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
#ifdef INPUT_BIG_LOCAL
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
@ -1102,6 +1101,9 @@ __kernel void search11(__global hash_t* hashes) @@ -1102,6 +1101,9 @@ __kernel void search11(__global hash_t* hashes)
T512_L[i] = T512_C[i];
barrier(CLK_LOCAL_MEM_FENCE);
#else
#define INPUT_BIG_LOCAL INPUT_BIG
#endif
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
@ -1113,7 +1115,8 @@ __kernel void search11(__global hash_t* hashes) @@ -1113,7 +1115,8 @@ __kernel void search11(__global hash_t* hashes)
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8) {
for(int i = 0; i < 64; i += 8)
{
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;

540
kernel/marucoin-modold.cl

@ -30,8 +30,8 @@ @@ -30,8 +30,8 @@
* @author phm <phm@inbox.com>
*/
#ifndef X13MOD_CL
#define X13MOD_CL
#ifndef MARUCOIN_MOD_CL
#define MARUCOIN_MOD_CL
#if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1
@ -55,12 +55,12 @@ typedef long sph_s64; @@ -55,12 +55,12 @@ typedef long sph_s64;
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
#define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
@ -74,7 +74,7 @@ typedef long sph_s64; @@ -74,7 +74,7 @@ typedef long sph_s64;
#define SPH_SMALL_FOOTPRINT_GROESTL 0
#define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0
#define SPH_KECCAK_UNROLL 0
#define SPH_KECCAK_UNROLL 1
#ifndef SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 4
#endif
@ -115,8 +115,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -115,8 +115,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// blake
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -125,13 +125,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -125,13 +125,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;;
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
{
T1 = SPH_T64(T1 + 1);
}
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
M0 = DEC64BE(block + 0);
M1 = DEC64BE(block + 8);
M2 = DEC64BE(block + 16);
@ -170,16 +170,13 @@ __kernel void search1(__global hash_t* hashes) @@ -170,16 +170,13 @@ __kernel void search1(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// bmw
sph_u64 BMW_H[16];
#pragma unroll 16
for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u];
sph_u64 mv[16],q[32];
sph_u64 tmp;
sph_u64 BMW_h1[16], BMW_h2[16];
sph_u64 mv[16];
mv[ 0] = SWAP8(hash->h8[0]);
mv[ 1] = SWAP8(hash->h8[1]);
@ -196,243 +193,35 @@ __kernel void search1(__global hash_t* hashes) @@ -196,243 +193,35 @@ __kernel void search1(__global hash_t* hashes)
mv[12] = 0;
mv[13] = 0;
mv[14] = 0;
mv[15] = SPH_C64(512);
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
mv[15] = 0x200;
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
FOLDb;
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#undef M
#undef H
#undef dH
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x])
#define H(x) (final_b[x])
#define dH(x) (BMW_h1[x])
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
FOLDb;
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#undef M
#undef H
#undef dH
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash->h8[0] = SWAP8(BMW_H[8]);
hash->h8[1] = SWAP8(BMW_H[9]);
hash->h8[2] = SWAP8(BMW_H[10]);
hash->h8[3] = SWAP8(BMW_H[11]);
hash->h8[4] = SWAP8(BMW_H[12]);
hash->h8[5] = SWAP8(BMW_H[13]);
hash->h8[6] = SWAP8(BMW_H[14]);
hash->h8[7] = SWAP8(BMW_H[15]);
hash->h8[0] = SWAP8(BMW_h1[8]);
hash->h8[1] = SWAP8(BMW_h1[9]);
hash->h8[2] = SWAP8(BMW_h1[10]);
hash->h8[3] = SWAP8(BMW_h1[11]);
hash->h8[4] = SWAP8(BMW_h1[12]);
hash->h8[5] = SWAP8(BMW_h1[13]);
hash->h8[6] = SWAP8(BMW_h1[14]);
hash->h8[7] = SWAP8(BMW_h1[15]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -451,15 +240,14 @@ __kernel void search2(__global hash_t* hashes) @@ -451,15 +240,14 @@ __kernel void search2(__global hash_t* hashes)
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T4_L[i] = T4[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
@ -472,38 +260,47 @@ __kernel void search2(__global hash_t* hashes) @@ -472,38 +260,47 @@ __kernel void search2(__global hash_t* hashes)
#define T7 T7_L
// groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u];
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -528,14 +325,10 @@ __kernel void search3(__global hash_t* hashes) @@ -528,14 +325,10 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8(hash->h8[5]);
m6 = SWAP8(hash->h8[6]);
m7 = SWAP8(hash->h8[7]);
UBI_BIG(480, 64);
bcount = 0;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
UBI_BIG(510, 8);
hash->h8[0] = SWAP8(h0);
hash->h8[1] = SWAP8(h1);
hash->h8[2] = SWAP8(h2);
@ -562,8 +355,7 @@ __kernel void search4(__global hash_t* hashes) @@ -562,8 +355,7 @@ __kernel void search4(__global hash_t* hashes)
for(int i = 0; i < 2; i++)
{
if (i == 0)
{
if (i == 0) {
h0h ^= DEC64E(hash->h8[0]);
h0l ^= DEC64E(hash->h8[1]);
h1h ^= DEC64E(hash->h8[2]);
@ -572,9 +364,7 @@ __kernel void search4(__global hash_t* hashes) @@ -572,9 +364,7 @@ __kernel void search4(__global hash_t* hashes)
h2l ^= DEC64E(hash->h8[5]);
h3h ^= DEC64E(hash->h8[6]);
h3l ^= DEC64E(hash->h8[7]);
}
else if(i == 1)
{
} else if(i == 1) {
h4h ^= DEC64E(hash->h8[0]);
h4l ^= DEC64E(hash->h8[1]);
h5h ^= DEC64E(hash->h8[2]);
@ -635,7 +425,6 @@ __kernel void search5(__global hash_t* hashes) @@ -635,7 +425,6 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8(hash->h8[7]);
a31 ^= 0x8000000000000001;
KECCAK_F_1600;
// Finalize the "lane complement"
a10 = ~a10;
a20 = ~a20;
@ -682,8 +471,7 @@ __kernel void search6(__global hash_t* hashes) @@ -682,8 +471,7 @@ __kernel void search6(__global hash_t* hashes)
MI5;
LUFFA_P5;
if(i == 0)
{
if(i == 0) {
M0 = hash->h4[9];
M1 = hash->h4[8];
M2 = hash->h4[11];
@ -692,16 +480,12 @@ __kernel void search6(__global hash_t* hashes) @@ -692,16 +480,12 @@ __kernel void search6(__global hash_t* hashes)
M5 = hash->h4[12];
M6 = hash->h4[15];
M7 = hash->h4[14];
}
else if(i == 1)
{
} else if(i == 1) {
M0 = 0x80000000;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
}
else if(i == 2)
} else if(i == 2) {
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
else if(i == 3)
{
} else if(i == 3) {
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42;
@ -751,12 +535,10 @@ __kernel void search7(__global hash_t* hashes) @@ -751,12 +535,10 @@ __kernel void search7(__global hash_t* hashes)
x6 ^= SWAP4(hash->h4[7]);
x7 ^= SWAP4(hash->h4[6]);
for (int i = 0; i < 13; i ++)
{
for (int i = 0; i < 13; i ++) {
SIXTEEN_ROUNDS;
if (i == 0)
{
if (i == 0) {
x0 ^= SWAP4(hash->h4[9]);
x1 ^= SWAP4(hash->h4[8]);
x2 ^= SWAP4(hash->h4[11]);
@ -765,12 +547,12 @@ __kernel void search7(__global hash_t* hashes) @@ -765,12 +547,12 @@ __kernel void search7(__global hash_t* hashes)
x5 ^= SWAP4(hash->h4[12]);
x6 ^= SWAP4(hash->h4[15]);
x7 ^= SWAP4(hash->h4[14]);
}
else if(i == 1)
} else if(i == 1) {
x0 ^= 0x80;
else if (i == 2)
} else if (i == 2) {
xv ^= SPH_C32(1);
}
}
hash->h4[0] = x0;
hash->h4[1] = x1;
@ -797,7 +579,6 @@ __kernel void search8(__global hash_t* hashes) @@ -797,7 +579,6 @@ __kernel void search8(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
int init = get_local_id(0);
@ -826,7 +607,7 @@ __kernel void search8(__global hash_t* hashes) @@ -826,7 +607,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0];
rk01 = hash->h4[1];
@ -892,8 +673,7 @@ __kernel void search9(__global hash_t* hashes) @@ -892,8 +673,7 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);
FFT256(0, 1, 0, ll1);
for (int i = 0; i < 256; i ++)
{
for (int i = 0; i < 256; i ++) {
s32 tq;
tq = q[i] + yoff_b_n[i];
@ -929,17 +709,14 @@ __kernel void search9(__global hash_t* hashes) @@ -929,17 +709,14 @@ __kernel void search9(__global hash_t* hashes)
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
IF, 4, 13, PP8_4_);
STEP_BIG(
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
IF, 13, 10, PP8_5_);
STEP_BIG(
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
IF, 10, 25, PP8_6_);
STEP_BIG(
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
@ -958,27 +735,22 @@ __kernel void search9(__global hash_t* hashes) @@ -958,27 +735,22 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
STEP_BIG(
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4, 13, PP8_4_);
STEP_BIG(
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13, 10, PP8_5_);
STEP_BIG(
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10, 25, PP8_6_);
STEP_BIG(
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25, 4, PP8_0_);
#undef q
hash->h4[0] = A0;
@ -1006,7 +778,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1006,7 +778,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
{
uint gid = get_global_id(0);
uint offset = get_global_offset(0);
__global hash_t *hash = &(hashes[gid-offset]);
hash_t hash;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
@ -1023,9 +795,21 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1023,9 +795,21 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef INPUT_BIG_LOCAL
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
for (int i = init; i < 1024; i += step)
T512_L[i] = T512_C[i];
barrier(CLK_LOCAL_MEM_FENCE);
#else
#define INPUT_BIG_LOCAL INPUT_BIG
#endif
// mixtab
__local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256];
for (int i = init; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i];
@ -1033,13 +817,17 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1033,13 +817,17 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
mixtab2[i] = mixtab2_c[i];
mixtab3[i] = mixtab3_c[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < 8; i++)
hash->h8[i] = hashes[gid-offset].h8[i];
for (int i = 0; i < 8; i++) {
hash.h8[i] = hashes[gid-offset].h8[i];
}
// echo
{
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1;
sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71;
Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL;
@ -1066,14 +854,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1066,14 +854,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
W61 = Vb61;
W70 = Vb70;
W71 = Vb71;
W80 = hash->h8[0];
W81 = hash->h8[1];
W90 = hash->h8[2];
W91 = hash->h8[3];
WA0 = hash->h8[4];
WA1 = hash->h8[5];
WB0 = hash->h8[6];
WB1 = hash->h8[7];
W80 = hash.h8[0];
W81 = hash.h8[1];
W90 = hash.h8[2];
W91 = hash.h8[3];
WA0 = hash.h8[4];
WA1 = hash.h8[5];
WB0 = hash.h8[6];
WB1 = hash.h8[7];
WC0 = 0x80;
WC1 = 0;
WD0 = 0;
@ -1083,26 +871,24 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1083,26 +871,24 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
WF0 = 0x200;
WF1 = 0;
for (unsigned u = 0; u < 10; u ++)
for (unsigned u = 0; u < 10; u ++) {
BIG_ROUND;
}
hash->h8[0] ^= Vb00 ^ W00 ^ W80;
hash->h8[1] ^= Vb01 ^ W01 ^ W81;
hash->h8[2] ^= Vb10 ^ W10 ^ W90;
hash->h8[3] ^= Vb11 ^ W11 ^ W91;
hash->h8[4] ^= Vb20 ^ W20 ^ WA0;
hash->h8[5] ^= Vb21 ^ W21 ^ WA1;
hash->h8[6] ^= Vb30 ^ W30 ^ WB0;
hash->h8[7] ^= Vb31 ^ W31 ^ WB1;
hash.h8[0] ^= Vb00 ^ W00 ^ W80;
hash.h8[1] ^= Vb01 ^ W01 ^ W81;
hash.h8[2] ^= Vb10 ^ W10 ^ W90;
hash.h8[3] ^= Vb11 ^ W11 ^ W91;
hash.h8[4] ^= Vb20 ^ W20 ^ WA0;
hash.h8[5] ^= Vb21 ^ W21 ^ WA1;
hash.h8[6] ^= Vb30 ^ W30 ^ WB0;
hash.h8[7] ^= Vb31 ^ W31 ^ WB1;
// hamsi
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
}
for (int i = init; i < 1024; i += step)
T512_L[i] = T512_C[i];
// hamsi
barrier(CLK_LOCAL_MEM_FENCE);
{
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
@ -1112,39 +898,38 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1112,39 +898,38 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8)
{
#define buf(u) hash.h1[i + u]
for(int i = 0; i < 64; i += 8) {
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG_LOCAL;
PF_BIG;
T_BIG;
for (unsigned u = 0; u < 16; u ++)
hash->h4[u] = h[u];
hash.h4[u] = h[u];
}
// fugue
{
sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
sph_u32 S30, S31, S32, S33, S34, S35;
ulong fc_bit_count = (sph_u64) 0x200;
ulong fc_bit_count = (sph_u64) 64 << 3;
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
@ -1152,25 +937,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1152,25 +937,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2]));
FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5]));
FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8]));
FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB]));
FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE]));
FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
FUGUE512_3((hash.h4[0x0]), (hash.h4[0x1]), (hash.h4[0x2]));
FUGUE512_3((hash.h4[0x3]), (hash.h4[0x4]), (hash.h4[0x5]));
FUGUE512_3((hash.h4[0x6]), (hash.h4[0x7]), (hash.h4[0x8]));
FUGUE512_3((hash.h4[0x9]), (hash.h4[0xA]), (hash.h4[0xB]));
FUGUE512_3((hash.h4[0xC]), (hash.h4[0xD]), (hash.h4[0xE]));
FUGUE512_3((hash.h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
// apply round shift if necessary
int i;
for (i = 0; i < 32; i ++)
{
for (i = 0; i < 32; i ++) {
ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03);
}
for (i = 0; i < 13; i ++)
{
for (i = 0; i < 13; i ++) {
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
@ -1201,28 +983,30 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1201,28 +983,30 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S18 ^= S00;
S27 ^= S00;
hash->h4[0] = SWAP4(S01);
hash->h4[1] = SWAP4(S02);
hash->h4[2] = SWAP4(S03);
hash->h4[3] = SWAP4(S04);
hash->h4[4] = SWAP4(S09);
hash->h4[5] = SWAP4(S10);
hash->h4[6] = SWAP4(S11);
hash->h4[7] = SWAP4(S12);
hash->h4[8] = SWAP4(S18);
hash->h4[9] = SWAP4(S19);
hash->h4[10] = SWAP4(S20);
hash->h4[11] = SWAP4(S21);
hash->h4[12] = SWAP4(S27);
hash->h4[13] = SWAP4(S28);
hash->h4[14] = SWAP4(S29);
hash->h4[15] = SWAP4(S30);
bool result = (hash->h8[3] <= target);
hash.h4[0] = SWAP4(S01);
hash.h4[1] = SWAP4(S02);
hash.h4[2] = SWAP4(S03);
hash.h4[3] = SWAP4(S04);
hash.h4[4] = SWAP4(S09);
hash.h4[5] = SWAP4(S10);
hash.h4[6] = SWAP4(S11);
hash.h4[7] = SWAP4(S12);
hash.h4[8] = SWAP4(S18);
hash.h4[9] = SWAP4(S19);
hash.h4[10] = SWAP4(S20);
hash.h4[11] = SWAP4(S21);
hash.h4[12] = SWAP4(S27);
hash.h4[13] = SWAP4(S28);
hash.h4[14] = SWAP4(S29);
hash.h4[15] = SWAP4(S30);
}
bool result = (hash.h8[3] <= target);
if (result)
output[atomic_inc(output+0xFF)] = SWAP4(gid);
barrier(CLK_GLOBAL_MEM_FENCE);
}
#endif // X13MOD_CL
#endif // MARUCOIN_MOD_CL

11
kernel/x14.cl

@ -80,9 +80,6 @@ typedef int sph_s32; @@ -80,9 +80,6 @@ typedef int sph_s32;
#ifndef SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 1
#endif
#ifndef SPH_HAMSI_SHORT
#define SPH_HAMSI_SHORT 1
#endif
#include "blake.cl"
#include "bmw.cl"
@ -1107,6 +1104,8 @@ __kernel void search11(__global hash_t* hashes) @@ -1107,6 +1104,8 @@ __kernel void search11(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
#ifdef INPUT_BIG_LOCAL
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
@ -1116,6 +1115,9 @@ __kernel void search11(__global hash_t* hashes) @@ -1116,6 +1115,9 @@ __kernel void search11(__global hash_t* hashes)
T512_L[i] = T512_C[i];
barrier(CLK_LOCAL_MEM_FENCE);
#else
#define INPUT_BIG_LOCAL INPUT_BIG
#endif
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
@ -1127,7 +1129,8 @@ __kernel void search11(__global hash_t* hashes) @@ -1127,7 +1129,8 @@ __kernel void search11(__global hash_t* hashes)
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8) {
for(int i = 0; i < 64; i += 8)
{
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;

602
kernel/x14old.cl

@ -30,8 +30,8 @@ @@ -30,8 +30,8 @@
* @author phm <phm@inbox.com>
*/
#ifndef X14OLD_CL
#define X14OLD_CL
#ifndef X14_CL
#define X14_CL
#if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1
@ -94,6 +94,7 @@ typedef long sph_s64; @@ -94,6 +94,7 @@ typedef long sph_s64;
#include "fugue.cl"
#include "shabal.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
@ -116,8 +117,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -116,8 +117,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// blake
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -126,13 +127,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -126,13 +127,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;;
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
{
T1 = SPH_T64(T1 + 1);
}
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
M0 = DEC64BE(block + 0);
M1 = DEC64BE(block + 8);
M2 = DEC64BE(block + 16);
@ -171,16 +172,13 @@ __kernel void search1(__global hash_t* hashes) @@ -171,16 +172,13 @@ __kernel void search1(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// bmw
sph_u64 BMW_H[16];
#pragma unroll 16
for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u];
sph_u64 mv[16],q[32];
sph_u64 tmp;
sph_u64 BMW_h1[16], BMW_h2[16];
sph_u64 mv[16];
mv[ 0] = SWAP8(hash->h8[0]);
mv[ 1] = SWAP8(hash->h8[1]);
@ -197,243 +195,35 @@ __kernel void search1(__global hash_t* hashes) @@ -197,243 +195,35 @@ __kernel void search1(__global hash_t* hashes)
mv[12] = 0;
mv[13] = 0;
mv[14] = 0;
mv[15] = SPH_C64(512);
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
mv[15] = 0x200;
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
FOLDb;
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#undef M
#undef H
#undef dH
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x])
#define H(x) (final_b[x])
#define dH(x) (BMW_h1[x])
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
FOLDb;
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#undef M
#undef H
#undef dH
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash->h8[0] = SWAP8(BMW_H[8]);
hash->h8[1] = SWAP8(BMW_H[9]);
hash->h8[2] = SWAP8(BMW_H[10]);
hash->h8[3] = SWAP8(BMW_H[11]);
hash->h8[4] = SWAP8(BMW_H[12]);
hash->h8[5] = SWAP8(BMW_H[13]);
hash->h8[6] = SWAP8(BMW_H[14]);
hash->h8[7] = SWAP8(BMW_H[15]);
hash->h8[0] = SWAP8(BMW_h1[8]);
hash->h8[1] = SWAP8(BMW_h1[9]);
hash->h8[2] = SWAP8(BMW_h1[10]);
hash->h8[3] = SWAP8(BMW_h1[11]);
hash->h8[4] = SWAP8(BMW_h1[12]);
hash->h8[5] = SWAP8(BMW_h1[13]);
hash->h8[6] = SWAP8(BMW_h1[14]);
hash->h8[7] = SWAP8(BMW_h1[15]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -452,15 +242,14 @@ __kernel void search2(__global hash_t* hashes) @@ -452,15 +242,14 @@ __kernel void search2(__global hash_t* hashes)
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T4_L[i] = T4[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
@ -473,38 +262,47 @@ __kernel void search2(__global hash_t* hashes) @@ -473,38 +262,47 @@ __kernel void search2(__global hash_t* hashes)
#define T7 T7_L
// groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u];
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -529,14 +327,10 @@ __kernel void search3(__global hash_t* hashes) @@ -529,14 +327,10 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8(hash->h8[5]);
m6 = SWAP8(hash->h8[6]);
m7 = SWAP8(hash->h8[7]);
UBI_BIG(480, 64);
bcount = 0;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
UBI_BIG(510, 8);
hash->h8[0] = SWAP8(h0);
hash->h8[1] = SWAP8(h1);
hash->h8[2] = SWAP8(h2);
@ -563,8 +357,7 @@ __kernel void search4(__global hash_t* hashes) @@ -563,8 +357,7 @@ __kernel void search4(__global hash_t* hashes)
for(int i = 0; i < 2; i++)
{
if (i == 0)
{
if (i == 0) {
h0h ^= DEC64E(hash->h8[0]);
h0l ^= DEC64E(hash->h8[1]);
h1h ^= DEC64E(hash->h8[2]);
@ -573,9 +366,7 @@ __kernel void search4(__global hash_t* hashes) @@ -573,9 +366,7 @@ __kernel void search4(__global hash_t* hashes)
h2l ^= DEC64E(hash->h8[5]);
h3h ^= DEC64E(hash->h8[6]);
h3l ^= DEC64E(hash->h8[7]);
}
else if(i == 1)
{
} else if(i == 1) {
h4h ^= DEC64E(hash->h8[0]);
h4l ^= DEC64E(hash->h8[1]);
h5h ^= DEC64E(hash->h8[2]);
@ -636,7 +427,6 @@ __kernel void search5(__global hash_t* hashes) @@ -636,7 +427,6 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8(hash->h8[7]);
a31 ^= 0x8000000000000001;
KECCAK_F_1600;
// Finalize the "lane complement"
a10 = ~a10;
a20 = ~a20;
@ -683,8 +473,7 @@ __kernel void search6(__global hash_t* hashes) @@ -683,8 +473,7 @@ __kernel void search6(__global hash_t* hashes)
MI5;
LUFFA_P5;
if(i == 0)
{
if(i == 0) {
M0 = hash->h4[9];
M1 = hash->h4[8];
M2 = hash->h4[11];
@ -693,16 +482,12 @@ __kernel void search6(__global hash_t* hashes) @@ -693,16 +482,12 @@ __kernel void search6(__global hash_t* hashes)
M5 = hash->h4[12];
M6 = hash->h4[15];
M7 = hash->h4[14];
}
else if(i == 1)
{
} else if(i == 1) {
M0 = 0x80000000;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
}
else if(i == 2)
} else if(i == 2) {
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
else if(i == 3)
{
} else if(i == 3) {
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42;
@ -752,12 +537,10 @@ __kernel void search7(__global hash_t* hashes) @@ -752,12 +537,10 @@ __kernel void search7(__global hash_t* hashes)
x6 ^= SWAP4(hash->h4[7]);
x7 ^= SWAP4(hash->h4[6]);
for (int i = 0; i < 13; i ++)
{
for (int i = 0; i < 13; i ++) {
SIXTEEN_ROUNDS;
if (i == 0)
{
if (i == 0) {
x0 ^= SWAP4(hash->h4[9]);
x1 ^= SWAP4(hash->h4[8]);
x2 ^= SWAP4(hash->h4[11]);
@ -766,12 +549,12 @@ __kernel void search7(__global hash_t* hashes) @@ -766,12 +549,12 @@ __kernel void search7(__global hash_t* hashes)
x5 ^= SWAP4(hash->h4[12]);
x6 ^= SWAP4(hash->h4[15]);
x7 ^= SWAP4(hash->h4[14]);
}
else if(i == 1)
} else if(i == 1) {
x0 ^= 0x80;
else if (i == 2)
} else if (i == 2) {
xv ^= SPH_C32(1);
}
}
hash->h4[0] = x0;
hash->h4[1] = x1;
@ -798,7 +581,6 @@ __kernel void search8(__global hash_t* hashes) @@ -798,7 +581,6 @@ __kernel void search8(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
int init = get_local_id(0);
@ -827,7 +609,7 @@ __kernel void search8(__global hash_t* hashes) @@ -827,7 +609,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0];
rk01 = hash->h4[1];
@ -893,8 +675,7 @@ __kernel void search9(__global hash_t* hashes) @@ -893,8 +675,7 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);
FFT256(0, 1, 0, ll1);
for (int i = 0; i < 256; i ++)
{
for (int i = 0; i < 256; i ++) {
s32 tq;
tq = q[i] + yoff_b_n[i];
@ -930,17 +711,14 @@ __kernel void search9(__global hash_t* hashes) @@ -930,17 +711,14 @@ __kernel void search9(__global hash_t* hashes)
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
IF, 4, 13, PP8_4_);
STEP_BIG(
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
IF, 13, 10, PP8_5_);
STEP_BIG(
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
IF, 10, 25, PP8_6_);
STEP_BIG(
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
@ -959,27 +737,22 @@ __kernel void search9(__global hash_t* hashes) @@ -959,27 +737,22 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
STEP_BIG(
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4, 13, PP8_4_);
STEP_BIG(
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13, 10, PP8_5_);
STEP_BIG(
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10, 25, PP8_6_);
STEP_BIG(
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25, 4, PP8_0_);
#undef q
hash->h4[0] = A0;
@ -1007,7 +780,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1007,7 +780,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
{
uint gid = get_global_id(0);
uint offset = get_global_offset(0);
__global hash_t *hash = &(hashes[gid-offset]);
hash_t hash;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
@ -1024,9 +797,20 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1024,9 +797,20 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef INPUT_BIG_LOCAL
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
for (int i = init; i < 1024; i += step)
T512_L[i] = T512_C[i];
barrier(CLK_LOCAL_MEM_FENCE);
#else
#define INPUT_BIG_LOCAL INPUT_BIG
#endif
// mixtab
__local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256];
for (int i = init; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i];
@ -1034,21 +818,17 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1034,21 +818,17 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
mixtab2[i] = mixtab2_c[i];
mixtab3[i] = mixtab3_c[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
for (int i = init; i < 1024; i += step)
T512_L[i] = T512_C[i];
for (int i = 0; i < 8; i++) {
hash.h8[i] = hashes[gid-offset].h8[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
// echo
for (int i = 0; i < 8; i++)
hash->h8[i] = hashes[gid-offset].h8[i];
{
// echo
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1;
sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71;
Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL;
@ -1075,14 +855,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1075,14 +855,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
W61 = Vb61;
W70 = Vb70;
W71 = Vb71;
W80 = hash->h8[0];
W81 = hash->h8[1];
W90 = hash->h8[2];
W91 = hash->h8[3];
WA0 = hash->h8[4];
WA1 = hash->h8[5];
WB0 = hash->h8[6];
WB1 = hash->h8[7];
W80 = hash.h8[0];
W81 = hash.h8[1];
W90 = hash.h8[2];
W91 = hash.h8[3];
WA0 = hash.h8[4];
WA1 = hash.h8[5];
WB0 = hash.h8[6];
WB1 = hash.h8[7];
WC0 = 0x80;
WC1 = 0;
WD0 = 0;
@ -1092,19 +872,25 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1092,19 +872,25 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
WF0 = 0x200;
WF1 = 0;
for (unsigned u = 0; u < 10; u ++)
for (unsigned u = 0; u < 10; u ++) {
BIG_ROUND;
}
hash.h8[0] ^= Vb00 ^ W00 ^ W80;
hash.h8[1] ^= Vb01 ^ W01 ^ W81;
hash.h8[2] ^= Vb10 ^ W10 ^ W90;
hash.h8[3] ^= Vb11 ^ W11 ^ W91;
hash.h8[4] ^= Vb20 ^ W20 ^ WA0;
hash.h8[5] ^= Vb21 ^ W21 ^ WA1;
hash.h8[6] ^= Vb30 ^ W30 ^ WB0;
hash.h8[7] ^= Vb31 ^ W31 ^ WB1;
hash->h8[0] ^= Vb00 ^ W00 ^ W80;
hash->h8[1] ^= Vb01 ^ W01 ^ W81;
hash->h8[2] ^= Vb10 ^ W10 ^ W90;
hash->h8[3] ^= Vb11 ^ W11 ^ W91;
hash->h8[4] ^= Vb20 ^ W20 ^ WA0;
hash->h8[5] ^= Vb21 ^ W21 ^ WA1;
hash->h8[6] ^= Vb30 ^ W30 ^ WB0;
hash->h8[7] ^= Vb31 ^ W31 ^ WB1;
}
// hamsi
{
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
@ -1113,39 +899,38 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1113,39 +899,38 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8)
{
#define buf(u) hash.h1[i + u]
for(int i = 0; i < 64; i += 8) {
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG_LOCAL;
PF_BIG;
T_BIG;
for (unsigned u = 0; u < 16; u ++)
hash->h4[u] = h[u];
hash.h4[u] = h[u];
}
// fugue
{
sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
sph_u32 S30, S31, S32, S33, S34, S35;
ulong fc_bit_count = (sph_u64) 0x200;
ulong fc_bit_count = (sph_u64) 64 << 3;
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
@ -1153,25 +938,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1153,25 +938,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2]));
FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5]));
FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8]));
FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB]));
FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE]));
FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
FUGUE512_3((hash.h4[0x0]), (hash.h4[0x1]), (hash.h4[0x2]));
FUGUE512_3((hash.h4[0x3]), (hash.h4[0x4]), (hash.h4[0x5]));
FUGUE512_3((hash.h4[0x6]), (hash.h4[0x7]), (hash.h4[0x8]));
FUGUE512_3((hash.h4[0x9]), (hash.h4[0xA]), (hash.h4[0xB]));
FUGUE512_3((hash.h4[0xC]), (hash.h4[0xD]), (hash.h4[0xE]));
FUGUE512_3((hash.h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
// apply round shift if necessary
int i;
for (i = 0; i < 32; i ++)
{
for (i = 0; i < 32; i ++) {
ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03);
}
for (i = 0; i < 13; i ++)
{
for (i = 0; i < 13; i ++) {
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
@ -1202,24 +984,27 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1202,24 +984,27 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S18 ^= S00;
S27 ^= S00;
hash->h4[0] = SWAP4(S01);
hash->h4[1] = SWAP4(S02);
hash->h4[2] = SWAP4(S03);
hash->h4[3] = SWAP4(S04);
hash->h4[4] = SWAP4(S09);
hash->h4[5] = SWAP4(S10);
hash->h4[6] = SWAP4(S11);
hash->h4[7] = SWAP4(S12);
hash->h4[8] = SWAP4(S18);
hash->h4[9] = SWAP4(S19);
hash->h4[10] = SWAP4(S20);
hash->h4[11] = SWAP4(S21);
hash->h4[12] = SWAP4(S27);
hash->h4[13] = SWAP4(S28);
hash->h4[14] = SWAP4(S29);
hash->h4[15] = SWAP4(S30);
hash.h4[0] = SWAP4(S01);
hash.h4[1] = SWAP4(S02);
hash.h4[2] = SWAP4(S03);
hash.h4[3] = SWAP4(S04);
hash.h4[4] = SWAP4(S09);
hash.h4[5] = SWAP4(S10);
hash.h4[6] = SWAP4(S11);
hash.h4[7] = SWAP4(S12);
hash.h4[8] = SWAP4(S18);
hash.h4[9] = SWAP4(S19);
hash.h4[10] = SWAP4(S20);
hash.h4[11] = SWAP4(S21);
hash.h4[12] = SWAP4(S27);
hash.h4[13] = SWAP4(S28);
hash.h4[14] = SWAP4(S29);
hash.h4[15] = SWAP4(S30);
}
//shabal
{
sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[2], A03 = A_init_512[3], A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[6], A07 = A_init_512[7],
A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
sph_u32 B0 = B_init_512[0], B1 = B_init_512[1], B2 = B_init_512[2], B3 = B_init_512[3], B4 = B_init_512[4], B5 = B_init_512[5], B6 = B_init_512[6], B7 = B_init_512[7],
@ -1229,22 +1014,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1229,22 +1014,22 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
sph_u32 Wlow = 1, Whigh = 0;
M0 = hash->h4[0];
M1 = hash->h4[1];
M2 = hash->h4[2];
M3 = hash->h4[3];
M4 = hash->h4[4];
M5 = hash->h4[5];
M6 = hash->h4[6];
M7 = hash->h4[7];
M8 = hash->h4[8];
M9 = hash->h4[9];
MA = hash->h4[10];
MB = hash->h4[11];
MC = hash->h4[12];
MD = hash->h4[13];
ME = hash->h4[14];
MF = hash->h4[15];
M0 = hash.h4[0];
M1 = hash.h4[1];
M2 = hash.h4[2];
M3 = hash.h4[3];
M4 = hash.h4[4];
M5 = hash.h4[5];
M6 = hash.h4[6];
M7 = hash.h4[7];
M8 = hash.h4[8];
M9 = hash.h4[9];
MA = hash.h4[10];
MB = hash.h4[11];
MC = hash.h4[12];
MD = hash.h4[13];
ME = hash.h4[14];
MF = hash.h4[15];
INPUT_BLOCK_ADD;
XOR_W;
@ -1259,36 +1044,35 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo @@ -1259,36 +1044,35 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
for (unsigned i = 0; i < 3; i ++)
{
for (unsigned i = 0; i < 3; i ++) {
SWAP_BC;
XOR_W;
APPLY_P;
}
hash->h4[0] = B0;
hash->h4[1] = B1;
hash->h4[2] = B2;
hash->h4[3] = B3;
hash->h4[4] = B4;
hash->h4[5] = B5;
hash->h4[6] = B6;
hash->h4[7] = B7;
hash->h4[8] = B8;
hash->h4[9] = B9;
hash->h4[10] = BA;
hash->h4[11] = BB;
hash->h4[12] = BC;
hash->h4[13] = BD;
hash->h4[14] = BE;
hash->h4[15] = BF;
bool result = (hash->h8[3] <= target);
hash.h4[0] = B0;
hash.h4[1] = B1;
hash.h4[2] = B2;
hash.h4[3] = B3;
hash.h4[4] = B4;
hash.h4[5] = B5;
hash.h4[6] = B6;
hash.h4[7] = B7;
hash.h4[8] = B8;
hash.h4[9] = B9;
hash.h4[10] = BA;
hash.h4[11] = BB;
hash.h4[12] = BC;
hash.h4[13] = BD;
hash.h4[14] = BE;
hash.h4[15] = BF;
}
bool result = (hash.h8[3] <= target);
if (result)
output[atomic_inc(output+0xFF)] = SWAP4(gid);
barrier(CLK_GLOBAL_MEM_FENCE);
}
#endif // X14OLD_CL
#endif // X14_CL
Loading…
Cancel
Save