@ -75,8 +75,8 @@ typedef long sph_s64;
@@ -75,8 +75,8 @@ typedef long sph_s64;
# define SPH_GROESTL_BIG_ENDIAN 0
# define SPH_CUBEHASH_UNROLL 0
# define SPH_KECCAK_UNROLL 0
# if !defined SPH_HAMSI_EXPAND_BIG
# define SPH_HAMSI_EXPAND_BIG 4
# ifndef SPH_HAMSI_EXPAND_BIG
# define SPH_HAMSI_EXPAND_BIG 4
# endif
# include "blake.cl"
@ -115,8 +115,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
@@ -115,8 +115,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
{
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
// blake
// blake
sph_u64 H0 = SPH_C64 ( 0x6A09E667F3BCC908 ) , H1 = SPH_C64 ( 0xBB67AE8584CAA73B ) ;
sph_u64 H2 = SPH_C64 ( 0x3C6EF372FE94F82B ) , H3 = SPH_C64 ( 0xA54FF53A5F1D36F1 ) ;
sph_u64 H4 = SPH_C64 ( 0x510E527FADE682D1 ) , H5 = SPH_C64 ( 0x9B05688C2B3E6C1F ) ;
@ -125,13 +125,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
@@ -125,13 +125,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 T0 = SPH_C64 ( 0xFFFFFFFFFFFFFC00 ) + ( 80 << 3 ) , T1 = 0xFFFFFFFFFFFFFFFF ;;
if ( ( T0 = SPH_T64 ( T0 + 1024 ) ) < 1024 )
{
T1 = SPH_T64 ( T1 + 1 ) ;
}
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7 ;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF ;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7 ;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF ;
M0 = DEC64BE ( block + 0 ) ;
M1 = DEC64BE ( block + 8 ) ;
M2 = DEC64BE ( block + 16 ) ;
@ -170,58 +170,269 @@ __kernel void search1(__global hash_t* hashes)
@@ -170,58 +170,269 @@ __kernel void search1(__global hash_t* hashes)
{
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
// bmw
sph_u64 BMW_H[16] ;
# pragma unroll 16
for ( unsigned u = 0 ; u < 16; u++)
BMW_H[u] = BMW_IV512[u] ;
sph_u64 BMW_h1[16], BMW_h2[16 ];
sph_u64 mv[16] ;
mv[ 0] = SWAP8 ( hash->h8[0] ) ;
mv[ 1] = SWAP8 ( hash->h8[1] ) ;
mv[ 2] = SWAP8 ( hash->h8[2] ) ;
mv[ 3] = SWAP8 ( hash->h8[3] ) ;
mv[ 4] = SWAP8 ( hash->h8[4] ) ;
mv[ 5] = SWAP8 ( hash->h8[5] ) ;
mv[ 6] = SWAP8 ( hash->h8[6] ) ;
mv[ 7] = SWAP8 ( hash->h8[7] ) ;
mv[ 8] = 0x80 ;
mv[ 9] = 0 ;
sph_u64 mv[16],q[32 ];
sph_u64 tmp ;
mv[0] = SWAP8 ( hash->h8[0] ) ;
mv[1] = SWAP8 ( hash->h8[1] ) ;
mv[2] = SWAP8 ( hash->h8[2] ) ;
mv[3] = SWAP8 ( hash->h8[3] ) ;
mv[4] = SWAP8 ( hash->h8[4] ) ;
mv[5] = SWAP8 ( hash->h8[5] ) ;
mv[6] = SWAP8 ( hash->h8[6] ) ;
mv[7] = SWAP8 ( hash->h8[7] ) ;
mv[8] = 0x80 ;
mv[9] = 0 ;
mv[10] = 0 ;
mv[11] = 0 ;
mv[12] = 0 ;
mv[13] = 0 ;
mv[14] = 0 ;
mv[15] = 0x200 ;
# define M ( x ) ( mv[x] )
# define H ( x ) ( BMW_H[x] )
# define dH ( x ) ( BMW_h2[x] )
mv[15] = SPH_C64 ( 512 ) ;
tmp = ( mv[5] ^ BMW_H[5] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) + ( mv[14] ^ BMW_H[14] ) ;
q[0] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[1] ;
tmp = ( mv[6] ^ BMW_H[6] ) - ( mv[8] ^ BMW_H[8] ) + ( mv[11] ^ BMW_H[11] ) + ( mv[14] ^ BMW_H[14] ) - ( mv[15] ^ BMW_H[15] ) ;
q[1] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[2] ;
tmp = ( mv[0] ^ BMW_H[0] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[2] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[3] ;
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[1] ^ BMW_H[1] ) + ( mv[8] ^ BMW_H[8] ) - ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) ;
q[3] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[4] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[2] ^ BMW_H[2] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[14] ^ BMW_H[14] ) ;
q[4] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[5] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[2] ^ BMW_H[2] ) + ( mv[10] ^ BMW_H[10] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[5] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[6] ;
tmp = ( mv[4] ^ BMW_H[4] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) - ( mv[11] ^ BMW_H[11] ) + ( mv[13] ^ BMW_H[13] ) ;
q[6] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[7] ;
tmp = ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[12] ^ BMW_H[12] ) - ( mv[14] ^ BMW_H[14] ) ;
q[7] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[8] ;
tmp = ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[6] ^ BMW_H[6] ) + ( mv[13] ^ BMW_H[13] ) - ( mv[15] ^ BMW_H[15] ) ;
q[8] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[9] ;
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) + ( mv[6] ^ BMW_H[6] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[14] ^ BMW_H[14] ) ;
q[9] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[10] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[15] ^ BMW_H[15] ) ;
q[10] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[11] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[9] ^ BMW_H[9] ) ;
q[11] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[12] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[3] ^ BMW_H[3] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[10] ^ BMW_H[10] ) ;
q[12] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[13] ;
tmp = ( mv[2] ^ BMW_H[2] ) + ( mv[4] ^ BMW_H[4] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[11] ^ BMW_H[11] ) ;
q[13] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[14] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[8] ^ BMW_H[8] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[12] ^ BMW_H[12] ) ;
q[14] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[15] ;
tmp = ( mv[12] ^ BMW_H[12] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[13] ^ BMW_H[13] ) ;
q[15] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[0] ;
# pragma unroll 2
for ( int i=0 ;i<2;i++)
{
q[i+16] =
( SHR ( q[i], 1 ) ^ SHL ( q[i], 2 ) ^ SPH_ROTL64 ( q[i], 13 ) ^ SPH_ROTL64 ( q[i], 43 ) ) +
( SHR ( q[i+1], 2 ) ^ SHL ( q[i+1], 1 ) ^ SPH_ROTL64 ( q[i+1], 19 ) ^ SPH_ROTL64 ( q[i+1], 53 ) ) +
( SHR ( q[i+2], 2 ) ^ SHL ( q[i+2], 2 ) ^ SPH_ROTL64 ( q[i+2], 28 ) ^ SPH_ROTL64 ( q[i+2], 59 ) ) +
( SHR ( q[i+3], 1 ) ^ SHL ( q[i+3], 3 ) ^ SPH_ROTL64 ( q[i+3], 4 ) ^ SPH_ROTL64 ( q[i+3], 37 ) ) +
( SHR ( q[i+4], 1 ) ^ SHL ( q[i+4], 2 ) ^ SPH_ROTL64 ( q[i+4], 13 ) ^ SPH_ROTL64 ( q[i+4], 43 ) ) +
( SHR ( q[i+5], 2 ) ^ SHL ( q[i+5], 1 ) ^ SPH_ROTL64 ( q[i+5], 19 ) ^ SPH_ROTL64 ( q[i+5], 53 ) ) +
( SHR ( q[i+6], 2 ) ^ SHL ( q[i+6], 2 ) ^ SPH_ROTL64 ( q[i+6], 28 ) ^ SPH_ROTL64 ( q[i+6], 59 ) ) +
( SHR ( q[i+7], 1 ) ^ SHL ( q[i+7], 3 ) ^ SPH_ROTL64 ( q[i+7], 4 ) ^ SPH_ROTL64 ( q[i+7], 37 ) ) +
( SHR ( q[i+8], 1 ) ^ SHL ( q[i+8], 2 ) ^ SPH_ROTL64 ( q[i+8], 13 ) ^ SPH_ROTL64 ( q[i+8], 43 ) ) +
( SHR ( q[i+9], 2 ) ^ SHL ( q[i+9], 1 ) ^ SPH_ROTL64 ( q[i+9], 19 ) ^ SPH_ROTL64 ( q[i+9], 53 ) ) +
( SHR ( q[i+10], 2 ) ^ SHL ( q[i+10], 2 ) ^ SPH_ROTL64 ( q[i+10], 28 ) ^ SPH_ROTL64 ( q[i+10], 59 ) ) +
( SHR ( q[i+11], 1 ) ^ SHL ( q[i+11], 3 ) ^ SPH_ROTL64 ( q[i+11], 4 ) ^ SPH_ROTL64 ( q[i+11], 37 ) ) +
( SHR ( q[i+12], 1 ) ^ SHL ( q[i+12], 2 ) ^ SPH_ROTL64 ( q[i+12], 13 ) ^ SPH_ROTL64 ( q[i+12], 43 ) ) +
( SHR ( q[i+13], 2 ) ^ SHL ( q[i+13], 1 ) ^ SPH_ROTL64 ( q[i+13], 19 ) ^ SPH_ROTL64 ( q[i+13], 53 ) ) +
( SHR ( q[i+14], 2 ) ^ SHL ( q[i+14], 2 ) ^ SPH_ROTL64 ( q[i+14], 28 ) ^ SPH_ROTL64 ( q[i+14], 59 ) ) +
( SHR ( q[i+15], 1 ) ^ SHL ( q[i+15], 3 ) ^ SPH_ROTL64 ( q[i+15], 4 ) ^ SPH_ROTL64 ( q[i+15], 37 ) ) +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
FOLDb ;
# pragma unroll 4
for ( int i=2 ;i<6;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# undef M
# undef H
# undef dH
# pragma unroll 3
for ( int i=6 ;i<9;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 4
for ( int i=9 ;i<13;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
# pragma unroll 3
for ( int i=13 ;i<16;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i-13], ( i-13 ) +1 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23] ;
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31] ;
BMW_H[0] = ( SHL ( XH64, 5 ) ^ SHR ( q[16],5 ) ^ mv[0] ) + ( XL64 ^ q[24] ^ q[0] ) ;
BMW_H[1] = ( SHR ( XH64, 7 ) ^ SHL ( q[17],8 ) ^ mv[1] ) + ( XL64 ^ q[25] ^ q[1] ) ;
BMW_H[2] = ( SHR ( XH64, 5 ) ^ SHL ( q[18],5 ) ^ mv[2] ) + ( XL64 ^ q[26] ^ q[2] ) ;
BMW_H[3] = ( SHR ( XH64, 1 ) ^ SHL ( q[19],5 ) ^ mv[3] ) + ( XL64 ^ q[27] ^ q[3] ) ;
BMW_H[4] = ( SHR ( XH64, 3 ) ^ q[20] ^ mv[4] ) + ( XL64 ^ q[28] ^ q[4] ) ;
BMW_H[5] = ( SHL ( XH64, 6 ) ^ SHR ( q[21],6 ) ^ mv[5] ) + ( XL64 ^ q[29] ^ q[5] ) ;
BMW_H[6] = ( SHR ( XH64, 4 ) ^ SHL ( q[22],6 ) ^ mv[6] ) + ( XL64 ^ q[30] ^ q[6] ) ;
BMW_H[7] = ( SHR ( XH64,11 ) ^ SHL ( q[23],2 ) ^ mv[7] ) + ( XL64 ^ q[31] ^ q[7] ) ;
BMW_H[8] = SPH_ROTL64 ( BMW_H[4], 9 ) + ( XH64 ^ q[24] ^ mv[8] ) + ( SHL ( XL64,8 ) ^ q[23] ^ q[8] ) ;
BMW_H[9] = SPH_ROTL64 ( BMW_H[5],10 ) + ( XH64 ^ q[25] ^ mv[9] ) + ( SHR ( XL64,6 ) ^ q[16] ^ q[9] ) ;
BMW_H[10] = SPH_ROTL64 ( BMW_H[6],11 ) + ( XH64 ^ q[26] ^ mv[10] ) + ( SHL ( XL64,6 ) ^ q[17] ^ q[10] ) ;
BMW_H[11] = SPH_ROTL64 ( BMW_H[7],12 ) + ( XH64 ^ q[27] ^ mv[11] ) + ( SHL ( XL64,4 ) ^ q[18] ^ q[11] ) ;
BMW_H[12] = SPH_ROTL64 ( BMW_H[0],13 ) + ( XH64 ^ q[28] ^ mv[12] ) + ( SHR ( XL64,3 ) ^ q[19] ^ q[12] ) ;
BMW_H[13] = SPH_ROTL64 ( BMW_H[1],14 ) + ( XH64 ^ q[29] ^ mv[13] ) + ( SHR ( XL64,4 ) ^ q[20] ^ q[13] ) ;
BMW_H[14] = SPH_ROTL64 ( BMW_H[2],15 ) + ( XH64 ^ q[30] ^ mv[14] ) + ( SHR ( XL64,7 ) ^ q[21] ^ q[14] ) ;
BMW_H[15] = SPH_ROTL64 ( BMW_H[3],16 ) + ( XH64 ^ q[31] ^ mv[15] ) + ( SHR ( XL64,2 ) ^ q[22] ^ q[15] ) ;
# pragma unroll 16
for ( int i=0 ;i<16;i++)
{
mv[i] = BMW_H[i] ;
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + ( sph_u64 ) i ;
}
tmp = ( mv[5] ^ BMW_H[5] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) + ( mv[14] ^ BMW_H[14] ) ;
q[0] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[1] ;
tmp = ( mv[6] ^ BMW_H[6] ) - ( mv[8] ^ BMW_H[8] ) + ( mv[11] ^ BMW_H[11] ) + ( mv[14] ^ BMW_H[14] ) - ( mv[15] ^ BMW_H[15] ) ;
q[1] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[2] ;
tmp = ( mv[0] ^ BMW_H[0] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[2] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[3] ;
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[1] ^ BMW_H[1] ) + ( mv[8] ^ BMW_H[8] ) - ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) ;
q[3] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[4] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[2] ^ BMW_H[2] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[14] ^ BMW_H[14] ) ;
q[4] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[5] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[2] ^ BMW_H[2] ) + ( mv[10] ^ BMW_H[10] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[5] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[6] ;
tmp = ( mv[4] ^ BMW_H[4] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) - ( mv[11] ^ BMW_H[11] ) + ( mv[13] ^ BMW_H[13] ) ;
q[6] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[7] ;
tmp = ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[12] ^ BMW_H[12] ) - ( mv[14] ^ BMW_H[14] ) ;
q[7] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[8] ;
tmp = ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[6] ^ BMW_H[6] ) + ( mv[13] ^ BMW_H[13] ) - ( mv[15] ^ BMW_H[15] ) ;
q[8] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[9] ;
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) + ( mv[6] ^ BMW_H[6] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[14] ^ BMW_H[14] ) ;
q[9] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[10] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[15] ^ BMW_H[15] ) ;
q[10] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[11] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[9] ^ BMW_H[9] ) ;
q[11] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[12] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[3] ^ BMW_H[3] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[10] ^ BMW_H[10] ) ;
q[12] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[13] ;
tmp = ( mv[2] ^ BMW_H[2] ) + ( mv[4] ^ BMW_H[4] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[11] ^ BMW_H[11] ) ;
q[13] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[14] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[8] ^ BMW_H[8] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[12] ^ BMW_H[12] ) ;
q[14] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[15] ;
tmp = ( mv[12] ^ BMW_H[12] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[13] ^ BMW_H[13] ) ;
q[15] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[0] ;
# pragma unroll 2
for ( int i=0 ;i<2;i++)
{
q[i+16] =
( SHR ( q[i], 1 ) ^ SHL ( q[i], 2 ) ^ SPH_ROTL64 ( q[i], 13 ) ^ SPH_ROTL64 ( q[i], 43 ) ) +
( SHR ( q[i+1], 2 ) ^ SHL ( q[i+1], 1 ) ^ SPH_ROTL64 ( q[i+1], 19 ) ^ SPH_ROTL64 ( q[i+1], 53 ) ) +
( SHR ( q[i+2], 2 ) ^ SHL ( q[i+2], 2 ) ^ SPH_ROTL64 ( q[i+2], 28 ) ^ SPH_ROTL64 ( q[i+2], 59 ) ) +
( SHR ( q[i+3], 1 ) ^ SHL ( q[i+3], 3 ) ^ SPH_ROTL64 ( q[i+3], 4 ) ^ SPH_ROTL64 ( q[i+3], 37 ) ) +
( SHR ( q[i+4], 1 ) ^ SHL ( q[i+4], 2 ) ^ SPH_ROTL64 ( q[i+4], 13 ) ^ SPH_ROTL64 ( q[i+4], 43 ) ) +
( SHR ( q[i+5], 2 ) ^ SHL ( q[i+5], 1 ) ^ SPH_ROTL64 ( q[i+5], 19 ) ^ SPH_ROTL64 ( q[i+5], 53 ) ) +
( SHR ( q[i+6], 2 ) ^ SHL ( q[i+6], 2 ) ^ SPH_ROTL64 ( q[i+6], 28 ) ^ SPH_ROTL64 ( q[i+6], 59 ) ) +
( SHR ( q[i+7], 1 ) ^ SHL ( q[i+7], 3 ) ^ SPH_ROTL64 ( q[i+7], 4 ) ^ SPH_ROTL64 ( q[i+7], 37 ) ) +
( SHR ( q[i+8], 1 ) ^ SHL ( q[i+8], 2 ) ^ SPH_ROTL64 ( q[i+8], 13 ) ^ SPH_ROTL64 ( q[i+8], 43 ) ) +
( SHR ( q[i+9], 2 ) ^ SHL ( q[i+9], 1 ) ^ SPH_ROTL64 ( q[i+9], 19 ) ^ SPH_ROTL64 ( q[i+9], 53 ) ) +
( SHR ( q[i+10], 2 ) ^ SHL ( q[i+10], 2 ) ^ SPH_ROTL64 ( q[i+10], 28 ) ^ SPH_ROTL64 ( q[i+10], 59 ) ) +
( SHR ( q[i+11], 1 ) ^ SHL ( q[i+11], 3 ) ^ SPH_ROTL64 ( q[i+11], 4 ) ^ SPH_ROTL64 ( q[i+11], 37 ) ) +
( SHR ( q[i+12], 1 ) ^ SHL ( q[i+12], 2 ) ^ SPH_ROTL64 ( q[i+12], 13 ) ^ SPH_ROTL64 ( q[i+12], 43 ) ) +
( SHR ( q[i+13], 2 ) ^ SHL ( q[i+13], 1 ) ^ SPH_ROTL64 ( q[i+13], 19 ) ^ SPH_ROTL64 ( q[i+13], 53 ) ) +
( SHR ( q[i+14], 2 ) ^ SHL ( q[i+14], 2 ) ^ SPH_ROTL64 ( q[i+14], 28 ) ^ SPH_ROTL64 ( q[i+14], 59 ) ) +
( SHR ( q[i+15], 1 ) ^ SHL ( q[i+15], 3 ) ^ SPH_ROTL64 ( q[i+15], 4 ) ^ SPH_ROTL64 ( q[i+15], 37 ) ) +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 4
for ( int i=2 ;i<6;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# define M ( x ) ( BMW_h2[x] )
# define H ( x ) ( final_b[x] )
# define dH ( x ) ( BMW_h1[x] )
# pragma unroll 3
for ( int i=6 ;i<9;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i+7] ) ;
}
FOLDb ;
# pragma unroll 4
for ( int i=9 ;i<13;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
# undef M
# undef H
# undef dH
# pragma unroll 3
for ( int i=13 ;i<16;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i-13], ( i-13 ) +1 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
hash->h8[0] = SWAP8 ( BMW_h1[8] ) ;
hash->h8[1] = SWAP8 ( BMW_h1[9] ) ;
hash->h8[2] = SWAP8 ( BMW_h1[10] ) ;
hash->h8[3] = SWAP8 ( BMW_h1[11] ) ;
hash->h8[4] = SWAP8 ( BMW_h1[12] ) ;
hash->h8[5] = SWAP8 ( BMW_h1[13] ) ;
hash->h8[6] = SWAP8 ( BMW_h1[14] ) ;
hash->h8[7] = SWAP8 ( BMW_h1[15] ) ;
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23] ;
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31] ;
BMW_H[0] = ( SHL ( XH64, 5 ) ^ SHR ( q[16],5 ) ^ mv[0] ) + ( XL64 ^ q[24] ^ q[0] ) ;
BMW_H[1] = ( SHR ( XH64, 7 ) ^ SHL ( q[17],8 ) ^ mv[1] ) + ( XL64 ^ q[25] ^ q[1] ) ;
BMW_H[2] = ( SHR ( XH64, 5 ) ^ SHL ( q[18],5 ) ^ mv[2] ) + ( XL64 ^ q[26] ^ q[2] ) ;
BMW_H[3] = ( SHR ( XH64, 1 ) ^ SHL ( q[19],5 ) ^ mv[3] ) + ( XL64 ^ q[27] ^ q[3] ) ;
BMW_H[4] = ( SHR ( XH64, 3 ) ^ q[20] ^ mv[4] ) + ( XL64 ^ q[28] ^ q[4] ) ;
BMW_H[5] = ( SHL ( XH64, 6 ) ^ SHR ( q[21],6 ) ^ mv[5] ) + ( XL64 ^ q[29] ^ q[5] ) ;
BMW_H[6] = ( SHR ( XH64, 4 ) ^ SHL ( q[22],6 ) ^ mv[6] ) + ( XL64 ^ q[30] ^ q[6] ) ;
BMW_H[7] = ( SHR ( XH64,11 ) ^ SHL ( q[23],2 ) ^ mv[7] ) + ( XL64 ^ q[31] ^ q[7] ) ;
BMW_H[8] = SPH_ROTL64 ( BMW_H[4], 9 ) + ( XH64 ^ q[24] ^ mv[8] ) + ( SHL ( XL64,8 ) ^ q[23] ^ q[8] ) ;
BMW_H[9] = SPH_ROTL64 ( BMW_H[5],10 ) + ( XH64 ^ q[25] ^ mv[9] ) + ( SHR ( XL64,6 ) ^ q[16] ^ q[9] ) ;
BMW_H[10] = SPH_ROTL64 ( BMW_H[6],11 ) + ( XH64 ^ q[26] ^ mv[10] ) + ( SHL ( XL64,6 ) ^ q[17] ^ q[10] ) ;
BMW_H[11] = SPH_ROTL64 ( BMW_H[7],12 ) + ( XH64 ^ q[27] ^ mv[11] ) + ( SHL ( XL64,4 ) ^ q[18] ^ q[11] ) ;
BMW_H[12] = SPH_ROTL64 ( BMW_H[0],13 ) + ( XH64 ^ q[28] ^ mv[12] ) + ( SHR ( XL64,3 ) ^ q[19] ^ q[12] ) ;
BMW_H[13] = SPH_ROTL64 ( BMW_H[1],14 ) + ( XH64 ^ q[29] ^ mv[13] ) + ( SHR ( XL64,4 ) ^ q[20] ^ q[13] ) ;
BMW_H[14] = SPH_ROTL64 ( BMW_H[2],15 ) + ( XH64 ^ q[30] ^ mv[14] ) + ( SHR ( XL64,7 ) ^ q[21] ^ q[14] ) ;
BMW_H[15] = SPH_ROTL64 ( BMW_H[3],16 ) + ( XH64 ^ q[31] ^ mv[15] ) + ( SHR ( XL64,2 ) ^ q[22] ^ q[15] ) ;
hash->h8[0] = SWAP8 ( BMW_H[8] ) ;
hash->h8[1] = SWAP8 ( BMW_H[9] ) ;
hash->h8[2] = SWAP8 ( BMW_H[10] ) ;
hash->h8[3] = SWAP8 ( BMW_H[11] ) ;
hash->h8[4] = SWAP8 ( BMW_H[12] ) ;
hash->h8[5] = SWAP8 ( BMW_H[13] ) ;
hash->h8[6] = SWAP8 ( BMW_H[14] ) ;
hash->h8[7] = SWAP8 ( BMW_H[15] ) ;
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
}
@ -240,67 +451,59 @@ __kernel void search2(__global hash_t* hashes)
@@ -240,67 +451,59 @@ __kernel void search2(__global hash_t* hashes)
for ( int i = init ; i < 256; i += step)
{
T0_L[i] = T0[i] ;
T4_L[i] = T4[i] ;
T1_L[i] = T1[i] ;
T2_L[i] = T2[i] ;
T3_L[i] = T3[i] ;
T4_L[i] = T4[i] ;
T5_L[i] = T5[i] ;
T6_L[i] = T6[i] ;
T7_L[i] = T7[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
# define T0 T0_L
# define T1 T1_L
# define T2 T2_L
# define T3 T3_L
# define T4 T4_L
# define T5 T5_L
# define T6 T6_L
# define T7 T7_L
# define T0 T0_L
# define T1 T1_L
# define T2 T2_L
# define T3 T3_L
# define T4 T4_L
# define T5 T5_L
# define T6 T6_L
# define T7 T7_L
// groestl
sph_u64 H[16] ;
for ( unsigned int u = 0 ; u < 15; u ++)
H[u] = 0 ;
# if USE_LE
H[15] = ( ( sph_u64 ) ( 512 & 0xFF ) << 56 ) | ( ( sph_u64 ) ( 512 & 0xFF00 ) << 40 ) ;
# else
H[15] = ( sph_u64 ) 512 ;
# endif
sph_u64 H[16] = {0, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0x0002000000000000} ;
sph_u64 g[16], m[16] ;
m[0] = DEC64E ( hash->h8[0] ) ;
m[1] = DEC64E ( hash->h8[1] ) ;
m[2] = DEC64E ( hash->h8[2] ) ;
m[3] = DEC64E ( hash->h8[3] ) ;
m[4] = DEC64E ( hash->h8[4] ) ;
m[5] = DEC64E ( hash->h8[5] ) ;
m[6] = DEC64E ( hash->h8[6] ) ;
m[7] = DEC64E ( hash->h8[7] ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
g[u] = m[u] ^ H[u] ;
m[8] = 0x80 ; g[8] = m[8] ^ H[8] ;
m[9 ] = 0 ; g[9] = m[9] ^ H[9] ;
m[10 ] = 0 ; g[10] = m[10] ^ H[10] ;
m[11 ] = 0 ; g[11] = m[11] ^ H[11] ;
m[12 ] = 0 ; g[12] = m[12] ^ H[12] ;
m[13] = 0 ; g[13] = m[13] ^ H[13] ;
m[14] = 0 ; g[14] = m[14] ^ H[14] ;
m[15] = 0x100000000000000 ; g[15] = m[15] ^ H[15];
g[0] = m[0] = DEC64E ( hash->h8[0] ) ;
g[1] = m[1] = DEC64E ( hash->h8[1] ) ;
g[2] = m[2] = DEC64E ( hash->h8[2] ) ;
g[3] = m[3] = DEC64E ( hash->h8[3] ) ;
g[4] = m[4] = DEC64E ( hash->h8[4] ) ;
g[5] = m[5] = DEC64E ( hash->h8[5] ) ;
g[6] = m[6] = DEC64E ( hash->h8[6] ) ;
g[7] = m[7] = DEC64E ( hash->h8[7] ) ;
g[8] = m[8] = 0x80 ;
g[9] = m[9] = 0 ;
g[10] = m[10] = 0 ;
g[11] = m[11 ] = 0 ;
g[12] = m[12 ] = 0 ;
g[13] = m[13 ] = 0 ;
g[14] = m[14 ] = 0 ;
g[15] = 0x102000000000000 ;
m[15] = 0x100000000000000 ;
PERM_BIG_P ( g ) ;
PERM_BIG_Q ( m ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
H[u] ^= g[u] ^ m[u] ;
sph_u64 xH[16] ;
for ( unsigned int u = 0 ; u < 16; u ++)
xH[u] = H[u] ;
xH[u] = H[u] ^= g[u] ^ m[u] ;
PERM_BIG_P ( xH ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
H[u] ^= xH[u] ;
for ( unsigned int u = 0 ; u < 8; u ++)
hash->h8[u] = DEC64E ( H[u + 8] ) ;
for ( unsigned int u = 8 ; u < 16; u ++)
hash->h8[u-8] = DEC64E ( H[u] ^ xH[u] ) ;
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
}
@ -325,10 +528,14 @@ __kernel void search3(__global hash_t* hashes)
@@ -325,10 +528,14 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8 ( hash->h8[5] ) ;
m6 = SWAP8 ( hash->h8[6] ) ;
m7 = SWAP8 ( hash->h8[7] ) ;
UBI_BIG ( 480 , 64 ) ;
bcount = 0 ;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0 ;
UBI_BIG ( 510 , 8 ) ;
hash->h8[0] = SWAP8 ( h0 ) ;
hash->h8[1] = SWAP8 ( h1 ) ;
hash->h8[2] = SWAP8 ( h2 ) ;
@ -355,7 +562,8 @@ __kernel void search4(__global hash_t* hashes)
@@ -355,7 +562,8 @@ __kernel void search4(__global hash_t* hashes)
for ( int i = 0 ; i < 2; i++)
{
if ( i == 0 ) {
if ( i == 0 )
{
h0h ^= DEC64E ( hash->h8[0] ) ;
h0l ^= DEC64E ( hash->h8[1] ) ;
h1h ^= DEC64E ( hash->h8[2] ) ;
@ -364,7 +572,9 @@ __kernel void search4(__global hash_t* hashes)
@@ -364,7 +572,9 @@ __kernel void search4(__global hash_t* hashes)
h2l ^= DEC64E ( hash->h8[5] ) ;
h3h ^= DEC64E ( hash->h8[6] ) ;
h3l ^= DEC64E ( hash->h8[7] ) ;
} else if ( i == 1 ) {
}
else if ( i == 1 )
{
h4h ^= DEC64E ( hash->h8[0] ) ;
h4l ^= DEC64E ( hash->h8[1] ) ;
h5h ^= DEC64E ( hash->h8[2] ) ;
@ -425,6 +635,7 @@ __kernel void search5(__global hash_t* hashes)
@@ -425,6 +635,7 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8 ( hash->h8[7] ) ;
a31 ^= 0x8000000000000001 ;
KECCAK_F_1600 ;
// Finalize the "lane complement"
a10 = ~a10 ;
a20 = ~a20 ;
@ -471,7 +682,8 @@ __kernel void search6(__global hash_t* hashes)
@@ -471,7 +682,8 @@ __kernel void search6(__global hash_t* hashes)
MI5 ;
LUFFA_P5 ;
if ( i == 0 ) {
if ( i == 0 )
{
M0 = hash->h4[9] ;
M1 = hash->h4[8] ;
M2 = hash->h4[11] ;
@ -480,12 +692,16 @@ __kernel void search6(__global hash_t* hashes)
@@ -480,12 +692,16 @@ __kernel void search6(__global hash_t* hashes)
M5 = hash->h4[12] ;
M6 = hash->h4[15] ;
M7 = hash->h4[14] ;
} else if ( i == 1 ) {
}
else if ( i == 1 )
{
M0 = 0x80000000 ;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0 ;
} else if ( i == 2 ) {
}
else if ( i == 2 )
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0 ;
} else if ( i == 3 ) {
else if ( i == 3 )
{
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40 ;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41 ;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42 ;
@ -535,10 +751,12 @@ __kernel void search7(__global hash_t* hashes)
@@ -535,10 +751,12 @@ __kernel void search7(__global hash_t* hashes)
x6 ^= SWAP4 ( hash->h4[7] ) ;
x7 ^= SWAP4 ( hash->h4[6] ) ;
for ( int i = 0 ; i < 13; i ++) {
for ( int i = 0 ; i < 13; i ++)
{
SIXTEEN_ROUNDS ;
if ( i == 0 ) {
if ( i == 0 )
{
x0 ^= SWAP4 ( hash->h4[9] ) ;
x1 ^= SWAP4 ( hash->h4[8] ) ;
x2 ^= SWAP4 ( hash->h4[11] ) ;
@ -547,12 +765,12 @@ __kernel void search7(__global hash_t* hashes)
@@ -547,12 +765,12 @@ __kernel void search7(__global hash_t* hashes)
x5 ^= SWAP4 ( hash->h4[12] ) ;
x6 ^= SWAP4 ( hash->h4[15] ) ;
x7 ^= SWAP4 ( hash->h4[14] ) ;
} else if ( i == 1 ) {
}
else if ( i == 1 )
x0 ^= 0x80 ;
} else if ( i == 2 ) {
else if ( i == 2 )
xv ^= SPH_C32 ( 1 ) ;
}
}
hash->h4[0] = x0 ;
hash->h4[1] = x1 ;
@ -579,6 +797,7 @@ __kernel void search8(__global hash_t* hashes)
@@ -579,6 +797,7 @@ __kernel void search8(__global hash_t* hashes)
{
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256] ;
int init = get_local_id ( 0 ) ;
@ -607,7 +826,7 @@ __kernel void search8(__global hash_t* hashes)
@@ -607,7 +826,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17 ;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F ;
sph_u32 sc_count0 = ( 64 << 3 ) , sc_count1 = 0 , sc_count2 = 0 , sc_count3 = 0 ;
sph_u32 sc_count0 = 0x200 , sc_count1 = 0 , sc_count2 = 0 , sc_count3 = 0 ;
rk00 = hash->h4[0] ;
rk01 = hash->h4[1] ;
@ -673,7 +892,8 @@ __kernel void search9(__global hash_t* hashes)
@@ -673,7 +892,8 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32 ( 0x09254899 ) , D1 = C32 ( 0xD699C7BC ) , D2 = C32 ( 0x9019B6DC ) , D3 = C32 ( 0x2B9022E4 ) , D4 = C32 ( 0x8FA14956 ) , D5 = C32 ( 0x21BF9BD3 ) , D6 = C32 ( 0xB94D0943 ) , D7 = C32 ( 0x6FFDDC22 ) ;
FFT256 ( 0 , 1 , 0 , ll1 ) ;
for ( int i = 0 ; i < 256; i ++) {
for ( int i = 0 ; i < 256; i ++)
{
s32 tq ;
tq = q[i] + yoff_b_n[i] ;
@ -709,14 +929,17 @@ __kernel void search9(__global hash_t* hashes)
@@ -709,14 +929,17 @@ __kernel void search9(__global hash_t* hashes)
C32 ( 0x0BA16B95 ) , C32 ( 0x72F999AD ) , C32 ( 0x9FECC2AE ) , C32 ( 0xBA3264FC ) ,
C32 ( 0x5E894929 ) , C32 ( 0x8E9F30E5 ) , C32 ( 0x2F1DAA37 ) , C32 ( 0xF0F2C558 ) ,
IF, 4 , 13 , PP8_4_ ) ;
STEP_BIG (
C32 ( 0xAC506643 ) , C32 ( 0xA90635A5 ) , C32 ( 0xE25B878B ) , C32 ( 0xAAB7878F ) ,
C32 ( 0x88817F7A ) , C32 ( 0x0A02892B ) , C32 ( 0x559A7550 ) , C32 ( 0x598F657E ) ,
IF, 13 , 10 , PP8_5_ ) ;
STEP_BIG (
C32 ( 0x7EEF60A1 ) , C32 ( 0x6B70E3E8 ) , C32 ( 0x9C1714D1 ) , C32 ( 0xB958E2A8 ) ,
C32 ( 0xAB02675E ) , C32 ( 0xED1C014F ) , C32 ( 0xCD8D65BB ) , C32 ( 0xFDB7A257 ) ,
IF, 10 , 25 , PP8_6_ ) ;
STEP_BIG (
C32 ( 0x09254899 ) , C32 ( 0xD699C7BC ) , C32 ( 0x9019B6DC ) , C32 ( 0x2B9022E4 ) ,
C32 ( 0x8FA14956 ) , C32 ( 0x21BF9BD3 ) , C32 ( 0xB94D0943 ) , C32 ( 0x6FFDDC22 ) ,
@ -735,22 +958,27 @@ __kernel void search9(__global hash_t* hashes)
@@ -735,22 +958,27 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG ( 1_, 1 , 28 , 19 , 22 , 7 ) ;
ONE_ROUND_BIG ( 2_, 2 , 29 , 9 , 15 , 5 ) ;
ONE_ROUND_BIG ( 3_, 3 , 4 , 13 , 10 , 25 ) ;
STEP_BIG (
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4 , 13 , PP8_4_ ) ;
STEP_BIG (
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13 , 10 , PP8_5_ ) ;
STEP_BIG (
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10 , 25 , PP8_6_ ) ;
STEP_BIG (
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25 , 4 , PP8_0_ ) ;
# undef q
hash->h4[0] = A0 ;
@ -778,7 +1006,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
@@ -778,7 +1006,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
{
uint gid = get_global_id ( 0 ) ;
uint offset = get_global_offset ( 0 ) ;
hash_t hash ;
__global hash_t * hash = & ( hashes[gid-offset] ) ;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256] ;
@ -795,14 +1023,23 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
@@ -795,14 +1023,23 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
barrier ( CLK_LOCAL_MEM_FENCE ) ;
for ( int i = 0 ; i < 8; i++) {
hash.h8[i] = hashes[gid-offset].h8[i] ;
//mixtab
__local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256] ;
for ( int i = init ; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i] ;
mixtab1[i] = mixtab1_c[i] ;
mixtab2[i] = mixtab2_c[i] ;
mixtab3[i] = mixtab3_c[i] ;
}
// echo
barrier ( CLK_LOCAL_MEM_FENCE ) ;
{
for ( int i = 0 ; i < 8; i++)
hash->h8[i] = hashes[gid-offset].h8[i] ;
// echo
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1 ;
sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71 ;
Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL ;
@ -829,14 +1066,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
@@ -829,14 +1066,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
W61 = Vb61 ;
W70 = Vb70 ;
W71 = Vb71 ;
W80 = hash. h8[0] ;
W81 = hash. h8[1] ;
W90 = hash. h8[2] ;
W91 = hash. h8[3] ;
WA0 = hash. h8[4] ;
WA1 = hash. h8[5] ;
WB0 = hash. h8[6] ;
WB1 = hash. h8[7] ;
W80 = hash-> h8[0] ;
W81 = hash-> h8[1] ;
W90 = hash-> h8[2] ;
W91 = hash-> h8[3] ;
WA0 = hash-> h8[4] ;
WA1 = hash-> h8[5] ;
WB0 = hash-> h8[6] ;
WB1 = hash-> h8[7] ;
WC0 = 0x80 ;
WC1 = 0 ;
WD0 = 0 ;
@ -846,24 +1083,26 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
@@ -846,24 +1083,26 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
WF0 = 0x200 ;
WF1 = 0 ;
for ( unsigned u = 0 ; u < 10; u ++) {
for ( unsigned u = 0 ; u < 10; u ++)
BIG_ROUND ;
}
hash.h8[0] ^= Vb00 ^ W00 ^ W80 ;
hash.h8[1] ^= Vb01 ^ W01 ^ W81 ;
hash.h8[2] ^= Vb10 ^ W10 ^ W90 ;
hash.h8[3] ^= Vb11 ^ W11 ^ W91 ;
hash.h8[4] ^= Vb20 ^ W20 ^ WA0 ;
hash.h8[5] ^= Vb21 ^ W21 ^ WA1 ;
hash.h8[6] ^= Vb30 ^ W30 ^ WB0 ;
hash.h8[7] ^= Vb31 ^ W31 ^ WB1 ;
}
hash->h8[0] ^= Vb00 ^ W00 ^ W80 ;
hash->h8[1] ^= Vb01 ^ W01 ^ W81 ;
hash->h8[2] ^= Vb10 ^ W10 ^ W90 ;
hash->h8[3] ^= Vb11 ^ W11 ^ W91 ;
hash->h8[4] ^= Vb20 ^ W20 ^ WA0 ;
hash->h8[5] ^= Vb21 ^ W21 ^ WA1 ;
hash->h8[6] ^= Vb30 ^ W30 ^ WB0 ;
hash->h8[7] ^= Vb31 ^ W31 ^ WB1 ;
// hamsi
__local sph_u32 T512_L[1024] ;
__constant const sph_u32 *T512_C = &T512[0][0] ;
{
for ( int i = init ; i < 1024; i += step)
T512_L[i] = T512_C[i] ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3] ;
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7] ;
@ -873,51 +1112,39 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
@@ -873,51 +1112,39 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF ;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF } ;
# define buf ( u ) hash.h1[i + u]
for ( int i = 0 ; i < 64; i += 8) {
INPUT_BIG ;
# define buf ( u ) hash->h1[i + u]
for ( int i = 0 ; i < 64; i += 8)
{
INPUT_BIG_LOCAL ;
P_BIG ;
T_BIG ;
}
# undef buf
# define buf ( u ) ( u == 0 ? 0x80 : 0 )
INPUT_BIG ;
# undef buf
# define buf ( u ) ( u == 0 ? 0x80 : 0 )
INPUT_BIG_LOCAL ;
P_BIG ;
T_BIG ;
# undef buf
# define buf ( u ) ( u == 6 ? 2 : 0 )
INPUT_BIG ;
# undef buf
# define buf ( u ) ( u == 6 ? 2 : 0 )
INPUT_BIG_LOCAL ;
PF_BIG ;
T_BIG ;
for ( unsigned u = 0 ; u < 16; u ++)
hash.h4[u] = h[u] ;
}
//mixtab
__local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256] ;
init = get_local_id ( 0 ) ;
step = get_local_size ( 0 ) ;
for ( int i = init ; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i] ;
mixtab1[i] = mixtab1_c[i] ;
mixtab2[i] = mixtab2_c[i] ;
mixtab3[i] = mixtab3_c[i] ;
}
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
hash->h4[u] = h[u] ;
// fugue
{
sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09 ;
sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19 ;
sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29 ;
sph_u32 S30, S31, S32, S33, S34, S35 ;
ulong fc_bit_count = ( sph_u64 ) 64 << 3 ;
ulong fc_bit_count = ( sph_u64 ) 0x200 ;
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0 ;
S20 = SPH_C32 ( 0x8807a57e ) ; S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
@ -925,22 +1152,25 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
@@ -925,22 +1152,25 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S28 = SPH_C32 ( 0xaac6e2c9 ) ; S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32 ( 0x25ea78e7 ) ; S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3 ( ( hash. h4[0x0] ) , ( hash. h4[0x1] ) , ( hash. h4[0x2] ) ) ;
FUGUE512_3 ( ( hash. h4[0x3] ) , ( hash. h4[0x4] ) , ( hash. h4[0x5] ) ) ;
FUGUE512_3 ( ( hash. h4[0x6] ) , ( hash. h4[0x7] ) , ( hash. h4[0x8] ) ) ;
FUGUE512_3 ( ( hash. h4[0x9] ) , ( hash. h4[0xA] ) , ( hash. h4[0xB] ) ) ;
FUGUE512_3 ( ( hash. h4[0xC] ) , ( hash. h4[0xD] ) , ( hash. h4[0xE] ) ) ;
FUGUE512_3 ( ( hash. h4[0xF] ) , as_uint2 ( fc_bit_count ) . y, as_uint2 ( fc_bit_count ) . x ) ;
FUGUE512_3 ( ( hash-> h4[0x0] ) , ( hash-> h4[0x1] ) , ( hash-> h4[0x2] ) ) ;
FUGUE512_3 ( ( hash-> h4[0x3] ) , ( hash-> h4[0x4] ) , ( hash-> h4[0x5] ) ) ;
FUGUE512_3 ( ( hash-> h4[0x6] ) , ( hash-> h4[0x7] ) , ( hash-> h4[0x8] ) ) ;
FUGUE512_3 ( ( hash-> h4[0x9] ) , ( hash-> h4[0xA] ) , ( hash-> h4[0xB] ) ) ;
FUGUE512_3 ( ( hash-> h4[0xC] ) , ( hash-> h4[0xD] ) , ( hash-> h4[0xE] ) ) ;
FUGUE512_3 ( ( hash-> h4[0xF] ) , as_uint2 ( fc_bit_count ) . y, as_uint2 ( fc_bit_count ) . x ) ;
// apply round shift if necessary
int i ;
for ( i = 0 ; i < 32; i ++) {
for ( i = 0 ; i < 32; i ++)
{
ROR3 ;
CMIX36 ( S00, S01, S02, S04, S05, S06, S18, S19, S20 ) ;
SMIX ( S00, S01, S02, S03 ) ;
}
for ( i = 0 ; i < 13; i ++) {
for ( i = 0 ; i < 13; i ++)
{
S04 ^= S00 ;
S09 ^= S00 ;
S18 ^= S00 ;
@ -971,26 +1201,24 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
@@ -971,26 +1201,24 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S18 ^= S00 ;
S27 ^= S00 ;
hash.h4[0] = SWAP4 ( S01 ) ;
hash.h4[1] = SWAP4 ( S02 ) ;
hash.h4[2] = SWAP4 ( S03 ) ;
hash.h4[3] = SWAP4 ( S04 ) ;
hash.h4[4] = SWAP4 ( S09 ) ;
hash.h4[5] = SWAP4 ( S10 ) ;
hash.h4[6] = SWAP4 ( S11 ) ;
hash.h4[7] = SWAP4 ( S12 ) ;
hash.h4[8] = SWAP4 ( S18 ) ;
hash.h4[9] = SWAP4 ( S19 ) ;
hash.h4[10] = SWAP4 ( S20 ) ;
hash.h4[11] = SWAP4 ( S21 ) ;
hash.h4[12] = SWAP4 ( S27 ) ;
hash.h4[13] = SWAP4 ( S28 ) ;
hash.h4[14] = SWAP4 ( S29 ) ;
hash.h4[15] = SWAP4 ( S30 ) ;
}
bool result = ( hash.h8[3] <= target ) ;
hash->h4[0] = SWAP4 ( S01 ) ;
hash->h4[1] = SWAP4 ( S02 ) ;
hash->h4[2] = SWAP4 ( S03 ) ;
hash->h4[3] = SWAP4 ( S04 ) ;
hash->h4[4] = SWAP4 ( S09 ) ;
hash->h4[5] = SWAP4 ( S10 ) ;
hash->h4[6] = SWAP4 ( S11 ) ;
hash->h4[7] = SWAP4 ( S12 ) ;
hash->h4[8] = SWAP4 ( S18 ) ;
hash->h4[9] = SWAP4 ( S19 ) ;
hash->h4[10] = SWAP4 ( S20 ) ;
hash->h4[11] = SWAP4 ( S21 ) ;
hash->h4[12] = SWAP4 ( S27 ) ;
hash->h4[13] = SWAP4 ( S28 ) ;
hash->h4[14] = SWAP4 ( S29 ) ;
hash->h4[15] = SWAP4 ( S30 ) ;
bool result = ( hash->h8[3] <= target ) ;
if ( result )
output[atomic_inc ( output+0xFF ) ] = SWAP4 ( gid ) ;