@ -75,8 +75,8 @@ typedef long sph_s64;
# define SPH_GROESTL_BIG_ENDIAN 0
# define SPH_GROESTL_BIG_ENDIAN 0
# define SPH_CUBEHASH_UNROLL 0
# define SPH_CUBEHASH_UNROLL 0
# define SPH_KECCAK_UNROLL 0
# define SPH_KECCAK_UNROLL 0
# if !defined SPH_HAMSI_EXPAND_BIG
# ifndef SPH_HAMSI_EXPAND_BIG
# define SPH_HAMSI_EXPAND_BIG 4
# define SPH_HAMSI_EXPAND_BIG 4
# endif
# endif
# include "blake.cl"
# include "blake.cl"
@ -115,8 +115,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
{
{
uint gid = get_global_id ( 0 ) ;
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
// blake
// blake
sph_u64 H0 = SPH_C64 ( 0x6A09E667F3BCC908 ) , H1 = SPH_C64 ( 0xBB67AE8584CAA73B ) ;
sph_u64 H0 = SPH_C64 ( 0x6A09E667F3BCC908 ) , H1 = SPH_C64 ( 0xBB67AE8584CAA73B ) ;
sph_u64 H2 = SPH_C64 ( 0x3C6EF372FE94F82B ) , H3 = SPH_C64 ( 0xA54FF53A5F1D36F1 ) ;
sph_u64 H2 = SPH_C64 ( 0x3C6EF372FE94F82B ) , H3 = SPH_C64 ( 0xA54FF53A5F1D36F1 ) ;
sph_u64 H4 = SPH_C64 ( 0x510E527FADE682D1 ) , H5 = SPH_C64 ( 0x9B05688C2B3E6C1F ) ;
sph_u64 H4 = SPH_C64 ( 0x510E527FADE682D1 ) , H5 = SPH_C64 ( 0x9B05688C2B3E6C1F ) ;
@ -125,13 +125,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 T0 = SPH_C64 ( 0xFFFFFFFFFFFFFC00 ) + ( 80 << 3 ) , T1 = 0xFFFFFFFFFFFFFFFF ;;
sph_u64 T0 = SPH_C64 ( 0xFFFFFFFFFFFFFC00 ) + ( 80 << 3 ) , T1 = 0xFFFFFFFFFFFFFFFF ;;
if ( ( T0 = SPH_T64 ( T0 + 1024 ) ) < 1024 )
if ( ( T0 = SPH_T64 ( T0 + 1024 ) ) < 1024 )
{
T1 = SPH_T64 ( T1 + 1 ) ;
T1 = SPH_T64 ( T1 + 1 ) ;
}
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7 ;
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7 ;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF ;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF ;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7 ;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7 ;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF ;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF ;
M0 = DEC64BE ( block + 0 ) ;
M0 = DEC64BE ( block + 0 ) ;
M1 = DEC64BE ( block + 8 ) ;
M1 = DEC64BE ( block + 8 ) ;
M2 = DEC64BE ( block + 16 ) ;
M2 = DEC64BE ( block + 16 ) ;
@ -170,58 +170,269 @@ __kernel void search1(__global hash_t* hashes)
{
{
uint gid = get_global_id ( 0 ) ;
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
// bmw
// bmw
sph_u64 BMW_H[16] ;
sph_u64 BMW_H[16] ;
# pragma unroll 16
for ( unsigned u = 0 ; u < 16; u++)
for ( unsigned u = 0 ; u < 16; u++)
BMW_H[u] = BMW_IV512[u] ;
BMW_H[u] = BMW_IV512[u] ;
sph_u64 BMW_h1[16], BMW_h2[16 ];
sph_u64 mv[16],q[32 ];
sph_u64 mv[16] ;
sph_u64 tmp ;
mv[ 0] = SWAP8 ( hash->h8[0] ) ;
mv[0] = SWAP8 ( hash->h8[0] ) ;
mv[ 1] = SWAP8 ( hash->h8[1] ) ;
mv[1] = SWAP8 ( hash->h8[1] ) ;
mv[ 2] = SWAP8 ( hash->h8[2] ) ;
mv[2] = SWAP8 ( hash->h8[2] ) ;
mv[ 3] = SWAP8 ( hash->h8[3] ) ;
mv[3] = SWAP8 ( hash->h8[3] ) ;
mv[ 4] = SWAP8 ( hash->h8[4] ) ;
mv[4] = SWAP8 ( hash->h8[4] ) ;
mv[ 5] = SWAP8 ( hash->h8[5] ) ;
mv[5] = SWAP8 ( hash->h8[5] ) ;
mv[ 6] = SWAP8 ( hash->h8[6] ) ;
mv[6] = SWAP8 ( hash->h8[6] ) ;
mv[ 7] = SWAP8 ( hash->h8[7] ) ;
mv[7] = SWAP8 ( hash->h8[7] ) ;
mv[ 8] = 0x80 ;
mv[8] = 0x80 ;
mv[ 9] = 0 ;
mv[9] = 0 ;
mv[10] = 0 ;
mv[10] = 0 ;
mv[11] = 0 ;
mv[11] = 0 ;
mv[12] = 0 ;
mv[12] = 0 ;
mv[13] = 0 ;
mv[13] = 0 ;
mv[14] = 0 ;
mv[14] = 0 ;
mv[15] = 0x200 ;
mv[15] = SPH_C64 ( 512 ) ;
# define M ( x ) ( mv[x] )
# define H ( x ) ( BMW_H[x] )
tmp = ( mv[5] ^ BMW_H[5] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) + ( mv[14] ^ BMW_H[14] ) ;
# define dH ( x ) ( BMW_h2[x] )
q[0] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[1] ;
tmp = ( mv[6] ^ BMW_H[6] ) - ( mv[8] ^ BMW_H[8] ) + ( mv[11] ^ BMW_H[11] ) + ( mv[14] ^ BMW_H[14] ) - ( mv[15] ^ BMW_H[15] ) ;
q[1] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[2] ;
tmp = ( mv[0] ^ BMW_H[0] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[2] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[3] ;
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[1] ^ BMW_H[1] ) + ( mv[8] ^ BMW_H[8] ) - ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) ;
q[3] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[4] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[2] ^ BMW_H[2] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[14] ^ BMW_H[14] ) ;
q[4] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[5] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[2] ^ BMW_H[2] ) + ( mv[10] ^ BMW_H[10] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[5] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[6] ;
tmp = ( mv[4] ^ BMW_H[4] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) - ( mv[11] ^ BMW_H[11] ) + ( mv[13] ^ BMW_H[13] ) ;
q[6] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[7] ;
tmp = ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[12] ^ BMW_H[12] ) - ( mv[14] ^ BMW_H[14] ) ;
q[7] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[8] ;
tmp = ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[6] ^ BMW_H[6] ) + ( mv[13] ^ BMW_H[13] ) - ( mv[15] ^ BMW_H[15] ) ;
q[8] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[9] ;
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) + ( mv[6] ^ BMW_H[6] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[14] ^ BMW_H[14] ) ;
q[9] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[10] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[15] ^ BMW_H[15] ) ;
q[10] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[11] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[9] ^ BMW_H[9] ) ;
q[11] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[12] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[3] ^ BMW_H[3] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[10] ^ BMW_H[10] ) ;
q[12] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[13] ;
tmp = ( mv[2] ^ BMW_H[2] ) + ( mv[4] ^ BMW_H[4] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[11] ^ BMW_H[11] ) ;
q[13] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[14] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[8] ^ BMW_H[8] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[12] ^ BMW_H[12] ) ;
q[14] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[15] ;
tmp = ( mv[12] ^ BMW_H[12] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[13] ^ BMW_H[13] ) ;
q[15] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[0] ;
# pragma unroll 2
for ( int i=0 ;i<2;i++)
{
q[i+16] =
( SHR ( q[i], 1 ) ^ SHL ( q[i], 2 ) ^ SPH_ROTL64 ( q[i], 13 ) ^ SPH_ROTL64 ( q[i], 43 ) ) +
( SHR ( q[i+1], 2 ) ^ SHL ( q[i+1], 1 ) ^ SPH_ROTL64 ( q[i+1], 19 ) ^ SPH_ROTL64 ( q[i+1], 53 ) ) +
( SHR ( q[i+2], 2 ) ^ SHL ( q[i+2], 2 ) ^ SPH_ROTL64 ( q[i+2], 28 ) ^ SPH_ROTL64 ( q[i+2], 59 ) ) +
( SHR ( q[i+3], 1 ) ^ SHL ( q[i+3], 3 ) ^ SPH_ROTL64 ( q[i+3], 4 ) ^ SPH_ROTL64 ( q[i+3], 37 ) ) +
( SHR ( q[i+4], 1 ) ^ SHL ( q[i+4], 2 ) ^ SPH_ROTL64 ( q[i+4], 13 ) ^ SPH_ROTL64 ( q[i+4], 43 ) ) +
( SHR ( q[i+5], 2 ) ^ SHL ( q[i+5], 1 ) ^ SPH_ROTL64 ( q[i+5], 19 ) ^ SPH_ROTL64 ( q[i+5], 53 ) ) +
( SHR ( q[i+6], 2 ) ^ SHL ( q[i+6], 2 ) ^ SPH_ROTL64 ( q[i+6], 28 ) ^ SPH_ROTL64 ( q[i+6], 59 ) ) +
( SHR ( q[i+7], 1 ) ^ SHL ( q[i+7], 3 ) ^ SPH_ROTL64 ( q[i+7], 4 ) ^ SPH_ROTL64 ( q[i+7], 37 ) ) +
( SHR ( q[i+8], 1 ) ^ SHL ( q[i+8], 2 ) ^ SPH_ROTL64 ( q[i+8], 13 ) ^ SPH_ROTL64 ( q[i+8], 43 ) ) +
( SHR ( q[i+9], 2 ) ^ SHL ( q[i+9], 1 ) ^ SPH_ROTL64 ( q[i+9], 19 ) ^ SPH_ROTL64 ( q[i+9], 53 ) ) +
( SHR ( q[i+10], 2 ) ^ SHL ( q[i+10], 2 ) ^ SPH_ROTL64 ( q[i+10], 28 ) ^ SPH_ROTL64 ( q[i+10], 59 ) ) +
( SHR ( q[i+11], 1 ) ^ SHL ( q[i+11], 3 ) ^ SPH_ROTL64 ( q[i+11], 4 ) ^ SPH_ROTL64 ( q[i+11], 37 ) ) +
( SHR ( q[i+12], 1 ) ^ SHL ( q[i+12], 2 ) ^ SPH_ROTL64 ( q[i+12], 13 ) ^ SPH_ROTL64 ( q[i+12], 43 ) ) +
( SHR ( q[i+13], 2 ) ^ SHL ( q[i+13], 1 ) ^ SPH_ROTL64 ( q[i+13], 19 ) ^ SPH_ROTL64 ( q[i+13], 53 ) ) +
( SHR ( q[i+14], 2 ) ^ SHL ( q[i+14], 2 ) ^ SPH_ROTL64 ( q[i+14], 28 ) ^ SPH_ROTL64 ( q[i+14], 59 ) ) +
( SHR ( q[i+15], 1 ) ^ SHL ( q[i+15], 3 ) ^ SPH_ROTL64 ( q[i+15], 4 ) ^ SPH_ROTL64 ( q[i+15], 37 ) ) +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
FOLDb ;
# pragma unroll 4
for ( int i=2 ;i<6;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# undef M
# pragma unroll 3
# undef H
for ( int i=6 ;i<9;i++)
# undef dH
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 4
for ( int i=9 ;i<13;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
# pragma unroll 3
for ( int i=13 ;i<16;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i-13], ( i-13 ) +1 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23] ;
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31] ;
BMW_H[0] = ( SHL ( XH64, 5 ) ^ SHR ( q[16],5 ) ^ mv[0] ) + ( XL64 ^ q[24] ^ q[0] ) ;
BMW_H[1] = ( SHR ( XH64, 7 ) ^ SHL ( q[17],8 ) ^ mv[1] ) + ( XL64 ^ q[25] ^ q[1] ) ;
BMW_H[2] = ( SHR ( XH64, 5 ) ^ SHL ( q[18],5 ) ^ mv[2] ) + ( XL64 ^ q[26] ^ q[2] ) ;
BMW_H[3] = ( SHR ( XH64, 1 ) ^ SHL ( q[19],5 ) ^ mv[3] ) + ( XL64 ^ q[27] ^ q[3] ) ;
BMW_H[4] = ( SHR ( XH64, 3 ) ^ q[20] ^ mv[4] ) + ( XL64 ^ q[28] ^ q[4] ) ;
BMW_H[5] = ( SHL ( XH64, 6 ) ^ SHR ( q[21],6 ) ^ mv[5] ) + ( XL64 ^ q[29] ^ q[5] ) ;
BMW_H[6] = ( SHR ( XH64, 4 ) ^ SHL ( q[22],6 ) ^ mv[6] ) + ( XL64 ^ q[30] ^ q[6] ) ;
BMW_H[7] = ( SHR ( XH64,11 ) ^ SHL ( q[23],2 ) ^ mv[7] ) + ( XL64 ^ q[31] ^ q[7] ) ;
BMW_H[8] = SPH_ROTL64 ( BMW_H[4], 9 ) + ( XH64 ^ q[24] ^ mv[8] ) + ( SHL ( XL64,8 ) ^ q[23] ^ q[8] ) ;
BMW_H[9] = SPH_ROTL64 ( BMW_H[5],10 ) + ( XH64 ^ q[25] ^ mv[9] ) + ( SHR ( XL64,6 ) ^ q[16] ^ q[9] ) ;
BMW_H[10] = SPH_ROTL64 ( BMW_H[6],11 ) + ( XH64 ^ q[26] ^ mv[10] ) + ( SHL ( XL64,6 ) ^ q[17] ^ q[10] ) ;
BMW_H[11] = SPH_ROTL64 ( BMW_H[7],12 ) + ( XH64 ^ q[27] ^ mv[11] ) + ( SHL ( XL64,4 ) ^ q[18] ^ q[11] ) ;
BMW_H[12] = SPH_ROTL64 ( BMW_H[0],13 ) + ( XH64 ^ q[28] ^ mv[12] ) + ( SHR ( XL64,3 ) ^ q[19] ^ q[12] ) ;
BMW_H[13] = SPH_ROTL64 ( BMW_H[1],14 ) + ( XH64 ^ q[29] ^ mv[13] ) + ( SHR ( XL64,4 ) ^ q[20] ^ q[13] ) ;
BMW_H[14] = SPH_ROTL64 ( BMW_H[2],15 ) + ( XH64 ^ q[30] ^ mv[14] ) + ( SHR ( XL64,7 ) ^ q[21] ^ q[14] ) ;
BMW_H[15] = SPH_ROTL64 ( BMW_H[3],16 ) + ( XH64 ^ q[31] ^ mv[15] ) + ( SHR ( XL64,2 ) ^ q[22] ^ q[15] ) ;
# pragma unroll 16
for ( int i=0 ;i<16;i++)
{
mv[i] = BMW_H[i] ;
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + ( sph_u64 ) i ;
}
tmp = ( mv[5] ^ BMW_H[5] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) + ( mv[14] ^ BMW_H[14] ) ;
q[0] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[1] ;
tmp = ( mv[6] ^ BMW_H[6] ) - ( mv[8] ^ BMW_H[8] ) + ( mv[11] ^ BMW_H[11] ) + ( mv[14] ^ BMW_H[14] ) - ( mv[15] ^ BMW_H[15] ) ;
q[1] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[2] ;
tmp = ( mv[0] ^ BMW_H[0] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[2] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[3] ;
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[1] ^ BMW_H[1] ) + ( mv[8] ^ BMW_H[8] ) - ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) ;
q[3] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[4] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[2] ^ BMW_H[2] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[14] ^ BMW_H[14] ) ;
q[4] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[5] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[2] ^ BMW_H[2] ) + ( mv[10] ^ BMW_H[10] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[5] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[6] ;
tmp = ( mv[4] ^ BMW_H[4] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) - ( mv[11] ^ BMW_H[11] ) + ( mv[13] ^ BMW_H[13] ) ;
q[6] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[7] ;
tmp = ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[12] ^ BMW_H[12] ) - ( mv[14] ^ BMW_H[14] ) ;
q[7] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[8] ;
tmp = ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[6] ^ BMW_H[6] ) + ( mv[13] ^ BMW_H[13] ) - ( mv[15] ^ BMW_H[15] ) ;
q[8] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[9] ;
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) + ( mv[6] ^ BMW_H[6] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[14] ^ BMW_H[14] ) ;
q[9] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[10] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[15] ^ BMW_H[15] ) ;
q[10] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[11] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[9] ^ BMW_H[9] ) ;
q[11] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[12] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[3] ^ BMW_H[3] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[10] ^ BMW_H[10] ) ;
q[12] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 1 ) ^ SPH_ROTL64 ( tmp, 19 ) ^ SPH_ROTL64 ( tmp, 53 ) ) + BMW_H[13] ;
tmp = ( mv[2] ^ BMW_H[2] ) + ( mv[4] ^ BMW_H[4] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[11] ^ BMW_H[11] ) ;
q[13] = ( SHR ( tmp, 2 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 28 ) ^ SPH_ROTL64 ( tmp, 59 ) ) + BMW_H[14] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[8] ^ BMW_H[8] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[12] ^ BMW_H[12] ) ;
q[14] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[15] ;
tmp = ( mv[12] ^ BMW_H[12] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[13] ^ BMW_H[13] ) ;
q[15] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[0] ;
# pragma unroll 2
for ( int i=0 ;i<2;i++)
{
q[i+16] =
( SHR ( q[i], 1 ) ^ SHL ( q[i], 2 ) ^ SPH_ROTL64 ( q[i], 13 ) ^ SPH_ROTL64 ( q[i], 43 ) ) +
( SHR ( q[i+1], 2 ) ^ SHL ( q[i+1], 1 ) ^ SPH_ROTL64 ( q[i+1], 19 ) ^ SPH_ROTL64 ( q[i+1], 53 ) ) +
( SHR ( q[i+2], 2 ) ^ SHL ( q[i+2], 2 ) ^ SPH_ROTL64 ( q[i+2], 28 ) ^ SPH_ROTL64 ( q[i+2], 59 ) ) +
( SHR ( q[i+3], 1 ) ^ SHL ( q[i+3], 3 ) ^ SPH_ROTL64 ( q[i+3], 4 ) ^ SPH_ROTL64 ( q[i+3], 37 ) ) +
( SHR ( q[i+4], 1 ) ^ SHL ( q[i+4], 2 ) ^ SPH_ROTL64 ( q[i+4], 13 ) ^ SPH_ROTL64 ( q[i+4], 43 ) ) +
( SHR ( q[i+5], 2 ) ^ SHL ( q[i+5], 1 ) ^ SPH_ROTL64 ( q[i+5], 19 ) ^ SPH_ROTL64 ( q[i+5], 53 ) ) +
( SHR ( q[i+6], 2 ) ^ SHL ( q[i+6], 2 ) ^ SPH_ROTL64 ( q[i+6], 28 ) ^ SPH_ROTL64 ( q[i+6], 59 ) ) +
( SHR ( q[i+7], 1 ) ^ SHL ( q[i+7], 3 ) ^ SPH_ROTL64 ( q[i+7], 4 ) ^ SPH_ROTL64 ( q[i+7], 37 ) ) +
( SHR ( q[i+8], 1 ) ^ SHL ( q[i+8], 2 ) ^ SPH_ROTL64 ( q[i+8], 13 ) ^ SPH_ROTL64 ( q[i+8], 43 ) ) +
( SHR ( q[i+9], 2 ) ^ SHL ( q[i+9], 1 ) ^ SPH_ROTL64 ( q[i+9], 19 ) ^ SPH_ROTL64 ( q[i+9], 53 ) ) +
( SHR ( q[i+10], 2 ) ^ SHL ( q[i+10], 2 ) ^ SPH_ROTL64 ( q[i+10], 28 ) ^ SPH_ROTL64 ( q[i+10], 59 ) ) +
( SHR ( q[i+11], 1 ) ^ SHL ( q[i+11], 3 ) ^ SPH_ROTL64 ( q[i+11], 4 ) ^ SPH_ROTL64 ( q[i+11], 37 ) ) +
( SHR ( q[i+12], 1 ) ^ SHL ( q[i+12], 2 ) ^ SPH_ROTL64 ( q[i+12], 13 ) ^ SPH_ROTL64 ( q[i+12], 43 ) ) +
( SHR ( q[i+13], 2 ) ^ SHL ( q[i+13], 1 ) ^ SPH_ROTL64 ( q[i+13], 19 ) ^ SPH_ROTL64 ( q[i+13], 53 ) ) +
( SHR ( q[i+14], 2 ) ^ SHL ( q[i+14], 2 ) ^ SPH_ROTL64 ( q[i+14], 28 ) ^ SPH_ROTL64 ( q[i+14], 59 ) ) +
( SHR ( q[i+15], 1 ) ^ SHL ( q[i+15], 3 ) ^ SPH_ROTL64 ( q[i+15], 4 ) ^ SPH_ROTL64 ( q[i+15], 37 ) ) +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 4
for ( int i=2 ;i<6;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# define M ( x ) ( BMW_h2[x] )
# pragma unroll 3
# define H ( x ) ( final_b[x] )
for ( int i=6 ;i<9;i++)
# define dH ( x ) ( BMW_h1[x] )
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i+7] ) ;
}
FOLDb ;
# pragma unroll 4
for ( int i=9 ;i<13;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
# undef M
# pragma unroll 3
# undef H
for ( int i=13 ;i<16;i++)
# undef dH
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i-13], ( i-13 ) +1 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
hash->h8[0] = SWAP8 ( BMW_h1[8] ) ;
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23] ;
hash->h8[1] = SWAP8 ( BMW_h1[9] ) ;
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31] ;
hash->h8[2] = SWAP8 ( BMW_h1[10] ) ;
hash->h8[3] = SWAP8 ( BMW_h1[11] ) ;
BMW_H[0] = ( SHL ( XH64, 5 ) ^ SHR ( q[16],5 ) ^ mv[0] ) + ( XL64 ^ q[24] ^ q[0] ) ;
hash->h8[4] = SWAP8 ( BMW_h1[12] ) ;
BMW_H[1] = ( SHR ( XH64, 7 ) ^ SHL ( q[17],8 ) ^ mv[1] ) + ( XL64 ^ q[25] ^ q[1] ) ;
hash->h8[5] = SWAP8 ( BMW_h1[13] ) ;
BMW_H[2] = ( SHR ( XH64, 5 ) ^ SHL ( q[18],5 ) ^ mv[2] ) + ( XL64 ^ q[26] ^ q[2] ) ;
hash->h8[6] = SWAP8 ( BMW_h1[14] ) ;
BMW_H[3] = ( SHR ( XH64, 1 ) ^ SHL ( q[19],5 ) ^ mv[3] ) + ( XL64 ^ q[27] ^ q[3] ) ;
hash->h8[7] = SWAP8 ( BMW_h1[15] ) ;
BMW_H[4] = ( SHR ( XH64, 3 ) ^ q[20] ^ mv[4] ) + ( XL64 ^ q[28] ^ q[4] ) ;
BMW_H[5] = ( SHL ( XH64, 6 ) ^ SHR ( q[21],6 ) ^ mv[5] ) + ( XL64 ^ q[29] ^ q[5] ) ;
BMW_H[6] = ( SHR ( XH64, 4 ) ^ SHL ( q[22],6 ) ^ mv[6] ) + ( XL64 ^ q[30] ^ q[6] ) ;
BMW_H[7] = ( SHR ( XH64,11 ) ^ SHL ( q[23],2 ) ^ mv[7] ) + ( XL64 ^ q[31] ^ q[7] ) ;
BMW_H[8] = SPH_ROTL64 ( BMW_H[4], 9 ) + ( XH64 ^ q[24] ^ mv[8] ) + ( SHL ( XL64,8 ) ^ q[23] ^ q[8] ) ;
BMW_H[9] = SPH_ROTL64 ( BMW_H[5],10 ) + ( XH64 ^ q[25] ^ mv[9] ) + ( SHR ( XL64,6 ) ^ q[16] ^ q[9] ) ;
BMW_H[10] = SPH_ROTL64 ( BMW_H[6],11 ) + ( XH64 ^ q[26] ^ mv[10] ) + ( SHL ( XL64,6 ) ^ q[17] ^ q[10] ) ;
BMW_H[11] = SPH_ROTL64 ( BMW_H[7],12 ) + ( XH64 ^ q[27] ^ mv[11] ) + ( SHL ( XL64,4 ) ^ q[18] ^ q[11] ) ;
BMW_H[12] = SPH_ROTL64 ( BMW_H[0],13 ) + ( XH64 ^ q[28] ^ mv[12] ) + ( SHR ( XL64,3 ) ^ q[19] ^ q[12] ) ;
BMW_H[13] = SPH_ROTL64 ( BMW_H[1],14 ) + ( XH64 ^ q[29] ^ mv[13] ) + ( SHR ( XL64,4 ) ^ q[20] ^ q[13] ) ;
BMW_H[14] = SPH_ROTL64 ( BMW_H[2],15 ) + ( XH64 ^ q[30] ^ mv[14] ) + ( SHR ( XL64,7 ) ^ q[21] ^ q[14] ) ;
BMW_H[15] = SPH_ROTL64 ( BMW_H[3],16 ) + ( XH64 ^ q[31] ^ mv[15] ) + ( SHR ( XL64,2 ) ^ q[22] ^ q[15] ) ;
hash->h8[0] = SWAP8 ( BMW_H[8] ) ;
hash->h8[1] = SWAP8 ( BMW_H[9] ) ;
hash->h8[2] = SWAP8 ( BMW_H[10] ) ;
hash->h8[3] = SWAP8 ( BMW_H[11] ) ;
hash->h8[4] = SWAP8 ( BMW_H[12] ) ;
hash->h8[5] = SWAP8 ( BMW_H[13] ) ;
hash->h8[6] = SWAP8 ( BMW_H[14] ) ;
hash->h8[7] = SWAP8 ( BMW_H[15] ) ;
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
}
}
@ -240,67 +451,59 @@ __kernel void search2(__global hash_t* hashes)
for ( int i = init ; i < 256; i += step)
for ( int i = init ; i < 256; i += step)
{
{
T0_L[i] = T0[i] ;
T0_L[i] = T0[i] ;
T4_L[i] = T4[i] ;
T1_L[i] = T1[i] ;
T1_L[i] = T1[i] ;
T2_L[i] = T2[i] ;
T2_L[i] = T2[i] ;
T3_L[i] = T3[i] ;
T3_L[i] = T3[i] ;
T4_L[i] = T4[i] ;
T5_L[i] = T5[i] ;
T5_L[i] = T5[i] ;
T6_L[i] = T6[i] ;
T6_L[i] = T6[i] ;
T7_L[i] = T7[i] ;
T7_L[i] = T7[i] ;
}
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
# define T0 T0_L
# define T0 T0_L
# define T1 T1_L
# define T1 T1_L
# define T2 T2_L
# define T2 T2_L
# define T3 T3_L
# define T3 T3_L
# define T4 T4_L
# define T4 T4_L
# define T5 T5_L
# define T5 T5_L
# define T6 T6_L
# define T6 T6_L
# define T7 T7_L
# define T7 T7_L
// groestl
// groestl
sph_u64 H[16] = {0, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0x0002000000000000} ;
sph_u64 H[16] ;
for ( unsigned int u = 0 ; u < 15; u ++)
H[u] = 0 ;
# if USE_LE
H[15] = ( ( sph_u64 ) ( 512 & 0xFF ) << 56 ) | ( ( sph_u64 ) ( 512 & 0xFF00 ) << 40 ) ;
# else
H[15] = ( sph_u64 ) 512 ;
# endif
sph_u64 g[16], m[16] ;
sph_u64 g[16], m[16] ;
m[0] = DEC64E ( hash->h8[0] ) ;
g[0] = m[0] = DEC64E ( hash->h8[0] ) ;
m[1] = DEC64E ( hash->h8[1] ) ;
g[1] = m[1] = DEC64E ( hash->h8[1] ) ;
m[2] = DEC64E ( hash->h8[2] ) ;
g[2] = m[2] = DEC64E ( hash->h8[2] ) ;
m[3] = DEC64E ( hash->h8[3] ) ;
g[3] = m[3] = DEC64E ( hash->h8[3] ) ;
m[4] = DEC64E ( hash->h8[4] ) ;
g[4] = m[4] = DEC64E ( hash->h8[4] ) ;
m[5] = DEC64E ( hash->h8[5] ) ;
g[5] = m[5] = DEC64E ( hash->h8[5] ) ;
m[6] = DEC64E ( hash->h8[6] ) ;
g[6] = m[6] = DEC64E ( hash->h8[6] ) ;
m[7] = DEC64E ( hash->h8[7] ) ;
g[7] = m[7] = DEC64E ( hash->h8[7] ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
g[8] = m[8] = 0x80 ;
g[u] = m[u] ^ H[u] ;
g[9] = m[9] = 0 ;
m[8] = 0x80 ; g[8] = m[8] ^ H[8] ;
g[10] = m[10] = 0 ;
m[9 ] = 0 ; g[9] = m[9] ^ H[9] ;
g[11] = m[11 ] = 0 ;
m[10 ] = 0 ; g[10] = m[10] ^ H[10] ;
g[12] = m[12 ] = 0 ;
m[11 ] = 0 ; g[11] = m[11] ^ H[11] ;
g[13] = m[13 ] = 0 ;
m[12 ] = 0 ; g[12] = m[12] ^ H[12] ;
g[14] = m[14 ] = 0 ;
m[13] = 0 ; g[13] = m[13] ^ H[13] ;
g[15] = 0x102000000000000 ;
m[14] = 0 ; g[14] = m[14] ^ H[14] ;
m[15] = 0x100000000000000 ;
m[15] = 0x100000000000000 ; g[15] = m[15] ^ H[15];
PERM_BIG_P ( g ) ;
PERM_BIG_P ( g ) ;
PERM_BIG_Q ( m ) ;
PERM_BIG_Q ( m ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
H[u] ^= g[u] ^ m[u] ;
sph_u64 xH[16] ;
sph_u64 xH[16] ;
for ( unsigned int u = 0 ; u < 16; u ++)
for ( unsigned int u = 0 ; u < 16; u ++)
xH[u] = H[u] ;
xH[u] = H[u] ^= g[u] ^ m[u] ;
PERM_BIG_P ( xH ) ;
PERM_BIG_P ( xH ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
H[u] ^= xH[u] ;
for ( unsigned int u = 8 ; u < 16; u ++)
for ( unsigned int u = 0 ; u < 8; u ++)
hash->h8[u-8] = DEC64E ( H[u] ^ xH[u] ) ;
hash->h8[u] = DEC64E ( H[u + 8] ) ;
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
}
}
@ -325,10 +528,14 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8 ( hash->h8[5] ) ;
m5 = SWAP8 ( hash->h8[5] ) ;
m6 = SWAP8 ( hash->h8[6] ) ;
m6 = SWAP8 ( hash->h8[6] ) ;
m7 = SWAP8 ( hash->h8[7] ) ;
m7 = SWAP8 ( hash->h8[7] ) ;
UBI_BIG ( 480 , 64 ) ;
UBI_BIG ( 480 , 64 ) ;
bcount = 0 ;
bcount = 0 ;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0 ;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0 ;
UBI_BIG ( 510 , 8 ) ;
UBI_BIG ( 510 , 8 ) ;
hash->h8[0] = SWAP8 ( h0 ) ;
hash->h8[0] = SWAP8 ( h0 ) ;
hash->h8[1] = SWAP8 ( h1 ) ;
hash->h8[1] = SWAP8 ( h1 ) ;
hash->h8[2] = SWAP8 ( h2 ) ;
hash->h8[2] = SWAP8 ( h2 ) ;
@ -355,7 +562,8 @@ __kernel void search4(__global hash_t* hashes)
for ( int i = 0 ; i < 2; i++)
for ( int i = 0 ; i < 2; i++)
{
{
if ( i == 0 ) {
if ( i == 0 )
{
h0h ^= DEC64E ( hash->h8[0] ) ;
h0h ^= DEC64E ( hash->h8[0] ) ;
h0l ^= DEC64E ( hash->h8[1] ) ;
h0l ^= DEC64E ( hash->h8[1] ) ;
h1h ^= DEC64E ( hash->h8[2] ) ;
h1h ^= DEC64E ( hash->h8[2] ) ;
@ -364,7 +572,9 @@ __kernel void search4(__global hash_t* hashes)
h2l ^= DEC64E ( hash->h8[5] ) ;
h2l ^= DEC64E ( hash->h8[5] ) ;
h3h ^= DEC64E ( hash->h8[6] ) ;
h3h ^= DEC64E ( hash->h8[6] ) ;
h3l ^= DEC64E ( hash->h8[7] ) ;
h3l ^= DEC64E ( hash->h8[7] ) ;
} else if ( i == 1 ) {
}
else if ( i == 1 )
{
h4h ^= DEC64E ( hash->h8[0] ) ;
h4h ^= DEC64E ( hash->h8[0] ) ;
h4l ^= DEC64E ( hash->h8[1] ) ;
h4l ^= DEC64E ( hash->h8[1] ) ;
h5h ^= DEC64E ( hash->h8[2] ) ;
h5h ^= DEC64E ( hash->h8[2] ) ;
@ -425,6 +635,7 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8 ( hash->h8[7] ) ;
a21 ^= SWAP8 ( hash->h8[7] ) ;
a31 ^= 0x8000000000000001 ;
a31 ^= 0x8000000000000001 ;
KECCAK_F_1600 ;
KECCAK_F_1600 ;
// Finalize the "lane complement"
// Finalize the "lane complement"
a10 = ~a10 ;
a10 = ~a10 ;
a20 = ~a20 ;
a20 = ~a20 ;
@ -471,7 +682,8 @@ __kernel void search6(__global hash_t* hashes)
MI5 ;
MI5 ;
LUFFA_P5 ;
LUFFA_P5 ;
if ( i == 0 ) {
if ( i == 0 )
{
M0 = hash->h4[9] ;
M0 = hash->h4[9] ;
M1 = hash->h4[8] ;
M1 = hash->h4[8] ;
M2 = hash->h4[11] ;
M2 = hash->h4[11] ;
@ -480,12 +692,16 @@ __kernel void search6(__global hash_t* hashes)
M5 = hash->h4[12] ;
M5 = hash->h4[12] ;
M6 = hash->h4[15] ;
M6 = hash->h4[15] ;
M7 = hash->h4[14] ;
M7 = hash->h4[14] ;
} else if ( i == 1 ) {
}
else if ( i == 1 )
{
M0 = 0x80000000 ;
M0 = 0x80000000 ;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0 ;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0 ;
} else if ( i == 2 ) {
}
else if ( i == 2 )
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0 ;
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0 ;
} else if ( i == 3 ) {
else if ( i == 3 )
{
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40 ;
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40 ;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41 ;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41 ;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42 ;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42 ;
@ -535,10 +751,12 @@ __kernel void search7(__global hash_t* hashes)
x6 ^= SWAP4 ( hash->h4[7] ) ;
x6 ^= SWAP4 ( hash->h4[7] ) ;
x7 ^= SWAP4 ( hash->h4[6] ) ;
x7 ^= SWAP4 ( hash->h4[6] ) ;
for ( int i = 0 ; i < 13; i ++) {
for ( int i = 0 ; i < 13; i ++)
{
SIXTEEN_ROUNDS ;
SIXTEEN_ROUNDS ;
if ( i == 0 ) {
if ( i == 0 )
{
x0 ^= SWAP4 ( hash->h4[9] ) ;
x0 ^= SWAP4 ( hash->h4[9] ) ;
x1 ^= SWAP4 ( hash->h4[8] ) ;
x1 ^= SWAP4 ( hash->h4[8] ) ;
x2 ^= SWAP4 ( hash->h4[11] ) ;
x2 ^= SWAP4 ( hash->h4[11] ) ;
@ -547,12 +765,12 @@ __kernel void search7(__global hash_t* hashes)
x5 ^= SWAP4 ( hash->h4[12] ) ;
x5 ^= SWAP4 ( hash->h4[12] ) ;
x6 ^= SWAP4 ( hash->h4[15] ) ;
x6 ^= SWAP4 ( hash->h4[15] ) ;
x7 ^= SWAP4 ( hash->h4[14] ) ;
x7 ^= SWAP4 ( hash->h4[14] ) ;
} else if ( i == 1 ) {
}
else if ( i == 1 )
x0 ^= 0x80 ;
x0 ^= 0x80 ;
} else if ( i == 2 ) {
else if ( i == 2 )
xv ^= SPH_C32 ( 1 ) ;
xv ^= SPH_C32 ( 1 ) ;
}
}
}
hash->h4[0] = x0 ;
hash->h4[0] = x0 ;
hash->h4[1] = x1 ;
hash->h4[1] = x1 ;
@ -579,6 +797,7 @@ __kernel void search8(__global hash_t* hashes)
{
{
uint gid = get_global_id ( 0 ) ;
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256] ;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256] ;
int init = get_local_id ( 0 ) ;
int init = get_local_id ( 0 ) ;
@ -607,7 +826,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17 ;
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17 ;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F ;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F ;
sph_u32 sc_count0 = ( 64 << 3 ) , sc_count1 = 0 , sc_count2 = 0 , sc_count3 = 0 ;
sph_u32 sc_count0 = 0x200 , sc_count1 = 0 , sc_count2 = 0 , sc_count3 = 0 ;
rk00 = hash->h4[0] ;
rk00 = hash->h4[0] ;
rk01 = hash->h4[1] ;
rk01 = hash->h4[1] ;
@ -673,7 +892,8 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32 ( 0x09254899 ) , D1 = C32 ( 0xD699C7BC ) , D2 = C32 ( 0x9019B6DC ) , D3 = C32 ( 0x2B9022E4 ) , D4 = C32 ( 0x8FA14956 ) , D5 = C32 ( 0x21BF9BD3 ) , D6 = C32 ( 0xB94D0943 ) , D7 = C32 ( 0x6FFDDC22 ) ;
u32 D0 = C32 ( 0x09254899 ) , D1 = C32 ( 0xD699C7BC ) , D2 = C32 ( 0x9019B6DC ) , D3 = C32 ( 0x2B9022E4 ) , D4 = C32 ( 0x8FA14956 ) , D5 = C32 ( 0x21BF9BD3 ) , D6 = C32 ( 0xB94D0943 ) , D7 = C32 ( 0x6FFDDC22 ) ;
FFT256 ( 0 , 1 , 0 , ll1 ) ;
FFT256 ( 0 , 1 , 0 , ll1 ) ;
for ( int i = 0 ; i < 256; i ++) {
for ( int i = 0 ; i < 256; i ++)
{
s32 tq ;
s32 tq ;
tq = q[i] + yoff_b_n[i] ;
tq = q[i] + yoff_b_n[i] ;
@ -709,14 +929,17 @@ __kernel void search9(__global hash_t* hashes)
C32 ( 0x0BA16B95 ) , C32 ( 0x72F999AD ) , C32 ( 0x9FECC2AE ) , C32 ( 0xBA3264FC ) ,
C32 ( 0x0BA16B95 ) , C32 ( 0x72F999AD ) , C32 ( 0x9FECC2AE ) , C32 ( 0xBA3264FC ) ,
C32 ( 0x5E894929 ) , C32 ( 0x8E9F30E5 ) , C32 ( 0x2F1DAA37 ) , C32 ( 0xF0F2C558 ) ,
C32 ( 0x5E894929 ) , C32 ( 0x8E9F30E5 ) , C32 ( 0x2F1DAA37 ) , C32 ( 0xF0F2C558 ) ,
IF, 4 , 13 , PP8_4_ ) ;
IF, 4 , 13 , PP8_4_ ) ;
STEP_BIG (
STEP_BIG (
C32 ( 0xAC506643 ) , C32 ( 0xA90635A5 ) , C32 ( 0xE25B878B ) , C32 ( 0xAAB7878F ) ,
C32 ( 0xAC506643 ) , C32 ( 0xA90635A5 ) , C32 ( 0xE25B878B ) , C32 ( 0xAAB7878F ) ,
C32 ( 0x88817F7A ) , C32 ( 0x0A02892B ) , C32 ( 0x559A7550 ) , C32 ( 0x598F657E ) ,
C32 ( 0x88817F7A ) , C32 ( 0x0A02892B ) , C32 ( 0x559A7550 ) , C32 ( 0x598F657E ) ,
IF, 13 , 10 , PP8_5_ ) ;
IF, 13 , 10 , PP8_5_ ) ;
STEP_BIG (
STEP_BIG (
C32 ( 0x7EEF60A1 ) , C32 ( 0x6B70E3E8 ) , C32 ( 0x9C1714D1 ) , C32 ( 0xB958E2A8 ) ,
C32 ( 0x7EEF60A1 ) , C32 ( 0x6B70E3E8 ) , C32 ( 0x9C1714D1 ) , C32 ( 0xB958E2A8 ) ,
C32 ( 0xAB02675E ) , C32 ( 0xED1C014F ) , C32 ( 0xCD8D65BB ) , C32 ( 0xFDB7A257 ) ,
C32 ( 0xAB02675E ) , C32 ( 0xED1C014F ) , C32 ( 0xCD8D65BB ) , C32 ( 0xFDB7A257 ) ,
IF, 10 , 25 , PP8_6_ ) ;
IF, 10 , 25 , PP8_6_ ) ;
STEP_BIG (
STEP_BIG (
C32 ( 0x09254899 ) , C32 ( 0xD699C7BC ) , C32 ( 0x9019B6DC ) , C32 ( 0x2B9022E4 ) ,
C32 ( 0x09254899 ) , C32 ( 0xD699C7BC ) , C32 ( 0x9019B6DC ) , C32 ( 0x2B9022E4 ) ,
C32 ( 0x8FA14956 ) , C32 ( 0x21BF9BD3 ) , C32 ( 0xB94D0943 ) , C32 ( 0x6FFDDC22 ) ,
C32 ( 0x8FA14956 ) , C32 ( 0x21BF9BD3 ) , C32 ( 0xB94D0943 ) , C32 ( 0x6FFDDC22 ) ,
@ -735,22 +958,27 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG ( 1_, 1 , 28 , 19 , 22 , 7 ) ;
ONE_ROUND_BIG ( 1_, 1 , 28 , 19 , 22 , 7 ) ;
ONE_ROUND_BIG ( 2_, 2 , 29 , 9 , 15 , 5 ) ;
ONE_ROUND_BIG ( 2_, 2 , 29 , 9 , 15 , 5 ) ;
ONE_ROUND_BIG ( 3_, 3 , 4 , 13 , 10 , 25 ) ;
ONE_ROUND_BIG ( 3_, 3 , 4 , 13 , 10 , 25 ) ;
STEP_BIG (
STEP_BIG (
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4 , 13 , PP8_4_ ) ;
IF, 4 , 13 , PP8_4_ ) ;
STEP_BIG (
STEP_BIG (
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13 , 10 , PP8_5_ ) ;
IF, 13 , 10 , PP8_5_ ) ;
STEP_BIG (
STEP_BIG (
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10 , 25 , PP8_6_ ) ;
IF, 10 , 25 , PP8_6_ ) ;
STEP_BIG (
STEP_BIG (
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25 , 4 , PP8_0_ ) ;
IF, 25 , 4 , PP8_0_ ) ;
# undef q
# undef q
hash->h4[0] = A0 ;
hash->h4[0] = A0 ;
@ -778,7 +1006,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
{
{
uint gid = get_global_id ( 0 ) ;
uint gid = get_global_id ( 0 ) ;
uint offset = get_global_offset ( 0 ) ;
uint offset = get_global_offset ( 0 ) ;
hash_t hash ;
__global hash_t * hash = & ( hashes[gid-offset] ) ;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256] ;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256] ;
@ -795,14 +1023,23 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
barrier ( CLK_LOCAL_MEM_FENCE ) ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
for ( int i = 0 ; i < 8; i++) {
//mixtab
hash.h8[i] = hashes[gid-offset].h8[i] ;
__local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256] ;
for ( int i = init ; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i] ;
mixtab1[i] = mixtab1_c[i] ;
mixtab2[i] = mixtab2_c[i] ;
mixtab3[i] = mixtab3_c[i] ;
}
}
// echo
barrier ( CLK_LOCAL_MEM_FENCE ) ;
{
for ( int i = 0 ; i < 8; i++)
hash->h8[i] = hashes[gid-offset].h8[i] ;
// echo
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1 ;
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1 ;
sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71 ;
sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71 ;
Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL ;
Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL ;
@ -829,14 +1066,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
W61 = Vb61 ;
W61 = Vb61 ;
W70 = Vb70 ;
W70 = Vb70 ;
W71 = Vb71 ;
W71 = Vb71 ;
W80 = hash. h8[0] ;
W80 = hash-> h8[0] ;
W81 = hash. h8[1] ;
W81 = hash-> h8[1] ;
W90 = hash. h8[2] ;
W90 = hash-> h8[2] ;
W91 = hash. h8[3] ;
W91 = hash-> h8[3] ;
WA0 = hash. h8[4] ;
WA0 = hash-> h8[4] ;
WA1 = hash. h8[5] ;
WA1 = hash-> h8[5] ;
WB0 = hash. h8[6] ;
WB0 = hash-> h8[6] ;
WB1 = hash. h8[7] ;
WB1 = hash-> h8[7] ;
WC0 = 0x80 ;
WC0 = 0x80 ;
WC1 = 0 ;
WC1 = 0 ;
WD0 = 0 ;
WD0 = 0 ;
@ -846,24 +1083,26 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
WF0 = 0x200 ;
WF0 = 0x200 ;
WF1 = 0 ;
WF1 = 0 ;
for ( unsigned u = 0 ; u < 10; u ++) {
for ( unsigned u = 0 ; u < 10; u ++)
BIG_ROUND ;
BIG_ROUND ;
}
hash.h8[0] ^= Vb00 ^ W00 ^ W80 ;
hash.h8[1] ^= Vb01 ^ W01 ^ W81 ;
hash.h8[2] ^= Vb10 ^ W10 ^ W90 ;
hash.h8[3] ^= Vb11 ^ W11 ^ W91 ;
hash.h8[4] ^= Vb20 ^ W20 ^ WA0 ;
hash.h8[5] ^= Vb21 ^ W21 ^ WA1 ;
hash.h8[6] ^= Vb30 ^ W30 ^ WB0 ;
hash.h8[7] ^= Vb31 ^ W31 ^ WB1 ;
}
hash->h8[0] ^= Vb00 ^ W00 ^ W80 ;
hash->h8[1] ^= Vb01 ^ W01 ^ W81 ;
hash->h8[2] ^= Vb10 ^ W10 ^ W90 ;
hash->h8[3] ^= Vb11 ^ W11 ^ W91 ;
hash->h8[4] ^= Vb20 ^ W20 ^ WA0 ;
hash->h8[5] ^= Vb21 ^ W21 ^ WA1 ;
hash->h8[6] ^= Vb30 ^ W30 ^ WB0 ;
hash->h8[7] ^= Vb31 ^ W31 ^ WB1 ;
// hamsi
// hamsi
__local sph_u32 T512_L[1024] ;
__constant const sph_u32 *T512_C = &T512[0][0] ;
{
for ( int i = init ; i < 1024; i += step)
T512_L[i] = T512_C[i] ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3] ;
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3] ;
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7] ;
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7] ;
@ -873,51 +1112,39 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF ;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF ;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF } ;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF } ;
# define buf ( u ) hash.h1[i + u]
# define buf ( u ) hash->h1[i + u]
for ( int i = 0 ; i < 64; i += 8) {
INPUT_BIG ;
for ( int i = 0 ; i < 64; i += 8)
{
INPUT_BIG_LOCAL ;
P_BIG ;
P_BIG ;
T_BIG ;
T_BIG ;
}
}
# undef buf
# define buf ( u ) ( u == 0 ? 0x80 : 0 )
# undef buf
INPUT_BIG ;
# define buf ( u ) ( u == 0 ? 0x80 : 0 )
INPUT_BIG_LOCAL ;
P_BIG ;
P_BIG ;
T_BIG ;
T_BIG ;
# undef buf
# define buf ( u ) ( u == 6 ? 2 : 0 )
# undef buf
INPUT_BIG ;
# define buf ( u ) ( u == 6 ? 2 : 0 )
INPUT_BIG_LOCAL ;
PF_BIG ;
PF_BIG ;
T_BIG ;
T_BIG ;
for ( unsigned u = 0 ; u < 16; u ++)
for ( unsigned u = 0 ; u < 16; u ++)
hash.h4[u] = h[u] ;
hash->h4[u] = h[u] ;
}
//mixtab
__local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256] ;
init = get_local_id ( 0 ) ;
step = get_local_size ( 0 ) ;
for ( int i = init ; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i] ;
mixtab1[i] = mixtab1_c[i] ;
mixtab2[i] = mixtab2_c[i] ;
mixtab3[i] = mixtab3_c[i] ;
}
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
// fugue
// fugue
{
sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09 ;
sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09 ;
sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19 ;
sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19 ;
sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29 ;
sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29 ;
sph_u32 S30, S31, S32, S33, S34, S35 ;
sph_u32 S30, S31, S32, S33, S34, S35 ;
ulong fc_bit_count = ( sph_u64 ) 64 << 3 ;
ulong fc_bit_count = ( sph_u64 ) 0x200 ;
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0 ;
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0 ;
S20 = SPH_C32 ( 0x8807a57e ) ; S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
S20 = SPH_C32 ( 0x8807a57e ) ; S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
@ -925,22 +1152,25 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S28 = SPH_C32 ( 0xaac6e2c9 ) ; S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S28 = SPH_C32 ( 0xaac6e2c9 ) ; S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32 ( 0x25ea78e7 ) ; S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
S32 = SPH_C32 ( 0x25ea78e7 ) ; S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3 ( ( hash. h4[0x0] ) , ( hash. h4[0x1] ) , ( hash. h4[0x2] ) ) ;
FUGUE512_3 ( ( hash-> h4[0x0] ) , ( hash-> h4[0x1] ) , ( hash-> h4[0x2] ) ) ;
FUGUE512_3 ( ( hash. h4[0x3] ) , ( hash. h4[0x4] ) , ( hash. h4[0x5] ) ) ;
FUGUE512_3 ( ( hash-> h4[0x3] ) , ( hash-> h4[0x4] ) , ( hash-> h4[0x5] ) ) ;
FUGUE512_3 ( ( hash. h4[0x6] ) , ( hash. h4[0x7] ) , ( hash. h4[0x8] ) ) ;
FUGUE512_3 ( ( hash-> h4[0x6] ) , ( hash-> h4[0x7] ) , ( hash-> h4[0x8] ) ) ;
FUGUE512_3 ( ( hash. h4[0x9] ) , ( hash. h4[0xA] ) , ( hash. h4[0xB] ) ) ;
FUGUE512_3 ( ( hash-> h4[0x9] ) , ( hash-> h4[0xA] ) , ( hash-> h4[0xB] ) ) ;
FUGUE512_3 ( ( hash. h4[0xC] ) , ( hash. h4[0xD] ) , ( hash. h4[0xE] ) ) ;
FUGUE512_3 ( ( hash-> h4[0xC] ) , ( hash-> h4[0xD] ) , ( hash-> h4[0xE] ) ) ;
FUGUE512_3 ( ( hash. h4[0xF] ) , as_uint2 ( fc_bit_count ) . y, as_uint2 ( fc_bit_count ) . x ) ;
FUGUE512_3 ( ( hash-> h4[0xF] ) , as_uint2 ( fc_bit_count ) . y, as_uint2 ( fc_bit_count ) . x ) ;
// apply round shift if necessary
// apply round shift if necessary
int i ;
int i ;
for ( i = 0 ; i < 32; i ++) {
for ( i = 0 ; i < 32; i ++)
{
ROR3 ;
ROR3 ;
CMIX36 ( S00, S01, S02, S04, S05, S06, S18, S19, S20 ) ;
CMIX36 ( S00, S01, S02, S04, S05, S06, S18, S19, S20 ) ;
SMIX ( S00, S01, S02, S03 ) ;
SMIX ( S00, S01, S02, S03 ) ;
}
}
for ( i = 0 ; i < 13; i ++) {
for ( i = 0 ; i < 13; i ++)
{
S04 ^= S00 ;
S04 ^= S00 ;
S09 ^= S00 ;
S09 ^= S00 ;
S18 ^= S00 ;
S18 ^= S00 ;
@ -971,26 +1201,24 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S18 ^= S00 ;
S18 ^= S00 ;
S27 ^= S00 ;
S27 ^= S00 ;
hash.h4[0] = SWAP4 ( S01 ) ;
hash->h4[0] = SWAP4 ( S01 ) ;
hash.h4[1] = SWAP4 ( S02 ) ;
hash->h4[1] = SWAP4 ( S02 ) ;
hash.h4[2] = SWAP4 ( S03 ) ;
hash->h4[2] = SWAP4 ( S03 ) ;
hash.h4[3] = SWAP4 ( S04 ) ;
hash->h4[3] = SWAP4 ( S04 ) ;
hash.h4[4] = SWAP4 ( S09 ) ;
hash->h4[4] = SWAP4 ( S09 ) ;
hash.h4[5] = SWAP4 ( S10 ) ;
hash->h4[5] = SWAP4 ( S10 ) ;
hash.h4[6] = SWAP4 ( S11 ) ;
hash->h4[6] = SWAP4 ( S11 ) ;
hash.h4[7] = SWAP4 ( S12 ) ;
hash->h4[7] = SWAP4 ( S12 ) ;
hash.h4[8] = SWAP4 ( S18 ) ;
hash->h4[8] = SWAP4 ( S18 ) ;
hash.h4[9] = SWAP4 ( S19 ) ;
hash->h4[9] = SWAP4 ( S19 ) ;
hash.h4[10] = SWAP4 ( S20 ) ;
hash->h4[10] = SWAP4 ( S20 ) ;
hash.h4[11] = SWAP4 ( S21 ) ;
hash->h4[11] = SWAP4 ( S21 ) ;
hash.h4[12] = SWAP4 ( S27 ) ;
hash->h4[12] = SWAP4 ( S27 ) ;
hash.h4[13] = SWAP4 ( S28 ) ;
hash->h4[13] = SWAP4 ( S28 ) ;
hash.h4[14] = SWAP4 ( S29 ) ;
hash->h4[14] = SWAP4 ( S29 ) ;
hash.h4[15] = SWAP4 ( S30 ) ;
hash->h4[15] = SWAP4 ( S30 ) ;
}
bool result = ( hash->h8[3] <= target ) ;
bool result = ( hash.h8[3] <= target ) ;
if ( result )
if ( result )
output[atomic_inc ( output+0xFF ) ] = SWAP4 ( gid ) ;
output[atomic_inc ( output+0xFF ) ] = SWAP4 ( gid ) ;