@ -72,13 +72,17 @@ typedef int sph_s32;
@@ -72,13 +72,17 @@ typedef int sph_s32;
# define SPH_SIMD_NOCOPY 0
# define SPH_KECCAK_NOCOPY 0
# define SPH_COMPACT_BLAKE_64 0
# define SPH_LUFFA_PARALLEL 1
# define SPH_LUFFA_PARALLEL 0
# define SPH_SMALL_FOOTPRINT_GROESTL 0
# define SPH_GROESTL_BIG_ENDIAN 0
# define SPH_CUBEHASH_UNROLL 0
# define SPH_KECCAK_UNROLL 1
# ifndef SPH_HAMSI_EXPAND_BIG
# define SPH_HAMSI_EXPAND_BIG 1
# define SPH_HAMSI_SHORT 1
# endif
# ifndef SPH_HAMSI_SHORT
# define SPH_HAMSI_SHORT 1
# endif
# include "blake.cl"
# include "bmw.cl"
@ -126,10 +130,10 @@ typedef union {
@@ -126,10 +130,10 @@ typedef union {
__attribute__ ( ( reqd_work_group_size ( WORKSIZE, 1 , 1 ) ) )
__kernel void search ( __global unsigned char* block, __global hash_t* hashes )
{
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
// blake
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
// blake
sph_u64 H0 = SPH_C64 ( 0x6A09E667F3BCC908 ) , H1 = SPH_C64 ( 0xBB67AE8584CAA73B ) ;
sph_u64 H2 = SPH_C64 ( 0x3C6EF372FE94F82B ) , H3 = SPH_C64 ( 0xA54FF53A5F1D36F1 ) ;
sph_u64 H4 = SPH_C64 ( 0x510E527FADE682D1 ) , H5 = SPH_C64 ( 0x9B05688C2B3E6C1F ) ;
@ -144,6 +148,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
@@ -144,6 +148,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF ;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7 ;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF ;
M0 = DEC64BE ( block + 0 ) ;
M1 = DEC64BE ( block + 8 ) ;
M2 = DEC64BE ( block + 16 ) ;
@ -180,18 +185,18 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
@@ -180,18 +185,18 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
__attribute__ ( ( reqd_work_group_size ( WORKSIZE, 1 , 1 ) ) )
__kernel void search1 ( __global hash_t* hashes )
{
uint gid = get_global_id ( 0 ) ;
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
// bmw
sph_u64 BMW_H[16] ;
# pragma unroll 16
# pragma unroll 16
for ( unsigned u = 0 ; u < 16; u++)
BMW_H[u] = BMW_IV512[u] ;
sph_u64 mv[16],q[32] ;
sph_u64 tmp ;
sph_u64 tmp ;
mv[0] = SWAP8 ( hash->h8[0] ) ;
mv[1] = SWAP8 ( hash->h8[1] ) ;
@ -211,7 +216,7 @@ __kernel void search1(__global hash_t* hashes)
@@ -211,7 +216,7 @@ __kernel void search1(__global hash_t* hashes)
mv[15] = SPH_C64 ( 512 ) ;
tmp = ( mv[5] ^ BMW_H[5] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) + ( mv[14] ^ BMW_H[14] ) ;
q[0] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[1] ;
q[0] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[1] ;
tmp = ( mv[6] ^ BMW_H[6] ) - ( mv[8] ^ BMW_H[8] ) + ( mv[11] ^ BMW_H[11] ) + ( mv[14] ^ BMW_H[14] ) - ( mv[15] ^ BMW_H[15] ) ;
q[1] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[2] ;
tmp = ( mv[0] ^ BMW_H[0] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
@ -221,7 +226,7 @@ __kernel void search1(__global hash_t* hashes)
@@ -221,7 +226,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[2] ^ BMW_H[2] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[14] ^ BMW_H[14] ) ;
q[4] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[5] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[2] ^ BMW_H[2] ) + ( mv[10] ^ BMW_H[10] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[5] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[6] ;
q[5] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[6] ;
tmp = ( mv[4] ^ BMW_H[4] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) - ( mv[11] ^ BMW_H[11] ) + ( mv[13] ^ BMW_H[13] ) ;
q[6] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[7] ;
tmp = ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[12] ^ BMW_H[12] ) - ( mv[14] ^ BMW_H[14] ) ;
@ -231,7 +236,7 @@ __kernel void search1(__global hash_t* hashes)
@@ -231,7 +236,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) + ( mv[6] ^ BMW_H[6] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[14] ^ BMW_H[14] ) ;
q[9] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[10] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[15] ^ BMW_H[15] ) ;
q[10] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[11] ;
q[10] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[11] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[9] ^ BMW_H[9] ) ;
q[11] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[12] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[3] ^ BMW_H[3] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[10] ^ BMW_H[10] ) ;
@ -243,60 +248,60 @@ __kernel void search1(__global hash_t* hashes)
@@ -243,60 +248,60 @@ __kernel void search1(__global hash_t* hashes)
tmp = ( mv[12] ^ BMW_H[12] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[13] ^ BMW_H[13] ) ;
q[15] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[0] ;
# pragma unroll 2
# pragma unroll 2
for ( int i=0 ;i<2;i++)
{
q[i+16] =
( SHR ( q[i], 1 ) ^ SHL ( q[i], 2 ) ^ SPH_ROTL64 ( q[i], 13 ) ^ SPH_ROTL64 ( q[i], 43 ) ) +
( SHR ( q[i+1], 2 ) ^ SHL ( q[i+1], 1 ) ^ SPH_ROTL64 ( q[i+1], 19 ) ^ SPH_ROTL64 ( q[i+1], 53 ) ) +
( SHR ( q[i+2], 2 ) ^ SHL ( q[i+2], 2 ) ^ SPH_ROTL64 ( q[i+2], 28 ) ^ SPH_ROTL64 ( q[i+2], 59 ) ) +
( SHR ( q[i+3], 1 ) ^ SHL ( q[i+3], 3 ) ^ SPH_ROTL64 ( q[i+3], 4 ) ^ SPH_ROTL64 ( q[i+3], 37 ) ) +
( SHR ( q[i+4], 1 ) ^ SHL ( q[i+4], 2 ) ^ SPH_ROTL64 ( q[i+4], 13 ) ^ SPH_ROTL64 ( q[i+4], 43 ) ) +
( SHR ( q[i+5], 2 ) ^ SHL ( q[i+5], 1 ) ^ SPH_ROTL64 ( q[i+5], 19 ) ^ SPH_ROTL64 ( q[i+5], 53 ) ) +
( SHR ( q[i+6], 2 ) ^ SHL ( q[i+6], 2 ) ^ SPH_ROTL64 ( q[i+6], 28 ) ^ SPH_ROTL64 ( q[i+6], 59 ) ) +
( SHR ( q[i+7], 1 ) ^ SHL ( q[i+7], 3 ) ^ SPH_ROTL64 ( q[i+7], 4 ) ^ SPH_ROTL64 ( q[i+7], 37 ) ) +
( SHR ( q[i+8], 1 ) ^ SHL ( q[i+8], 2 ) ^ SPH_ROTL64 ( q[i+8], 13 ) ^ SPH_ROTL64 ( q[i+8], 43 ) ) +
( SHR ( q[i+9], 2 ) ^ SHL ( q[i+9], 1 ) ^ SPH_ROTL64 ( q[i+9], 19 ) ^ SPH_ROTL64 ( q[i+9], 53 ) ) +
( SHR ( q[i+10], 2 ) ^ SHL ( q[i+10], 2 ) ^ SPH_ROTL64 ( q[i+10], 28 ) ^ SPH_ROTL64 ( q[i+10], 59 ) ) +
( SHR ( q[i+11], 1 ) ^ SHL ( q[i+11], 3 ) ^ SPH_ROTL64 ( q[i+11], 4 ) ^ SPH_ROTL64 ( q[i+11], 37 ) ) +
( SHR ( q[i+12], 1 ) ^ SHL ( q[i+12], 2 ) ^ SPH_ROTL64 ( q[i+12], 13 ) ^ SPH_ROTL64 ( q[i+12], 43 ) ) +
( SHR ( q[i+13], 2 ) ^ SHL ( q[i+13], 1 ) ^ SPH_ROTL64 ( q[i+13], 19 ) ^ SPH_ROTL64 ( q[i+13], 53 ) ) +
( SHR ( q[i+14], 2 ) ^ SHL ( q[i+14], 2 ) ^ SPH_ROTL64 ( q[i+14], 28 ) ^ SPH_ROTL64 ( q[i+14], 59 ) ) +
( SHR ( q[i+15], 1 ) ^ SHL ( q[i+15], 3 ) ^ SPH_ROTL64 ( q[i+15], 4 ) ^ SPH_ROTL64 ( q[i+15], 37 ) ) +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
( SHR ( q[i], 1 ) ^ SHL ( q[i], 2 ) ^ SPH_ROTL64 ( q[i], 13 ) ^ SPH_ROTL64 ( q[i], 43 ) ) +
( SHR ( q[i+1], 2 ) ^ SHL ( q[i+1], 1 ) ^ SPH_ROTL64 ( q[i+1], 19 ) ^ SPH_ROTL64 ( q[i+1], 53 ) ) +
( SHR ( q[i+2], 2 ) ^ SHL ( q[i+2], 2 ) ^ SPH_ROTL64 ( q[i+2], 28 ) ^ SPH_ROTL64 ( q[i+2], 59 ) ) +
( SHR ( q[i+3], 1 ) ^ SHL ( q[i+3], 3 ) ^ SPH_ROTL64 ( q[i+3], 4 ) ^ SPH_ROTL64 ( q[i+3], 37 ) ) +
( SHR ( q[i+4], 1 ) ^ SHL ( q[i+4], 2 ) ^ SPH_ROTL64 ( q[i+4], 13 ) ^ SPH_ROTL64 ( q[i+4], 43 ) ) +
( SHR ( q[i+5], 2 ) ^ SHL ( q[i+5], 1 ) ^ SPH_ROTL64 ( q[i+5], 19 ) ^ SPH_ROTL64 ( q[i+5], 53 ) ) +
( SHR ( q[i+6], 2 ) ^ SHL ( q[i+6], 2 ) ^ SPH_ROTL64 ( q[i+6], 28 ) ^ SPH_ROTL64 ( q[i+6], 59 ) ) +
( SHR ( q[i+7], 1 ) ^ SHL ( q[i+7], 3 ) ^ SPH_ROTL64 ( q[i+7], 4 ) ^ SPH_ROTL64 ( q[i+7], 37 ) ) +
( SHR ( q[i+8], 1 ) ^ SHL ( q[i+8], 2 ) ^ SPH_ROTL64 ( q[i+8], 13 ) ^ SPH_ROTL64 ( q[i+8], 43 ) ) +
( SHR ( q[i+9], 2 ) ^ SHL ( q[i+9], 1 ) ^ SPH_ROTL64 ( q[i+9], 19 ) ^ SPH_ROTL64 ( q[i+9], 53 ) ) +
( SHR ( q[i+10], 2 ) ^ SHL ( q[i+10], 2 ) ^ SPH_ROTL64 ( q[i+10], 28 ) ^ SPH_ROTL64 ( q[i+10], 59 ) ) +
( SHR ( q[i+11], 1 ) ^ SHL ( q[i+11], 3 ) ^ SPH_ROTL64 ( q[i+11], 4 ) ^ SPH_ROTL64 ( q[i+11], 37 ) ) +
( SHR ( q[i+12], 1 ) ^ SHL ( q[i+12], 2 ) ^ SPH_ROTL64 ( q[i+12], 13 ) ^ SPH_ROTL64 ( q[i+12], 43 ) ) +
( SHR ( q[i+13], 2 ) ^ SHL ( q[i+13], 1 ) ^ SPH_ROTL64 ( q[i+13], 19 ) ^ SPH_ROTL64 ( q[i+13], 53 ) ) +
( SHR ( q[i+14], 2 ) ^ SHL ( q[i+14], 2 ) ^ SPH_ROTL64 ( q[i+14], 28 ) ^ SPH_ROTL64 ( q[i+14], 59 ) ) +
( SHR ( q[i+15], 1 ) ^ SHL ( q[i+15], 3 ) ^ SPH_ROTL64 ( q[i+15], 4 ) ^ SPH_ROTL64 ( q[i+15], 37 ) ) +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 4
# pragma unroll 4
for ( int i=2 ;i<6;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 3
# pragma unroll 3
for ( int i=6 ;i<9;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i+7] ) ;
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 4
# pragma unroll 4
for ( int i=9 ;i<13;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
# pragma unroll 3
# pragma unroll 3
for ( int i=13 ;i<16;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i-13], ( i-13 ) +1 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i-13], ( i-13 ) +1 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23] ;
@ -320,7 +325,7 @@ __kernel void search1(__global hash_t* hashes)
@@ -320,7 +325,7 @@ __kernel void search1(__global hash_t* hashes)
BMW_H[14] = SPH_ROTL64 ( BMW_H[2],15 ) + ( XH64 ^ q[30] ^ mv[14] ) + ( SHR ( XL64,7 ) ^ q[21] ^ q[14] ) ;
BMW_H[15] = SPH_ROTL64 ( BMW_H[3],16 ) + ( XH64 ^ q[31] ^ mv[15] ) + ( SHR ( XL64,2 ) ^ q[22] ^ q[15] ) ;
# pragma unroll 16
# pragma unroll 16
for ( int i=0 ;i<16;i++)
{
mv[i] = BMW_H[i] ;
@ -328,7 +333,7 @@ __kernel void search1(__global hash_t* hashes)
@@ -328,7 +333,7 @@ __kernel void search1(__global hash_t* hashes)
}
tmp = ( mv[5] ^ BMW_H[5] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[10] ^ BMW_H[10] ) + ( mv[13] ^ BMW_H[13] ) + ( mv[14] ^ BMW_H[14] ) ;
q[0] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[1] ;
q[0] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[1] ;
tmp = ( mv[6] ^ BMW_H[6] ) - ( mv[8] ^ BMW_H[8] ) + ( mv[11] ^ BMW_H[11] ) + ( mv[14] ^ BMW_H[14] ) - ( mv[15] ^ BMW_H[15] ) ;
q[1] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[2] ;
tmp = ( mv[0] ^ BMW_H[0] ) + ( mv[7] ^ BMW_H[7] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
@ -338,7 +343,7 @@ __kernel void search1(__global hash_t* hashes)
@@ -338,7 +343,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[2] ^ BMW_H[2] ) + ( mv[9] ^ BMW_H[9] ) - ( mv[11] ^ BMW_H[11] ) - ( mv[14] ^ BMW_H[14] ) ;
q[4] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[5] ;
tmp = ( mv[3] ^ BMW_H[3] ) - ( mv[2] ^ BMW_H[2] ) + ( mv[10] ^ BMW_H[10] ) - ( mv[12] ^ BMW_H[12] ) + ( mv[15] ^ BMW_H[15] ) ;
q[5] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[6] ;
q[5] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[6] ;
tmp = ( mv[4] ^ BMW_H[4] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) - ( mv[11] ^ BMW_H[11] ) + ( mv[13] ^ BMW_H[13] ) ;
q[6] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[7] ;
tmp = ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[5] ^ BMW_H[5] ) - ( mv[12] ^ BMW_H[12] ) - ( mv[14] ^ BMW_H[14] ) ;
@ -348,7 +353,7 @@ __kernel void search1(__global hash_t* hashes)
@@ -348,7 +353,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = ( mv[0] ^ BMW_H[0] ) - ( mv[3] ^ BMW_H[3] ) + ( mv[6] ^ BMW_H[6] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[14] ^ BMW_H[14] ) ;
q[9] = ( SHR ( tmp, 1 ) ^ tmp ) + BMW_H[10] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[1] ^ BMW_H[1] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[7] ^ BMW_H[7] ) + ( mv[15] ^ BMW_H[15] ) ;
q[10] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[11] ;
q[10] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[11] ;
tmp = ( mv[8] ^ BMW_H[8] ) - ( mv[0] ^ BMW_H[0] ) - ( mv[2] ^ BMW_H[2] ) - ( mv[5] ^ BMW_H[5] ) + ( mv[9] ^ BMW_H[9] ) ;
q[11] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 2 ) ^ SPH_ROTL64 ( tmp, 13 ) ^ SPH_ROTL64 ( tmp, 43 ) ) + BMW_H[12] ;
tmp = ( mv[1] ^ BMW_H[1] ) + ( mv[3] ^ BMW_H[3] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[10] ^ BMW_H[10] ) ;
@ -360,64 +365,65 @@ __kernel void search1(__global hash_t* hashes)
@@ -360,64 +365,65 @@ __kernel void search1(__global hash_t* hashes)
tmp = ( mv[12] ^ BMW_H[12] ) - ( mv[4] ^ BMW_H[4] ) - ( mv[6] ^ BMW_H[6] ) - ( mv[9] ^ BMW_H[9] ) + ( mv[13] ^ BMW_H[13] ) ;
q[15] = ( SHR ( tmp, 1 ) ^ SHL ( tmp, 3 ) ^ SPH_ROTL64 ( tmp, 4 ) ^ SPH_ROTL64 ( tmp, 37 ) ) + BMW_H[0] ;
# pragma unroll 2
# pragma unroll 2
for ( int i=0 ;i<2;i++)
{
q[i+16] =
( SHR ( q[i], 1 ) ^ SHL ( q[i], 2 ) ^ SPH_ROTL64 ( q[i], 13 ) ^ SPH_ROTL64 ( q[i], 43 ) ) +
( SHR ( q[i+1], 2 ) ^ SHL ( q[i+1], 1 ) ^ SPH_ROTL64 ( q[i+1], 19 ) ^ SPH_ROTL64 ( q[i+1], 53 ) ) +
( SHR ( q[i+2], 2 ) ^ SHL ( q[i+2], 2 ) ^ SPH_ROTL64 ( q[i+2], 28 ) ^ SPH_ROTL64 ( q[i+2], 59 ) ) +
( SHR ( q[i+3], 1 ) ^ SHL ( q[i+3], 3 ) ^ SPH_ROTL64 ( q[i+3], 4 ) ^ SPH_ROTL64 ( q[i+3], 37 ) ) +
( SHR ( q[i+4], 1 ) ^ SHL ( q[i+4], 2 ) ^ SPH_ROTL64 ( q[i+4], 13 ) ^ SPH_ROTL64 ( q[i+4], 43 ) ) +
( SHR ( q[i+5], 2 ) ^ SHL ( q[i+5], 1 ) ^ SPH_ROTL64 ( q[i+5], 19 ) ^ SPH_ROTL64 ( q[i+5], 53 ) ) +
( SHR ( q[i+6], 2 ) ^ SHL ( q[i+6], 2 ) ^ SPH_ROTL64 ( q[i+6], 28 ) ^ SPH_ROTL64 ( q[i+6], 59 ) ) +
( SHR ( q[i+7], 1 ) ^ SHL ( q[i+7], 3 ) ^ SPH_ROTL64 ( q[i+7], 4 ) ^ SPH_ROTL64 ( q[i+7], 37 ) ) +
( SHR ( q[i+8], 1 ) ^ SHL ( q[i+8], 2 ) ^ SPH_ROTL64 ( q[i+8], 13 ) ^ SPH_ROTL64 ( q[i+8], 43 ) ) +
( SHR ( q[i+9], 2 ) ^ SHL ( q[i+9], 1 ) ^ SPH_ROTL64 ( q[i+9], 19 ) ^ SPH_ROTL64 ( q[i+9], 53 ) ) +
( SHR ( q[i+10], 2 ) ^ SHL ( q[i+10], 2 ) ^ SPH_ROTL64 ( q[i+10], 28 ) ^ SPH_ROTL64 ( q[i+10], 59 ) ) +
( SHR ( q[i+11], 1 ) ^ SHL ( q[i+11], 3 ) ^ SPH_ROTL64 ( q[i+11], 4 ) ^ SPH_ROTL64 ( q[i+11], 37 ) ) +
( SHR ( q[i+12], 1 ) ^ SHL ( q[i+12], 2 ) ^ SPH_ROTL64 ( q[i+12], 13 ) ^ SPH_ROTL64 ( q[i+12], 43 ) ) +
( SHR ( q[i+13], 2 ) ^ SHL ( q[i+13], 1 ) ^ SPH_ROTL64 ( q[i+13], 19 ) ^ SPH_ROTL64 ( q[i+13], 53 ) ) +
( SHR ( q[i+14], 2 ) ^ SHL ( q[i+14], 2 ) ^ SPH_ROTL64 ( q[i+14], 28 ) ^ SPH_ROTL64 ( q[i+14], 59 ) ) +
( SHR ( q[i+15], 1 ) ^ SHL ( q[i+15], 3 ) ^ SPH_ROTL64 ( q[i+15], 4 ) ^ SPH_ROTL64 ( q[i+15], 37 ) ) +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
( SHR ( q[i], 1 ) ^ SHL ( q[i], 2 ) ^ SPH_ROTL64 ( q[i], 13 ) ^ SPH_ROTL64 ( q[i], 43 ) ) +
( SHR ( q[i+1], 2 ) ^ SHL ( q[i+1], 1 ) ^ SPH_ROTL64 ( q[i+1], 19 ) ^ SPH_ROTL64 ( q[i+1], 53 ) ) +
( SHR ( q[i+2], 2 ) ^ SHL ( q[i+2], 2 ) ^ SPH_ROTL64 ( q[i+2], 28 ) ^ SPH_ROTL64 ( q[i+2], 59 ) ) +
( SHR ( q[i+3], 1 ) ^ SHL ( q[i+3], 3 ) ^ SPH_ROTL64 ( q[i+3], 4 ) ^ SPH_ROTL64 ( q[i+3], 37 ) ) +
( SHR ( q[i+4], 1 ) ^ SHL ( q[i+4], 2 ) ^ SPH_ROTL64 ( q[i+4], 13 ) ^ SPH_ROTL64 ( q[i+4], 43 ) ) +
( SHR ( q[i+5], 2 ) ^ SHL ( q[i+5], 1 ) ^ SPH_ROTL64 ( q[i+5], 19 ) ^ SPH_ROTL64 ( q[i+5], 53 ) ) +
( SHR ( q[i+6], 2 ) ^ SHL ( q[i+6], 2 ) ^ SPH_ROTL64 ( q[i+6], 28 ) ^ SPH_ROTL64 ( q[i+6], 59 ) ) +
( SHR ( q[i+7], 1 ) ^ SHL ( q[i+7], 3 ) ^ SPH_ROTL64 ( q[i+7], 4 ) ^ SPH_ROTL64 ( q[i+7], 37 ) ) +
( SHR ( q[i+8], 1 ) ^ SHL ( q[i+8], 2 ) ^ SPH_ROTL64 ( q[i+8], 13 ) ^ SPH_ROTL64 ( q[i+8], 43 ) ) +
( SHR ( q[i+9], 2 ) ^ SHL ( q[i+9], 1 ) ^ SPH_ROTL64 ( q[i+9], 19 ) ^ SPH_ROTL64 ( q[i+9], 53 ) ) +
( SHR ( q[i+10], 2 ) ^ SHL ( q[i+10], 2 ) ^ SPH_ROTL64 ( q[i+10], 28 ) ^ SPH_ROTL64 ( q[i+10], 59 ) ) +
( SHR ( q[i+11], 1 ) ^ SHL ( q[i+11], 3 ) ^ SPH_ROTL64 ( q[i+11], 4 ) ^ SPH_ROTL64 ( q[i+11], 37 ) ) +
( SHR ( q[i+12], 1 ) ^ SHL ( q[i+12], 2 ) ^ SPH_ROTL64 ( q[i+12], 13 ) ^ SPH_ROTL64 ( q[i+12], 43 ) ) +
( SHR ( q[i+13], 2 ) ^ SHL ( q[i+13], 1 ) ^ SPH_ROTL64 ( q[i+13], 19 ) ^ SPH_ROTL64 ( q[i+13], 53 ) ) +
( SHR ( q[i+14], 2 ) ^ SHL ( q[i+14], 2 ) ^ SPH_ROTL64 ( q[i+14], 28 ) ^ SPH_ROTL64 ( q[i+14], 59 ) ) +
( SHR ( q[i+15], 1 ) ^ SHL ( q[i+15], 3 ) ^ SPH_ROTL64 ( q[i+15], 4 ) ^ SPH_ROTL64 ( q[i+15], 37 ) ) +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 4
# pragma unroll 4
for ( int i=2 ;i<6;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i+10], i+11 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 3
# pragma unroll 3
for ( int i=6 ;i<9;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i+7] ) ;
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i+7] ) ;
}
# pragma unroll 4
# pragma unroll 4
for ( int i=9 ;i<13;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i+3], i+4 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
# pragma unroll 3
# pragma unroll 3
for ( int i=13 ;i<16;i++)
{
q[i+16] = CONST_EXP2 +
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i-13], ( i-13 ) +1 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
( ( ( ( i+16 ) * ( 0x0555555555555555ull ) ) + SPH_ROTL64 ( mv[i], i+1 ) +
SPH_ROTL64 ( mv[i-13], ( i-13 ) +1 ) - SPH_ROTL64 ( mv[i-6], ( i-6 ) +1 ) ) ^ BMW_H[i-9] ) ;
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23] ;
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31] ;
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31] ;
BMW_H[0] = ( SHL ( XH64, 5 ) ^ SHR ( q[16],5 ) ^ mv[0] ) + ( XL64 ^ q[24] ^ q[0] ) ;
BMW_H[1] = ( SHR ( XH64, 7 ) ^ SHL ( q[17],8 ) ^ mv[1] ) + ( XL64 ^ q[25] ^ q[1] ) ;
BMW_H[2] = ( SHR ( XH64, 5 ) ^ SHL ( q[18],5 ) ^ mv[2] ) + ( XL64 ^ q[26] ^ q[2] ) ;
@ -454,94 +460,67 @@ __kernel void search2(__global hash_t* hashes)
@@ -454,94 +460,67 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
# if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256] ;
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256] ;
# else
__local sph_u64 T0_C[256], T4_C[256] ;
# endif
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256] ;
int init = get_local_id ( 0 ) ;
int step = get_local_size ( 0 ) ;
for ( int i = init ; i < 256; i += step)
{
T0_C[i] = T0[i] ;
T4_C[i] = T4[i] ;
# if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i] ;
T2_C[i] = T2[i] ;
T3_C[i] = T3[i] ;
T5_C[i] = T5[i] ;
T6_C[i] = T6[i] ;
T7_C[i] = T7[i] ;
# endif
T0_L[i] = T0[i] ;
T4_L[i] = T4[i] ;
T1_L[i] = T1[i] ;
T2_L[i] = T2[i] ;
T3_L[i] = T3[i] ;
T5_L[i] = T5[i] ;
T6_L[i] = T6[i] ;
T7_L[i] = T7[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ; // groestl
barrier ( CLK_LOCAL_MEM_FENCE ) ;
# define T0 T0_C
# define T1 T1_C
# define T2 T2_C
# define T3 T3_C
# define T4 T4_C
# define T5 T5_C
# define T6 T6_C
# define T7 T7_C
# define T0 T0_L
# define T1 T1_L
# define T2 T2_L
# define T3 T3_L
# define T4 T4_L
# define T5 T5_L
# define T6 T6_L
# define T7 T7_L
// groestl
sph_u64 H[16] ;
for ( unsigned int u = 0 ; u < 15; u ++)
H[u] = 0 ;
# if USE_LE
H[15] = ( ( sph_u64 ) ( 512 & 0xFF ) << 56 ) | ( ( sph_u64 ) ( 512 & 0xFF00 ) << 40 ) ;
# else
H[15] = ( sph_u64 ) 512 ;
# endif
sph_u64 H[16] = {0, 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0x0002000000000000} ;
sph_u64 g[16], m[16] ;
m[0] = DEC64E ( hash->h8[0] ) ;
m[1] = DEC64E ( hash->h8[1] ) ;
m[2] = DEC64E ( hash->h8[2] ) ;
m[3] = DEC64E ( hash->h8[3] ) ;
m[4] = DEC64E ( hash->h8[4] ) ;
m[5] = DEC64E ( hash->h8[5] ) ;
m[6] = DEC64E ( hash->h8[6] ) ;
m[7] = DEC64E ( hash->h8[7] ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
g[u] = m[u] ^ H[u] ;
m[8] = 0x80 ; g[8] = m[8] ^ H[8];
m[9] = 0 ; g[9] = m[9] ^ H[9];
m[10] = 0 ; g[10] = m[10] ^ H[10];
m[11] = 0 ; g[11] = m[11] ^ H[11];
m[12] = 0 ; g[12] = m[12] ^ H[12];
m[13] = 0 ; g[13] = m[13] ^ H[13];
m[14] = 0 ; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000 ; g[15] = m[15] ^ H[15];
g[0] = m[0] = DEC64E ( hash->h8[0] ) ;
g[1] = m[1] = DEC64E ( hash->h8[1] ) ;
g[2] = m[2] = DEC64E ( hash->h8[2] ) ;
g[3] = m[3] = DEC64E ( hash->h8[3] ) ;
g[4] = m[4] = DEC64E ( hash->h8[4] ) ;
g[5] = m[5] = DEC64E ( hash->h8[5] ) ;
g[6] = m[6] = DEC64E ( hash->h8[6] ) ;
g[7] = m[7] = DEC64E ( hash->h8[7] ) ;
g[8] = m[8] = 0x80 ;
g[9] = m[9] = 0 ;
g[10] = m[10] = 0 ;
g[11] = m[11] = 0 ;
g[12] = m[12] = 0 ;
g[13] = m[13] = 0 ;
g[14] = m[14] = 0 ;
g[15] = 0x102000000000000 ;
m[15] = 0x100000000000000 ;
PERM_BIG_P ( g ) ;
PERM_BIG_Q ( m ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
H[u] ^= g[u] ^ m[u] ;
sph_u64 xH[16] ;
for ( unsigned int u = 0 ; u < 16; u ++)
xH[u] = H[u] ;
xH[u] = H[u] ^= g[u] ^ m[u] ;
PERM_BIG_P ( xH ) ;
for ( unsigned int u = 0 ; u < 16; u ++)
H[u] ^= xH[u] ;
for ( unsigned int u = 0 ; u < 8; u ++)
hash->h8[u] = DEC64E ( H[u + 8] ) ;
for ( unsigned int u = 8 ; u < 16; u ++)
hash->h8[u-8] = DEC64E ( H[u] ^ xH[u] ) ;
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
}
@ -566,10 +545,14 @@ __kernel void search3(__global hash_t* hashes)
@@ -566,10 +545,14 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8 ( hash->h8[5] ) ;
m6 = SWAP8 ( hash->h8[6] ) ;
m7 = SWAP8 ( hash->h8[7] ) ;
UBI_BIG ( 480 , 64 ) ;
bcount = 0 ;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0 ;
UBI_BIG ( 510 , 8 ) ;
hash->h8[0] = SWAP8 ( h0 ) ;
hash->h8[1] = SWAP8 ( h1 ) ;
hash->h8[2] = SWAP8 ( h2 ) ;
@ -588,7 +571,7 @@ __kernel void search4(__global hash_t* hashes)
@@ -588,7 +571,7 @@ __kernel void search4(__global hash_t* hashes)
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
// jh
// jh
sph_u64 h0h = C64e ( 0x6fd14b963e00aa17 ) , h0l = C64e ( 0x636a2e057a15d543 ) , h1h = C64e ( 0x8a225e8d0c97ef0b ) , h1l = C64e ( 0xe9341259f2b3c361 ) , h2h = C64e ( 0x891da0c1536f801e ) , h2l = C64e ( 0x2aa9056bea2b6d80 ) , h3h = C64e ( 0x588eccdb2075baa6 ) , h3l = C64e ( 0xa90f3a76baf83bf7 ) ;
sph_u64 h4h = C64e ( 0x0169e60541e34a69 ) , h4l = C64e ( 0x46b58a8e2e6fe65a ) , h5h = C64e ( 0x1047a7d0c1843c24 ) , h5l = C64e ( 0x3b6e71b12d5ac199 ) , h6h = C64e ( 0xcf57f6ec9db1f856 ) , h6l = C64e ( 0xa706887c5716b156 ) , h7h = C64e ( 0xe3c2fcdfe68517fb ) , h7l = C64e ( 0x545a4678cc8cdd4b ) ;
@ -669,6 +652,7 @@ __kernel void search5(__global hash_t* hashes)
@@ -669,6 +652,7 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8 ( hash->h8[7] ) ;
a31 ^= 0x8000000000000001 ;
KECCAK_F_1600 ;
// Finalize the "lane complement"
a10 = ~a10 ;
a20 = ~a20 ;
@ -830,6 +814,7 @@ __kernel void search8(__global hash_t* hashes)
@@ -830,6 +814,7 @@ __kernel void search8(__global hash_t* hashes)
{
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256] ;
int init = get_local_id ( 0 ) ;
@ -858,7 +843,7 @@ __kernel void search8(__global hash_t* hashes)
@@ -858,7 +843,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17 ;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F ;
sph_u32 sc_count0 = ( 64 << 3 ) , sc_count1 = 0 , sc_count2 = 0 , sc_count3 = 0 ;
sph_u32 sc_count0 = 0x200 , sc_count1 = 0 , sc_count2 = 0 , sc_count3 = 0 ;
rk00 = hash->h4[0] ;
rk01 = hash->h4[1] ;
@ -913,10 +898,8 @@ __kernel void search9(__global hash_t* hashes)
@@ -913,10 +898,8 @@ __kernel void search9(__global hash_t* hashes)
// simd
s32 q[256] ;
unsigned char x[128] ;
for ( unsigned int i = 0 ; i < 64; i++)
x[i] = hash->h1[i] ;
for ( unsigned int i = 64 ; i < 128; i++)
x[i] = 0 ;
@ -926,14 +909,15 @@ __kernel void search9(__global hash_t* hashes)
@@ -926,14 +909,15 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32 ( 0x09254899 ) , D1 = C32 ( 0xD699C7BC ) , D2 = C32 ( 0x9019B6DC ) , D3 = C32 ( 0x2B9022E4 ) , D4 = C32 ( 0x8FA14956 ) , D5 = C32 ( 0x21BF9BD3 ) , D6 = C32 ( 0xB94D0943 ) , D7 = C32 ( 0x6FFDDC22 ) ;
FFT256 ( 0 , 1 , 0 , ll1 ) ;
for ( int i = 0 ; i < 256; i ++) {
s32 tq ;
tq = q[i] + yoff_b_n[i] ;
tq = REDS2 ( tq ) ;
tq = REDS1 ( tq ) ;
tq = REDS1 ( tq ) ;
q[i] = ( tq <= 128 ? tq : tq - 257 ) ;
for ( int i = 0 ; i < 256; i ++)
{
s32 tq ;
tq = q[i] + yoff_b_n[i] ;
tq = REDS2 ( tq ) ;
tq = REDS1 ( tq ) ;
tq = REDS1 ( tq ) ;
q[i] = ( tq <= 128 ? tq : tq - 257 ) ;
}
A0 ^= hash->h4[0] ;
@ -959,21 +943,24 @@ __kernel void search9(__global hash_t* hashes)
@@ -959,21 +943,24 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG ( 3_, 3 , 4 , 13 , 10 , 25 ) ;
STEP_BIG (
C32 ( 0x0BA16B95 ) , C32 ( 0x72F999AD ) , C32 ( 0x9FECC2AE ) , C32 ( 0xBA3264FC ) ,
C32 ( 0x5E894929 ) , C32 ( 0x8E9F30E5 ) , C32 ( 0x2F1DAA37 ) , C32 ( 0xF0F2C558 ) ,
IF, 4 , 13 , PP8_4_ ) ;
C32 ( 0x0BA16B95 ) , C32 ( 0x72F999AD ) , C32 ( 0x9FECC2AE ) , C32 ( 0xBA3264FC ) ,
C32 ( 0x5E894929 ) , C32 ( 0x8E9F30E5 ) , C32 ( 0x2F1DAA37 ) , C32 ( 0xF0F2C558 ) ,
IF, 4 , 13 , PP8_4_ ) ;
STEP_BIG (
C32 ( 0xAC506643 ) , C32 ( 0xA90635A5 ) , C32 ( 0xE25B878B ) , C32 ( 0xAAB7878F ) ,
C32 ( 0x88817F7A ) , C32 ( 0x0A02892B ) , C32 ( 0x559A7550 ) , C32 ( 0x598F657E ) ,
IF, 13 , 10 , PP8_5_ ) ;
C32 ( 0xAC506643 ) , C32 ( 0xA90635A5 ) , C32 ( 0xE25B878B ) , C32 ( 0xAAB7878F ) ,
C32 ( 0x88817F7A ) , C32 ( 0x0A02892B ) , C32 ( 0x559A7550 ) , C32 ( 0x598F657E ) ,
IF, 13 , 10 , PP8_5_ ) ;
STEP_BIG (
C32 ( 0x7EEF60A1 ) , C32 ( 0x6B70E3E8 ) , C32 ( 0x9C1714D1 ) , C32 ( 0xB958E2A8 ) ,
C32 ( 0xAB02675E ) , C32 ( 0xED1C014F ) , C32 ( 0xCD8D65BB ) , C32 ( 0xFDB7A257 ) ,
IF, 10 , 25 , PP8_6_ ) ;
C32 ( 0x7EEF60A1 ) , C32 ( 0x6B70E3E8 ) , C32 ( 0x9C1714D1 ) , C32 ( 0xB958E2A8 ) ,
C32 ( 0xAB02675E ) , C32 ( 0xED1C014F ) , C32 ( 0xCD8D65BB ) , C32 ( 0xFDB7A257 ) ,
IF, 10 , 25 , PP8_6_ ) ;
STEP_BIG (
C32 ( 0x09254899 ) , C32 ( 0xD699C7BC ) , C32 ( 0x9019B6DC ) , C32 ( 0x2B9022E4 ) ,
C32 ( 0x8FA14956 ) , C32 ( 0x21BF9BD3 ) , C32 ( 0xB94D0943 ) , C32 ( 0x6FFDDC22 ) ,
IF, 25 , 4 , PP8_0_ ) ;
C32 ( 0x09254899 ) , C32 ( 0xD699C7BC ) , C32 ( 0x9019B6DC ) , C32 ( 0x2B9022E4 ) ,
C32 ( 0x8FA14956 ) , C32 ( 0x21BF9BD3 ) , C32 ( 0xB94D0943 ) , C32 ( 0x6FFDDC22 ) ,
IF, 25 , 4 , PP8_0_ ) ;
u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7 ;
u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7 ;
@ -988,22 +975,27 @@ __kernel void search9(__global hash_t* hashes)
@@ -988,22 +975,27 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG ( 1_, 1 , 28 , 19 , 22 , 7 ) ;
ONE_ROUND_BIG ( 2_, 2 , 29 , 9 , 15 , 5 ) ;
ONE_ROUND_BIG ( 3_, 3 , 4 , 13 , 10 , 25 ) ;
STEP_BIG (
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4 , 13 , PP8_4_ ) ;
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4 , 13 , PP8_4_ ) ;
STEP_BIG (
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13 , 10 , PP8_5_ ) ;
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13 , 10 , PP8_5_ ) ;
STEP_BIG (
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10 , 25 , PP8_6_ ) ;
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10 , 25 , PP8_6_ ) ;
STEP_BIG (
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25 , 4 , PP8_0_ ) ;
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25 , 4 , PP8_0_ ) ;
# undef q
hash->h4[0] = A0 ;
@ -1114,49 +1106,52 @@ __kernel void search10(__global hash_t* hashes)
@@ -1114,49 +1106,52 @@ __kernel void search10(__global hash_t* hashes)
__attribute__ ( ( reqd_work_group_size ( WORKSIZE, 1 , 1 ) ) )
__kernel void search11 ( __global hash_t* hashes )
{
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
uint gid = get_global_id ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-get_global_offset ( 0 ) ] ) ;
__local sph_u32 T512_L[1024] ;
__constant const sph_u32 *T512_C = &T512[0][0] ;
__local sph_u32 T512_L[1024] ;
__constant const sph_u32 *T512_C = &T512[0][0] ;
int init = get_local_id ( 0 ) ;
int step = get_local_size ( 0 ) ;
for ( int i = init ; i < 1024; i += step)
{
T512_L[i] = T512_C[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
int init = get_local_id ( 0 ) ;
int step = get_local_size ( 0 ) ;
for ( int i = init ; i < 1024; i += step)
T512_L[i] = T512_C[i] ;
{
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3] ;
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7] ;
sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11] ;
sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15] ;
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7 ;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF ;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF } ;
# define buf ( u ) hash->h1[i + u]
for ( int i = 0 ; i < 64; i += 8) {
INPUT_BIG_LOCAL ;
P_BIG ;
T_BIG ;
}
# undef buf
# define buf ( u ) ( u == 0 ? 0x80 : 0 )
INPUT_BIG_LOCAL ;
P_BIG ;
T_BIG ;
# undef buf
# define buf ( u ) ( u == 6 ? 2 : 0 )
INPUT_BIG_LOCAL ;
PF_BIG ;
T_BIG ;
for ( unsigned u = 0 ; u < 16; u ++)
hash->h4[u] = h[u] ;
}
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
barrier ( CLK_LOCAL_MEM_FENCE ) ;
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3] ;
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7] ;
sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11] ;
sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15] ;
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7 ;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF ;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF } ;
# define buf ( u ) hash->h1[i + u]
for ( int i = 0 ; i < 64; i += 8) {
INPUT_BIG_LOCAL ;
P_BIG ;
T_BIG ;
}
# undef buf
# define buf ( u ) ( u == 0 ? 0x80 : 0 )
INPUT_BIG_LOCAL ;
P_BIG ;
T_BIG ;
# undef buf
# define buf ( u ) ( u == 6 ? 2 : 0 )
INPUT_BIG_LOCAL ;
PF_BIG ;
T_BIG ;
for ( unsigned u = 0 ; u < 16; u ++)
hash->h4[u] = h[u] ;
barrier ( CLK_GLOBAL_MEM_FENCE ) ;
}
__attribute__ ( ( reqd_work_group_size ( WORKSIZE, 1 , 1 ) ) )
@ -1346,115 +1341,115 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
@@ -1346,115 +1341,115 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
uint offset = get_global_offset ( 0 ) ;
__global hash_t *hash = & ( hashes[gid-offset] ) ;
__local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256] ;
__local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256] ;
int init = get_local_id ( 0 ) ;
int step = get_local_size ( 0 ) ;
int init = get_local_id ( 0 ) ;
int step = get_local_size ( 0 ) ;
for ( int i = init ; i < 256; i += step)
{
LT0[i] = plain_T0[i] ;
LT1[i] = plain_T1[i] ;
LT2[i] = plain_T2[i] ;
LT3[i] = plain_T3[i] ;
LT4[i] = plain_T4[i] ;
LT5[i] = plain_T5[i] ;
LT6[i] = plain_T6[i] ;
LT7[i] = plain_T7[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
for ( int i = init ; i < 256; i += step)
{
LT0[i] = plain_T0[i] ;
LT1[i] = plain_T1[i] ;
LT2[i] = plain_T2[i] ;
LT3[i] = plain_T3[i] ;
LT4[i] = plain_T4[i] ;
LT5[i] = plain_T5[i] ;
LT6[i] = plain_T6[i] ;
LT7[i] = plain_T7[i] ;
}
barrier ( CLK_LOCAL_MEM_FENCE ) ;
// whirlpool
sph_u64 n0, n1, n2, n3, n4, n5, n6, n7 ;
sph_u64 h0, h1, h2, h3, h4, h5, h6, h7 ;
sph_u64 state[8] ;
n0 = ( hash->h8[0] ) ;
n1 = ( hash->h8[1] ) ;
n2 = ( hash->h8[2] ) ;
n3 = ( hash->h8[3] ) ;
n4 = ( hash->h8[4] ) ;
n5 = ( hash->h8[5] ) ;
n6 = ( hash->h8[6] ) ;
n7 = ( hash->h8[7] ) ;
h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0 ;
n0 ^= h0 ;
n1 ^= h1 ;
n2 ^= h2 ;
n3 ^= h3 ;
n4 ^= h4 ;
n5 ^= h5 ;
n6 ^= h6 ;
n7 ^= h7 ;
for ( unsigned r = 0 ; r < 10; r ++) {
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 ;
ROUND_KSCHED ( plain_T, h, tmp, plain_RC[r] ) ;
TRANSFER ( h, tmp ) ;
ROUND_WENC ( plain_T, n, h, tmp ) ;
TRANSFER ( n, tmp ) ;
}
state[0] = n0 ^ ( hash->h8[0] ) ;
state[1] = n1 ^ ( hash->h8[1] ) ;
state[2] = n2 ^ ( hash->h8[2] ) ;
state[3] = n3 ^ ( hash->h8[3] ) ;
state[4] = n4 ^ ( hash->h8[4] ) ;
state[5] = n5 ^ ( hash->h8[5] ) ;
state[6] = n6 ^ ( hash->h8[6] ) ;
state[7] = n7 ^ ( hash->h8[7] ) ;
n0 = 0x80 ;
n1 = n2 = n3 = n4 = n5 = n6 = 0 ;
n7 = 0x2000000000000 ;
h0 = state[0] ;
h1 = state[1] ;
h2 = state[2] ;
h3 = state[3] ;
h4 = state[4] ;
h5 = state[5] ;
h6 = state[6] ;
h7 = state[7] ;
n0 ^= h0 ;
n1 ^= h1 ;
n2 ^= h2 ;
n3 ^= h3 ;
n4 ^= h4 ;
n5 ^= h5 ;
n6 ^= h6 ;
n7 ^= h7 ;
for ( unsigned r = 0 ; r < 10; r ++) {
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 ;
ROUND_KSCHED ( LT, h, tmp, plain_RC[r] ) ;
TRANSFER ( h, tmp ) ;
ROUND_WENC ( plain_T, n, h, tmp ) ;
TRANSFER ( n, tmp ) ;
}
// whirlpool
sph_u64 n0, n1, n2, n3, n4, n5, n6, n7 ;
sph_u64 h0, h1, h2, h3, h4, h5, h6, h7 ;
sph_u64 state[8] ;
n0 = ( hash->h8[0] ) ;
n1 = ( hash->h8[1] ) ;
n2 = ( hash->h8[2] ) ;
n3 = ( hash->h8[3] ) ;
n4 = ( hash->h8[4] ) ;
n5 = ( hash->h8[5] ) ;
n6 = ( hash->h8[6] ) ;
n7 = ( hash->h8[7] ) ;
h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0 ;
n0 ^= h0 ;
n1 ^= h1 ;
n2 ^= h2 ;
n3 ^= h3 ;
n4 ^= h4 ;
n5 ^= h5 ;
n6 ^= h6 ;
n7 ^= h7 ;
# pragma unroll 10
for ( unsigned r = 0 ; r < 10; r ++)
{
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 ;
state[0] ^= n0 ^ 0x80 ;
state[1] ^= n1 ;
state[2] ^= n2 ;
state[3] ^= n3 ;
state[4] ^= n4 ;
state[5] ^= n5 ;
state[6] ^= n6 ;
state[7] ^= n7 ^ 0x2000000000000 ;
ROUND_KSCHED ( plain_T, h, tmp, plain_RC[r] ) ;
TRANSFER ( h, tmp ) ;
ROUND_WENC ( plain_T, n, h, tmp ) ;
TRANSFER ( n, tmp ) ;
}
for ( unsigned i = 0 ; i < 8; i ++)
hash->h8[i] = state[i] ;
state[0] = n0 ^ ( hash->h8[0] ) ;
state[1] = n1 ^ ( hash->h8[1] ) ;
state[2] = n2 ^ ( hash->h8[2] ) ;
state[3] = n3 ^ ( hash->h8[3] ) ;
state[4] = n4 ^ ( hash->h8[4] ) ;
state[5] = n5 ^ ( hash->h8[5] ) ;
state[6] = n6 ^ ( hash->h8[6] ) ;
state[7] = n7 ^ ( hash->h8[7] ) ;
n0 = 0x80 ;
n1 = n2 = n3 = n4 = n5 = n6 = 0 ;
n7 = 0x2000000000000 ;
h0 = state[0] ;
h1 = state[1] ;
h2 = state[2] ;
h3 = state[3] ;
h4 = state[4] ;
h5 = state[5] ;
h6 = state[6] ;
h7 = state[7] ;
n0 ^= h0 ;
n1 ^= h1 ;
n2 ^= h2 ;
n3 ^= h3 ;
n4 ^= h4 ;
n5 ^= h5 ;
n6 ^= h6 ;
n7 ^= h7 ;
# pragma unroll 10
for ( unsigned r = 0 ; r < 10; r ++)
{
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 ;
// for ( uint i = 0 ; i < 8; i++)
// output[ ( NUMHASH * 9 ) * 15 + gid * 9 + i] = hash->h8[i] ;
ROUND_KSCHED ( LT, h, tmp, plain_RC[r] ) ;
TRANSFER ( h, tmp ) ;
ROUND_WENC ( plain_T, n, h, tmp ) ;
TRANSFER ( n, tmp ) ;
}
// output[ ( NUMHASH * 9 ) * 15 + gid * 9 + 8] = nonce ;
state[0] ^= n0 ^ 0x80 ;
state[1] ^= n1 ;
state[2] ^= n2 ;
state[3] ^= n3 ;
state[4] ^= n4 ;
state[5] ^= n5 ;
state[6] ^= n6 ;
state[7] ^= n7 ^ 0x2000000000000 ;
for ( unsigned i = 0 ; i < 8; i ++)
hash->h8[i] = state[i] ;
bool result = ( hash->h8[3] <= target ) ;
if ( result )