/* * Sifcoin kernel implementation. * * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * ===========================(LICENSE END)============================= * * @author phm */ #ifndef SIFCOIN_CL #define SIFCOIN_CL #if __ENDIAN_LITTLE__ #define SPH_LITTLE_ENDIAN 1 #else #define SPH_BIG_ENDIAN 1 #endif #define SPH_UPTR sph_u64 typedef unsigned int sph_u32; typedef int sph_s32; #ifndef __OPENCL_VERSION__ typedef unsigned long long sph_u64; typedef long long sph_s64; #else typedef unsigned long sph_u64; typedef long sph_s64; #endif #define SPH_64 1 #define SPH_64_TRUE 1 #define SPH_C32(x) ((sph_u32)(x ## U)) #define SPH_T32(x) (as_uint(x)) #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_C64(x) ((sph_u64)(x ## UL)) #define SPH_T64(x) (as_ulong(x)) #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ECHO_64 1 #define SPH_KECCAK_64 1 #define SPH_JH_64 1 #define SPH_SIMD_NOCOPY 0 #define SPH_KECCAK_NOCOPY 0 #define SPH_SMALL_FOOTPRINT_GROESTL 0 #define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_CUBEHASH_UNROLL 0 #ifndef SPH_COMPACT_BLAKE_64 #define SPH_COMPACT_BLAKE_64 0 #endif #ifndef SPH_LUFFA_PARALLEL #define SPH_LUFFA_PARALLEL 0 #endif #ifndef SPH_KECCAK_UNROLL #define SPH_KECCAK_UNROLL 0 #endif #include "blake.cl" #include "bmw.cl" #include "groestl.cl" #include "jh.cl" #include "keccak.cl" #include "skein.cl" #define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP8(x) as_ulong(as_uchar8(x).s76543210) #if SPH_BIG_ENDIAN #define DEC64E(x) (x) #define DEC64BE(x) (*(const __global sph_u64 *) (x)); #else #define DEC64E(x) SWAP8(x) #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); #endif #define SHL(x, n) ((x) << (n)) #define SHR(x, n) ((x) >> (n)) #define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) { uint gid = get_global_id(0); union { unsigned char h1[64]; uint h4[16]; ulong h8[8]; } hash; // blake { sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; if ((T0 = SPH_T64(T0 + 1024)) < 1024) { T1 = SPH_T64(T1 + 1); } sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; M0 = DEC64BE(block + 0); M1 = DEC64BE(block + 8); M2 = DEC64BE(block + 16); M3 = DEC64BE(block + 24); M4 = DEC64BE(block + 32); M5 = DEC64BE(block + 40); M6 = DEC64BE(block + 48); M7 = DEC64BE(block + 56); M8 = DEC64BE(block + 64); M9 = DEC64BE(block + 72); M9 &= 0xFFFFFFFF00000000; M9 ^= SWAP4(gid); MA = 0x8000000000000000; MB = 0; MC = 0; MD = 1; ME = 0; MF = 0x280; COMPRESS64; hash.h8[0] = H0; hash.h8[1] = H1; hash.h8[2] = H2; hash.h8[3] = H3; hash.h8[4] = H4; hash.h8[5] = H5; hash.h8[6] = H6; hash.h8[7] = H7; } // bmw { sph_u64 BMW_H[16]; for(unsigned u = 0; u < 16; u++) BMW_H[u] = BMW_IV512[u]; sph_u64 mv[16],q[32]; sph_u64 tmp; mv[ 0] = SWAP8(hash.h8[0]); mv[ 1] = SWAP8(hash.h8[1]); mv[ 2] = SWAP8(hash.h8[2]); mv[ 3] = SWAP8(hash.h8[3]); mv[ 4] = SWAP8(hash.h8[4]); mv[ 5] = SWAP8(hash.h8[5]); mv[ 6] = SWAP8(hash.h8[6]); mv[ 7] = SWAP8(hash.h8[7]); mv[ 8] = 0x80; mv[ 9] = 0; mv[10] = 0; mv[11] = 0; mv[12] = 0; mv[13] = 0; mv[14] = 0; mv[15] = 0x200; tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; #pragma unroll 2 for(int i=0;i<2;i++) { q[i+16] = (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } #pragma unroll 4 for(int i=2;i<6;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } #pragma unroll 3 for(int i=6;i<9;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); } #pragma unroll 4 for(int i=9;i<13;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); } #pragma unroll 3 for(int i=13;i<16;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); } sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); #pragma unroll 16 for(int i=0;i<16;i++) { mv[i] = BMW_H[i]; BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; } tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; #pragma unroll 2 for(int i=0;i<2;i++) { q[i+16] = (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } #pragma unroll 4 for(int i=2;i<6;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } #pragma unroll 3 for(int i=6;i<9;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); } #pragma unroll 4 for(int i=9;i<13;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); } #pragma unroll 3 for(int i=13;i<16;i++) { q[i+16] = CONST_EXP2 + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); } XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); hash.h8[0] = SWAP8(BMW_H[8]); hash.h8[1] = SWAP8(BMW_H[9]); hash.h8[2] = SWAP8(BMW_H[10]); hash.h8[3] = SWAP8(BMW_H[11]); hash.h8[4] = SWAP8(BMW_H[12]); hash.h8[5] = SWAP8(BMW_H[13]); hash.h8[6] = SWAP8(BMW_H[14]); hash.h8[7] = SWAP8(BMW_H[15]); } // groestl { #if !SPH_SMALL_FOOTPRINT_GROESTL __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; #else __local sph_u64 T0_C[256], T4_C[256]; #endif int init = get_local_id(0); int step = get_local_size(0); for (int i = init; i < 256; i += step) { T0_C[i] = T0[i]; T4_C[i] = T4[i]; #if !SPH_SMALL_FOOTPRINT_GROESTL T1_C[i] = T1[i]; T2_C[i] = T2[i]; T3_C[i] = T3[i]; T5_C[i] = T5[i]; T6_C[i] = T6[i]; T7_C[i] = T7[i]; #endif } barrier(CLK_LOCAL_MEM_FENCE); // groestl #define T0 T0_C #define T1 T1_C #define T2 T2_C #define T3 T3_C #define T4 T4_C #define T5 T5_C #define T6 T6_C #define T7 T7_C sph_u64 H[16]; //#pragma unroll 15 for (unsigned int u = 0; u < 15; u ++) H[u] = 0; #if USE_LE H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); #else H[15] = (sph_u64)512; #endif sph_u64 g[16], m[16]; m[0] = DEC64E(hash.h8[0]); m[1] = DEC64E(hash.h8[1]); m[2] = DEC64E(hash.h8[2]); m[3] = DEC64E(hash.h8[3]); m[4] = DEC64E(hash.h8[4]); m[5] = DEC64E(hash.h8[5]); m[6] = DEC64E(hash.h8[6]); m[7] = DEC64E(hash.h8[7]); //#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) g[u] = m[u] ^ H[u]; m[8] = 0x80; g[8] = m[8] ^ H[8]; m[9] = 0; g[9] = m[9] ^ H[9]; m[10] = 0; g[10] = m[10] ^ H[10]; m[11] = 0; g[11] = m[11] ^ H[11]; m[12] = 0; g[12] = m[12] ^ H[12]; m[13] = 0; g[13] = m[13] ^ H[13]; m[14] = 0; g[14] = m[14] ^ H[14]; m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; PERM_BIG_P(g); PERM_BIG_Q(m); //#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) H[u] ^= g[u] ^ m[u]; sph_u64 xH[16]; //#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) xH[u] = H[u]; PERM_BIG_P(xH); //#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) H[u] ^= xH[u]; //#pragma unroll 8 for (unsigned int u = 0; u < 8; u ++) hash.h8[u] = DEC64E(H[u + 8]); barrier(CLK_GLOBAL_MEM_FENCE); } // jh { sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); sph_u64 tmp; for(int i = 0; i < 2; i++) { if (i == 0) { h0h ^= DEC64E(hash.h8[0]); h0l ^= DEC64E(hash.h8[1]); h1h ^= DEC64E(hash.h8[2]); h1l ^= DEC64E(hash.h8[3]); h2h ^= DEC64E(hash.h8[4]); h2l ^= DEC64E(hash.h8[5]); h3h ^= DEC64E(hash.h8[6]); h3l ^= DEC64E(hash.h8[7]); } else if(i == 1) { h4h ^= DEC64E(hash.h8[0]); h4l ^= DEC64E(hash.h8[1]); h5h ^= DEC64E(hash.h8[2]); h5l ^= DEC64E(hash.h8[3]); h6h ^= DEC64E(hash.h8[4]); h6l ^= DEC64E(hash.h8[5]); h7h ^= DEC64E(hash.h8[6]); h7l ^= DEC64E(hash.h8[7]); h0h ^= 0x80; h3l ^= 0x2000000000000; } E8; } h4h ^= 0x80; h7l ^= 0x2000000000000; hash.h8[0] = DEC64E(h4h); hash.h8[1] = DEC64E(h4l); hash.h8[2] = DEC64E(h5h); hash.h8[3] = DEC64E(h5l); hash.h8[4] = DEC64E(h6h); hash.h8[5] = DEC64E(h6l); hash.h8[6] = DEC64E(h7h); hash.h8[7] = DEC64E(h7l); } // keccak { sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); a00 ^= SWAP8(hash.h8[0]); a10 ^= SWAP8(hash.h8[1]); a20 ^= SWAP8(hash.h8[2]); a30 ^= SWAP8(hash.h8[3]); a40 ^= SWAP8(hash.h8[4]); a01 ^= SWAP8(hash.h8[5]); a11 ^= SWAP8(hash.h8[6]); a21 ^= SWAP8(hash.h8[7]); a31 ^= 0x8000000000000001; KECCAK_F_1600; // Finalize the "lane complement" a10 = ~a10; a20 = ~a20; hash.h8[0] = SWAP8(a00); hash.h8[1] = SWAP8(a10); hash.h8[2] = SWAP8(a20); hash.h8[3] = SWAP8(a30); hash.h8[4] = SWAP8(a40); hash.h8[5] = SWAP8(a01); hash.h8[6] = SWAP8(a11); hash.h8[7] = SWAP8(a21); } // skein { sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; sph_u64 bcount = 0; m0 = SWAP8(hash.h8[0]); m1 = SWAP8(hash.h8[1]); m2 = SWAP8(hash.h8[2]); m3 = SWAP8(hash.h8[3]); m4 = SWAP8(hash.h8[4]); m5 = SWAP8(hash.h8[5]); m6 = SWAP8(hash.h8[6]); m7 = SWAP8(hash.h8[7]); UBI_BIG(480, 64); bcount = 0; m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; UBI_BIG(510, 8); hash.h8[0] = SWAP8(h0); hash.h8[1] = SWAP8(h1); hash.h8[2] = SWAP8(h2); hash.h8[3] = SWAP8(h3); hash.h8[4] = SWAP8(h4); hash.h8[5] = SWAP8(h5); hash.h8[6] = SWAP8(h6); hash.h8[7] = SWAP8(h7); } bool result = (SWAP8(hash.h8[3]) <= target); if (result) output[output[0xFF]++] = SWAP4(gid); } #endif // SIFCOIN_CL