diff --git a/Makefile.am b/Makefile.am index c014fe13..920ebadc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -63,6 +63,7 @@ sgminer_SOURCES += algorithm/marucoin.c algorithm/marucoin.h sgminer_SOURCES += algorithm/maxcoin.c algorithm/maxcoin.h sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h +sgminer_SOURCES += algorithm/x14.c algorithm/x14.h bin_SCRIPTS = $(top_srcdir)/kernel/*.cl diff --git a/algorithm.c b/algorithm.c index d468196a..631d92bc 100644 --- a/algorithm.c +++ b/algorithm.c @@ -27,6 +27,7 @@ #include "algorithm/maxcoin.h" #include "algorithm/talkcoin.h" #include "algorithm/bitblock.h" +#include "algorithm/x14.h" #include "compat.h" @@ -40,6 +41,8 @@ const char *algorithm_type_str[] = { "NScrypt", "X11", "X13", + "X14", + "X15", "Keccak", "Quarkcoin", "Twecoin", @@ -91,11 +94,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru static void append_hamsi_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm) { char buf[255]; - sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d", - opt_hamsi_expand_big); + sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d%s ", + opt_hamsi_expand_big, ((opt_hamsi_short)?" -D SPH_HAMSI_SHORT=1 ":"")); strcat(data->compiler_options, buf); - sprintf(buf, "big%u", (unsigned int)opt_hamsi_expand_big); + sprintf(buf, "big%u%s", (unsigned int)opt_hamsi_expand_big, ((opt_hamsi_short)?"hs":"")); strcat(data->binary_filename, buf); } @@ -419,6 +422,100 @@ static cl_int queue_talkcoin_mod_kernel(struct __clState *clState, struct _dev_b return status; } +static cl_int queue_x14_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel; + unsigned int num; + cl_ulong le_target; + cl_int status = 0; + + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); + + // blake - search + kernel = &clState->kernel; + num = 0; + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->padbuffer8); + // bmw - search1 + kernel = clState->extra_kernels; + CL_SET_ARG_0(clState->padbuffer8); + // groestl - search2 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // skein - search3 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // jh - search4 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // keccak - search5 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // luffa - search6 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // cubehash - search7 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // shavite - search8 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // simd - search9 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // echo - search10 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // hamsi - search11 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // fugue - search12 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // shabal - search13 + num = 0; + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + + return status; +} + +static cl_int queue_x14_old_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel; + unsigned int num; + cl_ulong le_target; + cl_int status = 0; + + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); + + // blake - search + kernel = &clState->kernel; + num = 0; + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->padbuffer8); + // bmw - search1 + kernel = clState->extra_kernels; + CL_SET_ARG_0(clState->padbuffer8); + // groestl - search2 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // skein - search3 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // jh - search4 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // keccak - search5 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // luffa - search6 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // cubehash - search7 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // shavite - search8 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // simd - search9 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // combined echo, hamsi, fugue - shabal - search10 + num = 0; + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + + return status; +} + typedef struct _algorithm_settings_t { const char *name; /* Human-readable identifier */ algorithm_type_t type; //common algorithm type @@ -477,8 +574,13 @@ static algorithm_settings_t algos[] = { { "marucoin-mod", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_hamsi_compiler_options}, { "marucoin-modold", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_hamsi_compiler_options}, - { "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL}, + { "x14", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_hamsi_compiler_options}, + { "x14old", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_hamsi_compiler_options}, + { "bitblock", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_hamsi_compiler_options}, + { "bitblockold", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_hamsi_compiler_options}, + + { "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL}, // kernels starting from this will have difficulty calculated by using fuguecoin algorithm #define A_FUGUE(a, b) \ { a, ALGO_FUGUE, 1, 256, 256, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL} diff --git a/algorithm.h b/algorithm.h index 45549b4d..3bbf8c35 100644 --- a/algorithm.h +++ b/algorithm.h @@ -16,6 +16,8 @@ typedef enum { ALGO_NSCRYPT, ALGO_X11, ALGO_X13, + ALGO_X14, + ALGO_X15, ALGO_KECCAK, ALGO_QUARK, ALGO_TWE, diff --git a/algorithm/x14.c b/algorithm/x14.c new file mode 100644 index 00000000..7855ecdd --- /dev/null +++ b/algorithm/x14.c @@ -0,0 +1,247 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + + +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" +#include "sph/sph_skein.h" +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" + +/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */ +typedef struct { + sph_blake512_context blake1; + sph_bmw512_context bmw1; + sph_groestl512_context groestl1; + sph_skein512_context skein1; + sph_jh512_context jh1; + sph_keccak512_context keccak1; + sph_luffa512_context luffa1; + sph_cubehash512_context cubehash1; + sph_shavite512_context shavite1; + sph_simd512_context simd1; + sph_echo512_context echo1; + sph_hamsi512_context hamsi1; + sph_fugue512_context fugue1; + sph_shabal512_context shabal1; +} Xhash_context_holder; + +static Xhash_context_holder base_contexts; + +void init_X14hash_contexts() +{ + sph_blake512_init(&base_contexts.blake1); + sph_bmw512_init(&base_contexts.bmw1); + sph_groestl512_init(&base_contexts.groestl1); + sph_skein512_init(&base_contexts.skein1); + sph_jh512_init(&base_contexts.jh1); + sph_keccak512_init(&base_contexts.keccak1); + sph_luffa512_init(&base_contexts.luffa1); + sph_cubehash512_init(&base_contexts.cubehash1); + sph_shavite512_init(&base_contexts.shavite1); + sph_simd512_init(&base_contexts.simd1); + sph_echo512_init(&base_contexts.echo1); + sph_hamsi512_init(&base_contexts.hamsi1); + sph_fugue512_init(&base_contexts.fugue1); + sph_shabal512_init(&base_contexts.shabal1); +} + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static inline void be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + + +inline void x14hash(void *state, const void *input) +{ + init_X14hash_contexts(); + + Xhash_context_holder ctx; + + uint32_t hashA[16], hashB[16]; + //blake-bmw-groestl-sken-jh-meccak-luffa-cubehash-shivite-simd-echo + memcpy(&ctx, &base_contexts, sizeof(base_contexts)); + + sph_blake512 (&ctx.blake1, input, 80); + sph_blake512_close (&ctx.blake1, hashA); + + sph_bmw512 (&ctx.bmw1, hashA, 64); + sph_bmw512_close(&ctx.bmw1, hashB); + + sph_groestl512 (&ctx.groestl1, hashB, 64); + sph_groestl512_close(&ctx.groestl1, hashA); + + sph_skein512 (&ctx.skein1, hashA, 64); + sph_skein512_close(&ctx.skein1, hashB); + + sph_jh512 (&ctx.jh1, hashB, 64); + sph_jh512_close(&ctx.jh1, hashA); + + sph_keccak512 (&ctx.keccak1, hashA, 64); + sph_keccak512_close(&ctx.keccak1, hashB); + + sph_luffa512 (&ctx.luffa1, hashB, 64); + sph_luffa512_close (&ctx.luffa1, hashA); + + sph_cubehash512 (&ctx.cubehash1, hashA, 64); + sph_cubehash512_close(&ctx.cubehash1, hashB); + + sph_shavite512 (&ctx.shavite1, hashB, 64); + sph_shavite512_close(&ctx.shavite1, hashA); + + sph_simd512 (&ctx.simd1, hashA, 64); + sph_simd512_close(&ctx.simd1, hashB); + + sph_echo512 (&ctx.echo1, hashB, 64); + sph_echo512_close(&ctx.echo1, hashA); + + sph_hamsi512 (&ctx.hamsi1, hashA, 64); + sph_hamsi512_close(&ctx.hamsi1, hashB); + + sph_fugue512 (&ctx.fugue1, hashB, 64); + sph_fugue512_close(&ctx.fugue1, hashA); + + sph_shabal512 (&ctx.shabal1, (const unsigned char*)hashA, 64); + sph_shabal512_close(&ctx.shabal1, hashB); + + memcpy(state, hashB, 32); +} + +static const uint32_t diff1targ = 0x0000ffff; + +/* Used externally as confirmation of correct OCL code */ +int x14_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + x14hash(ohash, data); + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + + if (tmp_hash7 > diff1targ) + return -1; + + if (tmp_hash7 > Htarg) + return 0; + + return 1; +} + +void x14_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + x14hash(ohash, data); +} + +static inline void be32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} + +bool scanhash_x14(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while(1) + { + uint32_t ostate[8]; + *nonce = ++n; + data[19] = (n); + x14hash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]); + + if(unlikely(tmp_hash7 <= Htarg)) + { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) + { + *last_nonce = n; + break; + } + } + + return ret; +} + + diff --git a/algorithm/x14.h b/algorithm/x14.h new file mode 100644 index 00000000..f4e92db6 --- /dev/null +++ b/algorithm/x14.h @@ -0,0 +1,10 @@ +#ifndef X14_H +#define X14_H + +#include "miner.h" + +extern int x14_test(unsigned char *pdata, const unsigned char *ptarget, + uint32_t nonce); +extern void x14_regenhash(struct work *work); + +#endif /* X14_H */ \ No newline at end of file diff --git a/kernel/bitblock.cl b/kernel/bitblock.cl index 610152de..54c885a1 100644 --- a/kernel/bitblock.cl +++ b/kernel/bitblock.cl @@ -72,13 +72,17 @@ typedef int sph_s32; #define SPH_SIMD_NOCOPY 0 #define SPH_KECCAK_NOCOPY 0 #define SPH_COMPACT_BLAKE_64 0 -#define SPH_LUFFA_PARALLEL 1 +#define SPH_LUFFA_PARALLEL 0 #define SPH_SMALL_FOOTPRINT_GROESTL 0 #define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_CUBEHASH_UNROLL 0 #define SPH_KECCAK_UNROLL 1 +#ifndef SPH_HAMSI_EXPAND_BIG #define SPH_HAMSI_EXPAND_BIG 1 -#define SPH_HAMSI_SHORT 1 +#endif +#ifndef SPH_HAMSI_SHORT + #define SPH_HAMSI_SHORT 1 +#endif #include "blake.cl" #include "bmw.cl" @@ -126,10 +130,10 @@ typedef union { __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global unsigned char* block, __global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // blake + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + // blake sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); @@ -139,11 +143,12 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) if ((T0 = SPH_T64(T0 + 1024)) < 1024) T1 = SPH_T64(T1 + 1); - + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; + M0 = DEC64BE(block + 0); M1 = DEC64BE(block + 8); M2 = DEC64BE(block + 16); @@ -174,25 +179,25 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) hash->h8[6] = H6; hash->h8[7] = H7; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search1(__global hash_t* hashes) { - uint gid = get_global_id(0); + uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); // bmw sph_u64 BMW_H[16]; - - #pragma unroll 16 + +#pragma unroll 16 for(unsigned u = 0; u < 16; u++) BMW_H[u] = BMW_IV512[u]; sph_u64 mv[16],q[32]; - sph_u64 tmp; - + sph_u64 tmp; + mv[0] = SWAP8(hash->h8[0]); mv[1] = SWAP8(hash->h8[1]); mv[2] = SWAP8(hash->h8[2]); @@ -211,7 +216,7 @@ __kernel void search1(__global hash_t* hashes) mv[15] = SPH_C64(512); tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); - q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); @@ -221,7 +226,7 @@ __kernel void search1(__global hash_t* hashes) tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); - q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); @@ -231,7 +236,7 @@ __kernel void search1(__global hash_t* hashes) tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); - q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); @@ -242,63 +247,63 @@ __kernel void search1(__global hash_t* hashes) q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; - - #pragma unroll 2 + +#pragma unroll 2 for(int i=0;i<2;i++) { q[i+16] = - (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + - (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + - (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + - (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + - (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + - (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + - (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + - (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + - (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + - (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + - (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + - (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + - (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + - (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + - (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + - (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } - - #pragma unroll 4 + +#pragma unroll 4 for(int i=2;i<6;i++) { q[i+16] = CONST_EXP2 + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } - #pragma unroll 3 +#pragma unroll 3 for(int i=6;i<9;i++) { q[i+16] = CONST_EXP2 + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); } - - #pragma unroll 4 + +#pragma unroll 4 for(int i=9;i<13;i++) { q[i+16] = CONST_EXP2 + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); } - - #pragma unroll 3 + +#pragma unroll 3 for(int i=13;i<16;i++) { q[i+16] = CONST_EXP2 + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); } - + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; @@ -320,7 +325,7 @@ __kernel void search1(__global hash_t* hashes) BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); - #pragma unroll 16 +#pragma unroll 16 for(int i=0;i<16;i++) { mv[i] = BMW_H[i]; @@ -328,7 +333,7 @@ __kernel void search1(__global hash_t* hashes) } tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); - q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); @@ -338,7 +343,7 @@ __kernel void search1(__global hash_t* hashes) tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); - q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); @@ -348,7 +353,7 @@ __kernel void search1(__global hash_t* hashes) tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); - q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); @@ -359,65 +364,66 @@ __kernel void search1(__global hash_t* hashes) q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; - - #pragma unroll 2 + +#pragma unroll 2 for(int i=0;i<2;i++) { q[i+16] = - (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + - (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + - (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + - (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + - (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + - (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + - (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + - (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + - (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + - (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + - (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + - (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + - (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + - (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + - (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + - (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } - - #pragma unroll 4 + +#pragma unroll 4 for(int i=2;i<6;i++) { q[i+16] = CONST_EXP2 + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } - #pragma unroll 3 +#pragma unroll 3 for(int i=6;i<9;i++) { q[i+16] = CONST_EXP2 + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); } - #pragma unroll 4 +#pragma unroll 4 for(int i=9;i<13;i++) { q[i+16] = CONST_EXP2 + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); } - #pragma unroll 3 +#pragma unroll 3 for(int i=13;i<16;i++) { q[i+16] = CONST_EXP2 + - (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + - SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); } - + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; - XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); @@ -444,8 +450,8 @@ __kernel void search1(__global hash_t* hashes) hash->h8[5] = SWAP8(BMW_H[13]); hash->h8[6] = SWAP8(BMW_H[14]); hash->h8[7] = SWAP8(BMW_H[15]); - - barrier(CLK_GLOBAL_MEM_FENCE); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -454,94 +460,67 @@ __kernel void search2(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - #if !SPH_SMALL_FOOTPRINT_GROESTL - __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; - __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; - #else - __local sph_u64 T0_C[256], T4_C[256]; - #endif + __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; int init = get_local_id(0); int step = get_local_size(0); for (int i = init; i < 256; i += step) { - T0_C[i] = T0[i]; - T4_C[i] = T4[i]; - #if !SPH_SMALL_FOOTPRINT_GROESTL - T1_C[i] = T1[i]; - T2_C[i] = T2[i]; - T3_C[i] = T3[i]; - T5_C[i] = T5[i]; - T6_C[i] = T6[i]; - T7_C[i] = T7[i]; - #endif + T0_L[i] = T0[i]; + T4_L[i] = T4[i]; + T1_L[i] = T1[i]; + T2_L[i] = T2[i]; + T3_L[i] = T3[i]; + T5_L[i] = T5[i]; + T6_L[i] = T6[i]; + T7_L[i] = T7[i]; } - barrier(CLK_LOCAL_MEM_FENCE); // groestl - - #define T0 T0_C - #define T1 T1_C - #define T2 T2_C - #define T3 T3_C - #define T4 T4_C - #define T5 T5_C - #define T6 T6_C - #define T7 T7_C + barrier(CLK_LOCAL_MEM_FENCE); + + #define T0 T0_L + #define T1 T1_L + #define T2 T2_L + #define T3 T3_L + #define T4 T4_L + #define T5 T5_L + #define T6 T6_L + #define T7 T7_L // groestl - - sph_u64 H[16]; - - for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; - - #if USE_LE - H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); - #else - H[15] = (sph_u64)512; - #endif + sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; sph_u64 g[16], m[16]; - m[0] = DEC64E(hash->h8[0]); - m[1] = DEC64E(hash->h8[1]); - m[2] = DEC64E(hash->h8[2]); - m[3] = DEC64E(hash->h8[3]); - m[4] = DEC64E(hash->h8[4]); - m[5] = DEC64E(hash->h8[5]); - m[6] = DEC64E(hash->h8[6]); - m[7] = DEC64E(hash->h8[7]); - - for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - - m[8] = 0x80; g[8] = m[8] ^ H[8]; - m[9] = 0; g[9] = m[9] ^ H[9]; - m[10] = 0; g[10] = m[10] ^ H[10]; - m[11] = 0; g[11] = m[11] ^ H[11]; - m[12] = 0; g[12] = m[12] ^ H[12]; - m[13] = 0; g[13] = m[13] ^ H[13]; - m[14] = 0; g[14] = m[14] ^ H[14]; - m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; + g[0] = m[0] = DEC64E(hash->h8[0]); + g[1] = m[1] = DEC64E(hash->h8[1]); + g[2] = m[2] = DEC64E(hash->h8[2]); + g[3] = m[3] = DEC64E(hash->h8[3]); + g[4] = m[4] = DEC64E(hash->h8[4]); + g[5] = m[5] = DEC64E(hash->h8[5]); + g[6] = m[6] = DEC64E(hash->h8[6]); + g[7] = m[7] = DEC64E(hash->h8[7]); + g[8] = m[8] = 0x80; + g[9] = m[9] = 0; + g[10] = m[10] = 0; + g[11] = m[11] = 0; + g[12] = m[12] = 0; + g[13] = m[13] = 0; + g[14] = m[14] = 0; + g[15] = 0x102000000000000; + m[15] = 0x100000000000000; PERM_BIG_P(g); PERM_BIG_Q(m); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - sph_u64 xH[16]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; - + xH[u] = H[u] ^= g[u] ^ m[u]; + PERM_BIG_P(xH); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; - - for (unsigned int u = 0; u < 8; u ++) - hash->h8[u] = DEC64E(H[u + 8]); + for (unsigned int u = 8; u < 16; u ++) + hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); barrier(CLK_GLOBAL_MEM_FENCE); } @@ -566,10 +545,14 @@ __kernel void search3(__global hash_t* hashes) m5 = SWAP8(hash->h8[5]); m6 = SWAP8(hash->h8[6]); m7 = SWAP8(hash->h8[7]); + UBI_BIG(480, 64); + bcount = 0; m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; + UBI_BIG(510, 8); + hash->h8[0] = SWAP8(h0); hash->h8[1] = SWAP8(h1); hash->h8[2] = SWAP8(h2); @@ -579,7 +562,7 @@ __kernel void search3(__global hash_t* hashes) hash->h8[6] = SWAP8(h6); hash->h8[7] = SWAP8(h7); - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -588,7 +571,7 @@ __kernel void search4(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // jh + // jh sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); @@ -617,7 +600,7 @@ __kernel void search4(__global hash_t* hashes) h6l ^= DEC64E(hash->h8[5]); h7h ^= DEC64E(hash->h8[6]); h7l ^= DEC64E(hash->h8[7]); - + h0h ^= 0x80; h3l ^= 0x2000000000000; } @@ -635,7 +618,7 @@ __kernel void search4(__global hash_t* hashes) hash->h8[6] = DEC64E(h7h); hash->h8[7] = DEC64E(h7l); - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -647,7 +630,7 @@ __kernel void search5(__global hash_t* hashes) // keccak sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; - sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; + sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; @@ -669,6 +652,7 @@ __kernel void search5(__global hash_t* hashes) a21 ^= SWAP8(hash->h8[7]); a31 ^= 0x8000000000000001; KECCAK_F_1600; + // Finalize the "lane complement" a10 = ~a10; a20 = ~a20; @@ -682,7 +666,7 @@ __kernel void search5(__global hash_t* hashes) hash->h8[6] = SWAP8(a11); hash->h8[7] = SWAP8(a21); - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -745,7 +729,7 @@ __kernel void search6(__global hash_t* hashes) hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; } } - + hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; @@ -755,7 +739,7 @@ __kernel void search6(__global hash_t* hashes) hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -799,7 +783,7 @@ __kernel void search7(__global hash_t* hashes) x6 ^= SWAP4(hash->h4[15]); x7 ^= SWAP4(hash->h4[14]); } - else if(i == 1) + else if(i == 1) x0 ^= 0x80; else if (i == 2) xv ^= SPH_C32(1); @@ -822,7 +806,7 @@ __kernel void search7(__global hash_t* hashes) hash->h4[14] = xe; hash->h4[15] = xf; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -830,11 +814,12 @@ __kernel void search8(__global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; - + int init = get_local_id(0); int step = get_local_size(0); - + for (int i = init; i < 256; i += step) { AES0[i] = AES0_C[i]; @@ -842,7 +827,7 @@ __kernel void search8(__global hash_t* hashes) AES2[i] = AES2_C[i]; AES3[i] = AES3_C[i]; } - + barrier(CLK_LOCAL_MEM_FENCE); // shavite @@ -858,7 +843,7 @@ __kernel void search8(__global hash_t* hashes) sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; - sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; rk00 = hash->h4[0]; rk01 = hash->h4[1]; @@ -901,7 +886,7 @@ __kernel void search8(__global hash_t* hashes) hash->h4[14] = hE; hash->h4[15] = hF; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -913,10 +898,8 @@ __kernel void search9(__global hash_t* hashes) // simd s32 q[256]; unsigned char x[128]; - for(unsigned int i = 0; i < 64; i++) x[i] = hash->h1[i]; - for(unsigned int i = 64; i < 128; i++) x[i] = 0; @@ -926,14 +909,15 @@ __kernel void search9(__global hash_t* hashes) u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); FFT256(0, 1, 0, ll1); - for (int i = 0; i < 256; i ++) { - s32 tq; - - tq = q[i] + yoff_b_n[i]; - tq = REDS2(tq); - tq = REDS1(tq); - tq = REDS1(tq); - q[i] = (tq <= 128 ? tq : tq - 257); + for (int i = 0; i < 256; i ++) + { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); } A0 ^= hash->h4[0]; @@ -959,21 +943,24 @@ __kernel void search9(__global hash_t* hashes) ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); STEP_BIG( - C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), - C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), - IF, 4, 13, PP8_4_); + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + IF, 4, 13, PP8_4_); + STEP_BIG( - C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), - C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), - IF, 13, 10, PP8_5_); + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + IF, 13, 10, PP8_5_); + STEP_BIG( - C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), - C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), - IF, 10, 25, PP8_6_); + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + IF, 10, 25, PP8_6_); + STEP_BIG( - C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), - C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), - IF, 25, 4, PP8_0_); + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), + IF, 25, 4, PP8_0_); u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; @@ -988,22 +975,27 @@ __kernel void search9(__global hash_t* hashes) ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + STEP_BIG( - COPY_A0, COPY_A1, COPY_A2, COPY_A3, - COPY_A4, COPY_A5, COPY_A6, COPY_A7, - IF, 4, 13, PP8_4_); + COPY_A0, COPY_A1, COPY_A2, COPY_A3, + COPY_A4, COPY_A5, COPY_A6, COPY_A7, + IF, 4, 13, PP8_4_); + STEP_BIG( - COPY_B0, COPY_B1, COPY_B2, COPY_B3, - COPY_B4, COPY_B5, COPY_B6, COPY_B7, - IF, 13, 10, PP8_5_); + COPY_B0, COPY_B1, COPY_B2, COPY_B3, + COPY_B4, COPY_B5, COPY_B6, COPY_B7, + IF, 13, 10, PP8_5_); + STEP_BIG( - COPY_C0, COPY_C1, COPY_C2, COPY_C3, - COPY_C4, COPY_C5, COPY_C6, COPY_C7, - IF, 10, 25, PP8_6_); + COPY_C0, COPY_C1, COPY_C2, COPY_C3, + COPY_C4, COPY_C5, COPY_C6, COPY_C7, + IF, 10, 25, PP8_6_); + STEP_BIG( - COPY_D0, COPY_D1, COPY_D2, COPY_D3, - COPY_D4, COPY_D5, COPY_D6, COPY_D7, - IF, 25, 4, PP8_0_); + COPY_D0, COPY_D1, COPY_D2, COPY_D3, + COPY_D4, COPY_D5, COPY_D6, COPY_D7, + IF, 25, 4, PP8_0_); + #undef q hash->h4[0] = A0; @@ -1023,7 +1015,7 @@ __kernel void search9(__global hash_t* hashes) hash->h4[14] = B6; hash->h4[15] = B7; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -1114,49 +1106,52 @@ __kernel void search10(__global hash_t* hashes) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search11(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __local sph_u32 T512_L[1024]; + __constant const sph_u32 *T512_C = &T512[0][0]; + + int init = get_local_id(0); + int step = get_local_size(0); + for (int i = init; i < 1024; i += step) + T512_L[i] = T512_C[i]; - __local sph_u32 T512_L[1024]; - __constant const sph_u32 *T512_C = &T512[0][0]; - int init = get_local_id(0); - int step = get_local_size(0); - for (int i = init; i < 1024; i += step) - { - T512_L[i] = T512_C[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); - { - sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; - sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; - sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; - sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; - sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; - sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; - sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; - -#define buf(u) hash->h1[i + u] - for(int i = 0; i < 64; i += 8) { - INPUT_BIG_LOCAL; - P_BIG; - T_BIG; - } -#undef buf -#define buf(u) (u == 0 ? 0x80 : 0) - INPUT_BIG_LOCAL; - P_BIG; - T_BIG; -#undef buf -#define buf(u) (u == 6 ? 2 : 0) - INPUT_BIG_LOCAL; - PF_BIG; - T_BIG; - - for (unsigned u = 0; u < 16; u ++) - hash->h4[u] = h[u]; - } - barrier(CLK_GLOBAL_MEM_FENCE); + sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; + sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; + sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; + sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; + sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + + #define buf(u) hash->h1[i + u] + + for(int i = 0; i < 64; i += 8) { + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + } + + #undef buf + #define buf(u) (u == 0 ? 0x80 : 0) + + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + + #undef buf + #define buf(u) (u == 6 ? 2 : 0) + + INPUT_BIG_LOCAL; + PF_BIG; + T_BIG; + + for (unsigned u = 0; u < 16; u ++) + hash->h4[u] = h[u]; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -1346,115 +1341,115 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo uint offset = get_global_offset(0); __global hash_t *hash = &(hashes[gid-offset]); - __local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256]; + __local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256]; - int init = get_local_id(0); - int step = get_local_size(0); + int init = get_local_id(0); + int step = get_local_size(0); - for (int i = init; i < 256; i += step) - { - LT0[i] = plain_T0[i]; - LT1[i] = plain_T1[i]; - LT2[i] = plain_T2[i]; - LT3[i] = plain_T3[i]; - LT4[i] = plain_T4[i]; - LT5[i] = plain_T5[i]; - LT6[i] = plain_T6[i]; - LT7[i] = plain_T7[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); + for (int i = init; i < 256; i += step) + { + LT0[i] = plain_T0[i]; + LT1[i] = plain_T1[i]; + LT2[i] = plain_T2[i]; + LT3[i] = plain_T3[i]; + LT4[i] = plain_T4[i]; + LT5[i] = plain_T5[i]; + LT6[i] = plain_T6[i]; + LT7[i] = plain_T7[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); // whirlpool - sph_u64 n0, n1, n2, n3, n4, n5, n6, n7; - sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; - sph_u64 state[8]; - - n0 = (hash->h8[0]); - n1 = (hash->h8[1]); - n2 = (hash->h8[2]); - n3 = (hash->h8[3]); - n4 = (hash->h8[4]); - n5 = (hash->h8[5]); - n6 = (hash->h8[6]); - n7 = (hash->h8[7]); - - h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0; - - n0 ^= h0; - n1 ^= h1; - n2 ^= h2; - n3 ^= h3; - n4 ^= h4; - n5 ^= h5; - n6 ^= h6; - n7 ^= h7; - - for (unsigned r = 0; r < 10; r ++) { - sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); - TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); - TRANSFER(n, tmp); - } - - state[0] = n0 ^ (hash->h8[0]); - state[1] = n1 ^ (hash->h8[1]); - state[2] = n2 ^ (hash->h8[2]); - state[3] = n3 ^ (hash->h8[3]); - state[4] = n4 ^ (hash->h8[4]); - state[5] = n5 ^ (hash->h8[5]); - state[6] = n6 ^ (hash->h8[6]); - state[7] = n7 ^ (hash->h8[7]); - - n0 = 0x80; - n1 = n2 = n3 = n4 = n5 = n6 = 0; - n7 = 0x2000000000000; - - h0 = state[0]; - h1 = state[1]; - h2 = state[2]; - h3 = state[3]; - h4 = state[4]; - h5 = state[5]; - h6 = state[6]; - h7 = state[7]; - - n0 ^= h0; - n1 ^= h1; - n2 ^= h2; - n3 ^= h3; - n4 ^= h4; - n5 ^= h5; - n6 ^= h6; - n7 ^= h7; - - for (unsigned r = 0; r < 10; r ++) { - sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - ROUND_KSCHED(LT, h, tmp, plain_RC[r]); - TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); - TRANSFER(n, tmp); - } + sph_u64 n0, n1, n2, n3, n4, n5, n6, n7; + sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; + sph_u64 state[8]; + + n0 = (hash->h8[0]); + n1 = (hash->h8[1]); + n2 = (hash->h8[2]); + n3 = (hash->h8[3]); + n4 = (hash->h8[4]); + n5 = (hash->h8[5]); + n6 = (hash->h8[6]); + n7 = (hash->h8[7]); + + h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0; + + n0 ^= h0; + n1 ^= h1; + n2 ^= h2; + n3 ^= h3; + n4 ^= h4; + n5 ^= h5; + n6 ^= h6; + n7 ^= h7; + + #pragma unroll 10 + for (unsigned r = 0; r < 10; r ++) + { + sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - state[0] ^= n0 ^ 0x80; - state[1] ^= n1; - state[2] ^= n2; - state[3] ^= n3; - state[4] ^= n4; - state[5] ^= n5; - state[6] ^= n6; - state[7] ^= n7 ^ 0x2000000000000; + ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); + TRANSFER(h, tmp); + ROUND_WENC(plain_T, n, h, tmp); + TRANSFER(n, tmp); + } - for (unsigned i = 0; i < 8; i ++) - hash->h8[i] = state[i]; + state[0] = n0 ^ (hash->h8[0]); + state[1] = n1 ^ (hash->h8[1]); + state[2] = n2 ^ (hash->h8[2]); + state[3] = n3 ^ (hash->h8[3]); + state[4] = n4 ^ (hash->h8[4]); + state[5] = n5 ^ (hash->h8[5]); + state[6] = n6 ^ (hash->h8[6]); + state[7] = n7 ^ (hash->h8[7]); + + n0 = 0x80; + n1 = n2 = n3 = n4 = n5 = n6 = 0; + n7 = 0x2000000000000; + + h0 = state[0]; + h1 = state[1]; + h2 = state[2]; + h3 = state[3]; + h4 = state[4]; + h5 = state[5]; + h6 = state[6]; + h7 = state[7]; + + n0 ^= h0; + n1 ^= h1; + n2 ^= h2; + n3 ^= h3; + n4 ^= h4; + n5 ^= h5; + n6 ^= h6; + n7 ^= h7; + + #pragma unroll 10 + for (unsigned r = 0; r < 10; r ++) + { + sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; -// for(uint i = 0; i < 8; i++) -// output[(NUMHASH * 9) * 15 + gid * 9 + i] = hash->h8[i]; + ROUND_KSCHED(LT, h, tmp, plain_RC[r]); + TRANSFER(h, tmp); + ROUND_WENC(plain_T, n, h, tmp); + TRANSFER(n, tmp); + } -// output[(NUMHASH * 9) * 15 + gid * 9 + 8] = nonce; + state[0] ^= n0 ^ 0x80; + state[1] ^= n1; + state[2] ^= n2; + state[3] ^= n3; + state[4] ^= n4; + state[5] ^= n5; + state[6] ^= n6; + state[7] ^= n7 ^ 0x2000000000000; + + for (unsigned i = 0; i < 8; i ++) + hash->h8[i] = state[i]; bool result = (hash->h8[3] <= target); if (result) diff --git a/kernel/bitblockold.cl b/kernel/bitblockold.cl index 6304bd63..9f584391 100644 --- a/kernel/bitblockold.cl +++ b/kernel/bitblockold.cl @@ -70,7 +70,7 @@ typedef long sph_s64; #define SPH_SIMD_NOCOPY 0 #define SPH_KECCAK_NOCOPY 0 #define SPH_COMPACT_BLAKE_64 0 -#define SPH_LUFFA_PARALLEL 1 +#define SPH_LUFFA_PARALLEL 0 #define SPH_SMALL_FOOTPRINT_GROESTL 0 #define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_CUBEHASH_UNROLL 0 @@ -115,1038 +115,1285 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // blake - sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); - sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); - sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); - sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); - sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; - sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; - - if ((T0 = SPH_T64(T0 + 1024)) < 1024) - { - T1 = SPH_T64(T1 + 1); - } - sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; - sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; - sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; - sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; - M0 = DEC64BE(block + 0); - M1 = DEC64BE(block + 8); - M2 = DEC64BE(block + 16); - M3 = DEC64BE(block + 24); - M4 = DEC64BE(block + 32); - M5 = DEC64BE(block + 40); - M6 = DEC64BE(block + 48); - M7 = DEC64BE(block + 56); - M8 = DEC64BE(block + 64); - M9 = DEC64BE(block + 72); - M9 &= 0xFFFFFFFF00000000; - M9 ^= SWAP4(gid); - MA = 0x8000000000000000; - MB = 0; - MC = 0; - MD = 1; - ME = 0; - MF = 0x280; - - COMPRESS64; - - hash->h8[0] = H0; - hash->h8[1] = H1; - hash->h8[2] = H2; - hash->h8[3] = H3; - hash->h8[4] = H4; - hash->h8[5] = H5; - hash->h8[6] = H6; - hash->h8[7] = H7; - - barrier(CLK_GLOBAL_MEM_FENCE); + // blake + sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); + sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); + sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); + sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); + sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; + sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; + + if ((T0 = SPH_T64(T0 + 1024)) < 1024) + T1 = SPH_T64(T1 + 1); + + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; + + M0 = DEC64BE(block + 0); + M1 = DEC64BE(block + 8); + M2 = DEC64BE(block + 16); + M3 = DEC64BE(block + 24); + M4 = DEC64BE(block + 32); + M5 = DEC64BE(block + 40); + M6 = DEC64BE(block + 48); + M7 = DEC64BE(block + 56); + M8 = DEC64BE(block + 64); + M9 = DEC64BE(block + 72); + M9 &= 0xFFFFFFFF00000000; + M9 ^= SWAP4(gid); + MA = 0x8000000000000000; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 0x280; + + COMPRESS64; + + hash->h8[0] = H0; + hash->h8[1] = H1; + hash->h8[2] = H2; + hash->h8[3] = H3; + hash->h8[4] = H4; + hash->h8[5] = H5; + hash->h8[6] = H6; + hash->h8[7] = H7; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search1(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // bmw - sph_u64 BMW_H[16]; - for(unsigned u = 0; u < 16; u++) - BMW_H[u] = BMW_IV512[u]; - - sph_u64 BMW_h1[16], BMW_h2[16]; - sph_u64 mv[16]; - - mv[ 0] = SWAP8(hash->h8[0]); - mv[ 1] = SWAP8(hash->h8[1]); - mv[ 2] = SWAP8(hash->h8[2]); - mv[ 3] = SWAP8(hash->h8[3]); - mv[ 4] = SWAP8(hash->h8[4]); - mv[ 5] = SWAP8(hash->h8[5]); - mv[ 6] = SWAP8(hash->h8[6]); - mv[ 7] = SWAP8(hash->h8[7]); - mv[ 8] = 0x80; - mv[ 9] = 0; - mv[10] = 0; - mv[11] = 0; - mv[12] = 0; - mv[13] = 0; - mv[14] = 0; - mv[15] = 0x200; -#define M(x) (mv[x]) -#define H(x) (BMW_H[x]) -#define dH(x) (BMW_h2[x]) - - FOLDb; - -#undef M -#undef H -#undef dH - -#define M(x) (BMW_h2[x]) -#define H(x) (final_b[x]) -#define dH(x) (BMW_h1[x]) - - FOLDb; - -#undef M -#undef H -#undef dH - - hash->h8[0] = SWAP8(BMW_h1[8]); - hash->h8[1] = SWAP8(BMW_h1[9]); - hash->h8[2] = SWAP8(BMW_h1[10]); - hash->h8[3] = SWAP8(BMW_h1[11]); - hash->h8[4] = SWAP8(BMW_h1[12]); - hash->h8[5] = SWAP8(BMW_h1[13]); - hash->h8[6] = SWAP8(BMW_h1[14]); - hash->h8[7] = SWAP8(BMW_h1[15]); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // bmw + sph_u64 BMW_H[16]; + +#pragma unroll 16 + for(unsigned u = 0; u < 16; u++) + BMW_H[u] = BMW_IV512[u]; + + sph_u64 mv[16],q[32]; + sph_u64 tmp; + + mv[0] = SWAP8(hash->h8[0]); + mv[1] = SWAP8(hash->h8[1]); + mv[2] = SWAP8(hash->h8[2]); + mv[3] = SWAP8(hash->h8[3]); + mv[4] = SWAP8(hash->h8[4]); + mv[5] = SWAP8(hash->h8[5]); + mv[6] = SWAP8(hash->h8[6]); + mv[7] = SWAP8(hash->h8[7]); + mv[8] = 0x80; + mv[9] = 0; + mv[10] = 0; + mv[11] = 0; + mv[12] = 0; + mv[13] = 0; + mv[14] = 0; + mv[15] = SPH_C64(512); + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) + { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; + } + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + hash->h8[0] = SWAP8(BMW_H[8]); + hash->h8[1] = SWAP8(BMW_H[9]); + hash->h8[2] = SWAP8(BMW_H[10]); + hash->h8[3] = SWAP8(BMW_H[11]); + hash->h8[4] = SWAP8(BMW_H[12]); + hash->h8[5] = SWAP8(BMW_H[13]); + hash->h8[6] = SWAP8(BMW_H[14]); + hash->h8[7] = SWAP8(BMW_H[15]); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search2(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - - int init = get_local_id(0); - int step = get_local_size(0); - - for (int i = init; i < 256; i += step) - { - T0_L[i] = T0[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T4_L[i] = T4[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - -#define T0 T0_L -#define T1 T1_L -#define T2 T2_L -#define T3 T3_L -#define T4 T4_L -#define T5 T5_L -#define T6 T6_L -#define T7 T7_L - - // groestl - - sph_u64 H[16]; - for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; -#if USE_LE - H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); -#else - H[15] = (sph_u64)512; -#endif - - sph_u64 g[16], m[16]; - m[0] = DEC64E(hash->h8[0]); - m[1] = DEC64E(hash->h8[1]); - m[2] = DEC64E(hash->h8[2]); - m[3] = DEC64E(hash->h8[3]); - m[4] = DEC64E(hash->h8[4]); - m[5] = DEC64E(hash->h8[5]); - m[6] = DEC64E(hash->h8[6]); - m[7] = DEC64E(hash->h8[7]); - for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - m[8] = 0x80; g[8] = m[8] ^ H[8]; - m[9] = 0; g[9] = m[9] ^ H[9]; - m[10] = 0; g[10] = m[10] ^ H[10]; - m[11] = 0; g[11] = m[11] ^ H[11]; - m[12] = 0; g[12] = m[12] ^ H[12]; - m[13] = 0; g[13] = m[13] ^ H[13]; - m[14] = 0; g[14] = m[14] ^ H[14]; - m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; - PERM_BIG_P(g); - PERM_BIG_Q(m); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - sph_u64 xH[16]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; - PERM_BIG_P(xH); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; - for (unsigned int u = 0; u < 8; u ++) - hash->h8[u] = DEC64E(H[u + 8]); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + T0_L[i] = T0[i]; + T4_L[i] = T4[i]; + T1_L[i] = T1[i]; + T2_L[i] = T2[i]; + T3_L[i] = T3[i]; + T5_L[i] = T5[i]; + T6_L[i] = T6[i]; + T7_L[i] = T7[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + #define T0 T0_L + #define T1 T1_L + #define T2 T2_L + #define T3 T3_L + #define T4 T4_L + #define T5 T5_L + #define T6 T6_L + #define T7 T7_L + + // groestl + sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + + sph_u64 g[16], m[16]; + g[0] = m[0] = DEC64E(hash->h8[0]); + g[1] = m[1] = DEC64E(hash->h8[1]); + g[2] = m[2] = DEC64E(hash->h8[2]); + g[3] = m[3] = DEC64E(hash->h8[3]); + g[4] = m[4] = DEC64E(hash->h8[4]); + g[5] = m[5] = DEC64E(hash->h8[5]); + g[6] = m[6] = DEC64E(hash->h8[6]); + g[7] = m[7] = DEC64E(hash->h8[7]); + g[8] = m[8] = 0x80; + g[9] = m[9] = 0; + g[10] = m[10] = 0; + g[11] = m[11] = 0; + g[12] = m[12] = 0; + g[13] = m[13] = 0; + g[14] = m[14] = 0; + g[15] = 0x102000000000000; + m[15] = 0x100000000000000; + + PERM_BIG_P(g); + PERM_BIG_Q(m); + + sph_u64 xH[16]; + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u] ^= g[u] ^ m[u]; + + PERM_BIG_P(xH); + + for (unsigned int u = 8; u < 16; u ++) + hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search3(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // skein - - sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); - sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; - sph_u64 bcount = 0; - - m0 = SWAP8(hash->h8[0]); - m1 = SWAP8(hash->h8[1]); - m2 = SWAP8(hash->h8[2]); - m3 = SWAP8(hash->h8[3]); - m4 = SWAP8(hash->h8[4]); - m5 = SWAP8(hash->h8[5]); - m6 = SWAP8(hash->h8[6]); - m7 = SWAP8(hash->h8[7]); - UBI_BIG(480, 64); - bcount = 0; - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; - UBI_BIG(510, 8); - hash->h8[0] = SWAP8(h0); - hash->h8[1] = SWAP8(h1); - hash->h8[2] = SWAP8(h2); - hash->h8[3] = SWAP8(h3); - hash->h8[4] = SWAP8(h4); - hash->h8[5] = SWAP8(h5); - hash->h8[6] = SWAP8(h6); - hash->h8[7] = SWAP8(h7); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // skein + + sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); + sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u64 bcount = 0; + + m0 = SWAP8(hash->h8[0]); + m1 = SWAP8(hash->h8[1]); + m2 = SWAP8(hash->h8[2]); + m3 = SWAP8(hash->h8[3]); + m4 = SWAP8(hash->h8[4]); + m5 = SWAP8(hash->h8[5]); + m6 = SWAP8(hash->h8[6]); + m7 = SWAP8(hash->h8[7]); + + UBI_BIG(480, 64); + + bcount = 0; + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; + + UBI_BIG(510, 8); + + hash->h8[0] = SWAP8(h0); + hash->h8[1] = SWAP8(h1); + hash->h8[2] = SWAP8(h2); + hash->h8[3] = SWAP8(h3); + hash->h8[4] = SWAP8(h4); + hash->h8[5] = SWAP8(h5); + hash->h8[6] = SWAP8(h6); + hash->h8[7] = SWAP8(h7); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search4(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // jh + // jh - sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); - sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); - sph_u64 tmp; + sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); + sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); + sph_u64 tmp; - for(int i = 0; i < 2; i++) + for(int i = 0; i < 2; i++) + { + if (i == 0) + { + h0h ^= DEC64E(hash->h8[0]); + h0l ^= DEC64E(hash->h8[1]); + h1h ^= DEC64E(hash->h8[2]); + h1l ^= DEC64E(hash->h8[3]); + h2h ^= DEC64E(hash->h8[4]); + h2l ^= DEC64E(hash->h8[5]); + h3h ^= DEC64E(hash->h8[6]); + h3l ^= DEC64E(hash->h8[7]); + } + else if(i == 1) { - if (i == 0) { - h0h ^= DEC64E(hash->h8[0]); - h0l ^= DEC64E(hash->h8[1]); - h1h ^= DEC64E(hash->h8[2]); - h1l ^= DEC64E(hash->h8[3]); - h2h ^= DEC64E(hash->h8[4]); - h2l ^= DEC64E(hash->h8[5]); - h3h ^= DEC64E(hash->h8[6]); - h3l ^= DEC64E(hash->h8[7]); - } else if(i == 1) { - h4h ^= DEC64E(hash->h8[0]); - h4l ^= DEC64E(hash->h8[1]); - h5h ^= DEC64E(hash->h8[2]); - h5l ^= DEC64E(hash->h8[3]); - h6h ^= DEC64E(hash->h8[4]); - h6l ^= DEC64E(hash->h8[5]); - h7h ^= DEC64E(hash->h8[6]); - h7l ^= DEC64E(hash->h8[7]); - - h0h ^= 0x80; - h3l ^= 0x2000000000000; - } - E8; + h4h ^= DEC64E(hash->h8[0]); + h4l ^= DEC64E(hash->h8[1]); + h5h ^= DEC64E(hash->h8[2]); + h5l ^= DEC64E(hash->h8[3]); + h6h ^= DEC64E(hash->h8[4]); + h6l ^= DEC64E(hash->h8[5]); + h7h ^= DEC64E(hash->h8[6]); + h7l ^= DEC64E(hash->h8[7]); + + h0h ^= 0x80; + h3l ^= 0x2000000000000; } - h4h ^= 0x80; - h7l ^= 0x2000000000000; - - hash->h8[0] = DEC64E(h4h); - hash->h8[1] = DEC64E(h4l); - hash->h8[2] = DEC64E(h5h); - hash->h8[3] = DEC64E(h5l); - hash->h8[4] = DEC64E(h6h); - hash->h8[5] = DEC64E(h6l); - hash->h8[6] = DEC64E(h7h); - hash->h8[7] = DEC64E(h7l); - - barrier(CLK_GLOBAL_MEM_FENCE); + E8; + } + h4h ^= 0x80; + h7l ^= 0x2000000000000; + + hash->h8[0] = DEC64E(h4h); + hash->h8[1] = DEC64E(h4l); + hash->h8[2] = DEC64E(h5h); + hash->h8[3] = DEC64E(h5l); + hash->h8[4] = DEC64E(h6h); + hash->h8[5] = DEC64E(h6l); + hash->h8[6] = DEC64E(h7h); + hash->h8[7] = DEC64E(h7l); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search5(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // keccak - - sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; - sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; - sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; - sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; - sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; - - a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); - - a00 ^= SWAP8(hash->h8[0]); - a10 ^= SWAP8(hash->h8[1]); - a20 ^= SWAP8(hash->h8[2]); - a30 ^= SWAP8(hash->h8[3]); - a40 ^= SWAP8(hash->h8[4]); - a01 ^= SWAP8(hash->h8[5]); - a11 ^= SWAP8(hash->h8[6]); - a21 ^= SWAP8(hash->h8[7]); - a31 ^= 0x8000000000000001; - KECCAK_F_1600; - // Finalize the "lane complement" - a10 = ~a10; - a20 = ~a20; - - hash->h8[0] = SWAP8(a00); - hash->h8[1] = SWAP8(a10); - hash->h8[2] = SWAP8(a20); - hash->h8[3] = SWAP8(a30); - hash->h8[4] = SWAP8(a40); - hash->h8[5] = SWAP8(a01); - hash->h8[6] = SWAP8(a11); - hash->h8[7] = SWAP8(a21); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // keccak + + sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; + sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; + sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; + sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; + sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; + + a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); + + a00 ^= SWAP8(hash->h8[0]); + a10 ^= SWAP8(hash->h8[1]); + a20 ^= SWAP8(hash->h8[2]); + a30 ^= SWAP8(hash->h8[3]); + a40 ^= SWAP8(hash->h8[4]); + a01 ^= SWAP8(hash->h8[5]); + a11 ^= SWAP8(hash->h8[6]); + a21 ^= SWAP8(hash->h8[7]); + a31 ^= 0x8000000000000001; + KECCAK_F_1600; + + // Finalize the "lane complement" + a10 = ~a10; + a20 = ~a20; + + hash->h8[0] = SWAP8(a00); + hash->h8[1] = SWAP8(a10); + hash->h8[2] = SWAP8(a20); + hash->h8[3] = SWAP8(a30); + hash->h8[4] = SWAP8(a40); + hash->h8[5] = SWAP8(a01); + hash->h8[6] = SWAP8(a11); + hash->h8[7] = SWAP8(a21); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search6(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // luffa - - sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); - sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); - sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); - sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); - sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea); - - DECL_TMP8(M); - - M0 = hash->h4[1]; - M1 = hash->h4[0]; - M2 = hash->h4[3]; - M3 = hash->h4[2]; - M4 = hash->h4[5]; - M5 = hash->h4[4]; - M6 = hash->h4[7]; - M7 = hash->h4[6]; - - for(uint i = 0; i < 5; i++) + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // luffa + + sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); + sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); + sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); + sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); + sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea); + + DECL_TMP8(M); + + M0 = hash->h4[1]; + M1 = hash->h4[0]; + M2 = hash->h4[3]; + M3 = hash->h4[2]; + M4 = hash->h4[5]; + M5 = hash->h4[4]; + M6 = hash->h4[7]; + M7 = hash->h4[6]; + + for(uint i = 0; i < 5; i++) + { + MI5; + LUFFA_P5; + + if(i == 0) + { + M0 = hash->h4[9]; + M1 = hash->h4[8]; + M2 = hash->h4[11]; + M3 = hash->h4[10]; + M4 = hash->h4[13]; + M5 = hash->h4[12]; + M6 = hash->h4[15]; + M7 = hash->h4[14]; + } + else if(i == 1) { - MI5; - LUFFA_P5; - - if(i == 0) { - M0 = hash->h4[9]; - M1 = hash->h4[8]; - M2 = hash->h4[11]; - M3 = hash->h4[10]; - M4 = hash->h4[13]; - M5 = hash->h4[12]; - M6 = hash->h4[15]; - M7 = hash->h4[14]; - } else if(i == 1) { - M0 = 0x80000000; - M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; - } else if(i == 2) { - M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; - } else if(i == 3) { - hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; - hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; - hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; - hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43; - hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44; - hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45; - hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46; - hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; - } + M0 = 0x80000000; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + } + else if(i == 2) + M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + else if(i == 3) + { + hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; } - - hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; - hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; - hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; - hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; - hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; - hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; - hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; - hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; - - barrier(CLK_GLOBAL_MEM_FENCE); + } + + hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search7(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // cubehash.h1 - - sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); - sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); - sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); - sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); - sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); - sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); - sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); - sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44); - - x0 ^= SWAP4(hash->h4[1]); - x1 ^= SWAP4(hash->h4[0]); - x2 ^= SWAP4(hash->h4[3]); - x3 ^= SWAP4(hash->h4[2]); - x4 ^= SWAP4(hash->h4[5]); - x5 ^= SWAP4(hash->h4[4]); - x6 ^= SWAP4(hash->h4[7]); - x7 ^= SWAP4(hash->h4[6]); - - for (int i = 0; i < 13; i ++) { - SIXTEEN_ROUNDS; - - if (i == 0) { - x0 ^= SWAP4(hash->h4[9]); - x1 ^= SWAP4(hash->h4[8]); - x2 ^= SWAP4(hash->h4[11]); - x3 ^= SWAP4(hash->h4[10]); - x4 ^= SWAP4(hash->h4[13]); - x5 ^= SWAP4(hash->h4[12]); - x6 ^= SWAP4(hash->h4[15]); - x7 ^= SWAP4(hash->h4[14]); - } else if(i == 1) { - x0 ^= 0x80; - } else if (i == 2) { - xv ^= SPH_C32(1); - } - } - - hash->h4[0] = x0; - hash->h4[1] = x1; - hash->h4[2] = x2; - hash->h4[3] = x3; - hash->h4[4] = x4; - hash->h4[5] = x5; - hash->h4[6] = x6; - hash->h4[7] = x7; - hash->h4[8] = x8; - hash->h4[9] = x9; - hash->h4[10] = xa; - hash->h4[11] = xb; - hash->h4[12] = xc; - hash->h4[13] = xd; - hash->h4[14] = xe; - hash->h4[15] = xf; - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // cubehash.h1 + + sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); + sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); + sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); + sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); + sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); + sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); + sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); + sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44); + + x0 ^= SWAP4(hash->h4[1]); + x1 ^= SWAP4(hash->h4[0]); + x2 ^= SWAP4(hash->h4[3]); + x3 ^= SWAP4(hash->h4[2]); + x4 ^= SWAP4(hash->h4[5]); + x5 ^= SWAP4(hash->h4[4]); + x6 ^= SWAP4(hash->h4[7]); + x7 ^= SWAP4(hash->h4[6]); + + for (int i = 0; i < 13; i ++) + { + SIXTEEN_ROUNDS; + + if (i == 0) + { + x0 ^= SWAP4(hash->h4[9]); + x1 ^= SWAP4(hash->h4[8]); + x2 ^= SWAP4(hash->h4[11]); + x3 ^= SWAP4(hash->h4[10]); + x4 ^= SWAP4(hash->h4[13]); + x5 ^= SWAP4(hash->h4[12]); + x6 ^= SWAP4(hash->h4[15]); + x7 ^= SWAP4(hash->h4[14]); + } + else if(i == 1) + x0 ^= 0x80; + else if (i == 2) + xv ^= SPH_C32(1); + } + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + hash->h4[8] = x8; + hash->h4[9] = x9; + hash->h4[10] = xa; + hash->h4[11] = xb; + hash->h4[12] = xc; + hash->h4[13] = xd; + hash->h4[14] = xe; + hash->h4[15] = xf; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search8(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; - - int init = get_local_id(0); - int step = get_local_size(0); - - for (int i = init; i < 256; i += step) - { - AES0[i] = AES0_C[i]; - AES1[i] = AES1_C[i]; - AES2[i] = AES2_C[i]; - AES3[i] = AES3_C[i]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // shavite - // IV - sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); - sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); - sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); - sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A); - - // state - sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; - sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; - sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; - sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; - - sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; - - rk00 = hash->h4[0]; - rk01 = hash->h4[1]; - rk02 = hash->h4[2]; - rk03 = hash->h4[3]; - rk04 = hash->h4[4]; - rk05 = hash->h4[5]; - rk06 = hash->h4[6]; - rk07 = hash->h4[7]; - rk08 = hash->h4[8]; - rk09 = hash->h4[9]; - rk0A = hash->h4[10]; - rk0B = hash->h4[11]; - rk0C = hash->h4[12]; - rk0D = hash->h4[13]; - rk0E = hash->h4[14]; - rk0F = hash->h4[15]; - rk10 = 0x80; - rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; - rk1B = 0x2000000; - rk1C = rk1D = rk1E = 0; - rk1F = 0x2000000; - - c512(buf); - - hash->h4[0] = h0; - hash->h4[1] = h1; - hash->h4[2] = h2; - hash->h4[3] = h3; - hash->h4[4] = h4; - hash->h4[5] = h5; - hash->h4[6] = h6; - hash->h4[7] = h7; - hash->h4[8] = h8; - hash->h4[9] = h9; - hash->h4[10] = hA; - hash->h4[11] = hB; - hash->h4[12] = hC; - hash->h4[13] = hD; - hash->h4[14] = hE; - hash->h4[15] = hF; - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // shavite + // IV + sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); + sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); + sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); + sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A); + + // state + sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + + sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + + rk00 = hash->h4[0]; + rk01 = hash->h4[1]; + rk02 = hash->h4[2]; + rk03 = hash->h4[3]; + rk04 = hash->h4[4]; + rk05 = hash->h4[5]; + rk06 = hash->h4[6]; + rk07 = hash->h4[7]; + rk08 = hash->h4[8]; + rk09 = hash->h4[9]; + rk0A = hash->h4[10]; + rk0B = hash->h4[11]; + rk0C = hash->h4[12]; + rk0D = hash->h4[13]; + rk0E = hash->h4[14]; + rk0F = hash->h4[15]; + rk10 = 0x80; + rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; + rk1B = 0x2000000; + rk1C = rk1D = rk1E = 0; + rk1F = 0x2000000; + + c512(buf); + + hash->h4[0] = h0; + hash->h4[1] = h1; + hash->h4[2] = h2; + hash->h4[3] = h3; + hash->h4[4] = h4; + hash->h4[5] = h5; + hash->h4[6] = h6; + hash->h4[7] = h7; + hash->h4[8] = h8; + hash->h4[9] = h9; + hash->h4[10] = hA; + hash->h4[11] = hB; + hash->h4[12] = hC; + hash->h4[13] = hD; + hash->h4[14] = hE; + hash->h4[15] = hF; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search9(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // simd - s32 q[256]; - unsigned char x[128]; - for(unsigned int i = 0; i < 64; i++) - x[i] = hash->h1[i]; - for(unsigned int i = 64; i < 128; i++) - x[i] = 0; - - u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); - u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); - u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); - u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); - - FFT256(0, 1, 0, ll1); - for (int i = 0; i < 256; i ++) { - s32 tq; - - tq = q[i] + yoff_b_n[i]; - tq = REDS2(tq); - tq = REDS1(tq); - tq = REDS1(tq); - q[i] = (tq <= 128 ? tq : tq - 257); - } - - A0 ^= hash->h4[0]; - A1 ^= hash->h4[1]; - A2 ^= hash->h4[2]; - A3 ^= hash->h4[3]; - A4 ^= hash->h4[4]; - A5 ^= hash->h4[5]; - A6 ^= hash->h4[6]; - A7 ^= hash->h4[7]; - B0 ^= hash->h4[8]; - B1 ^= hash->h4[9]; - B2 ^= hash->h4[10]; - B3 ^= hash->h4[11]; - B4 ^= hash->h4[12]; - B5 ^= hash->h4[13]; - B6 ^= hash->h4[14]; - B7 ^= hash->h4[15]; - - ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); - ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); - ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); - ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); - - STEP_BIG( - C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), - C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), - IF, 4, 13, PP8_4_); - STEP_BIG( - C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), - C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), - IF, 13, 10, PP8_5_); - STEP_BIG( - C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), - C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), - IF, 10, 25, PP8_6_); - STEP_BIG( - C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), - C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), - IF, 25, 4, PP8_0_); - - u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; - u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; - u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; - u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; - - #define q SIMD_Q - - A0 ^= 0x200; - - ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); - ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); - ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); - ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); - STEP_BIG( - COPY_A0, COPY_A1, COPY_A2, COPY_A3, - COPY_A4, COPY_A5, COPY_A6, COPY_A7, - IF, 4, 13, PP8_4_); - STEP_BIG( - COPY_B0, COPY_B1, COPY_B2, COPY_B3, - COPY_B4, COPY_B5, COPY_B6, COPY_B7, - IF, 13, 10, PP8_5_); - STEP_BIG( - COPY_C0, COPY_C1, COPY_C2, COPY_C3, - COPY_C4, COPY_C5, COPY_C6, COPY_C7, - IF, 10, 25, PP8_6_); - STEP_BIG( - COPY_D0, COPY_D1, COPY_D2, COPY_D3, - COPY_D4, COPY_D5, COPY_D6, COPY_D7, - IF, 25, 4, PP8_0_); - #undef q - - hash->h4[0] = A0; - hash->h4[1] = A1; - hash->h4[2] = A2; - hash->h4[3] = A3; - hash->h4[4] = A4; - hash->h4[5] = A5; - hash->h4[6] = A6; - hash->h4[7] = A7; - hash->h4[8] = B0; - hash->h4[9] = B1; - hash->h4[10] = B2; - hash->h4[11] = B3; - hash->h4[12] = B4; - hash->h4[13] = B5; - hash->h4[14] = B6; - hash->h4[15] = B7; - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // simd + s32 q[256]; + unsigned char x[128]; + for(unsigned int i = 0; i < 64; i++) + x[i] = hash->h1[i]; + for(unsigned int i = 64; i < 128; i++) + x[i] = 0; + + u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); + u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); + u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); + u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); + + FFT256(0, 1, 0, ll1); + for (int i = 0; i < 256; i ++) + { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + + A0 ^= hash->h4[0]; + A1 ^= hash->h4[1]; + A2 ^= hash->h4[2]; + A3 ^= hash->h4[3]; + A4 ^= hash->h4[4]; + A5 ^= hash->h4[5]; + A6 ^= hash->h4[6]; + A7 ^= hash->h4[7]; + B0 ^= hash->h4[8]; + B1 ^= hash->h4[9]; + B2 ^= hash->h4[10]; + B3 ^= hash->h4[11]; + B4 ^= hash->h4[12]; + B5 ^= hash->h4[13]; + B6 ^= hash->h4[14]; + B7 ^= hash->h4[15]; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + IF, 4, 13, PP8_4_); + + STEP_BIG( + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + IF, 13, 10, PP8_5_); + + STEP_BIG( + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + IF, 10, 25, PP8_6_); + + STEP_BIG( + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), + IF, 25, 4, PP8_0_); + + u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; + u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; + u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; + u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; + + #define q SIMD_Q + + A0 ^= 0x200; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + COPY_A0, COPY_A1, COPY_A2, COPY_A3, + COPY_A4, COPY_A5, COPY_A6, COPY_A7, + IF, 4, 13, PP8_4_); + + STEP_BIG( + COPY_B0, COPY_B1, COPY_B2, COPY_B3, + COPY_B4, COPY_B5, COPY_B6, COPY_B7, + IF, 13, 10, PP8_5_); + + STEP_BIG( + COPY_C0, COPY_C1, COPY_C2, COPY_C3, + COPY_C4, COPY_C5, COPY_C6, COPY_C7, + IF, 10, 25, PP8_6_); + + STEP_BIG( + COPY_D0, COPY_D1, COPY_D2, COPY_D3, + COPY_D4, COPY_D5, COPY_D6, COPY_D7, + IF, 25, 4, PP8_0_); + + #undef q + + hash->h4[0] = A0; + hash->h4[1] = A1; + hash->h4[2] = A2; + hash->h4[3] = A3; + hash->h4[4] = A4; + hash->h4[5] = A5; + hash->h4[6] = A6; + hash->h4[7] = A7; + hash->h4[8] = B0; + hash->h4[9] = B1; + hash->h4[10] = B2; + hash->h4[11] = B3; + hash->h4[12] = B4; + hash->h4[13] = B5; + hash->h4[14] = B6; + hash->h4[15] = B7; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search10(__global hash_t* hashes, __global uint* output, const ulong target) { - uint gid = get_global_id(0); - uint offset = get_global_offset(0); - hash_t hash; - - __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; - - int init = get_local_id(0); - int step = get_local_size(0); - - for (int i = init; i < 256; i += step) - { - AES0[i] = AES0_C[i]; - AES1[i] = AES1_C[i]; - AES2[i] = AES2_C[i]; - AES3[i] = AES3_C[i]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // mixtab - __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256]; - for (int i = init; i < 256; i += step) - { - mixtab0[i] = mixtab0_c[i]; - mixtab1[i] = mixtab1_c[i]; - mixtab2[i] = mixtab2_c[i]; - mixtab3[i] = mixtab3_c[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - - for (int i = 0; i < 8; i++) { - hash.h8[i] = hashes[gid-offset].h8[i]; - } - - // echo - - { - - sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; - sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; - Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; - Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; - - sph_u32 K0 = 512; - sph_u32 K1 = 0; - sph_u32 K2 = 0; - sph_u32 K3 = 0; - - W00 = Vb00; - W01 = Vb01; - W10 = Vb10; - W11 = Vb11; - W20 = Vb20; - W21 = Vb21; - W30 = Vb30; - W31 = Vb31; - W40 = Vb40; - W41 = Vb41; - W50 = Vb50; - W51 = Vb51; - W60 = Vb60; - W61 = Vb61; - W70 = Vb70; - W71 = Vb71; - W80 = hash.h8[0]; - W81 = hash.h8[1]; - W90 = hash.h8[2]; - W91 = hash.h8[3]; - WA0 = hash.h8[4]; - WA1 = hash.h8[5]; - WB0 = hash.h8[6]; - WB1 = hash.h8[7]; - WC0 = 0x80; - WC1 = 0; - WD0 = 0; - WD1 = 0; - WE0 = 0; - WE1 = 0x200000000000000; - WF0 = 0x200; - WF1 = 0; - - for (unsigned u = 0; u < 10; u ++) { - BIG_ROUND; - } - - hash.h8[0] ^= Vb00 ^ W00 ^ W80; - hash.h8[1] ^= Vb01 ^ W01 ^ W81; - hash.h8[2] ^= Vb10 ^ W10 ^ W90; - hash.h8[3] ^= Vb11 ^ W11 ^ W91; - hash.h8[4] ^= Vb20 ^ W20 ^ WA0; - hash.h8[5] ^= Vb21 ^ W21 ^ WA1; - hash.h8[6] ^= Vb30 ^ W30 ^ WB0; - hash.h8[7] ^= Vb31 ^ W31 ^ WB1; - - } - - // hamsi - - { - - sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; - sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; - sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; - sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; - sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; - sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; - sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; - -#define buf(u) hash.h1[i + u] - for(int i = 0; i < 64; i += 8) { - INPUT_BIG; - P_BIG; - T_BIG; - } -#undef buf -#define buf(u) (u == 0 ? 0x80 : 0) - INPUT_BIG; + uint gid = get_global_id(0); + uint offset = get_global_offset(0); + __global hash_t *hash = &(hashes[gid-offset]); + + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + //mixtab + __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256]; + + for (int i = init; i < 256; i += step) + { + mixtab0[i] = mixtab0_c[i]; + mixtab1[i] = mixtab1_c[i]; + mixtab2[i] = mixtab2_c[i]; + mixtab3[i] = mixtab3_c[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local sph_u32 T512_L[1024]; + __constant const sph_u32 *T512_C = &T512[0][0]; + + for (int i = init; i < 1024; i += step) + T512_L[i] = T512_C[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + __local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256]; + + for (int i = init; i < 256; i += step) + { + LT0[i] = plain_T0[i]; + LT1[i] = plain_T1[i]; + LT2[i] = plain_T2[i]; + LT3[i] = plain_T3[i]; + LT4[i] = plain_T4[i]; + LT5[i] = plain_T5[i]; + LT6[i] = plain_T6[i]; + LT7[i] = plain_T7[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < 8; i++) + hash->h8[i] = hashes[gid-offset].h8[i]; + + // echo + sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; + sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; + Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; + Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; + + sph_u32 K0 = 512; + sph_u32 K1 = 0; + sph_u32 K2 = 0; + sph_u32 K3 = 0; + + W00 = Vb00; + W01 = Vb01; + W10 = Vb10; + W11 = Vb11; + W20 = Vb20; + W21 = Vb21; + W30 = Vb30; + W31 = Vb31; + W40 = Vb40; + W41 = Vb41; + W50 = Vb50; + W51 = Vb51; + W60 = Vb60; + W61 = Vb61; + W70 = Vb70; + W71 = Vb71; + W80 = hash->h8[0]; + W81 = hash->h8[1]; + W90 = hash->h8[2]; + W91 = hash->h8[3]; + WA0 = hash->h8[4]; + WA1 = hash->h8[5]; + WB0 = hash->h8[6]; + WB1 = hash->h8[7]; + WC0 = 0x80; + WC1 = 0; + WD0 = 0; + WD1 = 0; + WE0 = 0; + WE1 = 0x200000000000000; + WF0 = 0x200; + WF1 = 0; + + for (unsigned u = 0; u < 10; u ++) + BIG_ROUND; + + hash->h8[0] ^= Vb00 ^ W00 ^ W80; + hash->h8[1] ^= Vb01 ^ W01 ^ W81; + hash->h8[2] ^= Vb10 ^ W10 ^ W90; + hash->h8[3] ^= Vb11 ^ W11 ^ W91; + hash->h8[4] ^= Vb20 ^ W20 ^ WA0; + hash->h8[5] ^= Vb21 ^ W21 ^ WA1; + hash->h8[6] ^= Vb30 ^ W30 ^ WB0; + hash->h8[7] ^= Vb31 ^ W31 ^ WB1; + + // hamsi + sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; + sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; + sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; + sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; + sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + + #define buf(u) hash->h1[i + u] + + for(int i = 0; i < 64; i += 8) + { + INPUT_BIG_LOCAL; P_BIG; T_BIG; -#undef buf -#define buf(u) (u == 6 ? 2 : 0) - INPUT_BIG; - PF_BIG; - T_BIG; - - for (unsigned u = 0; u < 16; u ++) - hash.h4[u] = h[u]; - - } - - // fugue - - { - - sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; - sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; - sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; - sph_u32 S30, S31, S32, S33, S34, S35; - - ulong fc_bit_count = (sph_u64) 64 << 3; - - S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; - S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); - S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); - S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); - S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); - - FUGUE512_3((hash.h4[0x0]), (hash.h4[0x1]), (hash.h4[0x2])); - FUGUE512_3((hash.h4[0x3]), (hash.h4[0x4]), (hash.h4[0x5])); - FUGUE512_3((hash.h4[0x6]), (hash.h4[0x7]), (hash.h4[0x8])); - FUGUE512_3((hash.h4[0x9]), (hash.h4[0xA]), (hash.h4[0xB])); - FUGUE512_3((hash.h4[0xC]), (hash.h4[0xD]), (hash.h4[0xE])); - FUGUE512_3((hash.h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x); - - // apply round shift if necessary - int i; - - for (i = 0; i < 32; i ++) { - ROR3; - CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); - SMIX(S00, S01, S02, S03); - } - for (i = 0; i < 13; i ++) { - S04 ^= S00; - S09 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S28 ^= S00; - ROR8; - SMIX(S00, S01, S02, S03); - } + } + + #undef buf + #define buf(u) (u == 0 ? 0x80 : 0) + + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + + #undef buf + #define buf(u) (u == 6 ? 2 : 0) + + INPUT_BIG_LOCAL; + PF_BIG; + T_BIG; + + for (unsigned u = 0; u < 16; u ++) + hash->h4[u] = h[u]; + + // fugue + sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; + sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; + sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; + sph_u32 S30, S31, S32, S33, S34, S35; + + ulong fc_bit_count = (sph_u64) 0x200; + + S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; + S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); + S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); + S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); + S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); + + FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2])); + FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5])); + FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8])); + FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB])); + FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE])); + FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x); + + // apply round shift if necessary + int i; + + for (i = 0; i < 32; i ++) + { + ROR3; + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + } + + for (i = 0; i < 13; i ++) + { S04 ^= S00; S09 ^= S00; S18 ^= S00; S27 ^= S00; - - hash.h4[0] = SWAP4(S01); - hash.h4[1] = SWAP4(S02); - hash.h4[2] = SWAP4(S03); - hash.h4[3] = SWAP4(S04); - hash.h4[4] = SWAP4(S09); - hash.h4[5] = SWAP4(S10); - hash.h4[6] = SWAP4(S11); - hash.h4[7] = SWAP4(S12); - hash.h4[8] = SWAP4(S18); - hash.h4[9] = SWAP4(S19); - hash.h4[10] = SWAP4(S20); - hash.h4[11] = SWAP4(S21); - hash.h4[12] = SWAP4(S27); - hash.h4[13] = SWAP4(S28); - hash.h4[14] = SWAP4(S29); - hash.h4[15] = SWAP4(S30); - - } + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S18 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S28 ^= S00; + ROR8; + SMIX(S00, S01, S02, S03); + } + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + + hash->h4[0] = SWAP4(S01); + hash->h4[1] = SWAP4(S02); + hash->h4[2] = SWAP4(S03); + hash->h4[3] = SWAP4(S04); + hash->h4[4] = SWAP4(S09); + hash->h4[5] = SWAP4(S10); + hash->h4[6] = SWAP4(S11); + hash->h4[7] = SWAP4(S12); + hash->h4[8] = SWAP4(S18); + hash->h4[9] = SWAP4(S19); + hash->h4[10] = SWAP4(S20); + hash->h4[11] = SWAP4(S21); + hash->h4[12] = SWAP4(S27); + hash->h4[13] = SWAP4(S28); + hash->h4[14] = SWAP4(S29); + hash->h4[15] = SWAP4(S30); //shabal - { - sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[2], A03 = A_init_512[3], A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[6], A07 = A_init_512[7], - A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11]; - sph_u32 B0 = B_init_512[0], B1 = B_init_512[1], B2 = B_init_512[2], B3 = B_init_512[3], B4 = B_init_512[4], B5 = B_init_512[5], B6 = B_init_512[6], B7 = B_init_512[7], - B8 = B_init_512[8], B9 = B_init_512[9], BA = B_init_512[10], BB = B_init_512[11], BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15]; - sph_u32 C0 = C_init_512[0], C1 = C_init_512[1], C2 = C_init_512[2], C3 = C_init_512[3], C4 = C_init_512[4], C5 = C_init_512[5], C6 = C_init_512[6], C7 = C_init_512[7], - C8 = C_init_512[8], C9 = C_init_512[9], CA = C_init_512[10], CB = C_init_512[11], CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15]; - sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; - sph_u32 Wlow = 1, Whigh = 0; - - M0 = hash.h4[0]; - M1 = hash.h4[1]; - M2 = hash.h4[2]; - M3 = hash.h4[3]; - M4 = hash.h4[4]; - M5 = hash.h4[5]; - M6 = hash.h4[6]; - M7 = hash.h4[7]; - M8 = hash.h4[8]; - M9 = hash.h4[9]; - MA = hash.h4[10]; - MB = hash.h4[11]; - MC = hash.h4[12]; - MD = hash.h4[13]; - ME = hash.h4[14]; - MF = hash.h4[15]; - - INPUT_BLOCK_ADD; - XOR_W; - APPLY_P; - INPUT_BLOCK_SUB; + sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[2], A03 = A_init_512[3], A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[6], A07 = A_init_512[7], + A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11]; + sph_u32 B0 = B_init_512[0], B1 = B_init_512[1], B2 = B_init_512[2], B3 = B_init_512[3], B4 = B_init_512[4], B5 = B_init_512[5], B6 = B_init_512[6], B7 = B_init_512[7], + B8 = B_init_512[8], B9 = B_init_512[9], BA = B_init_512[10], BB = B_init_512[11], BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15]; + sph_u32 C0 = C_init_512[0], C1 = C_init_512[1], C2 = C_init_512[2], C3 = C_init_512[3], C4 = C_init_512[4], C5 = C_init_512[5], C6 = C_init_512[6], C7 = C_init_512[7], + C8 = C_init_512[8], C9 = C_init_512[9], CA = C_init_512[10], CB = C_init_512[11], CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15]; + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; + sph_u32 Wlow = 1, Whigh = 0; + + M0 = hash->h4[0]; + M1 = hash->h4[1]; + M2 = hash->h4[2]; + M3 = hash->h4[3]; + M4 = hash->h4[4]; + M5 = hash->h4[5]; + M6 = hash->h4[6]; + M7 = hash->h4[7]; + M8 = hash->h4[8]; + M9 = hash->h4[9]; + MA = hash->h4[10]; + MB = hash->h4[11]; + MC = hash->h4[12]; + MD = hash->h4[13]; + ME = hash->h4[14]; + MF = hash->h4[15]; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + INPUT_BLOCK_SUB; + SWAP_BC; + INCR_W; + + M0 = 0x80; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + + for (unsigned i = 0; i < 3; i ++) + { SWAP_BC; - INCR_W; - - M0 = 0x80; - M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; - - INPUT_BLOCK_ADD; XOR_W; APPLY_P; - for (unsigned i = 0; i < 3; i ++) { - SWAP_BC; - XOR_W; - APPLY_P; - } - - hash.h4[0] = B0; - hash.h4[1] = B1; - hash.h4[2] = B2; - hash.h4[3] = B3; - hash.h4[4] = B4; - hash.h4[5] = B5; - hash.h4[6] = B6; - hash.h4[7] = B7; - hash.h4[8] = B8; - hash.h4[9] = B9; - hash.h4[10] = BA; - hash.h4[11] = BB; - hash.h4[12] = BC; - hash.h4[13] = BD; - hash.h4[14] = BE; - hash.h4[15] = BF; - } - - //whirlpool - { - sph_u64 n0, n1, n2, n3, n4, n5, n6, n7; - sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; - sph_u64 state[8]; - - n0 = (hash.h8[0]); - n1 = (hash.h8[1]); - n2 = (hash.h8[2]); - n3 = (hash.h8[3]); - n4 = (hash.h8[4]); - n5 = (hash.h8[5]); - n6 = (hash.h8[6]); - n7 = (hash.h8[7]); - - h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0; - - n0 ^= h0; - n1 ^= h1; - n2 ^= h2; - n3 ^= h3; - n4 ^= h4; - n5 ^= h5; - n6 ^= h6; - n7 ^= h7; - - for (unsigned r = 0; r < 10; r ++) { - sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); - TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); - TRANSFER(n, tmp); - } - - state[0] = n0 ^ (hash.h8[0]); - state[1] = n1 ^ (hash.h8[1]); - state[2] = n2 ^ (hash.h8[2]); - state[3] = n3 ^ (hash.h8[3]); - state[4] = n4 ^ (hash.h8[4]); - state[5] = n5 ^ (hash.h8[5]); - state[6] = n6 ^ (hash.h8[6]); - state[7] = n7 ^ (hash.h8[7]); - - n0 = 0x80; - n1 = n2 = n3 = n4 = n5 = n6 = 0; - n7 = 0x2000000000000; - - h0 = state[0]; - h1 = state[1]; - h2 = state[2]; - h3 = state[3]; - h4 = state[4]; - h5 = state[5]; - h6 = state[6]; - h7 = state[7]; - - n0 ^= h0; - n1 ^= h1; - n2 ^= h2; - n3 ^= h3; - n4 ^= h4; - n5 ^= h5; - n6 ^= h6; - n7 ^= h7; - - for (unsigned r = 0; r < 10; r ++) { - sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); - TRANSFER(h, tmp); - ROUND_WENC(plain_T, n, h, tmp); - TRANSFER(n, tmp); - } - - state[0] ^= n0 ^ 0x80; - state[1] ^= n1; - state[2] ^= n2; - state[3] ^= n3; - state[4] ^= n4; - state[5] ^= n5; - state[6] ^= n6; - state[7] ^= n7 ^ 0x2000000000000; - - for (unsigned i = 0; i < 8; i ++) - hash.h8[i] = state[i]; - } - - bool result = (hash.h8[3] <= target); - if (result) - output[atomic_inc(output+0xFF)] = SWAP4(gid); - - barrier(CLK_GLOBAL_MEM_FENCE); + } + + hash->h4[0] = B0; + hash->h4[1] = B1; + hash->h4[2] = B2; + hash->h4[3] = B3; + hash->h4[4] = B4; + hash->h4[5] = B5; + hash->h4[6] = B6; + hash->h4[7] = B7; + hash->h4[8] = B8; + hash->h4[9] = B9; + hash->h4[10] = BA; + hash->h4[11] = BB; + hash->h4[12] = BC; + hash->h4[13] = BD; + hash->h4[14] = BE; + hash->h4[15] = BF; + + // whirlpool + sph_u64 n0, n1, n2, n3, n4, n5, n6, n7; + sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; + sph_u64 state[8]; + + n0 = (hash->h8[0]); + n1 = (hash->h8[1]); + n2 = (hash->h8[2]); + n3 = (hash->h8[3]); + n4 = (hash->h8[4]); + n5 = (hash->h8[5]); + n6 = (hash->h8[6]); + n7 = (hash->h8[7]); + + h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0; + + n0 ^= h0; + n1 ^= h1; + n2 ^= h2; + n3 ^= h3; + n4 ^= h4; + n5 ^= h5; + n6 ^= h6; + n7 ^= h7; + + #pragma unroll 10 + for (unsigned r = 0; r < 10; r ++) + { + sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); + TRANSFER(h, tmp); + ROUND_WENC(plain_T, n, h, tmp); + TRANSFER(n, tmp); + } + + state[0] = n0 ^ (hash->h8[0]); + state[1] = n1 ^ (hash->h8[1]); + state[2] = n2 ^ (hash->h8[2]); + state[3] = n3 ^ (hash->h8[3]); + state[4] = n4 ^ (hash->h8[4]); + state[5] = n5 ^ (hash->h8[5]); + state[6] = n6 ^ (hash->h8[6]); + state[7] = n7 ^ (hash->h8[7]); + + n0 = 0x80; + n1 = n2 = n3 = n4 = n5 = n6 = 0; + n7 = 0x2000000000000; + + h0 = state[0]; + h1 = state[1]; + h2 = state[2]; + h3 = state[3]; + h4 = state[4]; + h5 = state[5]; + h6 = state[6]; + h7 = state[7]; + + n0 ^= h0; + n1 ^= h1; + n2 ^= h2; + n3 ^= h3; + n4 ^= h4; + n5 ^= h5; + n6 ^= h6; + n7 ^= h7; + + #pragma unroll 10 + for (unsigned r = 0; r < 10; r ++) + { + sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + ROUND_KSCHED(LT, h, tmp, plain_RC[r]); + TRANSFER(h, tmp); + ROUND_WENC(plain_T, n, h, tmp); + TRANSFER(n, tmp); + } + + state[0] ^= n0 ^ 0x80; + state[1] ^= n1; + state[2] ^= n2; + state[3] ^= n3; + state[4] ^= n4; + state[5] ^= n5; + state[6] ^= n6; + state[7] ^= n7 ^ 0x2000000000000; + + for (unsigned i = 0; i < 8; i ++) + hash->h8[i] = state[i]; + + bool result = (hash->h8[3] <= target); + if (result) + output[atomic_inc(output+0xFF)] = SWAP4(gid); + + barrier(CLK_GLOBAL_MEM_FENCE); } #endif // BITBLOCK_CL diff --git a/kernel/darkcoin-mod.cl b/kernel/darkcoin-mod.cl index 5b56fa21..caa10368 100644 --- a/kernel/darkcoin-mod.cl +++ b/kernel/darkcoin-mod.cl @@ -95,8 +95,6 @@ #include "shavite.cl" #include "simd.cl" #include "echo.cl" -#include "hamsi.cl" -#include "fugue.cl" #define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP8(x) as_ulong(as_uchar8(x).s76543210) @@ -181,7 +179,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search1(__global hash_t* hashes) { - uint gid = get_global_id(0); + uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); // bmw @@ -456,93 +454,69 @@ __kernel void search2(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - #if !SPH_SMALL_FOOTPRINT_GROESTL - __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; - __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; - #else - __local sph_u64 T0_C[256], T4_C[256]; - #endif - + __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; + int init = get_local_id(0); int step = get_local_size(0); - + for (int i = init; i < 256; i += step) { - T0_C[i] = T0[i]; - T4_C[i] = T4[i]; - #if !SPH_SMALL_FOOTPRINT_GROESTL - T1_C[i] = T1[i]; - T2_C[i] = T2[i]; - T3_C[i] = T3[i]; - T5_C[i] = T5[i]; - T6_C[i] = T6[i]; - T7_C[i] = T7[i]; - #endif + T0_L[i] = T0[i]; + T4_L[i] = T4[i]; + T1_L[i] = T1[i]; + T2_L[i] = T2[i]; + T3_L[i] = T3[i]; + T5_L[i] = T5[i]; + T6_L[i] = T6[i]; + T7_L[i] = T7[i]; } - - barrier(CLK_LOCAL_MEM_FENCE); // groestl - - #define T0 T0_C - #define T1 T1_C - #define T2 T2_C - #define T3 T3_C - #define T4 T4_C - #define T5 T5_C - #define T6 T6_C - #define T7 T7_C - - sph_u64 H[16]; - - for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; - - #if USE_LE - H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); - #else - H[15] = (sph_u64)512; - #endif + + barrier(CLK_LOCAL_MEM_FENCE); + + #define T0 T0_L + #define T1 T1_L + #define T2 T2_L + #define T3 T3_L + #define T4 T4_L + #define T5 T5_L + #define T6 T6_L + #define T7 T7_L + + // groestl + sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; sph_u64 g[16], m[16]; - m[0] = DEC64E(hash->h8[0]); - m[1] = DEC64E(hash->h8[1]); - m[2] = DEC64E(hash->h8[2]); - m[3] = DEC64E(hash->h8[3]); - m[4] = DEC64E(hash->h8[4]); - m[5] = DEC64E(hash->h8[5]); - m[6] = DEC64E(hash->h8[6]); - m[7] = DEC64E(hash->h8[7]); - - for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - - m[8] = 0x80; g[8] = m[8] ^ H[8]; - m[9] = 0; g[9] = m[9] ^ H[9]; - m[10] = 0; g[10] = m[10] ^ H[10]; - m[11] = 0; g[11] = m[11] ^ H[11]; - m[12] = 0; g[12] = m[12] ^ H[12]; - m[13] = 0; g[13] = m[13] ^ H[13]; - m[14] = 0; g[14] = m[14] ^ H[14]; - m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; + g[0] = m[0] = DEC64E(hash->h8[0]); + g[1] = m[1] = DEC64E(hash->h8[1]); + g[2] = m[2] = DEC64E(hash->h8[2]); + g[3] = m[3] = DEC64E(hash->h8[3]); + g[4] = m[4] = DEC64E(hash->h8[4]); + g[5] = m[5] = DEC64E(hash->h8[5]); + g[6] = m[6] = DEC64E(hash->h8[6]); + g[7] = m[7] = DEC64E(hash->h8[7]); + g[8] = m[8] = 0x80; + g[9] = m[9] = 0; + g[10] = m[10] = 0; + g[11] = m[11] = 0; + g[12] = m[12] = 0; + g[13] = m[13] = 0; + g[14] = m[14] = 0; + g[15] = 0x102000000000000; + m[15] = 0x100000000000000; PERM_BIG_P(g); PERM_BIG_Q(m); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - sph_u64 xH[16]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; + xH[u] = H[u] ^= g[u] ^ m[u]; + PERM_BIG_P(xH); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; - - for (unsigned int u = 0; u < 8; u ++) - hash->h8[u] = DEC64E(H[u + 8]); - - barrier(CLK_GLOBAL_MEM_FENCE); + for (unsigned int u = 8; u < 16; u ++) + hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -863,7 +837,7 @@ __kernel void search8(__global hash_t* hashes) sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; - sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; rk00 = hash->h4[0]; rk01 = hash->h4[1]; diff --git a/kernel/marucoin-mod.cl b/kernel/marucoin-mod.cl index 37437374..0ad36706 100644 --- a/kernel/marucoin-mod.cl +++ b/kernel/marucoin-mod.cl @@ -70,12 +70,17 @@ typedef long sph_s64; #define SPH_SIMD_NOCOPY 0 #define SPH_KECCAK_NOCOPY 0 #define SPH_COMPACT_BLAKE_64 0 -#define SPH_LUFFA_PARALLEL 1 +#define SPH_LUFFA_PARALLEL 0 #define SPH_SMALL_FOOTPRINT_GROESTL 0 #define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_CUBEHASH_UNROLL 0 #define SPH_KECCAK_UNROLL 1 -#define SPH_HAMSI_EXPAND_BIG 1 +#if !defined SPH_HAMSI_EXPAND_BIG + #define SPH_HAMSI_EXPAND_BIG 1 +#endif +#ifndef SPH_HAMSI_SHORT + #define SPH_HAMSI_SHORT 1 +#endif #include "blake.cl" #include "bmw.cl" @@ -113,116 +118,326 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // blake - - sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); - sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); - sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); - sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); - sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; - sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; - if ((T0 = SPH_T64(T0 + 1024)) < 1024) - { - T1 = SPH_T64(T1 + 1); - } - sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; - sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; - sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; - sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; - M0 = DEC64BE(block + 0); - M1 = DEC64BE(block + 8); - M2 = DEC64BE(block + 16); - M3 = DEC64BE(block + 24); - M4 = DEC64BE(block + 32); - M5 = DEC64BE(block + 40); - M6 = DEC64BE(block + 48); - M7 = DEC64BE(block + 56); - M8 = DEC64BE(block + 64); - M9 = DEC64BE(block + 72); - M9 &= 0xFFFFFFFF00000000; - M9 ^= SWAP4(gid); - MA = 0x8000000000000000; - MB = 0; - MC = 0; - MD = 1; - ME = 0; - MF = 0x280; - - COMPRESS64; - - hash->h8[0] = H0; - hash->h8[1] = H1; - hash->h8[2] = H2; - hash->h8[3] = H3; - hash->h8[4] = H4; - hash->h8[5] = H5; - hash->h8[6] = H6; - hash->h8[7] = H7; - - - barrier(CLK_GLOBAL_MEM_FENCE); + // blake + sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); + sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); + sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); + sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); + sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; + sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; + + if ((T0 = SPH_T64(T0 + 1024)) < 1024) + T1 = SPH_T64(T1 + 1); + + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; + + M0 = DEC64BE(block + 0); + M1 = DEC64BE(block + 8); + M2 = DEC64BE(block + 16); + M3 = DEC64BE(block + 24); + M4 = DEC64BE(block + 32); + M5 = DEC64BE(block + 40); + M6 = DEC64BE(block + 48); + M7 = DEC64BE(block + 56); + M8 = DEC64BE(block + 64); + M9 = DEC64BE(block + 72); + M9 &= 0xFFFFFFFF00000000; + M9 ^= SWAP4(gid); + MA = 0x8000000000000000; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 0x280; + + COMPRESS64; + + hash->h8[0] = H0; + hash->h8[1] = H1; + hash->h8[2] = H2; + hash->h8[3] = H3; + hash->h8[4] = H4; + hash->h8[5] = H5; + hash->h8[6] = H6; + hash->h8[7] = H7; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search1(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // bmw - sph_u64 BMW_H[16]; - for(unsigned u = 0; u < 16; u++) - BMW_H[u] = BMW_IV512[u]; - - sph_u64 BMW_h1[16], BMW_h2[16]; - sph_u64 mv[16]; - - mv[ 0] = SWAP8(hash->h8[0]); - mv[ 1] = SWAP8(hash->h8[1]); - mv[ 2] = SWAP8(hash->h8[2]); - mv[ 3] = SWAP8(hash->h8[3]); - mv[ 4] = SWAP8(hash->h8[4]); - mv[ 5] = SWAP8(hash->h8[5]); - mv[ 6] = SWAP8(hash->h8[6]); - mv[ 7] = SWAP8(hash->h8[7]); - mv[ 8] = 0x80; - mv[ 9] = 0; - mv[10] = 0; - mv[11] = 0; - mv[12] = 0; - mv[13] = 0; - mv[14] = 0; - mv[15] = 0x200; -#define M(x) (mv[x]) -#define H(x) (BMW_H[x]) -#define dH(x) (BMW_h2[x]) - - FOLDb; + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); -#undef M -#undef H -#undef dH + // bmw + sph_u64 BMW_H[16]; + +#pragma unroll 16 + for(unsigned u = 0; u < 16; u++) + BMW_H[u] = BMW_IV512[u]; -#define M(x) (BMW_h2[x]) -#define H(x) (final_b[x]) -#define dH(x) (BMW_h1[x]) + sph_u64 mv[16],q[32]; + sph_u64 tmp; + + mv[0] = SWAP8(hash->h8[0]); + mv[1] = SWAP8(hash->h8[1]); + mv[2] = SWAP8(hash->h8[2]); + mv[3] = SWAP8(hash->h8[3]); + mv[4] = SWAP8(hash->h8[4]); + mv[5] = SWAP8(hash->h8[5]); + mv[6] = SWAP8(hash->h8[6]); + mv[7] = SWAP8(hash->h8[7]); + mv[8] = 0x80; + mv[9] = 0; + mv[10] = 0; + mv[11] = 0; + mv[12] = 0; + mv[13] = 0; + mv[14] = 0; + mv[15] = SPH_C64(512); + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } - FOLDb; +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) + { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; + } + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } #undef M -#undef H -#undef dH - - hash->h8[0] = SWAP8(BMW_h1[8]); - hash->h8[1] = SWAP8(BMW_h1[9]); - hash->h8[2] = SWAP8(BMW_h1[10]); - hash->h8[3] = SWAP8(BMW_h1[11]); - hash->h8[4] = SWAP8(BMW_h1[12]); - hash->h8[5] = SWAP8(BMW_h1[13]); - hash->h8[6] = SWAP8(BMW_h1[14]); - hash->h8[7] = SWAP8(BMW_h1[15]); - - barrier(CLK_GLOBAL_MEM_FENCE); + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + hash->h8[0] = SWAP8(BMW_H[8]); + hash->h8[1] = SWAP8(BMW_H[9]); + hash->h8[2] = SWAP8(BMW_H[10]); + hash->h8[3] = SWAP8(BMW_H[11]); + hash->h8[4] = SWAP8(BMW_H[12]); + hash->h8[5] = SWAP8(BMW_H[13]); + hash->h8[6] = SWAP8(BMW_H[14]); + hash->h8[7] = SWAP8(BMW_H[15]); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -231,678 +446,697 @@ __kernel void search2(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; + __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - int init = get_local_id(0); - int step = get_local_size(0); + int init = get_local_id(0); + int step = get_local_size(0); - for (int i = init; i < 256; i += step) - { - T0_L[i] = T0[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T4_L[i] = T4[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - -#define T0 T0_L -#define T1 T1_L -#define T2 T2_L -#define T3 T3_L -#define T4 T4_L -#define T5 T5_L -#define T6 T6_L -#define T7 T7_L - - // groestl - - sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; - - sph_u64 g[16], m[16]; - g[0] = m[0] = DEC64E(hash->h8[0]); - g[1] = m[1] = DEC64E(hash->h8[1]); - g[2] = m[2] = DEC64E(hash->h8[2]); - g[3] = m[3] = DEC64E(hash->h8[3]); - g[4] = m[4] = DEC64E(hash->h8[4]); - g[5] = m[5] = DEC64E(hash->h8[5]); - g[6] = m[6] = DEC64E(hash->h8[6]); - g[7] = m[7] = DEC64E(hash->h8[7]); - g[8] = m[8] = 0x80; - g[9] = m[9] = 0; - g[10] = m[10] = 0; - g[11] = m[11] = 0; - g[12] = m[12] = 0; - g[13] = m[13] = 0; - g[14] = m[14] = 0; - g[15] = 0x102000000000000; - m[15] = 0x100000000000000; - - PERM_BIG_P(g); - PERM_BIG_Q(m); - - sph_u64 xH[16]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u] ^= g[u] ^ m[u]; - PERM_BIG_P(xH); - for (unsigned int u = 8; u < 16; u ++) - hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); - barrier(CLK_GLOBAL_MEM_FENCE); + for (int i = init; i < 256; i += step) + { + T0_L[i] = T0[i]; + T4_L[i] = T4[i]; + T1_L[i] = T1[i]; + T2_L[i] = T2[i]; + T3_L[i] = T3[i]; + T5_L[i] = T5[i]; + T6_L[i] = T6[i]; + T7_L[i] = T7[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + #define T0 T0_L + #define T1 T1_L + #define T2 T2_L + #define T3 T3_L + #define T4 T4_L + #define T5 T5_L + #define T6 T6_L + #define T7 T7_L + + // groestl + sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + + sph_u64 g[16], m[16]; + g[0] = m[0] = DEC64E(hash->h8[0]); + g[1] = m[1] = DEC64E(hash->h8[1]); + g[2] = m[2] = DEC64E(hash->h8[2]); + g[3] = m[3] = DEC64E(hash->h8[3]); + g[4] = m[4] = DEC64E(hash->h8[4]); + g[5] = m[5] = DEC64E(hash->h8[5]); + g[6] = m[6] = DEC64E(hash->h8[6]); + g[7] = m[7] = DEC64E(hash->h8[7]); + g[8] = m[8] = 0x80; + g[9] = m[9] = 0; + g[10] = m[10] = 0; + g[11] = m[11] = 0; + g[12] = m[12] = 0; + g[13] = m[13] = 0; + g[14] = m[14] = 0; + g[15] = 0x102000000000000; + m[15] = 0x100000000000000; + + PERM_BIG_P(g); + PERM_BIG_Q(m); + + sph_u64 xH[16]; + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u] ^= g[u] ^ m[u]; + + PERM_BIG_P(xH); + + for (unsigned int u = 8; u < 16; u ++) + hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search3(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // skein - - sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); - sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; - sph_u64 bcount = 0; - - m0 = SWAP8(hash->h8[0]); - m1 = SWAP8(hash->h8[1]); - m2 = SWAP8(hash->h8[2]); - m3 = SWAP8(hash->h8[3]); - m4 = SWAP8(hash->h8[4]); - m5 = SWAP8(hash->h8[5]); - m6 = SWAP8(hash->h8[6]); - m7 = SWAP8(hash->h8[7]); - UBI_BIG(480, 64); - bcount = 0; - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; - UBI_BIG(510, 8); - hash->h8[0] = SWAP8(h0); - hash->h8[1] = SWAP8(h1); - hash->h8[2] = SWAP8(h2); - hash->h8[3] = SWAP8(h3); - hash->h8[4] = SWAP8(h4); - hash->h8[5] = SWAP8(h5); - hash->h8[6] = SWAP8(h6); - hash->h8[7] = SWAP8(h7); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // skein + + sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); + sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u64 bcount = 0; + + m0 = SWAP8(hash->h8[0]); + m1 = SWAP8(hash->h8[1]); + m2 = SWAP8(hash->h8[2]); + m3 = SWAP8(hash->h8[3]); + m4 = SWAP8(hash->h8[4]); + m5 = SWAP8(hash->h8[5]); + m6 = SWAP8(hash->h8[6]); + m7 = SWAP8(hash->h8[7]); + + UBI_BIG(480, 64); + + bcount = 0; + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; + + UBI_BIG(510, 8); + + hash->h8[0] = SWAP8(h0); + hash->h8[1] = SWAP8(h1); + hash->h8[2] = SWAP8(h2); + hash->h8[3] = SWAP8(h3); + hash->h8[4] = SWAP8(h4); + hash->h8[5] = SWAP8(h5); + hash->h8[6] = SWAP8(h6); + hash->h8[7] = SWAP8(h7); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search4(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // jh + // jh - sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); - sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); - sph_u64 tmp; + sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); + sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); + sph_u64 tmp; - for(int i = 0; i < 2; i++) + for(int i = 0; i < 2; i++) + { + if (i == 0) { - if (i == 0) { - h0h ^= DEC64E(hash->h8[0]); - h0l ^= DEC64E(hash->h8[1]); - h1h ^= DEC64E(hash->h8[2]); - h1l ^= DEC64E(hash->h8[3]); - h2h ^= DEC64E(hash->h8[4]); - h2l ^= DEC64E(hash->h8[5]); - h3h ^= DEC64E(hash->h8[6]); - h3l ^= DEC64E(hash->h8[7]); - } else if(i == 1) { - h4h ^= DEC64E(hash->h8[0]); - h4l ^= DEC64E(hash->h8[1]); - h5h ^= DEC64E(hash->h8[2]); - h5l ^= DEC64E(hash->h8[3]); - h6h ^= DEC64E(hash->h8[4]); - h6l ^= DEC64E(hash->h8[5]); - h7h ^= DEC64E(hash->h8[6]); - h7l ^= DEC64E(hash->h8[7]); - - h0h ^= 0x80; - h3l ^= 0x2000000000000; - } - E8; + h0h ^= DEC64E(hash->h8[0]); + h0l ^= DEC64E(hash->h8[1]); + h1h ^= DEC64E(hash->h8[2]); + h1l ^= DEC64E(hash->h8[3]); + h2h ^= DEC64E(hash->h8[4]); + h2l ^= DEC64E(hash->h8[5]); + h3h ^= DEC64E(hash->h8[6]); + h3l ^= DEC64E(hash->h8[7]); + } + else if(i == 1) + { + h4h ^= DEC64E(hash->h8[0]); + h4l ^= DEC64E(hash->h8[1]); + h5h ^= DEC64E(hash->h8[2]); + h5l ^= DEC64E(hash->h8[3]); + h6h ^= DEC64E(hash->h8[4]); + h6l ^= DEC64E(hash->h8[5]); + h7h ^= DEC64E(hash->h8[6]); + h7l ^= DEC64E(hash->h8[7]); + + h0h ^= 0x80; + h3l ^= 0x2000000000000; } - h4h ^= 0x80; - h7l ^= 0x2000000000000; - - hash->h8[0] = DEC64E(h4h); - hash->h8[1] = DEC64E(h4l); - hash->h8[2] = DEC64E(h5h); - hash->h8[3] = DEC64E(h5l); - hash->h8[4] = DEC64E(h6h); - hash->h8[5] = DEC64E(h6l); - hash->h8[6] = DEC64E(h7h); - hash->h8[7] = DEC64E(h7l); - - barrier(CLK_GLOBAL_MEM_FENCE); + E8; + } + h4h ^= 0x80; + h7l ^= 0x2000000000000; + + hash->h8[0] = DEC64E(h4h); + hash->h8[1] = DEC64E(h4l); + hash->h8[2] = DEC64E(h5h); + hash->h8[3] = DEC64E(h5l); + hash->h8[4] = DEC64E(h6h); + hash->h8[5] = DEC64E(h6l); + hash->h8[6] = DEC64E(h7h); + hash->h8[7] = DEC64E(h7l); + + barrier(CLK_GLOBAL_MEM_FENCE); } - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search5(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // keccak - - sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; - sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; - sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; - sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; - sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; - - a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); - - a00 ^= SWAP8(hash->h8[0]); - a10 ^= SWAP8(hash->h8[1]); - a20 ^= SWAP8(hash->h8[2]); - a30 ^= SWAP8(hash->h8[3]); - a40 ^= SWAP8(hash->h8[4]); - a01 ^= SWAP8(hash->h8[5]); - a11 ^= SWAP8(hash->h8[6]); - a21 ^= SWAP8(hash->h8[7]); - a31 ^= 0x8000000000000001; - KECCAK_F_1600; - // Finalize the "lane complement" - a10 = ~a10; - a20 = ~a20; - - hash->h8[0] = SWAP8(a00); - hash->h8[1] = SWAP8(a10); - hash->h8[2] = SWAP8(a20); - hash->h8[3] = SWAP8(a30); - hash->h8[4] = SWAP8(a40); - hash->h8[5] = SWAP8(a01); - hash->h8[6] = SWAP8(a11); - hash->h8[7] = SWAP8(a21); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // keccak + + sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; + sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; + sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; + sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; + sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; + + a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); + + a00 ^= SWAP8(hash->h8[0]); + a10 ^= SWAP8(hash->h8[1]); + a20 ^= SWAP8(hash->h8[2]); + a30 ^= SWAP8(hash->h8[3]); + a40 ^= SWAP8(hash->h8[4]); + a01 ^= SWAP8(hash->h8[5]); + a11 ^= SWAP8(hash->h8[6]); + a21 ^= SWAP8(hash->h8[7]); + a31 ^= 0x8000000000000001; + KECCAK_F_1600; + + // Finalize the "lane complement" + a10 = ~a10; + a20 = ~a20; + + hash->h8[0] = SWAP8(a00); + hash->h8[1] = SWAP8(a10); + hash->h8[2] = SWAP8(a20); + hash->h8[3] = SWAP8(a30); + hash->h8[4] = SWAP8(a40); + hash->h8[5] = SWAP8(a01); + hash->h8[6] = SWAP8(a11); + hash->h8[7] = SWAP8(a21); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search6(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // luffa + // luffa - sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); - sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); - sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); - sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); - sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea); + sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); + sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); + sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); + sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); + sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea); - DECL_TMP8(M); + DECL_TMP8(M); - M0 = hash->h4[1]; - M1 = hash->h4[0]; - M2 = hash->h4[3]; - M3 = hash->h4[2]; - M4 = hash->h4[5]; - M5 = hash->h4[4]; - M6 = hash->h4[7]; - M7 = hash->h4[6]; + M0 = hash->h4[1]; + M1 = hash->h4[0]; + M2 = hash->h4[3]; + M3 = hash->h4[2]; + M4 = hash->h4[5]; + M5 = hash->h4[4]; + M6 = hash->h4[7]; + M7 = hash->h4[6]; - for(uint i = 0; i < 5; i++) + for(uint i = 0; i < 5; i++) + { + MI5; + LUFFA_P5; + + if(i == 0) + { + M0 = hash->h4[9]; + M1 = hash->h4[8]; + M2 = hash->h4[11]; + M3 = hash->h4[10]; + M4 = hash->h4[13]; + M5 = hash->h4[12]; + M6 = hash->h4[15]; + M7 = hash->h4[14]; + } + else if(i == 1) { - MI5; - LUFFA_P5; - - if(i == 0) { - M0 = hash->h4[9]; - M1 = hash->h4[8]; - M2 = hash->h4[11]; - M3 = hash->h4[10]; - M4 = hash->h4[13]; - M5 = hash->h4[12]; - M6 = hash->h4[15]; - M7 = hash->h4[14]; - } else if(i == 1) { - M0 = 0x80000000; - M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; - } else if(i == 2) { - M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; - } else if(i == 3) { - hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; - hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; - hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; - hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43; - hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44; - hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45; - hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46; - hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; - } + M0 = 0x80000000; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + } + else if(i == 2) + M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + else if(i == 3) + { + hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; } - - hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; - hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; - hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; - hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; - hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; - hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; - hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; - hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; - - barrier(CLK_GLOBAL_MEM_FENCE); + } + + hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search7(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // cubehash.h1 + + sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); + sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); + sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); + sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); + sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); + sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); + sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); + sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44); + + x0 ^= SWAP4(hash->h4[1]); + x1 ^= SWAP4(hash->h4[0]); + x2 ^= SWAP4(hash->h4[3]); + x3 ^= SWAP4(hash->h4[2]); + x4 ^= SWAP4(hash->h4[5]); + x5 ^= SWAP4(hash->h4[4]); + x6 ^= SWAP4(hash->h4[7]); + x7 ^= SWAP4(hash->h4[6]); + + for (int i = 0; i < 13; i ++) + { + SIXTEEN_ROUNDS; - // cubehash.h1 - - sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); - sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); - sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); - sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); - sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); - sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); - sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); - sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44); - - x0 ^= SWAP4(hash->h4[1]); - x1 ^= SWAP4(hash->h4[0]); - x2 ^= SWAP4(hash->h4[3]); - x3 ^= SWAP4(hash->h4[2]); - x4 ^= SWAP4(hash->h4[5]); - x5 ^= SWAP4(hash->h4[4]); - x6 ^= SWAP4(hash->h4[7]); - x7 ^= SWAP4(hash->h4[6]); - - for (int i = 0; i < 13; i ++) { - SIXTEEN_ROUNDS; - - if (i == 0) { - x0 ^= SWAP4(hash->h4[9]); - x1 ^= SWAP4(hash->h4[8]); - x2 ^= SWAP4(hash->h4[11]); - x3 ^= SWAP4(hash->h4[10]); - x4 ^= SWAP4(hash->h4[13]); - x5 ^= SWAP4(hash->h4[12]); - x6 ^= SWAP4(hash->h4[15]); - x7 ^= SWAP4(hash->h4[14]); - } else if(i == 1) { - x0 ^= 0x80; - } else if (i == 2) { - xv ^= SPH_C32(1); - } - } + if (i == 0) + { + x0 ^= SWAP4(hash->h4[9]); + x1 ^= SWAP4(hash->h4[8]); + x2 ^= SWAP4(hash->h4[11]); + x3 ^= SWAP4(hash->h4[10]); + x4 ^= SWAP4(hash->h4[13]); + x5 ^= SWAP4(hash->h4[12]); + x6 ^= SWAP4(hash->h4[15]); + x7 ^= SWAP4(hash->h4[14]); + } + else if(i == 1) + x0 ^= 0x80; + else if (i == 2) + xv ^= SPH_C32(1); + } - hash->h4[0] = x0; - hash->h4[1] = x1; - hash->h4[2] = x2; - hash->h4[3] = x3; - hash->h4[4] = x4; - hash->h4[5] = x5; - hash->h4[6] = x6; - hash->h4[7] = x7; - hash->h4[8] = x8; - hash->h4[9] = x9; - hash->h4[10] = xa; - hash->h4[11] = xb; - hash->h4[12] = xc; - hash->h4[13] = xd; - hash->h4[14] = xe; - hash->h4[15] = xf; - - barrier(CLK_GLOBAL_MEM_FENCE); + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + hash->h4[8] = x8; + hash->h4[9] = x9; + hash->h4[10] = xa; + hash->h4[11] = xb; + hash->h4[12] = xc; + hash->h4[13] = xd; + hash->h4[14] = xe; + hash->h4[15] = xf; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search8(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; - - int init = get_local_id(0); - int step = get_local_size(0); - - for (int i = init; i < 256; i += step) - { - AES0[i] = AES0_C[i]; - AES1[i] = AES1_C[i]; - AES2[i] = AES2_C[i]; - AES3[i] = AES3_C[i]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // shavite - // IV - sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); - sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); - sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); - sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A); - - // state - sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; - sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; - sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; - sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; - - sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; - - rk00 = hash->h4[0]; - rk01 = hash->h4[1]; - rk02 = hash->h4[2]; - rk03 = hash->h4[3]; - rk04 = hash->h4[4]; - rk05 = hash->h4[5]; - rk06 = hash->h4[6]; - rk07 = hash->h4[7]; - rk08 = hash->h4[8]; - rk09 = hash->h4[9]; - rk0A = hash->h4[10]; - rk0B = hash->h4[11]; - rk0C = hash->h4[12]; - rk0D = hash->h4[13]; - rk0E = hash->h4[14]; - rk0F = hash->h4[15]; - rk10 = 0x80; - rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; - rk1B = 0x2000000; - rk1C = rk1D = rk1E = 0; - rk1F = 0x2000000; - - c512(buf); - - hash->h4[0] = h0; - hash->h4[1] = h1; - hash->h4[2] = h2; - hash->h4[3] = h3; - hash->h4[4] = h4; - hash->h4[5] = h5; - hash->h4[6] = h6; - hash->h4[7] = h7; - hash->h4[8] = h8; - hash->h4[9] = h9; - hash->h4[10] = hA; - hash->h4[11] = hB; - hash->h4[12] = hC; - hash->h4[13] = hD; - hash->h4[14] = hE; - hash->h4[15] = hF; - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // shavite + // IV + sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); + sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); + sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); + sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A); + + // state + sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + + sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + + rk00 = hash->h4[0]; + rk01 = hash->h4[1]; + rk02 = hash->h4[2]; + rk03 = hash->h4[3]; + rk04 = hash->h4[4]; + rk05 = hash->h4[5]; + rk06 = hash->h4[6]; + rk07 = hash->h4[7]; + rk08 = hash->h4[8]; + rk09 = hash->h4[9]; + rk0A = hash->h4[10]; + rk0B = hash->h4[11]; + rk0C = hash->h4[12]; + rk0D = hash->h4[13]; + rk0E = hash->h4[14]; + rk0F = hash->h4[15]; + rk10 = 0x80; + rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; + rk1B = 0x2000000; + rk1C = rk1D = rk1E = 0; + rk1F = 0x2000000; + + c512(buf); + + hash->h4[0] = h0; + hash->h4[1] = h1; + hash->h4[2] = h2; + hash->h4[3] = h3; + hash->h4[4] = h4; + hash->h4[5] = h5; + hash->h4[6] = h6; + hash->h4[7] = h7; + hash->h4[8] = h8; + hash->h4[9] = h9; + hash->h4[10] = hA; + hash->h4[11] = hB; + hash->h4[12] = hC; + hash->h4[13] = hD; + hash->h4[14] = hE; + hash->h4[15] = hF; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search9(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // simd + s32 q[256]; + unsigned char x[128]; + for(unsigned int i = 0; i < 64; i++) + x[i] = hash->h1[i]; + for(unsigned int i = 64; i < 128; i++) + x[i] = 0; + + u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); + u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); + u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); + u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); + + FFT256(0, 1, 0, ll1); + for (int i = 0; i < 256; i ++) + { + s32 tq; - // simd - s32 q[256]; - unsigned char x[128]; - for(unsigned int i = 0; i < 64; i++) - x[i] = hash->h1[i]; - for(unsigned int i = 64; i < 128; i++) - x[i] = 0; - - u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); - u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); - u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); - u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); - - FFT256(0, 1, 0, ll1); - for (int i = 0; i < 256; i ++) { - s32 tq; - - tq = q[i] + yoff_b_n[i]; - tq = REDS2(tq); - tq = REDS1(tq); - tq = REDS1(tq); - q[i] = (tq <= 128 ? tq : tq - 257); - } + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + + A0 ^= hash->h4[0]; + A1 ^= hash->h4[1]; + A2 ^= hash->h4[2]; + A3 ^= hash->h4[3]; + A4 ^= hash->h4[4]; + A5 ^= hash->h4[5]; + A6 ^= hash->h4[6]; + A7 ^= hash->h4[7]; + B0 ^= hash->h4[8]; + B1 ^= hash->h4[9]; + B2 ^= hash->h4[10]; + B3 ^= hash->h4[11]; + B4 ^= hash->h4[12]; + B5 ^= hash->h4[13]; + B6 ^= hash->h4[14]; + B7 ^= hash->h4[15]; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + IF, 4, 13, PP8_4_); + + STEP_BIG( + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + IF, 13, 10, PP8_5_); + + STEP_BIG( + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + IF, 10, 25, PP8_6_); + + STEP_BIG( + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), + IF, 25, 4, PP8_0_); + + u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; + u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; + u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; + u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; + + #define q SIMD_Q + + A0 ^= 0x200; - A0 ^= hash->h4[0]; - A1 ^= hash->h4[1]; - A2 ^= hash->h4[2]; - A3 ^= hash->h4[3]; - A4 ^= hash->h4[4]; - A5 ^= hash->h4[5]; - A6 ^= hash->h4[6]; - A7 ^= hash->h4[7]; - B0 ^= hash->h4[8]; - B1 ^= hash->h4[9]; - B2 ^= hash->h4[10]; - B3 ^= hash->h4[11]; - B4 ^= hash->h4[12]; - B5 ^= hash->h4[13]; - B6 ^= hash->h4[14]; - B7 ^= hash->h4[15]; - - ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); - ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); - ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); - ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); - - STEP_BIG( - C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), - C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), - IF, 4, 13, PP8_4_); - STEP_BIG( - C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), - C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), - IF, 13, 10, PP8_5_); - STEP_BIG( - C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), - C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), - IF, 10, 25, PP8_6_); - STEP_BIG( - C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), - C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), - IF, 25, 4, PP8_0_); - - u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; - u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; - u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; - u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; - - #define q SIMD_Q - - A0 ^= 0x200; - - ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); - ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); - ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); - ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); - STEP_BIG( - COPY_A0, COPY_A1, COPY_A2, COPY_A3, - COPY_A4, COPY_A5, COPY_A6, COPY_A7, - IF, 4, 13, PP8_4_); - STEP_BIG( - COPY_B0, COPY_B1, COPY_B2, COPY_B3, - COPY_B4, COPY_B5, COPY_B6, COPY_B7, - IF, 13, 10, PP8_5_); - STEP_BIG( - COPY_C0, COPY_C1, COPY_C2, COPY_C3, - COPY_C4, COPY_C5, COPY_C6, COPY_C7, - IF, 10, 25, PP8_6_); - STEP_BIG( - COPY_D0, COPY_D1, COPY_D2, COPY_D3, - COPY_D4, COPY_D5, COPY_D6, COPY_D7, - IF, 25, 4, PP8_0_); - #undef q - - hash->h4[0] = A0; - hash->h4[1] = A1; - hash->h4[2] = A2; - hash->h4[3] = A3; - hash->h4[4] = A4; - hash->h4[5] = A5; - hash->h4[6] = A6; - hash->h4[7] = A7; - hash->h4[8] = B0; - hash->h4[9] = B1; - hash->h4[10] = B2; - hash->h4[11] = B3; - hash->h4[12] = B4; - hash->h4[13] = B5; - hash->h4[14] = B6; - hash->h4[15] = B7; - - barrier(CLK_GLOBAL_MEM_FENCE); + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + COPY_A0, COPY_A1, COPY_A2, COPY_A3, + COPY_A4, COPY_A5, COPY_A6, COPY_A7, + IF, 4, 13, PP8_4_); + + STEP_BIG( + COPY_B0, COPY_B1, COPY_B2, COPY_B3, + COPY_B4, COPY_B5, COPY_B6, COPY_B7, + IF, 13, 10, PP8_5_); + + STEP_BIG( + COPY_C0, COPY_C1, COPY_C2, COPY_C3, + COPY_C4, COPY_C5, COPY_C6, COPY_C7, + IF, 10, 25, PP8_6_); + + STEP_BIG( + COPY_D0, COPY_D1, COPY_D2, COPY_D3, + COPY_D4, COPY_D5, COPY_D6, COPY_D7, + IF, 25, 4, PP8_0_); + + #undef q + + hash->h4[0] = A0; + hash->h4[1] = A1; + hash->h4[2] = A2; + hash->h4[3] = A3; + hash->h4[4] = A4; + hash->h4[5] = A5; + hash->h4[6] = A6; + hash->h4[7] = A7; + hash->h4[8] = B0; + hash->h4[9] = B1; + hash->h4[10] = B2; + hash->h4[11] = B3; + hash->h4[12] = B4; + hash->h4[13] = B5; + hash->h4[14] = B6; + hash->h4[15] = B7; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search10(__global hash_t* hashes) { - uint gid = get_global_id(0); - - - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + uint gid = get_global_id(0); + uint offset = get_global_offset(0); + __global hash_t *hash = &(hashes[gid-offset]); - int init = get_local_id(0); - int step = get_local_size(0); + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; - for (int i = init; i < 256; i += step) - { - AES0[i] = AES0_C[i]; - AES1[i] = AES1_C[i]; - AES2[i] = AES2_C[i]; - AES3[i] = AES3_C[i]; - } + int init = get_local_id(0); + int step = get_local_size(0); - barrier(CLK_LOCAL_MEM_FENCE); - - - - - - // echo - sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; - sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; - Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; - Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; - - sph_u32 K0 = 512; - sph_u32 K1 = 0; - sph_u32 K2 = 0; - sph_u32 K3 = 0; - - W00 = Vb00; - W01 = Vb01; - W10 = Vb10; - W11 = Vb11; - W20 = Vb20; - W21 = Vb21; - W30 = Vb30; - W31 = Vb31; - W40 = Vb40; - W41 = Vb41; - W50 = Vb50; - W51 = Vb51; - W60 = Vb60; - W61 = Vb61; - W70 = Vb70; - W71 = Vb71; - W80 = hash->h8[0]; - W81 = hash->h8[1]; - W90 = hash->h8[2]; - W91 = hash->h8[3]; - WA0 = hash->h8[4]; - WA1 = hash->h8[5]; - WB0 = hash->h8[6]; - WB1 = hash->h8[7]; - WC0 = 0x80; - WC1 = 0; - WD0 = 0; - WD1 = 0; - WE0 = 0; - WE1 = 0x200000000000000; - WF0 = 0x200; - WF1 = 0; - - for (unsigned u = 0; u < 10; u ++) { - BIG_ROUND; - } + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } - - - - - - - - + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < 8; i++) + hash->h8[i] = hashes[gid-offset].h8[i]; + + // echo + sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; + sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; + Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; + Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; + + sph_u32 K0 = 512; + sph_u32 K1 = 0; + sph_u32 K2 = 0; + sph_u32 K3 = 0; + + W00 = Vb00; + W01 = Vb01; + W10 = Vb10; + W11 = Vb11; + W20 = Vb20; + W21 = Vb21; + W30 = Vb30; + W31 = Vb31; + W40 = Vb40; + W41 = Vb41; + W50 = Vb50; + W51 = Vb51; + W60 = Vb60; + W61 = Vb61; + W70 = Vb70; + W71 = Vb71; + W80 = hash->h8[0]; + W81 = hash->h8[1]; + W90 = hash->h8[2]; + W91 = hash->h8[3]; + WA0 = hash->h8[4]; + WA1 = hash->h8[5]; + WB0 = hash->h8[6]; + WB1 = hash->h8[7]; + WC0 = 0x80; + WC1 = 0; + WD0 = 0; + WD1 = 0; + WE0 = 0; + WE1 = 0x200000000000000; + WF0 = 0x200; + WF1 = 0; + + for (unsigned u = 0; u < 10; u ++) + BIG_ROUND; + + hash->h8[0] ^= Vb00 ^ W00 ^ W80; + hash->h8[1] ^= Vb01 ^ W01 ^ W81; + hash->h8[2] ^= Vb10 ^ W10 ^ W90; + hash->h8[3] ^= Vb11 ^ W11 ^ W91; + hash->h8[4] ^= Vb20 ^ W20 ^ WA0; + hash->h8[5] ^= Vb21 ^ W21 ^ WA1; + hash->h8[6] ^= Vb30 ^ W30 ^ WB0; + hash->h8[7] ^= Vb31 ^ W31 ^ WB1; - hash->h8[0] = Vb00 ^ hash->h8[0] ^ W00 ^ W80; - hash->h8[1] = Vb01 ^ hash->h8[1] ^ W01 ^ W81; - hash->h8[2] = Vb10 ^ hash->h8[2] ^ W10 ^ W90; - hash->h8[3] = Vb11 ^ hash->h8[3] ^ W11 ^ W91; - hash->h8[4] = Vb20 ^ hash->h8[4] ^ W20 ^ WA0; - hash->h8[5] = Vb21 ^ hash->h8[5] ^ W21 ^ WA1; - hash->h8[6] = Vb30 ^ hash->h8[6] ^ W30 ^ WB0; - hash->h8[7] = Vb31 ^ hash->h8[7] ^ W31 ^ WB1; - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search11(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __local sph_u32 T512_L[1024]; + __constant const sph_u32 *T512_C = &T512[0][0]; + + int init = get_local_id(0); + int step = get_local_size(0); + for (int i = init; i < 1024; i += step) + T512_L[i] = T512_C[i]; - __local sph_u32 T512_L[1024]; - __constant const sph_u32 *T512_C = &T512[0][0]; - int init = get_local_id(0); - int step = get_local_size(0); - for (int i = init; i < 1024; i += step) - { - T512_L[i] = T512_C[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); - { - sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; - sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; - sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; - sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; - sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; - sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; - sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; - -#define buf(u) hash->h1[i + u] - for(int i = 0; i < 64; i += 8) { - INPUT_BIG_LOCAL; - P_BIG; - T_BIG; - } -#undef buf -#define buf(u) (u == 0 ? 0x80 : 0) - INPUT_BIG_LOCAL; - P_BIG; - T_BIG; -#undef buf -#define buf(u) (u == 6 ? 2 : 0) - INPUT_BIG_LOCAL; - PF_BIG; - T_BIG; - - for (unsigned u = 0; u < 16; u ++) - hash->h4[u] = h[u]; - } - barrier(CLK_GLOBAL_MEM_FENCE); + sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; + sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; + sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; + sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; + sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + + #define buf(u) hash->h1[i + u] + + for(int i = 0; i < 64; i += 8) { + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + } + + #undef buf + #define buf(u) (u == 0 ? 0x80 : 0) + + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + + #undef buf + #define buf(u) (u == 6 ? 2 : 0) + + INPUT_BIG_LOCAL; + PF_BIG; + T_BIG; + + for (unsigned u = 0; u < 16; u ++) + hash->h4[u] = h[u]; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -931,7 +1165,7 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; sph_u32 S30, S31, S32, S33, S34, S35; - ulong fc_bit_count = (sph_u64) 64 << 3; + ulong fc_bit_count = (sph_u64) 0x200; S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); diff --git a/kernel/marucoin-modold.cl b/kernel/marucoin-modold.cl index 1df97fca..775a150b 100644 --- a/kernel/marucoin-modold.cl +++ b/kernel/marucoin-modold.cl @@ -75,8 +75,8 @@ typedef long sph_s64; #define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_CUBEHASH_UNROLL 0 #define SPH_KECCAK_UNROLL 0 -#if !defined SPH_HAMSI_EXPAND_BIG -#define SPH_HAMSI_EXPAND_BIG 4 +#ifndef SPH_HAMSI_EXPAND_BIG + #define SPH_HAMSI_EXPAND_BIG 4 #endif #include "blake.cl" @@ -115,790 +115,917 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) { uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // blake - sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); - sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); - sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); - sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); - sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; - sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; - - if ((T0 = SPH_T64(T0 + 1024)) < 1024) - { - T1 = SPH_T64(T1 + 1); - } - sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; - sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; - sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; - sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; - M0 = DEC64BE(block + 0); - M1 = DEC64BE(block + 8); - M2 = DEC64BE(block + 16); - M3 = DEC64BE(block + 24); - M4 = DEC64BE(block + 32); - M5 = DEC64BE(block + 40); - M6 = DEC64BE(block + 48); - M7 = DEC64BE(block + 56); - M8 = DEC64BE(block + 64); - M9 = DEC64BE(block + 72); - M9 &= 0xFFFFFFFF00000000; - M9 ^= SWAP4(gid); - MA = 0x8000000000000000; - MB = 0; - MC = 0; - MD = 1; - ME = 0; - MF = 0x280; - - COMPRESS64; - - hash->h8[0] = H0; - hash->h8[1] = H1; - hash->h8[2] = H2; - hash->h8[3] = H3; - hash->h8[4] = H4; - hash->h8[5] = H5; - hash->h8[6] = H6; - hash->h8[7] = H7; - - barrier(CLK_GLOBAL_MEM_FENCE); + // blake + sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); + sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); + sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); + sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); + sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; + sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; + + if ((T0 = SPH_T64(T0 + 1024)) < 1024) + T1 = SPH_T64(T1 + 1); + + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; + + M0 = DEC64BE(block + 0); + M1 = DEC64BE(block + 8); + M2 = DEC64BE(block + 16); + M3 = DEC64BE(block + 24); + M4 = DEC64BE(block + 32); + M5 = DEC64BE(block + 40); + M6 = DEC64BE(block + 48); + M7 = DEC64BE(block + 56); + M8 = DEC64BE(block + 64); + M9 = DEC64BE(block + 72); + M9 &= 0xFFFFFFFF00000000; + M9 ^= SWAP4(gid); + MA = 0x8000000000000000; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 0x280; + + COMPRESS64; + + hash->h8[0] = H0; + hash->h8[1] = H1; + hash->h8[2] = H2; + hash->h8[3] = H3; + hash->h8[4] = H4; + hash->h8[5] = H5; + hash->h8[6] = H6; + hash->h8[7] = H7; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search1(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // bmw - sph_u64 BMW_H[16]; - for(unsigned u = 0; u < 16; u++) - BMW_H[u] = BMW_IV512[u]; - - sph_u64 BMW_h1[16], BMW_h2[16]; - sph_u64 mv[16]; - - mv[ 0] = SWAP8(hash->h8[0]); - mv[ 1] = SWAP8(hash->h8[1]); - mv[ 2] = SWAP8(hash->h8[2]); - mv[ 3] = SWAP8(hash->h8[3]); - mv[ 4] = SWAP8(hash->h8[4]); - mv[ 5] = SWAP8(hash->h8[5]); - mv[ 6] = SWAP8(hash->h8[6]); - mv[ 7] = SWAP8(hash->h8[7]); - mv[ 8] = 0x80; - mv[ 9] = 0; - mv[10] = 0; - mv[11] = 0; - mv[12] = 0; - mv[13] = 0; - mv[14] = 0; - mv[15] = 0x200; -#define M(x) (mv[x]) -#define H(x) (BMW_H[x]) -#define dH(x) (BMW_h2[x]) - - FOLDb; - -#undef M -#undef H -#undef dH - -#define M(x) (BMW_h2[x]) -#define H(x) (final_b[x]) -#define dH(x) (BMW_h1[x]) - - FOLDb; - -#undef M -#undef H -#undef dH - - hash->h8[0] = SWAP8(BMW_h1[8]); - hash->h8[1] = SWAP8(BMW_h1[9]); - hash->h8[2] = SWAP8(BMW_h1[10]); - hash->h8[3] = SWAP8(BMW_h1[11]); - hash->h8[4] = SWAP8(BMW_h1[12]); - hash->h8[5] = SWAP8(BMW_h1[13]); - hash->h8[6] = SWAP8(BMW_h1[14]); - hash->h8[7] = SWAP8(BMW_h1[15]); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // bmw + sph_u64 BMW_H[16]; + +#pragma unroll 16 + for(unsigned u = 0; u < 16; u++) + BMW_H[u] = BMW_IV512[u]; + + sph_u64 mv[16],q[32]; + sph_u64 tmp; + + mv[0] = SWAP8(hash->h8[0]); + mv[1] = SWAP8(hash->h8[1]); + mv[2] = SWAP8(hash->h8[2]); + mv[3] = SWAP8(hash->h8[3]); + mv[4] = SWAP8(hash->h8[4]); + mv[5] = SWAP8(hash->h8[5]); + mv[6] = SWAP8(hash->h8[6]); + mv[7] = SWAP8(hash->h8[7]); + mv[8] = 0x80; + mv[9] = 0; + mv[10] = 0; + mv[11] = 0; + mv[12] = 0; + mv[13] = 0; + mv[14] = 0; + mv[15] = SPH_C64(512); + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) + { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; + } + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + hash->h8[0] = SWAP8(BMW_H[8]); + hash->h8[1] = SWAP8(BMW_H[9]); + hash->h8[2] = SWAP8(BMW_H[10]); + hash->h8[3] = SWAP8(BMW_H[11]); + hash->h8[4] = SWAP8(BMW_H[12]); + hash->h8[5] = SWAP8(BMW_H[13]); + hash->h8[6] = SWAP8(BMW_H[14]); + hash->h8[7] = SWAP8(BMW_H[15]); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search2(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; + __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - int init = get_local_id(0); - int step = get_local_size(0); + int init = get_local_id(0); + int step = get_local_size(0); - for (int i = init; i < 256; i += step) - { - T0_L[i] = T0[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T4_L[i] = T4[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - -#define T0 T0_L -#define T1 T1_L -#define T2 T2_L -#define T3 T3_L -#define T4 T4_L -#define T5 T5_L -#define T6 T6_L -#define T7 T7_L - - // groestl - - sph_u64 H[16]; - for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; -#if USE_LE - H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); -#else - H[15] = (sph_u64)512; -#endif + for (int i = init; i < 256; i += step) + { + T0_L[i] = T0[i]; + T4_L[i] = T4[i]; + T1_L[i] = T1[i]; + T2_L[i] = T2[i]; + T3_L[i] = T3[i]; + T5_L[i] = T5[i]; + T6_L[i] = T6[i]; + T7_L[i] = T7[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + #define T0 T0_L + #define T1 T1_L + #define T2 T2_L + #define T3 T3_L + #define T4 T4_L + #define T5 T5_L + #define T6 T6_L + #define T7 T7_L + + // groestl + sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + + sph_u64 g[16], m[16]; + g[0] = m[0] = DEC64E(hash->h8[0]); + g[1] = m[1] = DEC64E(hash->h8[1]); + g[2] = m[2] = DEC64E(hash->h8[2]); + g[3] = m[3] = DEC64E(hash->h8[3]); + g[4] = m[4] = DEC64E(hash->h8[4]); + g[5] = m[5] = DEC64E(hash->h8[5]); + g[6] = m[6] = DEC64E(hash->h8[6]); + g[7] = m[7] = DEC64E(hash->h8[7]); + g[8] = m[8] = 0x80; + g[9] = m[9] = 0; + g[10] = m[10] = 0; + g[11] = m[11] = 0; + g[12] = m[12] = 0; + g[13] = m[13] = 0; + g[14] = m[14] = 0; + g[15] = 0x102000000000000; + m[15] = 0x100000000000000; + + PERM_BIG_P(g); + PERM_BIG_Q(m); + + sph_u64 xH[16]; + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u] ^= g[u] ^ m[u]; + + PERM_BIG_P(xH); + + for (unsigned int u = 8; u < 16; u ++) + hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); - sph_u64 g[16], m[16]; - m[0] = DEC64E(hash->h8[0]); - m[1] = DEC64E(hash->h8[1]); - m[2] = DEC64E(hash->h8[2]); - m[3] = DEC64E(hash->h8[3]); - m[4] = DEC64E(hash->h8[4]); - m[5] = DEC64E(hash->h8[5]); - m[6] = DEC64E(hash->h8[6]); - m[7] = DEC64E(hash->h8[7]); - for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - m[8] = 0x80; g[8] = m[8] ^ H[8]; - m[9] = 0; g[9] = m[9] ^ H[9]; - m[10] = 0; g[10] = m[10] ^ H[10]; - m[11] = 0; g[11] = m[11] ^ H[11]; - m[12] = 0; g[12] = m[12] ^ H[12]; - m[13] = 0; g[13] = m[13] ^ H[13]; - m[14] = 0; g[14] = m[14] ^ H[14]; - m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; - PERM_BIG_P(g); - PERM_BIG_Q(m); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - sph_u64 xH[16]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; - PERM_BIG_P(xH); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; - for (unsigned int u = 0; u < 8; u ++) - hash->h8[u] = DEC64E(H[u + 8]); - - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search3(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // skein - - sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); - sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; - sph_u64 bcount = 0; - - m0 = SWAP8(hash->h8[0]); - m1 = SWAP8(hash->h8[1]); - m2 = SWAP8(hash->h8[2]); - m3 = SWAP8(hash->h8[3]); - m4 = SWAP8(hash->h8[4]); - m5 = SWAP8(hash->h8[5]); - m6 = SWAP8(hash->h8[6]); - m7 = SWAP8(hash->h8[7]); - UBI_BIG(480, 64); - bcount = 0; - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; - UBI_BIG(510, 8); - hash->h8[0] = SWAP8(h0); - hash->h8[1] = SWAP8(h1); - hash->h8[2] = SWAP8(h2); - hash->h8[3] = SWAP8(h3); - hash->h8[4] = SWAP8(h4); - hash->h8[5] = SWAP8(h5); - hash->h8[6] = SWAP8(h6); - hash->h8[7] = SWAP8(h7); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // skein + + sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); + sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u64 bcount = 0; + + m0 = SWAP8(hash->h8[0]); + m1 = SWAP8(hash->h8[1]); + m2 = SWAP8(hash->h8[2]); + m3 = SWAP8(hash->h8[3]); + m4 = SWAP8(hash->h8[4]); + m5 = SWAP8(hash->h8[5]); + m6 = SWAP8(hash->h8[6]); + m7 = SWAP8(hash->h8[7]); + + UBI_BIG(480, 64); + + bcount = 0; + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; + + UBI_BIG(510, 8); + + hash->h8[0] = SWAP8(h0); + hash->h8[1] = SWAP8(h1); + hash->h8[2] = SWAP8(h2); + hash->h8[3] = SWAP8(h3); + hash->h8[4] = SWAP8(h4); + hash->h8[5] = SWAP8(h5); + hash->h8[6] = SWAP8(h6); + hash->h8[7] = SWAP8(h7); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search4(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - // jh + // jh - sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); - sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); - sph_u64 tmp; + sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); + sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); + sph_u64 tmp; - for(int i = 0; i < 2; i++) + for(int i = 0; i < 2; i++) + { + if (i == 0) + { + h0h ^= DEC64E(hash->h8[0]); + h0l ^= DEC64E(hash->h8[1]); + h1h ^= DEC64E(hash->h8[2]); + h1l ^= DEC64E(hash->h8[3]); + h2h ^= DEC64E(hash->h8[4]); + h2l ^= DEC64E(hash->h8[5]); + h3h ^= DEC64E(hash->h8[6]); + h3l ^= DEC64E(hash->h8[7]); + } + else if(i == 1) { - if (i == 0) { - h0h ^= DEC64E(hash->h8[0]); - h0l ^= DEC64E(hash->h8[1]); - h1h ^= DEC64E(hash->h8[2]); - h1l ^= DEC64E(hash->h8[3]); - h2h ^= DEC64E(hash->h8[4]); - h2l ^= DEC64E(hash->h8[5]); - h3h ^= DEC64E(hash->h8[6]); - h3l ^= DEC64E(hash->h8[7]); - } else if(i == 1) { - h4h ^= DEC64E(hash->h8[0]); - h4l ^= DEC64E(hash->h8[1]); - h5h ^= DEC64E(hash->h8[2]); - h5l ^= DEC64E(hash->h8[3]); - h6h ^= DEC64E(hash->h8[4]); - h6l ^= DEC64E(hash->h8[5]); - h7h ^= DEC64E(hash->h8[6]); - h7l ^= DEC64E(hash->h8[7]); - - h0h ^= 0x80; - h3l ^= 0x2000000000000; - } - E8; + h4h ^= DEC64E(hash->h8[0]); + h4l ^= DEC64E(hash->h8[1]); + h5h ^= DEC64E(hash->h8[2]); + h5l ^= DEC64E(hash->h8[3]); + h6h ^= DEC64E(hash->h8[4]); + h6l ^= DEC64E(hash->h8[5]); + h7h ^= DEC64E(hash->h8[6]); + h7l ^= DEC64E(hash->h8[7]); + + h0h ^= 0x80; + h3l ^= 0x2000000000000; } - h4h ^= 0x80; - h7l ^= 0x2000000000000; - - hash->h8[0] = DEC64E(h4h); - hash->h8[1] = DEC64E(h4l); - hash->h8[2] = DEC64E(h5h); - hash->h8[3] = DEC64E(h5l); - hash->h8[4] = DEC64E(h6h); - hash->h8[5] = DEC64E(h6l); - hash->h8[6] = DEC64E(h7h); - hash->h8[7] = DEC64E(h7l); - - barrier(CLK_GLOBAL_MEM_FENCE); + E8; + } + h4h ^= 0x80; + h7l ^= 0x2000000000000; + + hash->h8[0] = DEC64E(h4h); + hash->h8[1] = DEC64E(h4l); + hash->h8[2] = DEC64E(h5h); + hash->h8[3] = DEC64E(h5l); + hash->h8[4] = DEC64E(h6h); + hash->h8[5] = DEC64E(h6l); + hash->h8[6] = DEC64E(h7h); + hash->h8[7] = DEC64E(h7l); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search5(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - - // keccak - - sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; - sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; - sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; - sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; - sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; - - a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); - - a00 ^= SWAP8(hash->h8[0]); - a10 ^= SWAP8(hash->h8[1]); - a20 ^= SWAP8(hash->h8[2]); - a30 ^= SWAP8(hash->h8[3]); - a40 ^= SWAP8(hash->h8[4]); - a01 ^= SWAP8(hash->h8[5]); - a11 ^= SWAP8(hash->h8[6]); - a21 ^= SWAP8(hash->h8[7]); - a31 ^= 0x8000000000000001; - KECCAK_F_1600; - // Finalize the "lane complement" - a10 = ~a10; - a20 = ~a20; - - hash->h8[0] = SWAP8(a00); - hash->h8[1] = SWAP8(a10); - hash->h8[2] = SWAP8(a20); - hash->h8[3] = SWAP8(a30); - hash->h8[4] = SWAP8(a40); - hash->h8[5] = SWAP8(a01); - hash->h8[6] = SWAP8(a11); - hash->h8[7] = SWAP8(a21); - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // keccak + + sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; + sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; + sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; + sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; + sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; + + a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); + + a00 ^= SWAP8(hash->h8[0]); + a10 ^= SWAP8(hash->h8[1]); + a20 ^= SWAP8(hash->h8[2]); + a30 ^= SWAP8(hash->h8[3]); + a40 ^= SWAP8(hash->h8[4]); + a01 ^= SWAP8(hash->h8[5]); + a11 ^= SWAP8(hash->h8[6]); + a21 ^= SWAP8(hash->h8[7]); + a31 ^= 0x8000000000000001; + KECCAK_F_1600; + + // Finalize the "lane complement" + a10 = ~a10; + a20 = ~a20; + + hash->h8[0] = SWAP8(a00); + hash->h8[1] = SWAP8(a10); + hash->h8[2] = SWAP8(a20); + hash->h8[3] = SWAP8(a30); + hash->h8[4] = SWAP8(a40); + hash->h8[5] = SWAP8(a01); + hash->h8[6] = SWAP8(a11); + hash->h8[7] = SWAP8(a21); + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search6(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // luffa - // luffa + sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); + sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); + sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); + sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); + sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea); - sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); - sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); - sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); - sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); - sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea); + DECL_TMP8(M); - DECL_TMP8(M); + M0 = hash->h4[1]; + M1 = hash->h4[0]; + M2 = hash->h4[3]; + M3 = hash->h4[2]; + M4 = hash->h4[5]; + M5 = hash->h4[4]; + M6 = hash->h4[7]; + M7 = hash->h4[6]; - M0 = hash->h4[1]; - M1 = hash->h4[0]; - M2 = hash->h4[3]; - M3 = hash->h4[2]; - M4 = hash->h4[5]; - M5 = hash->h4[4]; - M6 = hash->h4[7]; - M7 = hash->h4[6]; + for(uint i = 0; i < 5; i++) + { + MI5; + LUFFA_P5; - for(uint i = 0; i < 5; i++) + if(i == 0) + { + M0 = hash->h4[9]; + M1 = hash->h4[8]; + M2 = hash->h4[11]; + M3 = hash->h4[10]; + M4 = hash->h4[13]; + M5 = hash->h4[12]; + M6 = hash->h4[15]; + M7 = hash->h4[14]; + } + else if(i == 1) { - MI5; - LUFFA_P5; - - if(i == 0) { - M0 = hash->h4[9]; - M1 = hash->h4[8]; - M2 = hash->h4[11]; - M3 = hash->h4[10]; - M4 = hash->h4[13]; - M5 = hash->h4[12]; - M6 = hash->h4[15]; - M7 = hash->h4[14]; - } else if(i == 1) { - M0 = 0x80000000; - M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; - } else if(i == 2) { - M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; - } else if(i == 3) { - hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; - hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; - hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; - hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43; - hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44; - hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45; - hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46; - hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; - } + M0 = 0x80000000; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + } + else if(i == 2) + M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + else if(i == 3) + { + hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; } - - hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; - hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; - hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; - hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; - hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; - hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; - hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; - hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; - - barrier(CLK_GLOBAL_MEM_FENCE); + } + + hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search7(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // cubehash.h1 + + sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); + sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); + sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); + sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); + sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); + sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); + sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); + sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44); + + x0 ^= SWAP4(hash->h4[1]); + x1 ^= SWAP4(hash->h4[0]); + x2 ^= SWAP4(hash->h4[3]); + x3 ^= SWAP4(hash->h4[2]); + x4 ^= SWAP4(hash->h4[5]); + x5 ^= SWAP4(hash->h4[4]); + x6 ^= SWAP4(hash->h4[7]); + x7 ^= SWAP4(hash->h4[6]); + + for (int i = 0; i < 13; i ++) + { + SIXTEEN_ROUNDS; - // cubehash.h1 - - sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); - sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); - sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); - sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); - sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); - sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); - sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); - sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44); - - x0 ^= SWAP4(hash->h4[1]); - x1 ^= SWAP4(hash->h4[0]); - x2 ^= SWAP4(hash->h4[3]); - x3 ^= SWAP4(hash->h4[2]); - x4 ^= SWAP4(hash->h4[5]); - x5 ^= SWAP4(hash->h4[4]); - x6 ^= SWAP4(hash->h4[7]); - x7 ^= SWAP4(hash->h4[6]); - - for (int i = 0; i < 13; i ++) { - SIXTEEN_ROUNDS; - - if (i == 0) { - x0 ^= SWAP4(hash->h4[9]); - x1 ^= SWAP4(hash->h4[8]); - x2 ^= SWAP4(hash->h4[11]); - x3 ^= SWAP4(hash->h4[10]); - x4 ^= SWAP4(hash->h4[13]); - x5 ^= SWAP4(hash->h4[12]); - x6 ^= SWAP4(hash->h4[15]); - x7 ^= SWAP4(hash->h4[14]); - } else if(i == 1) { - x0 ^= 0x80; - } else if (i == 2) { - xv ^= SPH_C32(1); - } - } + if (i == 0) + { + x0 ^= SWAP4(hash->h4[9]); + x1 ^= SWAP4(hash->h4[8]); + x2 ^= SWAP4(hash->h4[11]); + x3 ^= SWAP4(hash->h4[10]); + x4 ^= SWAP4(hash->h4[13]); + x5 ^= SWAP4(hash->h4[12]); + x6 ^= SWAP4(hash->h4[15]); + x7 ^= SWAP4(hash->h4[14]); + } + else if(i == 1) + x0 ^= 0x80; + else if (i == 2) + xv ^= SPH_C32(1); + } - hash->h4[0] = x0; - hash->h4[1] = x1; - hash->h4[2] = x2; - hash->h4[3] = x3; - hash->h4[4] = x4; - hash->h4[5] = x5; - hash->h4[6] = x6; - hash->h4[7] = x7; - hash->h4[8] = x8; - hash->h4[9] = x9; - hash->h4[10] = xa; - hash->h4[11] = xb; - hash->h4[12] = xc; - hash->h4[13] = xd; - hash->h4[14] = xe; - hash->h4[15] = xf; - - barrier(CLK_GLOBAL_MEM_FENCE); + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + hash->h4[8] = x8; + hash->h4[9] = x9; + hash->h4[10] = xa; + hash->h4[11] = xb; + hash->h4[12] = xc; + hash->h4[13] = xd; + hash->h4[14] = xe; + hash->h4[15] = xf; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search8(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; - - int init = get_local_id(0); - int step = get_local_size(0); - - for (int i = init; i < 256; i += step) - { - AES0[i] = AES0_C[i]; - AES1[i] = AES1_C[i]; - AES2[i] = AES2_C[i]; - AES3[i] = AES3_C[i]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // shavite - // IV - sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); - sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); - sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); - sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A); - - // state - sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; - sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; - sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; - sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; - - sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; - - rk00 = hash->h4[0]; - rk01 = hash->h4[1]; - rk02 = hash->h4[2]; - rk03 = hash->h4[3]; - rk04 = hash->h4[4]; - rk05 = hash->h4[5]; - rk06 = hash->h4[6]; - rk07 = hash->h4[7]; - rk08 = hash->h4[8]; - rk09 = hash->h4[9]; - rk0A = hash->h4[10]; - rk0B = hash->h4[11]; - rk0C = hash->h4[12]; - rk0D = hash->h4[13]; - rk0E = hash->h4[14]; - rk0F = hash->h4[15]; - rk10 = 0x80; - rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; - rk1B = 0x2000000; - rk1C = rk1D = rk1E = 0; - rk1F = 0x2000000; - - c512(buf); - - hash->h4[0] = h0; - hash->h4[1] = h1; - hash->h4[2] = h2; - hash->h4[3] = h3; - hash->h4[4] = h4; - hash->h4[5] = h5; - hash->h4[6] = h6; - hash->h4[7] = h7; - hash->h4[8] = h8; - hash->h4[9] = h9; - hash->h4[10] = hA; - hash->h4[11] = hB; - hash->h4[12] = hC; - hash->h4[13] = hD; - hash->h4[14] = hE; - hash->h4[15] = hF; - - barrier(CLK_GLOBAL_MEM_FENCE); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // shavite + // IV + sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); + sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); + sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); + sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A); + + // state + sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + + sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + + rk00 = hash->h4[0]; + rk01 = hash->h4[1]; + rk02 = hash->h4[2]; + rk03 = hash->h4[3]; + rk04 = hash->h4[4]; + rk05 = hash->h4[5]; + rk06 = hash->h4[6]; + rk07 = hash->h4[7]; + rk08 = hash->h4[8]; + rk09 = hash->h4[9]; + rk0A = hash->h4[10]; + rk0B = hash->h4[11]; + rk0C = hash->h4[12]; + rk0D = hash->h4[13]; + rk0E = hash->h4[14]; + rk0F = hash->h4[15]; + rk10 = 0x80; + rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; + rk1B = 0x2000000; + rk1C = rk1D = rk1E = 0; + rk1F = 0x2000000; + + c512(buf); + + hash->h4[0] = h0; + hash->h4[1] = h1; + hash->h4[2] = h2; + hash->h4[3] = h3; + hash->h4[4] = h4; + hash->h4[5] = h5; + hash->h4[6] = h6; + hash->h4[7] = h7; + hash->h4[8] = h8; + hash->h4[9] = h9; + hash->h4[10] = hA; + hash->h4[11] = hB; + hash->h4[12] = hC; + hash->h4[13] = hD; + hash->h4[14] = hE; + hash->h4[15] = hF; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search9(__global hash_t* hashes) { - uint gid = get_global_id(0); - __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // simd + s32 q[256]; + unsigned char x[128]; + for(unsigned int i = 0; i < 64; i++) + x[i] = hash->h1[i]; + for(unsigned int i = 64; i < 128; i++) + x[i] = 0; + + u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); + u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); + u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); + u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); + + FFT256(0, 1, 0, ll1); + for (int i = 0; i < 256; i ++) + { + s32 tq; - // simd - s32 q[256]; - unsigned char x[128]; - for(unsigned int i = 0; i < 64; i++) - x[i] = hash->h1[i]; - for(unsigned int i = 64; i < 128; i++) - x[i] = 0; - - u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); - u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); - u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); - u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); - - FFT256(0, 1, 0, ll1); - for (int i = 0; i < 256; i ++) { - s32 tq; - - tq = q[i] + yoff_b_n[i]; - tq = REDS2(tq); - tq = REDS1(tq); - tq = REDS1(tq); - q[i] = (tq <= 128 ? tq : tq - 257); - } + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } - A0 ^= hash->h4[0]; - A1 ^= hash->h4[1]; - A2 ^= hash->h4[2]; - A3 ^= hash->h4[3]; - A4 ^= hash->h4[4]; - A5 ^= hash->h4[5]; - A6 ^= hash->h4[6]; - A7 ^= hash->h4[7]; - B0 ^= hash->h4[8]; - B1 ^= hash->h4[9]; - B2 ^= hash->h4[10]; - B3 ^= hash->h4[11]; - B4 ^= hash->h4[12]; - B5 ^= hash->h4[13]; - B6 ^= hash->h4[14]; - B7 ^= hash->h4[15]; - - ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); - ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); - ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); - ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); - - STEP_BIG( - C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), - C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), - IF, 4, 13, PP8_4_); - STEP_BIG( - C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), - C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), - IF, 13, 10, PP8_5_); - STEP_BIG( - C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), - C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), - IF, 10, 25, PP8_6_); - STEP_BIG( - C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), - C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), - IF, 25, 4, PP8_0_); - - u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; - u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; - u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; - u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; - - #define q SIMD_Q - - A0 ^= 0x200; - - ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); - ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); - ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); - ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); - STEP_BIG( - COPY_A0, COPY_A1, COPY_A2, COPY_A3, - COPY_A4, COPY_A5, COPY_A6, COPY_A7, - IF, 4, 13, PP8_4_); - STEP_BIG( - COPY_B0, COPY_B1, COPY_B2, COPY_B3, - COPY_B4, COPY_B5, COPY_B6, COPY_B7, - IF, 13, 10, PP8_5_); - STEP_BIG( - COPY_C0, COPY_C1, COPY_C2, COPY_C3, - COPY_C4, COPY_C5, COPY_C6, COPY_C7, - IF, 10, 25, PP8_6_); - STEP_BIG( - COPY_D0, COPY_D1, COPY_D2, COPY_D3, - COPY_D4, COPY_D5, COPY_D6, COPY_D7, - IF, 25, 4, PP8_0_); - #undef q - - hash->h4[0] = A0; - hash->h4[1] = A1; - hash->h4[2] = A2; - hash->h4[3] = A3; - hash->h4[4] = A4; - hash->h4[5] = A5; - hash->h4[6] = A6; - hash->h4[7] = A7; - hash->h4[8] = B0; - hash->h4[9] = B1; - hash->h4[10] = B2; - hash->h4[11] = B3; - hash->h4[12] = B4; - hash->h4[13] = B5; - hash->h4[14] = B6; - hash->h4[15] = B7; - - barrier(CLK_GLOBAL_MEM_FENCE); + A0 ^= hash->h4[0]; + A1 ^= hash->h4[1]; + A2 ^= hash->h4[2]; + A3 ^= hash->h4[3]; + A4 ^= hash->h4[4]; + A5 ^= hash->h4[5]; + A6 ^= hash->h4[6]; + A7 ^= hash->h4[7]; + B0 ^= hash->h4[8]; + B1 ^= hash->h4[9]; + B2 ^= hash->h4[10]; + B3 ^= hash->h4[11]; + B4 ^= hash->h4[12]; + B5 ^= hash->h4[13]; + B6 ^= hash->h4[14]; + B7 ^= hash->h4[15]; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + IF, 4, 13, PP8_4_); + + STEP_BIG( + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + IF, 13, 10, PP8_5_); + + STEP_BIG( + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + IF, 10, 25, PP8_6_); + + STEP_BIG( + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), + IF, 25, 4, PP8_0_); + + u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; + u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; + u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; + u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; + + #define q SIMD_Q + + A0 ^= 0x200; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + COPY_A0, COPY_A1, COPY_A2, COPY_A3, + COPY_A4, COPY_A5, COPY_A6, COPY_A7, + IF, 4, 13, PP8_4_); + + STEP_BIG( + COPY_B0, COPY_B1, COPY_B2, COPY_B3, + COPY_B4, COPY_B5, COPY_B6, COPY_B7, + IF, 13, 10, PP8_5_); + + STEP_BIG( + COPY_C0, COPY_C1, COPY_C2, COPY_C3, + COPY_C4, COPY_C5, COPY_C6, COPY_C7, + IF, 10, 25, PP8_6_); + + STEP_BIG( + COPY_D0, COPY_D1, COPY_D2, COPY_D3, + COPY_D4, COPY_D5, COPY_D6, COPY_D7, + IF, 25, 4, PP8_0_); + + #undef q + + hash->h4[0] = A0; + hash->h4[1] = A1; + hash->h4[2] = A2; + hash->h4[3] = A3; + hash->h4[4] = A4; + hash->h4[5] = A5; + hash->h4[6] = A6; + hash->h4[7] = A7; + hash->h4[8] = B0; + hash->h4[9] = B1; + hash->h4[10] = B2; + hash->h4[11] = B3; + hash->h4[12] = B4; + hash->h4[13] = B5; + hash->h4[14] = B6; + hash->h4[15] = B7; + + barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search10(__global hash_t* hashes, __global uint* output, const ulong target) { - uint gid = get_global_id(0); - uint offset = get_global_offset(0); - hash_t hash; + uint gid = get_global_id(0); + uint offset = get_global_offset(0); + __global hash_t *hash = &(hashes[gid-offset]); - __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; - int init = get_local_id(0); - int step = get_local_size(0); + int init = get_local_id(0); + int step = get_local_size(0); - for (int i = init; i < 256; i += step) - { - AES0[i] = AES0_C[i]; - AES1[i] = AES1_C[i]; - AES2[i] = AES2_C[i]; - AES3[i] = AES3_C[i]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (int i = 0; i < 8; i++) { - hash.h8[i] = hashes[gid-offset].h8[i]; - } - - // echo - - { - - sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; - sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; - Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; - Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; - - sph_u32 K0 = 512; - sph_u32 K1 = 0; - sph_u32 K2 = 0; - sph_u32 K3 = 0; - - W00 = Vb00; - W01 = Vb01; - W10 = Vb10; - W11 = Vb11; - W20 = Vb20; - W21 = Vb21; - W30 = Vb30; - W31 = Vb31; - W40 = Vb40; - W41 = Vb41; - W50 = Vb50; - W51 = Vb51; - W60 = Vb60; - W61 = Vb61; - W70 = Vb70; - W71 = Vb71; - W80 = hash.h8[0]; - W81 = hash.h8[1]; - W90 = hash.h8[2]; - W91 = hash.h8[3]; - WA0 = hash.h8[4]; - WA1 = hash.h8[5]; - WB0 = hash.h8[6]; - WB1 = hash.h8[7]; - WC0 = 0x80; - WC1 = 0; - WD0 = 0; - WD1 = 0; - WE0 = 0; - WE1 = 0x200000000000000; - WF0 = 0x200; - WF1 = 0; - - for (unsigned u = 0; u < 10; u ++) { - BIG_ROUND; - } - - hash.h8[0] ^= Vb00 ^ W00 ^ W80; - hash.h8[1] ^= Vb01 ^ W01 ^ W81; - hash.h8[2] ^= Vb10 ^ W10 ^ W90; - hash.h8[3] ^= Vb11 ^ W11 ^ W91; - hash.h8[4] ^= Vb20 ^ W20 ^ WA0; - hash.h8[5] ^= Vb21 ^ W21 ^ WA1; - hash.h8[6] ^= Vb30 ^ W30 ^ WB0; - hash.h8[7] ^= Vb31 ^ W31 ^ WB1; - - } - - // hamsi - - { - - sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; - sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; - sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; - sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; - sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; - sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; - sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; - -#define buf(u) hash.h1[i + u] - for(int i = 0; i < 64; i += 8) { - INPUT_BIG; - P_BIG; - T_BIG; - } -#undef buf -#define buf(u) (u == 0 ? 0x80 : 0) - INPUT_BIG; - P_BIG; - T_BIG; -#undef buf -#define buf(u) (u == 6 ? 2 : 0) - INPUT_BIG; - PF_BIG; - T_BIG; - - for (unsigned u = 0; u < 16; u ++) - hash.h4[u] = h[u]; + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } - } + barrier(CLK_LOCAL_MEM_FENCE); //mixtab __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256]; - init = get_local_id(0); - step = get_local_size(0); + for (int i = init; i < 256; i += step) { mixtab0[i] = mixtab0_c[i]; @@ -906,95 +1033,196 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo mixtab2[i] = mixtab2_c[i]; mixtab3[i] = mixtab3_c[i]; } - barrier(CLK_GLOBAL_MEM_FENCE); - // fugue - - { + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < 8; i++) + hash->h8[i] = hashes[gid-offset].h8[i]; + + // echo + sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; + sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; + Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; + Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; + + sph_u32 K0 = 512; + sph_u32 K1 = 0; + sph_u32 K2 = 0; + sph_u32 K3 = 0; + + W00 = Vb00; + W01 = Vb01; + W10 = Vb10; + W11 = Vb11; + W20 = Vb20; + W21 = Vb21; + W30 = Vb30; + W31 = Vb31; + W40 = Vb40; + W41 = Vb41; + W50 = Vb50; + W51 = Vb51; + W60 = Vb60; + W61 = Vb61; + W70 = Vb70; + W71 = Vb71; + W80 = hash->h8[0]; + W81 = hash->h8[1]; + W90 = hash->h8[2]; + W91 = hash->h8[3]; + WA0 = hash->h8[4]; + WA1 = hash->h8[5]; + WB0 = hash->h8[6]; + WB1 = hash->h8[7]; + WC0 = 0x80; + WC1 = 0; + WD0 = 0; + WD1 = 0; + WE0 = 0; + WE1 = 0x200000000000000; + WF0 = 0x200; + WF1 = 0; + + for (unsigned u = 0; u < 10; u ++) + BIG_ROUND; + + hash->h8[0] ^= Vb00 ^ W00 ^ W80; + hash->h8[1] ^= Vb01 ^ W01 ^ W81; + hash->h8[2] ^= Vb10 ^ W10 ^ W90; + hash->h8[3] ^= Vb11 ^ W11 ^ W91; + hash->h8[4] ^= Vb20 ^ W20 ^ WA0; + hash->h8[5] ^= Vb21 ^ W21 ^ WA1; + hash->h8[6] ^= Vb30 ^ W30 ^ WB0; + hash->h8[7] ^= Vb31 ^ W31 ^ WB1; + + // hamsi + __local sph_u32 T512_L[1024]; + __constant const sph_u32 *T512_C = &T512[0][0]; + + for (int i = init; i < 1024; i += step) + T512_L[i] = T512_C[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; + sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; + sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; + sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; + sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + + #define buf(u) hash->h1[i + u] + + for(int i = 0; i < 64; i += 8) + { + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + } + + #undef buf + #define buf(u) (u == 0 ? 0x80 : 0) + + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + + #undef buf + #define buf(u) (u == 6 ? 2 : 0) + + INPUT_BIG_LOCAL; + PF_BIG; + T_BIG; + + for (unsigned u = 0; u < 16; u ++) + hash->h4[u] = h[u]; - sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; - sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; - sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; - sph_u32 S30, S31, S32, S33, S34, S35; - - ulong fc_bit_count = (sph_u64) 64 << 3; - - S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; - S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); - S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); - S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); - S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); - - FUGUE512_3((hash.h4[0x0]), (hash.h4[0x1]), (hash.h4[0x2])); - FUGUE512_3((hash.h4[0x3]), (hash.h4[0x4]), (hash.h4[0x5])); - FUGUE512_3((hash.h4[0x6]), (hash.h4[0x7]), (hash.h4[0x8])); - FUGUE512_3((hash.h4[0x9]), (hash.h4[0xA]), (hash.h4[0xB])); - FUGUE512_3((hash.h4[0xC]), (hash.h4[0xD]), (hash.h4[0xE])); - FUGUE512_3((hash.h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x); - - // apply round shift if necessary - int i; - - for (i = 0; i < 32; i ++) { - ROR3; - CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); - SMIX(S00, S01, S02, S03); - } - for (i = 0; i < 13; i ++) { - S04 ^= S00; - S09 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S18 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S27 ^= S00; - ROR9; - SMIX(S00, S01, S02, S03); - S04 ^= S00; - S10 ^= S00; - S19 ^= S00; - S28 ^= S00; - ROR8; - SMIX(S00, S01, S02, S03); - } + // fugue + sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; + sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; + sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; + sph_u32 S30, S31, S32, S33, S34, S35; + + ulong fc_bit_count = (sph_u64) 0x200; + + S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; + S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); + S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); + S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); + S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); + + FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2])); + FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5])); + FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8])); + FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB])); + FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE])); + FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x); + + // apply round shift if necessary + int i; + + for (i = 0; i < 32; i ++) + { + ROR3; + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + } + + for (i = 0; i < 13; i ++) + { S04 ^= S00; S09 ^= S00; S18 ^= S00; S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S18 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S28 ^= S00; + ROR8; + SMIX(S00, S01, S02, S03); + } + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + + hash->h4[0] = SWAP4(S01); + hash->h4[1] = SWAP4(S02); + hash->h4[2] = SWAP4(S03); + hash->h4[3] = SWAP4(S04); + hash->h4[4] = SWAP4(S09); + hash->h4[5] = SWAP4(S10); + hash->h4[6] = SWAP4(S11); + hash->h4[7] = SWAP4(S12); + hash->h4[8] = SWAP4(S18); + hash->h4[9] = SWAP4(S19); + hash->h4[10] = SWAP4(S20); + hash->h4[11] = SWAP4(S21); + hash->h4[12] = SWAP4(S27); + hash->h4[13] = SWAP4(S28); + hash->h4[14] = SWAP4(S29); + hash->h4[15] = SWAP4(S30); + + bool result = (hash->h8[3] <= target); + if (result) + output[atomic_inc(output+0xFF)] = SWAP4(gid); - hash.h4[0] = SWAP4(S01); - hash.h4[1] = SWAP4(S02); - hash.h4[2] = SWAP4(S03); - hash.h4[3] = SWAP4(S04); - hash.h4[4] = SWAP4(S09); - hash.h4[5] = SWAP4(S10); - hash.h4[6] = SWAP4(S11); - hash.h4[7] = SWAP4(S12); - hash.h4[8] = SWAP4(S18); - hash.h4[9] = SWAP4(S19); - hash.h4[10] = SWAP4(S20); - hash.h4[11] = SWAP4(S21); - hash.h4[12] = SWAP4(S27); - hash.h4[13] = SWAP4(S28); - hash.h4[14] = SWAP4(S29); - hash.h4[15] = SWAP4(S30); - - } - - bool result = (hash.h8[3] <= target); - if (result) - output[atomic_inc(output+0xFF)] = SWAP4(gid); - - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); } #endif // X13MOD_CL diff --git a/kernel/x14.cl b/kernel/x14.cl new file mode 100644 index 00000000..796ebc05 --- /dev/null +++ b/kernel/x14.cl @@ -0,0 +1,1338 @@ +/* + * X14 kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2014 phm + * Copyright (c) 2014 Girino Vey + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author phm + */ + +#ifndef X14_CL +#define X14_CL + +#define DEBUG(x) + +#if __ENDIAN_LITTLE__ + #define SPH_LITTLE_ENDIAN 1 +#else + #define SPH_BIG_ENDIAN 1 +#endif + +#define SPH_UPTR sph_u64 + +typedef unsigned int sph_u32; +typedef int sph_s32; +#ifndef __OPENCL_VERSION__ + typedef unsigned long long sph_u64; + typedef long long sph_s64; +#else + typedef unsigned long sph_u64; + typedef long sph_s64; +#endif + +#define SPH_64 1 +#define SPH_64_TRUE 1 + +#define SPH_C32(x) ((sph_u32)(x ## U)) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +#define SPH_C64(x) ((sph_u64)(x ## UL)) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) +#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) + +#define SPH_ECHO_64 1 +#define SPH_KECCAK_64 1 +#define SPH_JH_64 1 +#define SPH_SIMD_NOCOPY 0 +#define SPH_KECCAK_NOCOPY 0 +#define SPH_COMPACT_BLAKE_64 0 +#define SPH_LUFFA_PARALLEL 0 +#define SPH_SMALL_FOOTPRINT_GROESTL 0 +#define SPH_GROESTL_BIG_ENDIAN 0 +#define SPH_CUBEHASH_UNROLL 0 +#define SPH_KECCAK_UNROLL 1 +#ifndef SPH_HAMSI_EXPAND_BIG + #define SPH_HAMSI_EXPAND_BIG 1 +#endif +#ifndef SPH_HAMSI_SHORT + #define SPH_HAMSI_SHORT 1 +#endif + +#include "blake.cl" +#include "bmw.cl" +#include "groestl.cl" +#include "jh.cl" +#include "keccak.cl" +#include "skein.cl" +#include "luffa.cl" +#include "cubehash.cl" +#include "shavite.cl" +#include "simd.cl" +#include "echo.cl" +#include "hamsi.cl" +#include "fugue.cl" +#include "shabal.cl" + +#define SWAP4(x) as_uint(as_uchar4(x).wzyx) +#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) + +#if SPH_BIG_ENDIAN + #define DEC64E(x) (x) + #define DEC64BE(x) (*(const __global sph_u64 *) (x)); + #define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x)); +#else + #define DEC64E(x) SWAP8(x) + #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC32LE(x) (*(const __global sph_u32 *) (x)); +#endif + +#define SHL(x, n) ((x) << (n)) +#define SHR(x, n) ((x) >> (n)) + +#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ + q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ + q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ + q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + +typedef union { + unsigned char h1[64]; + uint h4[16]; + ulong h8[8]; +} hash_t; + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global unsigned char* block, __global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // blake + sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); + sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); + sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); + sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); + sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; + sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; + + if ((T0 = SPH_T64(T0 + 1024)) < 1024) + T1 = SPH_T64(T1 + 1); + + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; + + M0 = DEC64BE(block + 0); + M1 = DEC64BE(block + 8); + M2 = DEC64BE(block + 16); + M3 = DEC64BE(block + 24); + M4 = DEC64BE(block + 32); + M5 = DEC64BE(block + 40); + M6 = DEC64BE(block + 48); + M7 = DEC64BE(block + 56); + M8 = DEC64BE(block + 64); + M9 = DEC64BE(block + 72); + M9 &= 0xFFFFFFFF00000000; + M9 ^= SWAP4(gid); + MA = 0x8000000000000000; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 0x280; + + COMPRESS64; + + hash->h8[0] = H0; + hash->h8[1] = H1; + hash->h8[2] = H2; + hash->h8[3] = H3; + hash->h8[4] = H4; + hash->h8[5] = H5; + hash->h8[6] = H6; + hash->h8[7] = H7; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // bmw + sph_u64 BMW_H[16]; + +#pragma unroll 16 + for(unsigned u = 0; u < 16; u++) + BMW_H[u] = BMW_IV512[u]; + + sph_u64 mv[16],q[32]; + sph_u64 tmp; + + mv[0] = SWAP8(hash->h8[0]); + mv[1] = SWAP8(hash->h8[1]); + mv[2] = SWAP8(hash->h8[2]); + mv[3] = SWAP8(hash->h8[3]); + mv[4] = SWAP8(hash->h8[4]); + mv[5] = SWAP8(hash->h8[5]); + mv[6] = SWAP8(hash->h8[6]); + mv[7] = SWAP8(hash->h8[7]); + mv[8] = 0x80; + mv[9] = 0; + mv[10] = 0; + mv[11] = 0; + mv[12] = 0; + mv[13] = 0; + mv[14] = 0; + mv[15] = SPH_C64(512); + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) + { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; + } + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + hash->h8[0] = SWAP8(BMW_H[8]); + hash->h8[1] = SWAP8(BMW_H[9]); + hash->h8[2] = SWAP8(BMW_H[10]); + hash->h8[3] = SWAP8(BMW_H[11]); + hash->h8[4] = SWAP8(BMW_H[12]); + hash->h8[5] = SWAP8(BMW_H[13]); + hash->h8[6] = SWAP8(BMW_H[14]); + hash->h8[7] = SWAP8(BMW_H[15]); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + T0_L[i] = T0[i]; + T4_L[i] = T4[i]; + T1_L[i] = T1[i]; + T2_L[i] = T2[i]; + T3_L[i] = T3[i]; + T5_L[i] = T5[i]; + T6_L[i] = T6[i]; + T7_L[i] = T7[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + #define T0 T0_L + #define T1 T1_L + #define T2 T2_L + #define T3 T3_L + #define T4 T4_L + #define T5 T5_L + #define T6 T6_L + #define T7 T7_L + + // groestl + sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + + sph_u64 g[16], m[16]; + g[0] = m[0] = DEC64E(hash->h8[0]); + g[1] = m[1] = DEC64E(hash->h8[1]); + g[2] = m[2] = DEC64E(hash->h8[2]); + g[3] = m[3] = DEC64E(hash->h8[3]); + g[4] = m[4] = DEC64E(hash->h8[4]); + g[5] = m[5] = DEC64E(hash->h8[5]); + g[6] = m[6] = DEC64E(hash->h8[6]); + g[7] = m[7] = DEC64E(hash->h8[7]); + g[8] = m[8] = 0x80; + g[9] = m[9] = 0; + g[10] = m[10] = 0; + g[11] = m[11] = 0; + g[12] = m[12] = 0; + g[13] = m[13] = 0; + g[14] = m[14] = 0; + g[15] = 0x102000000000000; + m[15] = 0x100000000000000; + + PERM_BIG_P(g); + PERM_BIG_Q(m); + + sph_u64 xH[16]; + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u] ^= g[u] ^ m[u]; + + PERM_BIG_P(xH); + + for (unsigned int u = 8; u < 16; u ++) + hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // skein + + sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); + sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u64 bcount = 0; + + m0 = SWAP8(hash->h8[0]); + m1 = SWAP8(hash->h8[1]); + m2 = SWAP8(hash->h8[2]); + m3 = SWAP8(hash->h8[3]); + m4 = SWAP8(hash->h8[4]); + m5 = SWAP8(hash->h8[5]); + m6 = SWAP8(hash->h8[6]); + m7 = SWAP8(hash->h8[7]); + + UBI_BIG(480, 64); + + bcount = 0; + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; + + UBI_BIG(510, 8); + + hash->h8[0] = SWAP8(h0); + hash->h8[1] = SWAP8(h1); + hash->h8[2] = SWAP8(h2); + hash->h8[3] = SWAP8(h3); + hash->h8[4] = SWAP8(h4); + hash->h8[5] = SWAP8(h5); + hash->h8[6] = SWAP8(h6); + hash->h8[7] = SWAP8(h7); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search4(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // jh + + sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); + sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); + sph_u64 tmp; + + for(int i = 0; i < 2; i++) + { + if (i == 0) + { + h0h ^= DEC64E(hash->h8[0]); + h0l ^= DEC64E(hash->h8[1]); + h1h ^= DEC64E(hash->h8[2]); + h1l ^= DEC64E(hash->h8[3]); + h2h ^= DEC64E(hash->h8[4]); + h2l ^= DEC64E(hash->h8[5]); + h3h ^= DEC64E(hash->h8[6]); + h3l ^= DEC64E(hash->h8[7]); + } + else if(i == 1) + { + h4h ^= DEC64E(hash->h8[0]); + h4l ^= DEC64E(hash->h8[1]); + h5h ^= DEC64E(hash->h8[2]); + h5l ^= DEC64E(hash->h8[3]); + h6h ^= DEC64E(hash->h8[4]); + h6l ^= DEC64E(hash->h8[5]); + h7h ^= DEC64E(hash->h8[6]); + h7l ^= DEC64E(hash->h8[7]); + + h0h ^= 0x80; + h3l ^= 0x2000000000000; + } + E8; + } + h4h ^= 0x80; + h7l ^= 0x2000000000000; + + hash->h8[0] = DEC64E(h4h); + hash->h8[1] = DEC64E(h4l); + hash->h8[2] = DEC64E(h5h); + hash->h8[3] = DEC64E(h5l); + hash->h8[4] = DEC64E(h6h); + hash->h8[5] = DEC64E(h6l); + hash->h8[6] = DEC64E(h7h); + hash->h8[7] = DEC64E(h7l); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search5(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // keccak + + sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; + sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; + sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; + sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; + sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; + + a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); + + a00 ^= SWAP8(hash->h8[0]); + a10 ^= SWAP8(hash->h8[1]); + a20 ^= SWAP8(hash->h8[2]); + a30 ^= SWAP8(hash->h8[3]); + a40 ^= SWAP8(hash->h8[4]); + a01 ^= SWAP8(hash->h8[5]); + a11 ^= SWAP8(hash->h8[6]); + a21 ^= SWAP8(hash->h8[7]); + a31 ^= 0x8000000000000001; + KECCAK_F_1600; + + // Finalize the "lane complement" + a10 = ~a10; + a20 = ~a20; + + hash->h8[0] = SWAP8(a00); + hash->h8[1] = SWAP8(a10); + hash->h8[2] = SWAP8(a20); + hash->h8[3] = SWAP8(a30); + hash->h8[4] = SWAP8(a40); + hash->h8[5] = SWAP8(a01); + hash->h8[6] = SWAP8(a11); + hash->h8[7] = SWAP8(a21); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search6(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // luffa + + sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); + sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); + sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); + sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); + sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea); + + DECL_TMP8(M); + + M0 = hash->h4[1]; + M1 = hash->h4[0]; + M2 = hash->h4[3]; + M3 = hash->h4[2]; + M4 = hash->h4[5]; + M5 = hash->h4[4]; + M6 = hash->h4[7]; + M7 = hash->h4[6]; + + for(uint i = 0; i < 5; i++) + { + MI5; + LUFFA_P5; + + if(i == 0) + { + M0 = hash->h4[9]; + M1 = hash->h4[8]; + M2 = hash->h4[11]; + M3 = hash->h4[10]; + M4 = hash->h4[13]; + M5 = hash->h4[12]; + M6 = hash->h4[15]; + M7 = hash->h4[14]; + } + else if(i == 1) + { + M0 = 0x80000000; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + } + else if(i == 2) + M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + else if(i == 3) + { + hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; + } + } + + hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search7(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // cubehash.h1 + + sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); + sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); + sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); + sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); + sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); + sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); + sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); + sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44); + + x0 ^= SWAP4(hash->h4[1]); + x1 ^= SWAP4(hash->h4[0]); + x2 ^= SWAP4(hash->h4[3]); + x3 ^= SWAP4(hash->h4[2]); + x4 ^= SWAP4(hash->h4[5]); + x5 ^= SWAP4(hash->h4[4]); + x6 ^= SWAP4(hash->h4[7]); + x7 ^= SWAP4(hash->h4[6]); + + for (int i = 0; i < 13; i ++) + { + SIXTEEN_ROUNDS; + + if (i == 0) + { + x0 ^= SWAP4(hash->h4[9]); + x1 ^= SWAP4(hash->h4[8]); + x2 ^= SWAP4(hash->h4[11]); + x3 ^= SWAP4(hash->h4[10]); + x4 ^= SWAP4(hash->h4[13]); + x5 ^= SWAP4(hash->h4[12]); + x6 ^= SWAP4(hash->h4[15]); + x7 ^= SWAP4(hash->h4[14]); + } + else if(i == 1) + x0 ^= 0x80; + else if (i == 2) + xv ^= SPH_C32(1); + } + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + hash->h4[8] = x8; + hash->h4[9] = x9; + hash->h4[10] = xa; + hash->h4[11] = xb; + hash->h4[12] = xc; + hash->h4[13] = xd; + hash->h4[14] = xe; + hash->h4[15] = xf; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search8(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // shavite + // IV + sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); + sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); + sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); + sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A); + + // state + sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + + sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + + rk00 = hash->h4[0]; + rk01 = hash->h4[1]; + rk02 = hash->h4[2]; + rk03 = hash->h4[3]; + rk04 = hash->h4[4]; + rk05 = hash->h4[5]; + rk06 = hash->h4[6]; + rk07 = hash->h4[7]; + rk08 = hash->h4[8]; + rk09 = hash->h4[9]; + rk0A = hash->h4[10]; + rk0B = hash->h4[11]; + rk0C = hash->h4[12]; + rk0D = hash->h4[13]; + rk0E = hash->h4[14]; + rk0F = hash->h4[15]; + rk10 = 0x80; + rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; + rk1B = 0x2000000; + rk1C = rk1D = rk1E = 0; + rk1F = 0x2000000; + + c512(buf); + + hash->h4[0] = h0; + hash->h4[1] = h1; + hash->h4[2] = h2; + hash->h4[3] = h3; + hash->h4[4] = h4; + hash->h4[5] = h5; + hash->h4[6] = h6; + hash->h4[7] = h7; + hash->h4[8] = h8; + hash->h4[9] = h9; + hash->h4[10] = hA; + hash->h4[11] = hB; + hash->h4[12] = hC; + hash->h4[13] = hD; + hash->h4[14] = hE; + hash->h4[15] = hF; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search9(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // simd + s32 q[256]; + unsigned char x[128]; + for(unsigned int i = 0; i < 64; i++) + x[i] = hash->h1[i]; + for(unsigned int i = 64; i < 128; i++) + x[i] = 0; + + u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); + u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); + u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); + u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); + + FFT256(0, 1, 0, ll1); + for (int i = 0; i < 256; i ++) + { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + + A0 ^= hash->h4[0]; + A1 ^= hash->h4[1]; + A2 ^= hash->h4[2]; + A3 ^= hash->h4[3]; + A4 ^= hash->h4[4]; + A5 ^= hash->h4[5]; + A6 ^= hash->h4[6]; + A7 ^= hash->h4[7]; + B0 ^= hash->h4[8]; + B1 ^= hash->h4[9]; + B2 ^= hash->h4[10]; + B3 ^= hash->h4[11]; + B4 ^= hash->h4[12]; + B5 ^= hash->h4[13]; + B6 ^= hash->h4[14]; + B7 ^= hash->h4[15]; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + IF, 4, 13, PP8_4_); + + STEP_BIG( + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + IF, 13, 10, PP8_5_); + + STEP_BIG( + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + IF, 10, 25, PP8_6_); + + STEP_BIG( + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), + IF, 25, 4, PP8_0_); + + u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; + u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; + u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; + u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; + + #define q SIMD_Q + + A0 ^= 0x200; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + COPY_A0, COPY_A1, COPY_A2, COPY_A3, + COPY_A4, COPY_A5, COPY_A6, COPY_A7, + IF, 4, 13, PP8_4_); + + STEP_BIG( + COPY_B0, COPY_B1, COPY_B2, COPY_B3, + COPY_B4, COPY_B5, COPY_B6, COPY_B7, + IF, 13, 10, PP8_5_); + + STEP_BIG( + COPY_C0, COPY_C1, COPY_C2, COPY_C3, + COPY_C4, COPY_C5, COPY_C6, COPY_C7, + IF, 10, 25, PP8_6_); + + STEP_BIG( + COPY_D0, COPY_D1, COPY_D2, COPY_D3, + COPY_D4, COPY_D5, COPY_D6, COPY_D7, + IF, 25, 4, PP8_0_); + + #undef q + + hash->h4[0] = A0; + hash->h4[1] = A1; + hash->h4[2] = A2; + hash->h4[3] = A3; + hash->h4[4] = A4; + hash->h4[5] = A5; + hash->h4[6] = A6; + hash->h4[7] = A7; + hash->h4[8] = B0; + hash->h4[9] = B1; + hash->h4[10] = B2; + hash->h4[11] = B3; + hash->h4[12] = B4; + hash->h4[13] = B5; + hash->h4[14] = B6; + hash->h4[15] = B7; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search10(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + uint offset = get_global_offset(0); + hash_t hash; + __global hash_t *hashp = &(hashes[gid-offset]); + + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < 8; i++) + hash.h8[i] = hashes[gid-offset].h8[i]; + + // echo + sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; + sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; + Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; + Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; + + sph_u32 K0 = 512; + sph_u32 K1 = 0; + sph_u32 K2 = 0; + sph_u32 K3 = 0; + + W00 = Vb00; + W01 = Vb01; + W10 = Vb10; + W11 = Vb11; + W20 = Vb20; + W21 = Vb21; + W30 = Vb30; + W31 = Vb31; + W40 = Vb40; + W41 = Vb41; + W50 = Vb50; + W51 = Vb51; + W60 = Vb60; + W61 = Vb61; + W70 = Vb70; + W71 = Vb71; + W80 = hash.h8[0]; + W81 = hash.h8[1]; + W90 = hash.h8[2]; + W91 = hash.h8[3]; + WA0 = hash.h8[4]; + WA1 = hash.h8[5]; + WB0 = hash.h8[6]; + WB1 = hash.h8[7]; + WC0 = 0x80; + WC1 = 0; + WD0 = 0; + WD1 = 0; + WE0 = 0; + WE1 = 0x200000000000000; + WF0 = 0x200; + WF1 = 0; + + for (unsigned u = 0; u < 10; u ++) + BIG_ROUND; + + hashp->h8[0] = hash.h8[0] ^ Vb00 ^ W00 ^ W80; + hashp->h8[1] = hash.h8[1] ^ Vb01 ^ W01 ^ W81; + hashp->h8[2] = hash.h8[2] ^ Vb10 ^ W10 ^ W90; + hashp->h8[3] = hash.h8[3] ^ Vb11 ^ W11 ^ W91; + hashp->h8[4] = hash.h8[4] ^ Vb20 ^ W20 ^ WA0; + hashp->h8[5] = hash.h8[5] ^ Vb21 ^ W21 ^ WA1; + hashp->h8[6] = hash.h8[6] ^ Vb30 ^ W30 ^ WB0; + hashp->h8[7] = hash.h8[7] ^ Vb31 ^ W31 ^ WB1; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search11(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __local sph_u32 T512_L[1024]; + __constant const sph_u32 *T512_C = &T512[0][0]; + + int init = get_local_id(0); + int step = get_local_size(0); + for (int i = init; i < 1024; i += step) + T512_L[i] = T512_C[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; + sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; + sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; + sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; + sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + + #define buf(u) hash->h1[i + u] + + for(int i = 0; i < 64; i += 8) { + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + } + + #undef buf + #define buf(u) (u == 0 ? 0x80 : 0) + + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + + #undef buf + #define buf(u) (u == 6 ? 2 : 0) + + INPUT_BIG_LOCAL; + PF_BIG; + T_BIG; + + for (unsigned u = 0; u < 16; u ++) + hash->h4[u] = h[u]; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search12(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + uint offset = get_global_offset(0); + __global hash_t *hash = &(hashes[gid-offset]); + + // mixtab + __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256]; + int init = get_local_id(0); + int step = get_local_size(0); + for (int i = init; i < 256; i += step) + { + mixtab0[i] = mixtab0_c[i]; + mixtab1[i] = mixtab1_c[i]; + mixtab2[i] = mixtab2_c[i]; + mixtab3[i] = mixtab3_c[i]; + } + + // fugue + sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; + sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; + sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; + sph_u32 S30, S31, S32, S33, S34, S35; + + ulong fc_bit_count = (sph_u64) 64 << 3; + + S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; + S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); + S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); + S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); + S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); + + FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2])); + FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5])); + FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8])); + FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB])); + FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE])); + FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x); + + // apply round shift if necessary + int i; + + for (i = 0; i < 32; i ++) + { + ROR3; + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + } + + for (i = 0; i < 13; i ++) + { + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S18 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S28 ^= S00; + ROR8; + SMIX(S00, S01, S02, S03); + } + + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + + hash->h4[0] = SWAP4(S01); + hash->h4[1] = SWAP4(S02); + hash->h4[2] = SWAP4(S03); + hash->h4[3] = SWAP4(S04); + hash->h4[4] = SWAP4(S09); + hash->h4[5] = SWAP4(S10); + hash->h4[6] = SWAP4(S11); + hash->h4[7] = SWAP4(S12); + hash->h4[8] = SWAP4(S18); + hash->h4[9] = SWAP4(S19); + hash->h4[10] = SWAP4(S20); + hash->h4[11] = SWAP4(S21); + hash->h4[12] = SWAP4(S27); + hash->h4[13] = SWAP4(S28); + hash->h4[14] = SWAP4(S29); + hash->h4[15] = SWAP4(S30); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search13(__global hash_t* hashes, __global uint* output, const ulong target) +{ + uint gid = get_global_id(0); + uint offset = get_global_offset(0); + __global hash_t *hash = &(hashes[gid-offset]); + + // shabal + sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[2], A03 = A_init_512[3], A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[6], A07 = A_init_512[7], + A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11]; + sph_u32 B0 = B_init_512[0], B1 = B_init_512[1], B2 = B_init_512[2], B3 = B_init_512[3], B4 = B_init_512[4], B5 = B_init_512[5], B6 = B_init_512[6], B7 = B_init_512[7], + B8 = B_init_512[8], B9 = B_init_512[9], BA = B_init_512[10], BB = B_init_512[11], BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15]; + sph_u32 C0 = C_init_512[0], C1 = C_init_512[1], C2 = C_init_512[2], C3 = C_init_512[3], C4 = C_init_512[4], C5 = C_init_512[5], C6 = C_init_512[6], C7 = C_init_512[7], + C8 = C_init_512[8], C9 = C_init_512[9], CA = C_init_512[10], CB = C_init_512[11], CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15]; + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; + sph_u32 Wlow = 1, Whigh = 0; + + M0 = hash->h4[0]; + M1 = hash->h4[1]; + M2 = hash->h4[2]; + M3 = hash->h4[3]; + M4 = hash->h4[4]; + M5 = hash->h4[5]; + M6 = hash->h4[6]; + M7 = hash->h4[7]; + M8 = hash->h4[8]; + M9 = hash->h4[9]; + MA = hash->h4[10]; + MB = hash->h4[11]; + MC = hash->h4[12]; + MD = hash->h4[13]; + ME = hash->h4[14]; + MF = hash->h4[15]; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + INPUT_BLOCK_SUB; + SWAP_BC; + INCR_W; + + M0 = 0x80; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + + for (unsigned i = 0; i < 3; i ++) + { + SWAP_BC; + XOR_W; + APPLY_P; + } + + hash->h4[0] = B0; + hash->h4[1] = B1; + hash->h4[2] = B2; + hash->h4[3] = B3; + hash->h4[4] = B4; + hash->h4[5] = B5; + hash->h4[6] = B6; + hash->h4[7] = B7; + hash->h4[8] = B8; + hash->h4[9] = B9; + hash->h4[10] = BA; + hash->h4[11] = BB; + hash->h4[12] = BC; + hash->h4[13] = BD; + hash->h4[14] = BE; + hash->h4[15] = BF; + + bool result = (hash->h8[3] <= target); + if (result) + output[atomic_inc(output+0xFF)] = SWAP4(gid); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +#endif // X14_CL \ No newline at end of file diff --git a/kernel/x14old.cl b/kernel/x14old.cl new file mode 100644 index 00000000..315c3b58 --- /dev/null +++ b/kernel/x14old.cl @@ -0,0 +1,1294 @@ +/* + * X14 kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2014 phm + * Copyright (c) 2014 Girino Vey + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author phm + */ + +#ifndef X14OLD_CL +#define X14OLD_CL + +#if __ENDIAN_LITTLE__ +#define SPH_LITTLE_ENDIAN 1 +#else +#define SPH_BIG_ENDIAN 1 +#endif + +#define SPH_UPTR sph_u64 + +typedef unsigned int sph_u32; +typedef int sph_s32; +#ifndef __OPENCL_VERSION__ +typedef unsigned long long sph_u64; +typedef long long sph_s64; +#else +typedef unsigned long sph_u64; +typedef long sph_s64; +#endif + +#define SPH_64 1 +#define SPH_64_TRUE 1 + +#define SPH_C32(x) ((sph_u32)(x ## U)) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +#define SPH_C64(x) ((sph_u64)(x ## UL)) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) + +#define SPH_ECHO_64 1 +#define SPH_KECCAK_64 1 +#define SPH_JH_64 1 +#define SPH_SIMD_NOCOPY 0 +#define SPH_KECCAK_NOCOPY 0 +#define SPH_COMPACT_BLAKE_64 0 +#define SPH_LUFFA_PARALLEL 0 +#define SPH_SMALL_FOOTPRINT_GROESTL 0 +#define SPH_GROESTL_BIG_ENDIAN 0 +#define SPH_CUBEHASH_UNROLL 0 +#define SPH_KECCAK_UNROLL 1 +#ifndef SPH_HAMSI_EXPAND_BIG + #define SPH_HAMSI_EXPAND_BIG 4 +#endif + +#include "blake.cl" +#include "bmw.cl" +#include "groestl.cl" +#include "jh.cl" +#include "keccak.cl" +#include "skein.cl" +#include "luffa.cl" +#include "cubehash.cl" +#include "shavite.cl" +#include "simd.cl" +#include "echo.cl" +#include "hamsi.cl" +#include "fugue.cl" +#include "shabal.cl" + +#define SWAP4(x) as_uint(as_uchar4(x).wzyx) +#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) + +#if SPH_BIG_ENDIAN + #define DEC64E(x) (x) + #define DEC64BE(x) (*(const __global sph_u64 *) (x)); +#else + #define DEC64E(x) SWAP8(x) + #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); +#endif + +typedef union { + unsigned char h1[64]; + uint h4[16]; + ulong h8[8]; +} hash_t; + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global unsigned char* block, __global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // blake + sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); + sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); + sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); + sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); + sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; + sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; + + if ((T0 = SPH_T64(T0 + 1024)) < 1024) + T1 = SPH_T64(T1 + 1); + + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; + + M0 = DEC64BE(block + 0); + M1 = DEC64BE(block + 8); + M2 = DEC64BE(block + 16); + M3 = DEC64BE(block + 24); + M4 = DEC64BE(block + 32); + M5 = DEC64BE(block + 40); + M6 = DEC64BE(block + 48); + M7 = DEC64BE(block + 56); + M8 = DEC64BE(block + 64); + M9 = DEC64BE(block + 72); + M9 &= 0xFFFFFFFF00000000; + M9 ^= SWAP4(gid); + MA = 0x8000000000000000; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 0x280; + + COMPRESS64; + + hash->h8[0] = H0; + hash->h8[1] = H1; + hash->h8[2] = H2; + hash->h8[3] = H3; + hash->h8[4] = H4; + hash->h8[5] = H5; + hash->h8[6] = H6; + hash->h8[7] = H7; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // bmw + sph_u64 BMW_H[16]; + +#pragma unroll 16 + for(unsigned u = 0; u < 16; u++) + BMW_H[u] = BMW_IV512[u]; + + sph_u64 mv[16],q[32]; + sph_u64 tmp; + + mv[0] = SWAP8(hash->h8[0]); + mv[1] = SWAP8(hash->h8[1]); + mv[2] = SWAP8(hash->h8[2]); + mv[3] = SWAP8(hash->h8[3]); + mv[4] = SWAP8(hash->h8[4]); + mv[5] = SWAP8(hash->h8[5]); + mv[6] = SWAP8(hash->h8[6]); + mv[7] = SWAP8(hash->h8[7]); + mv[8] = 0x80; + mv[9] = 0; + mv[10] = 0; + mv[11] = 0; + mv[12] = 0; + mv[13] = 0; + mv[14] = 0; + mv[15] = SPH_C64(512); + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) + { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; + } + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + hash->h8[0] = SWAP8(BMW_H[8]); + hash->h8[1] = SWAP8(BMW_H[9]); + hash->h8[2] = SWAP8(BMW_H[10]); + hash->h8[3] = SWAP8(BMW_H[11]); + hash->h8[4] = SWAP8(BMW_H[12]); + hash->h8[5] = SWAP8(BMW_H[13]); + hash->h8[6] = SWAP8(BMW_H[14]); + hash->h8[7] = SWAP8(BMW_H[15]); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + T0_L[i] = T0[i]; + T4_L[i] = T4[i]; + T1_L[i] = T1[i]; + T2_L[i] = T2[i]; + T3_L[i] = T3[i]; + T5_L[i] = T5[i]; + T6_L[i] = T6[i]; + T7_L[i] = T7[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + #define T0 T0_L + #define T1 T1_L + #define T2 T2_L + #define T3 T3_L + #define T4 T4_L + #define T5 T5_L + #define T6 T6_L + #define T7 T7_L + + // groestl + sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + + sph_u64 g[16], m[16]; + g[0] = m[0] = DEC64E(hash->h8[0]); + g[1] = m[1] = DEC64E(hash->h8[1]); + g[2] = m[2] = DEC64E(hash->h8[2]); + g[3] = m[3] = DEC64E(hash->h8[3]); + g[4] = m[4] = DEC64E(hash->h8[4]); + g[5] = m[5] = DEC64E(hash->h8[5]); + g[6] = m[6] = DEC64E(hash->h8[6]); + g[7] = m[7] = DEC64E(hash->h8[7]); + g[8] = m[8] = 0x80; + g[9] = m[9] = 0; + g[10] = m[10] = 0; + g[11] = m[11] = 0; + g[12] = m[12] = 0; + g[13] = m[13] = 0; + g[14] = m[14] = 0; + g[15] = 0x102000000000000; + m[15] = 0x100000000000000; + + PERM_BIG_P(g); + PERM_BIG_Q(m); + + sph_u64 xH[16]; + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u] ^= g[u] ^ m[u]; + + PERM_BIG_P(xH); + + for (unsigned int u = 8; u < 16; u ++) + hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // skein + + sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); + sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u64 bcount = 0; + + m0 = SWAP8(hash->h8[0]); + m1 = SWAP8(hash->h8[1]); + m2 = SWAP8(hash->h8[2]); + m3 = SWAP8(hash->h8[3]); + m4 = SWAP8(hash->h8[4]); + m5 = SWAP8(hash->h8[5]); + m6 = SWAP8(hash->h8[6]); + m7 = SWAP8(hash->h8[7]); + + UBI_BIG(480, 64); + + bcount = 0; + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; + + UBI_BIG(510, 8); + + hash->h8[0] = SWAP8(h0); + hash->h8[1] = SWAP8(h1); + hash->h8[2] = SWAP8(h2); + hash->h8[3] = SWAP8(h3); + hash->h8[4] = SWAP8(h4); + hash->h8[5] = SWAP8(h5); + hash->h8[6] = SWAP8(h6); + hash->h8[7] = SWAP8(h7); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search4(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // jh + + sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); + sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); + sph_u64 tmp; + + for(int i = 0; i < 2; i++) + { + if (i == 0) + { + h0h ^= DEC64E(hash->h8[0]); + h0l ^= DEC64E(hash->h8[1]); + h1h ^= DEC64E(hash->h8[2]); + h1l ^= DEC64E(hash->h8[3]); + h2h ^= DEC64E(hash->h8[4]); + h2l ^= DEC64E(hash->h8[5]); + h3h ^= DEC64E(hash->h8[6]); + h3l ^= DEC64E(hash->h8[7]); + } + else if(i == 1) + { + h4h ^= DEC64E(hash->h8[0]); + h4l ^= DEC64E(hash->h8[1]); + h5h ^= DEC64E(hash->h8[2]); + h5l ^= DEC64E(hash->h8[3]); + h6h ^= DEC64E(hash->h8[4]); + h6l ^= DEC64E(hash->h8[5]); + h7h ^= DEC64E(hash->h8[6]); + h7l ^= DEC64E(hash->h8[7]); + + h0h ^= 0x80; + h3l ^= 0x2000000000000; + } + E8; + } + h4h ^= 0x80; + h7l ^= 0x2000000000000; + + hash->h8[0] = DEC64E(h4h); + hash->h8[1] = DEC64E(h4l); + hash->h8[2] = DEC64E(h5h); + hash->h8[3] = DEC64E(h5l); + hash->h8[4] = DEC64E(h6h); + hash->h8[5] = DEC64E(h6l); + hash->h8[6] = DEC64E(h7h); + hash->h8[7] = DEC64E(h7l); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search5(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // keccak + + sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; + sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; + sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; + sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; + sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; + + a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); + + a00 ^= SWAP8(hash->h8[0]); + a10 ^= SWAP8(hash->h8[1]); + a20 ^= SWAP8(hash->h8[2]); + a30 ^= SWAP8(hash->h8[3]); + a40 ^= SWAP8(hash->h8[4]); + a01 ^= SWAP8(hash->h8[5]); + a11 ^= SWAP8(hash->h8[6]); + a21 ^= SWAP8(hash->h8[7]); + a31 ^= 0x8000000000000001; + KECCAK_F_1600; + + // Finalize the "lane complement" + a10 = ~a10; + a20 = ~a20; + + hash->h8[0] = SWAP8(a00); + hash->h8[1] = SWAP8(a10); + hash->h8[2] = SWAP8(a20); + hash->h8[3] = SWAP8(a30); + hash->h8[4] = SWAP8(a40); + hash->h8[5] = SWAP8(a01); + hash->h8[6] = SWAP8(a11); + hash->h8[7] = SWAP8(a21); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search6(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // luffa + + sph_u32 V00 = SPH_C32(0x6d251e69), V01 = SPH_C32(0x44b051e0), V02 = SPH_C32(0x4eaa6fb4), V03 = SPH_C32(0xdbf78465), V04 = SPH_C32(0x6e292011), V05 = SPH_C32(0x90152df4), V06 = SPH_C32(0xee058139), V07 = SPH_C32(0xdef610bb); + sph_u32 V10 = SPH_C32(0xc3b44b95), V11 = SPH_C32(0xd9d2f256), V12 = SPH_C32(0x70eee9a0), V13 = SPH_C32(0xde099fa3), V14 = SPH_C32(0x5d9b0557), V15 = SPH_C32(0x8fc944b3), V16 = SPH_C32(0xcf1ccf0e), V17 = SPH_C32(0x746cd581); + sph_u32 V20 = SPH_C32(0xf7efc89d), V21 = SPH_C32(0x5dba5781), V22 = SPH_C32(0x04016ce5), V23 = SPH_C32(0xad659c05), V24 = SPH_C32(0x0306194f), V25 = SPH_C32(0x666d1836), V26 = SPH_C32(0x24aa230a), V27 = SPH_C32(0x8b264ae7); + sph_u32 V30 = SPH_C32(0x858075d5), V31 = SPH_C32(0x36d79cce), V32 = SPH_C32(0xe571f7d7), V33 = SPH_C32(0x204b1f67), V34 = SPH_C32(0x35870c6a), V35 = SPH_C32(0x57e9e923), V36 = SPH_C32(0x14bcb808), V37 = SPH_C32(0x7cde72ce); + sph_u32 V40 = SPH_C32(0x6c68e9be), V41 = SPH_C32(0x5ec41e22), V42 = SPH_C32(0xc825b7c7), V43 = SPH_C32(0xaffb4363), V44 = SPH_C32(0xf5df3999), V45 = SPH_C32(0x0fc688f1), V46 = SPH_C32(0xb07224cc), V47 = SPH_C32(0x03e86cea); + + DECL_TMP8(M); + + M0 = hash->h4[1]; + M1 = hash->h4[0]; + M2 = hash->h4[3]; + M3 = hash->h4[2]; + M4 = hash->h4[5]; + M5 = hash->h4[4]; + M6 = hash->h4[7]; + M7 = hash->h4[6]; + + for(uint i = 0; i < 5; i++) + { + MI5; + LUFFA_P5; + + if(i == 0) + { + M0 = hash->h4[9]; + M1 = hash->h4[8]; + M2 = hash->h4[11]; + M3 = hash->h4[10]; + M4 = hash->h4[13]; + M5 = hash->h4[12]; + M6 = hash->h4[15]; + M7 = hash->h4[14]; + } + else if(i == 1) + { + M0 = 0x80000000; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + } + else if(i == 2) + M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; + else if(i == 3) + { + hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[2] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[5] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[4] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[7] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[6] = V07 ^ V17 ^ V27 ^ V37 ^ V47; + } + } + + hash->h4[9] = V00 ^ V10 ^ V20 ^ V30 ^ V40; + hash->h4[8] = V01 ^ V11 ^ V21 ^ V31 ^ V41; + hash->h4[11] = V02 ^ V12 ^ V22 ^ V32 ^ V42; + hash->h4[10] = V03 ^ V13 ^ V23 ^ V33 ^ V43; + hash->h4[13] = V04 ^ V14 ^ V24 ^ V34 ^ V44; + hash->h4[12] = V05 ^ V15 ^ V25 ^ V35 ^ V45; + hash->h4[15] = V06 ^ V16 ^ V26 ^ V36 ^ V46; + hash->h4[14] = V07 ^ V17 ^ V27 ^ V37 ^ V47; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search7(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // cubehash.h1 + + sph_u32 x0 = SPH_C32(0x2AEA2A61), x1 = SPH_C32(0x50F494D4), x2 = SPH_C32(0x2D538B8B), x3 = SPH_C32(0x4167D83E); + sph_u32 x4 = SPH_C32(0x3FEE2313), x5 = SPH_C32(0xC701CF8C), x6 = SPH_C32(0xCC39968E), x7 = SPH_C32(0x50AC5695); + sph_u32 x8 = SPH_C32(0x4D42C787), x9 = SPH_C32(0xA647A8B3), xa = SPH_C32(0x97CF0BEF), xb = SPH_C32(0x825B4537); + sph_u32 xc = SPH_C32(0xEEF864D2), xd = SPH_C32(0xF22090C4), xe = SPH_C32(0xD0E5CD33), xf = SPH_C32(0xA23911AE); + sph_u32 xg = SPH_C32(0xFCD398D9), xh = SPH_C32(0x148FE485), xi = SPH_C32(0x1B017BEF), xj = SPH_C32(0xB6444532); + sph_u32 xk = SPH_C32(0x6A536159), xl = SPH_C32(0x2FF5781C), xm = SPH_C32(0x91FA7934), xn = SPH_C32(0x0DBADEA9); + sph_u32 xo = SPH_C32(0xD65C8A2B), xp = SPH_C32(0xA5A70E75), xq = SPH_C32(0xB1C62456), xr = SPH_C32(0xBC796576); + sph_u32 xs = SPH_C32(0x1921C8F7), xt = SPH_C32(0xE7989AF1), xu = SPH_C32(0x7795D246), xv = SPH_C32(0xD43E3B44); + + x0 ^= SWAP4(hash->h4[1]); + x1 ^= SWAP4(hash->h4[0]); + x2 ^= SWAP4(hash->h4[3]); + x3 ^= SWAP4(hash->h4[2]); + x4 ^= SWAP4(hash->h4[5]); + x5 ^= SWAP4(hash->h4[4]); + x6 ^= SWAP4(hash->h4[7]); + x7 ^= SWAP4(hash->h4[6]); + + for (int i = 0; i < 13; i ++) + { + SIXTEEN_ROUNDS; + + if (i == 0) + { + x0 ^= SWAP4(hash->h4[9]); + x1 ^= SWAP4(hash->h4[8]); + x2 ^= SWAP4(hash->h4[11]); + x3 ^= SWAP4(hash->h4[10]); + x4 ^= SWAP4(hash->h4[13]); + x5 ^= SWAP4(hash->h4[12]); + x6 ^= SWAP4(hash->h4[15]); + x7 ^= SWAP4(hash->h4[14]); + } + else if(i == 1) + x0 ^= 0x80; + else if (i == 2) + xv ^= SPH_C32(1); + } + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + hash->h4[8] = x8; + hash->h4[9] = x9; + hash->h4[10] = xa; + hash->h4[11] = xb; + hash->h4[12] = xc; + hash->h4[13] = xd; + hash->h4[14] = xe; + hash->h4[15] = xf; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search8(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // shavite + // IV + sph_u32 h0 = SPH_C32(0x72FCCDD8), h1 = SPH_C32(0x79CA4727), h2 = SPH_C32(0x128A077B), h3 = SPH_C32(0x40D55AEC); + sph_u32 h4 = SPH_C32(0xD1901A06), h5 = SPH_C32(0x430AE307), h6 = SPH_C32(0xB29F5CD1), h7 = SPH_C32(0xDF07FBFC); + sph_u32 h8 = SPH_C32(0x8E45D73D), h9 = SPH_C32(0x681AB538), hA = SPH_C32(0xBDE86578), hB = SPH_C32(0xDD577E47); + sph_u32 hC = SPH_C32(0xE275EADE), hD = SPH_C32(0x502D9FCD), hE = SPH_C32(0xB9357178), hF = SPH_C32(0x022A4B9A); + + // state + sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + + sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + + rk00 = hash->h4[0]; + rk01 = hash->h4[1]; + rk02 = hash->h4[2]; + rk03 = hash->h4[3]; + rk04 = hash->h4[4]; + rk05 = hash->h4[5]; + rk06 = hash->h4[6]; + rk07 = hash->h4[7]; + rk08 = hash->h4[8]; + rk09 = hash->h4[9]; + rk0A = hash->h4[10]; + rk0B = hash->h4[11]; + rk0C = hash->h4[12]; + rk0D = hash->h4[13]; + rk0E = hash->h4[14]; + rk0F = hash->h4[15]; + rk10 = 0x80; + rk11 = rk12 = rk13 = rk14 = rk15 = rk16 = rk17 = rk18 = rk19 = rk1A = 0; + rk1B = 0x2000000; + rk1C = rk1D = rk1E = 0; + rk1F = 0x2000000; + + c512(buf); + + hash->h4[0] = h0; + hash->h4[1] = h1; + hash->h4[2] = h2; + hash->h4[3] = h3; + hash->h4[4] = h4; + hash->h4[5] = h5; + hash->h4[6] = h6; + hash->h4[7] = h7; + hash->h4[8] = h8; + hash->h4[9] = h9; + hash->h4[10] = hA; + hash->h4[11] = hB; + hash->h4[12] = hC; + hash->h4[13] = hD; + hash->h4[14] = hE; + hash->h4[15] = hF; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search9(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + // simd + s32 q[256]; + unsigned char x[128]; + for(unsigned int i = 0; i < 64; i++) + x[i] = hash->h1[i]; + for(unsigned int i = 64; i < 128; i++) + x[i] = 0; + + u32 A0 = C32(0x0BA16B95), A1 = C32(0x72F999AD), A2 = C32(0x9FECC2AE), A3 = C32(0xBA3264FC), A4 = C32(0x5E894929), A5 = C32(0x8E9F30E5), A6 = C32(0x2F1DAA37), A7 = C32(0xF0F2C558); + u32 B0 = C32(0xAC506643), B1 = C32(0xA90635A5), B2 = C32(0xE25B878B), B3 = C32(0xAAB7878F), B4 = C32(0x88817F7A), B5 = C32(0x0A02892B), B6 = C32(0x559A7550), B7 = C32(0x598F657E); + u32 C0 = C32(0x7EEF60A1), C1 = C32(0x6B70E3E8), C2 = C32(0x9C1714D1), C3 = C32(0xB958E2A8), C4 = C32(0xAB02675E), C5 = C32(0xED1C014F), C6 = C32(0xCD8D65BB), C7 = C32(0xFDB7A257); + u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); + + FFT256(0, 1, 0, ll1); + for (int i = 0; i < 256; i ++) + { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + + A0 ^= hash->h4[0]; + A1 ^= hash->h4[1]; + A2 ^= hash->h4[2]; + A3 ^= hash->h4[3]; + A4 ^= hash->h4[4]; + A5 ^= hash->h4[5]; + A6 ^= hash->h4[6]; + A7 ^= hash->h4[7]; + B0 ^= hash->h4[8]; + B1 ^= hash->h4[9]; + B2 ^= hash->h4[10]; + B3 ^= hash->h4[11]; + B4 ^= hash->h4[12]; + B5 ^= hash->h4[13]; + B6 ^= hash->h4[14]; + B7 ^= hash->h4[15]; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + IF, 4, 13, PP8_4_); + + STEP_BIG( + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + IF, 13, 10, PP8_5_); + + STEP_BIG( + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + IF, 10, 25, PP8_6_); + + STEP_BIG( + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), + IF, 25, 4, PP8_0_); + + u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; + u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; + u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; + u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; + + #define q SIMD_Q + + A0 ^= 0x200; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + COPY_A0, COPY_A1, COPY_A2, COPY_A3, + COPY_A4, COPY_A5, COPY_A6, COPY_A7, + IF, 4, 13, PP8_4_); + + STEP_BIG( + COPY_B0, COPY_B1, COPY_B2, COPY_B3, + COPY_B4, COPY_B5, COPY_B6, COPY_B7, + IF, 13, 10, PP8_5_); + + STEP_BIG( + COPY_C0, COPY_C1, COPY_C2, COPY_C3, + COPY_C4, COPY_C5, COPY_C6, COPY_C7, + IF, 10, 25, PP8_6_); + + STEP_BIG( + COPY_D0, COPY_D1, COPY_D2, COPY_D3, + COPY_D4, COPY_D5, COPY_D6, COPY_D7, + IF, 25, 4, PP8_0_); + + #undef q + + hash->h4[0] = A0; + hash->h4[1] = A1; + hash->h4[2] = A2; + hash->h4[3] = A3; + hash->h4[4] = A4; + hash->h4[5] = A5; + hash->h4[6] = A6; + hash->h4[7] = A7; + hash->h4[8] = B0; + hash->h4[9] = B1; + hash->h4[10] = B2; + hash->h4[11] = B3; + hash->h4[12] = B4; + hash->h4[13] = B5; + hash->h4[14] = B6; + hash->h4[15] = B7; + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search10(__global hash_t* hashes, __global uint* output, const ulong target) +{ + uint gid = get_global_id(0); + uint offset = get_global_offset(0); + __global hash_t *hash = &(hashes[gid-offset]); + + __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; + + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + AES0[i] = AES0_C[i]; + AES1[i] = AES1_C[i]; + AES2[i] = AES2_C[i]; + AES3[i] = AES3_C[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + //mixtab + __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256]; + + for (int i = init; i < 256; i += step) + { + mixtab0[i] = mixtab0_c[i]; + mixtab1[i] = mixtab1_c[i]; + mixtab2[i] = mixtab2_c[i]; + mixtab3[i] = mixtab3_c[i]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local sph_u32 T512_L[1024]; + __constant const sph_u32 *T512_C = &T512[0][0]; + + for (int i = init; i < 1024; i += step) + T512_L[i] = T512_C[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + for (int i = 0; i < 8; i++) + hash->h8[i] = hashes[gid-offset].h8[i]; + + // echo + sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; + sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; + Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; + Vb01 = Vb11 = Vb21 = Vb31 = Vb41 = Vb51 = Vb61 = Vb71 = 0; + + sph_u32 K0 = 512; + sph_u32 K1 = 0; + sph_u32 K2 = 0; + sph_u32 K3 = 0; + + W00 = Vb00; + W01 = Vb01; + W10 = Vb10; + W11 = Vb11; + W20 = Vb20; + W21 = Vb21; + W30 = Vb30; + W31 = Vb31; + W40 = Vb40; + W41 = Vb41; + W50 = Vb50; + W51 = Vb51; + W60 = Vb60; + W61 = Vb61; + W70 = Vb70; + W71 = Vb71; + W80 = hash->h8[0]; + W81 = hash->h8[1]; + W90 = hash->h8[2]; + W91 = hash->h8[3]; + WA0 = hash->h8[4]; + WA1 = hash->h8[5]; + WB0 = hash->h8[6]; + WB1 = hash->h8[7]; + WC0 = 0x80; + WC1 = 0; + WD0 = 0; + WD1 = 0; + WE0 = 0; + WE1 = 0x200000000000000; + WF0 = 0x200; + WF1 = 0; + + for (unsigned u = 0; u < 10; u ++) + BIG_ROUND; + + hash->h8[0] ^= Vb00 ^ W00 ^ W80; + hash->h8[1] ^= Vb01 ^ W01 ^ W81; + hash->h8[2] ^= Vb10 ^ W10 ^ W90; + hash->h8[3] ^= Vb11 ^ W11 ^ W91; + hash->h8[4] ^= Vb20 ^ W20 ^ WA0; + hash->h8[5] ^= Vb21 ^ W21 ^ WA1; + hash->h8[6] ^= Vb30 ^ W30 ^ WB0; + hash->h8[7] ^= Vb31 ^ W31 ^ WB1; + + // hamsi + sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; + sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; + sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; + sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15]; + sph_u32 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; + sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + + #define buf(u) hash->h1[i + u] + + for(int i = 0; i < 64; i += 8) + { + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + } + + #undef buf + #define buf(u) (u == 0 ? 0x80 : 0) + + INPUT_BIG_LOCAL; + P_BIG; + T_BIG; + + #undef buf + #define buf(u) (u == 6 ? 2 : 0) + + INPUT_BIG_LOCAL; + PF_BIG; + T_BIG; + + for (unsigned u = 0; u < 16; u ++) + hash->h4[u] = h[u]; + + // fugue + sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; + sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; + sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; + sph_u32 S30, S31, S32, S33, S34, S35; + + ulong fc_bit_count = (sph_u64) 0x200; + + S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; + S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); + S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); + S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); + S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); + + FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2])); + FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5])); + FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8])); + FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB])); + FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE])); + FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x); + + // apply round shift if necessary + int i; + + for (i = 0; i < 32; i ++) + { + ROR3; + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + } + + for (i = 0; i < 13; i ++) + { + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S18 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S27 ^= S00; + ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S28 ^= S00; + ROR8; + SMIX(S00, S01, S02, S03); + } + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + + hash->h4[0] = SWAP4(S01); + hash->h4[1] = SWAP4(S02); + hash->h4[2] = SWAP4(S03); + hash->h4[3] = SWAP4(S04); + hash->h4[4] = SWAP4(S09); + hash->h4[5] = SWAP4(S10); + hash->h4[6] = SWAP4(S11); + hash->h4[7] = SWAP4(S12); + hash->h4[8] = SWAP4(S18); + hash->h4[9] = SWAP4(S19); + hash->h4[10] = SWAP4(S20); + hash->h4[11] = SWAP4(S21); + hash->h4[12] = SWAP4(S27); + hash->h4[13] = SWAP4(S28); + hash->h4[14] = SWAP4(S29); + hash->h4[15] = SWAP4(S30); + + //shabal + sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[2], A03 = A_init_512[3], A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[6], A07 = A_init_512[7], + A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11]; + sph_u32 B0 = B_init_512[0], B1 = B_init_512[1], B2 = B_init_512[2], B3 = B_init_512[3], B4 = B_init_512[4], B5 = B_init_512[5], B6 = B_init_512[6], B7 = B_init_512[7], + B8 = B_init_512[8], B9 = B_init_512[9], BA = B_init_512[10], BB = B_init_512[11], BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15]; + sph_u32 C0 = C_init_512[0], C1 = C_init_512[1], C2 = C_init_512[2], C3 = C_init_512[3], C4 = C_init_512[4], C5 = C_init_512[5], C6 = C_init_512[6], C7 = C_init_512[7], + C8 = C_init_512[8], C9 = C_init_512[9], CA = C_init_512[10], CB = C_init_512[11], CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15]; + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; + sph_u32 Wlow = 1, Whigh = 0; + + M0 = hash->h4[0]; + M1 = hash->h4[1]; + M2 = hash->h4[2]; + M3 = hash->h4[3]; + M4 = hash->h4[4]; + M5 = hash->h4[5]; + M6 = hash->h4[6]; + M7 = hash->h4[7]; + M8 = hash->h4[8]; + M9 = hash->h4[9]; + MA = hash->h4[10]; + MB = hash->h4[11]; + MC = hash->h4[12]; + MD = hash->h4[13]; + ME = hash->h4[14]; + MF = hash->h4[15]; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + INPUT_BLOCK_SUB; + SWAP_BC; + INCR_W; + + M0 = 0x80; + M1 = M2 = M3 = M4 = M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + + for (unsigned i = 0; i < 3; i ++) + { + SWAP_BC; + XOR_W; + APPLY_P; + } + + hash->h4[0] = B0; + hash->h4[1] = B1; + hash->h4[2] = B2; + hash->h4[3] = B3; + hash->h4[4] = B4; + hash->h4[5] = B5; + hash->h4[6] = B6; + hash->h4[7] = B7; + hash->h4[8] = B8; + hash->h4[9] = B9; + hash->h4[10] = BA; + hash->h4[11] = BB; + hash->h4[12] = BC; + hash->h4[13] = BD; + hash->h4[14] = BE; + hash->h4[15] = BF; + + bool result = (hash->h8[3] <= target); + if (result) + output[atomic_inc(output+0xFF)] = SWAP4(gid); + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +#endif // X14OLD_CL \ No newline at end of file diff --git a/miner.h b/miner.h index fa980c4c..cac35f2a 100644 --- a/miner.h +++ b/miner.h @@ -1033,6 +1033,7 @@ extern int swork_id; extern int opt_tcp_keepalive; extern bool opt_incognito; extern int opt_hamsi_expand_big; +extern bool opt_hamsi_short; #if LOCK_TRACKING extern pthread_mutex_t lockstat_lock; diff --git a/sgminer.c b/sgminer.c index 0debd9bc..7ea59118 100644 --- a/sgminer.c +++ b/sgminer.c @@ -192,6 +192,7 @@ int nDevs; int opt_dynamic_interval = 7; int opt_g_threads = -1; int opt_hamsi_expand_big = 4; +bool opt_hamsi_short = false; bool opt_restart = true; struct list_head scan_devices; @@ -1459,7 +1460,10 @@ struct opt_table opt_config_table[] = { "Set GPU lookup gap for scrypt mining, comma separated"), OPT_WITH_ARG("--hamsi-expand-big", set_int_1_to_10, opt_show_intval, &opt_hamsi_expand_big, - "Set SPH_HAMSI_EXPAND_BIG for X13 algorithms (1 or 4 are common)"), + "Set SPH_HAMSI_EXPAND_BIG for X13 derived algorithms (1 or 4 are common)"), + OPT_WITHOUT_ARG("--hamsi-short", + opt_set_bool, &opt_hamsi_short, + "Set SPH_HAMSI_SHORT for X13 derived algorithms (Can give better hashrate for some GPUs)"), #ifdef HAVE_CURSES OPT_WITHOUT_ARG("--incognito", opt_set_bool, &opt_incognito, diff --git a/winbuild/sgminer.vcxproj b/winbuild/sgminer.vcxproj index c2d7c06d..b73afae1 100644 --- a/winbuild/sgminer.vcxproj +++ b/winbuild/sgminer.vcxproj @@ -263,6 +263,7 @@ + @@ -329,6 +330,7 @@ + diff --git a/winbuild/sgminer.vcxproj.filters b/winbuild/sgminer.vcxproj.filters index a41d4849..05e877c4 100644 --- a/winbuild/sgminer.vcxproj.filters +++ b/winbuild/sgminer.vcxproj.filters @@ -227,6 +227,9 @@ Source Files\algorithm + + Source Files\algorithm + @@ -412,6 +415,9 @@ Header Files\algorithm + + Header Files\algorithm +