From 4ba8a68043a14f9f3d12c6ae3838cc802ddadd65 Mon Sep 17 00:00:00 2001 From: ystarnaud Date: Wed, 3 Dec 2014 10:09:55 -0500 Subject: [PATCH] Merged develop branch with master +updates Moved the develop code to master. Moving forward all updates will be done on master unless it's work on a major feature. This update contains all previous develop code as well as a few new ones that weren't pushed yet: * Added neoscrypt compatibility for xintensity/rawintensity * Neoscrypt now uses correct TC if not specified or set to 0 * Reworked the application of pool settings on algorithm switch which should resolve TC/Intensity changes between algos such as X11 and neoscrypt --- Makefile.am | 1 + algorithm.c | 105 +- algorithm.h | 1 + algorithm/neoscrypt.c | 5 +- algorithm/neoscrypt.h | 3 +- algorithm/whirlcoin.c | 171 +++ algorithm/whirlcoin.h | 9 + config_parser.c | 197 ++- configure.ac | 4 +- driver-opencl.c | 59 +- findnonce.c | 2 +- kernel/animecoin.cl | 530 +++++++- kernel/arebyp.cl | 993 +++++++++++++++ kernel/bitblock.cl | 119 +- kernel/darkcoin-mod.cl | 125 +- kernel/darkcoin.cl | 21 +- kernel/diamond.cl | 1853 +++++++++++++++++++++++++++ kernel/groestl.cl | 2 +- kernel/groestlcoin.cl | 2038 ++++++++++++++++++++++++++---- kernel/inkcoin.cl | 47 +- kernel/marucoin-mod.cl | 121 +- kernel/marucoin.cl | 8 +- kernel/myriadcoin-groestl.cl | 88 +- kernel/neoscrypt.cl | 986 +++++++++------ kernel/quarkcoin.cl | 1128 ++++++++++++----- kernel/qubitcoin.cl | 24 +- kernel/sifcoin.cl | 408 ++++-- kernel/talkcoin-mod.cl | 84 +- kernel/twecoin.cl | 18 +- kernel/whirlcoin.cl | 1358 ++++++++++++++++++++ kernel/x14.cl | 119 +- logging.c | 17 +- miner.h | 19 +- ocl.c | 131 +- pool.c | 19 +- sgminer.c | 655 +++++++--- util.c | 97 +- winbuild/sgminer.vcxproj | 4 +- winbuild/sgminer.vcxproj.filters | 8 +- 39 files changed, 9736 insertions(+), 1841 deletions(-) create mode 100644 algorithm/whirlcoin.c create mode 100644 algorithm/whirlcoin.h create mode 100644 kernel/arebyp.cl create mode 100644 kernel/diamond.cl create mode 100644 kernel/whirlcoin.cl diff --git a/Makefile.am b/Makefile.am index a91fd19c..f0e63de9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -66,6 +66,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h sgminer_SOURCES += algorithm/x14.c algorithm/x14.h sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h +sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h bin_SCRIPTS = $(top_srcdir)/kernel/*.cl diff --git a/algorithm.c b/algorithm.c index 69994d5a..2f549496 100644 --- a/algorithm.c +++ b/algorithm.c @@ -29,6 +29,7 @@ #include "algorithm/bitblock.h" #include "algorithm/x14.h" #include "algorithm/fresh.h" +#include "algorithm/whirlcoin.h" #include "algorithm/neoscrypt.h" #include "compat.h" @@ -49,7 +50,9 @@ const char *algorithm_type_str[] = { "Twecoin", "Fugue256", "NIST", - "Fresh" + "Fresh", + "Whirlcoin", + "Neoscrypt" }; void sha256(const unsigned char *message, unsigned int len, unsigned char *digest) @@ -96,11 +99,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm) { char buf[255]; - sprintf(buf, " -D MAX_GLOBAL_THREADS=%u", - (unsigned int)cgpu->thread_concurrency); + sprintf(buf, " %s-D MAX_GLOBAL_THREADS=%lu ", + ((cgpu->lookup_gap > 0)?" -D LOOKUP_GAP=2 ":""), (unsigned long)cgpu->thread_concurrency); strcat(data->compiler_options, buf); - - sprintf(buf, "tc%u", (unsigned int)cgpu->thread_concurrency); + + sprintf(buf, "%stc%lu", ((cgpu->lookup_gap > 0)?"lg":""), (unsigned long)cgpu->thread_concurrency); strcat(data->binary_filename, buf); } @@ -158,21 +161,20 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb unsigned int num = 0; cl_uint le_target; cl_int status = 0; - + /* This looks like a unnecessary double cast, but to make sure, that * the target's most significant entry is adressed as a 32-bit value * and not accidently by something else the double cast seems wise. - * The compiler will get rid of it anyway. - */ + * The compiler will get rid of it anyway. */ le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]); memcpy(clState->cldata, blk->work->data, 80); - status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); - + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); + CL_SET_ARG(clState->CLbuffer0); CL_SET_ARG(clState->outputBuffer); CL_SET_ARG(clState->padbuffer8); CL_SET_ARG(le_target); - + return status; } @@ -600,6 +602,34 @@ static cl_int queue_fresh_kernel(struct __clState *clState, struct _dev_blk_ctx return status; } +static cl_int queue_whirlcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel; + cl_ulong le_target; + cl_int status = 0; + + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); + + //clbuffer, hashes + kernel = &clState->kernel; + CL_SET_ARG_N(0,clState->CLbuffer0); + CL_SET_ARG_N(1,clState->padbuffer8); + + kernel = clState->extra_kernels; + CL_SET_ARG_N(0,clState->padbuffer8); + + CL_NEXTKERNEL_SET_ARG_N(0,clState->padbuffer8); + + //hashes, output, target + CL_NEXTKERNEL_SET_ARG_N(0,clState->padbuffer8); + CL_SET_ARG_N(1,clState->outputBuffer); + CL_SET_ARG_N(2,le_target); + + return status; +} + typedef struct _algorithm_settings_t { const char *name; /* Human-readable identifier */ algorithm_type_t type; //common algorithm type @@ -624,23 +654,24 @@ typedef struct _algorithm_settings_t { static algorithm_settings_t algos[] = { // kernels starting from this will have difficulty calculated by using litecoin algorithm #define A_SCRYPT(a) \ - { a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options} + { a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options} A_SCRYPT( "ckolivas" ), A_SCRYPT( "alexkarnew" ), A_SCRYPT( "alexkarnold" ), A_SCRYPT( "bufius" ), A_SCRYPT( "psw" ), A_SCRYPT( "zuikkis" ), + A_SCRYPT( "arebyp" ), #undef A_SCRYPT #define A_NEOSCRYPT(a) \ - { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options} + { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options} A_NEOSCRYPT("neoscrypt"), #undef A_NEOSCRYPT // kernels starting from this will have difficulty calculated by using quarkcoin algorithm #define A_QUARK(a, b) \ - { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options } + { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options} A_QUARK( "quarkcoin", quarkcoin_regenhash), A_QUARK( "qubitcoin", qubitcoin_regenhash), A_QUARK( "animecoin", animecoin_regenhash), @@ -649,40 +680,43 @@ static algorithm_settings_t algos[] = { // kernels starting from this will have difficulty calculated by using bitcoin algorithm #define A_DARK(a, b) \ - { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options } + { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options} A_DARK( "darkcoin", darkcoin_regenhash), A_DARK( "inkcoin", inkcoin_regenhash), A_DARK( "myriadcoin-groestl", myriadcoin_groestl_regenhash), #undef A_DARK { "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, queue_sph_kernel, sha256, NULL}, - { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL }, + { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL}, - { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options }, + { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options}, - { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options }, - { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options }, - { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options }, + { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options}, + { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options}, + { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options}, - { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options }, - { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options }, + { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options}, + { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options}, - { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options }, - { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options }, + { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options}, + { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options}, - { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options }, + { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options}, - { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL }, + { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL}, // kernels starting from this will have difficulty calculated by using fuguecoin algorithm -#define A_FUGUE(a, b) \ - { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL } - A_FUGUE( "fuguecoin", fuguecoin_regenhash), - A_FUGUE( "groestlcoin", groestlcoin_regenhash), -#undef A_FUGUE +#define A_FUGUE(a, b, c) \ + { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, c, NULL} + A_FUGUE("fuguecoin", fuguecoin_regenhash, sha256), + A_FUGUE("groestlcoin", groestlcoin_regenhash, sha256), + A_FUGUE("diamond", groestlcoin_regenhash, gen_hash), + #undef A_FUGUE + + { "whirlcoin", ALGO_WHIRL, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 3, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, whirlcoin_regenhash, queue_whirlcoin_kernel, sha256, NULL}, // Terminator (do not remove) - { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL } + { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL} }; void copy_algorithm_settings(algorithm_t* dest, const char* algo) @@ -695,7 +729,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo) if (strcmp(src->name, algo) == 0) { strcpy(dest->name, src->name); - dest->kernelfile = src->kernelfile; + dest->kernelfile = src->kernelfile; dest->type = src->type; dest->diff_multiplier1 = src->diff_multiplier1; @@ -751,6 +785,7 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa ALGO_ALIAS("x15old", "bitblockold"); ALGO_ALIAS("nist5", "talkcoin-mod"); ALGO_ALIAS("keccak", "maxcoin"); + ALGO_ALIAS("whirlpool", "whirlcoin"); #undef ALGO_ALIAS #undef ALGO_ALIAS_NF @@ -760,7 +795,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa void set_algorithm(algorithm_t* algo, const char* newname_alias) { - const char* newname; + const char *newname; + //load previous algorithm nfactor in case nfactor was applied before algorithm... or default to 10 uint8_t old_nfactor = ((algo->nfactor)?algo->nfactor:0); //load previous kernel file name if was applied before algorithm... @@ -811,5 +847,6 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor) bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2) { + // return (strcmp(algo1->name, algo2->name) == 0) && (algo1->nfactor == algo2->nfactor); return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor)); } diff --git a/algorithm.h b/algorithm.h index f9e2af1e..bbe63beb 100644 --- a/algorithm.h +++ b/algorithm.h @@ -24,6 +24,7 @@ typedef enum { ALGO_FUGUE, ALGO_NIST, ALGO_FRESH, + ALGO_WHIRL, ALGO_NEOSCRYPT } algorithm_type_t; diff --git a/algorithm/neoscrypt.c b/algorithm/neoscrypt.c index cec6e5a2..8cccc2c9 100644 --- a/algorithm/neoscrypt.c +++ b/algorithm/neoscrypt.c @@ -1177,9 +1177,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { r = (1 << ((profile >> 5) & 0x7)); } - uchar *stack; - stack =(uchar*)malloc((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align); - + uchar stack[(N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align]; /* X = r * 2 * SCRYPT_BLOCK_SIZE */ X = (uint *) &stack[stack_align & ~(stack_align - 1)]; /* Z is a copy of X for ChaCha */ @@ -1287,7 +1285,6 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { } - free(stack); } void neoscrypt_regenhash(struct work *work) diff --git a/algorithm/neoscrypt.h b/algorithm/neoscrypt.h index 5337dac8..086bdf0c 100644 --- a/algorithm/neoscrypt.h +++ b/algorithm/neoscrypt.h @@ -1,11 +1,10 @@ #ifndef NEOSCRYPT_H #define NEOSCRYPT_H - + #include "miner.h" /* The neoscrypt scratch buffer needs 32kBytes memory. */ #define NEOSCRYPT_SCRATCHBUF_SIZE (32 * 1024) - /* These routines are always available. */ extern void neoscrypt_regenhash(struct work *work); extern void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile); diff --git a/algorithm/whirlcoin.c b/algorithm/whirlcoin.c new file mode 100644 index 00000000..7aa61218 --- /dev/null +++ b/algorithm/whirlcoin.c @@ -0,0 +1,171 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "sph/sph_whirlpool.h" + +/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */ +typedef struct { + sph_whirlpool1_context whirlpool1; + sph_whirlpool1_context whirlpool2; + sph_whirlpool1_context whirlpool3; + sph_whirlpool1_context whirlpool4; +} Whash_context_holder; + +Whash_context_holder base_contexts; + + +void init_whirlcoin_hash_contexts() +{ + sph_whirlpool1_init(&base_contexts.whirlpool1); + sph_whirlpool1_init(&base_contexts.whirlpool2); + sph_whirlpool1_init(&base_contexts.whirlpool3); + sph_whirlpool1_init(&base_contexts.whirlpool4); +} + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + + +inline void whirlcoin_hash(void *state, const void *input) +{ + init_whirlcoin_hash_contexts(); + + Whash_context_holder ctx; + uint32_t hashA[16], hashB[16]; + + memcpy(&ctx, &base_contexts, sizeof(base_contexts)); + + sph_whirlpool1 (&ctx.whirlpool1, input, 80); + sph_whirlpool1_close (&ctx.whirlpool1, hashA); + + sph_whirlpool1(&ctx.whirlpool2, hashA, 64); + sph_whirlpool1_close(&ctx.whirlpool2, hashB); + + sph_whirlpool1(&ctx.whirlpool3, hashB, 64); + sph_whirlpool1_close(&ctx.whirlpool3, hashA); + + sph_whirlpool1(&ctx.whirlpool4, hashA, 64); + sph_whirlpool1_close(&ctx.whirlpool4, hashB); + + memcpy(state, hashB, 32); +} + +static const uint32_t diff1targ = 0x0000ffff; + + +/* Used externally as confirmation of correct OCL code */ +int whirlcoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + + whirlcoin_hash(ohash, data); + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + if (tmp_hash7 > diff1targ) + return -1; + if (tmp_hash7 > Htarg) + return 0; + return 1; +} + +void whirlcoin_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + whirlcoin_hash(ohash, data); +} + +bool scanhash_whirlcoin(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while(1) { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + whirlcoin_hash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", + (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) { + *last_nonce = n; + break; + } + } + + return ret; +} diff --git a/algorithm/whirlcoin.h b/algorithm/whirlcoin.h new file mode 100644 index 00000000..8c9f9733 --- /dev/null +++ b/algorithm/whirlcoin.h @@ -0,0 +1,9 @@ +#ifndef W_H +#define W_H + +#include "miner.h" + +extern int whirlcoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); +extern void whirlcoin_regenhash(struct work *work); + +#endif /* W_H */ \ No newline at end of file diff --git a/config_parser.c b/config_parser.c index 018e1bd9..9175785f 100644 --- a/config_parser.c +++ b/config_parser.c @@ -201,7 +201,7 @@ char *set_default_kernelfile(const char *arg) { applog(LOG_INFO, "Set default kernel file to %s", arg); default_profile.algorithm.kernelfile = arg; - + return NULL; } @@ -324,10 +324,10 @@ char *set_profile_devices(const char *arg) char *set_profile_kernelfile(const char *arg) { struct profile *profile = get_current_profile(); - + applog(LOG_DEBUG, "Setting profile %s algorithm kernel file to %s", profile->name, arg); profile->algorithm.kernelfile = arg; - + return NULL; } @@ -598,8 +598,10 @@ static struct opt_table *opt_find(struct opt_table *tbl, char *optname) //set url curl_easy_setopt(curl, CURLOPT_URL, url); //set write callback and fileinfo - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fetch_remote_config_cb); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); // fail on 404 or other 4xx http codes + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30); // timeout after 30 secs to prevent being stuck + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file); // stream to write data to + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fetch_remote_config_cb); // callback function to write to config file if((res = curl_easy_perform(curl)) != CURLE_OK) applog(LOG_ERR, "Fetch remote file failed: %s", curl_easy_strerror(res)); @@ -715,17 +717,20 @@ char *parse_config(json_t *val, const char *key, const char *parentkey, bool fil if((opt = opt_find(opt_config_table, optname)) != NULL) { //strings - if ((opt->type & OPT_HASARG) && json_is_string(val)) + if ((opt->type & OPT_HASARG) && json_is_string(val)) { err = opt->cb_arg(json_string_value(val), opt->u.arg); + } //boolean values - else if ((opt->type & OPT_NOARG) && json_is_true(val)) + else if ((opt->type & OPT_NOARG) && json_is_true(val)) { err = opt->cb(opt->u.arg); - else + } + else { err = "Invalid value"; + } } - else + else { err = "Invalid option"; - + } break; } @@ -756,37 +761,71 @@ char *load_config(const char *arg, const char *parentkey, void __maybe_unused *u json_t *config; #ifdef HAVE_LIBCURL - //if detected as url - if((strstr(arg, "http://") != NULL) || (strstr(arg, "https://") != NULL) || (strstr(arg, "ftp://") != NULL)) - { - //download config file locally and reset arg to it so we can parse it - if((arg = fetch_remote_config(arg)) == NULL) - return NULL; + int retry = opt_remoteconf_retry; + const char *url; + + // if detected as url + if ((strstr(arg, "http://") != NULL) || (strstr(arg, "https://") != NULL) || (strstr(arg, "ftp://") != NULL)) { + url = strdup(arg); + + do { + // wait for next retry + if (retry < opt_remoteconf_retry) { + sleep(opt_remoteconf_wait); + } + + // download config file locally and reset arg to it so we can parse it + if ((arg = fetch_remote_config(url)) != NULL) { + break; + } + + --retry; + } while (retry); + + // file not downloaded... abort + if (arg == NULL) { + // if we should use last downloaded copy... + if (opt_remoteconf_usecache) { + char *p; + + // extract filename out of url + if ((p = (char *)strrchr(url, '/')) == NULL) { + quit(1, "%s: invalid URL.", url); + } + + arg = p+1; + } else { + quit(1, "%s: unable to download config file.", url); + } + } } #endif - //most likely useless but leaving it here for now... - if(!cnfbuf) + // most likely useless but leaving it here for now... + if (!cnfbuf) { cnfbuf = strdup(arg); + } - //no need to restrict the number of includes... if it causes problems, restore it later + // no need to restrict the number of includes... if it causes problems, restore it later /*if(++include_count > JSON_MAX_DEPTH) return JSON_MAX_DEPTH_ERR; */ - //check if the file exists - if(access(arg, F_OK) == -1) + // check if the file exists + if (access(arg, F_OK) == -1) { quit(1, "%s: file not found.", arg); + } -#if JANSSON_MAJOR_VERSION > 1 - config = json_load_file(arg, 0, &err); -#else - config = json_load_file(arg, &err); -#endif + #if JANSSON_MAJOR_VERSION > 1 + config = json_load_file(arg, 0, &err); + #else + config = json_load_file(arg, &err); + #endif - //if json root is not an object, error out - if(!json_is_object(config)) + // if json root is not an object, error out + if (!json_is_object(config)) { return set_last_json_error("Error: JSON decode of file \"%s\" failed:\n %s", arg, err.text); + } config_loaded = true; @@ -1006,13 +1045,12 @@ void apply_pool_profile(struct pool *pool) if (empty_string(pool->algorithm.kernelfile)) { // ...but profile does, apply it to the pool if (!empty_string(profile->algorithm.kernelfile)) { - pool->algorithm.kernelfile = profile->algorithm.kernelfile; - applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile); - // ...or default profile does, apply it to the pool - } - else if (!empty_string(default_profile.algorithm.kernelfile)) { - pool->algorithm.kernelfile = default_profile.algorithm.kernelfile; - applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile); + pool->algorithm.kernelfile = profile->algorithm.kernelfile; + applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile); + // ...or default profile does, apply it to the pool + } else if (!empty_string(default_profile.algorithm.kernelfile)) { + pool->algorithm.kernelfile = default_profile.algorithm.kernelfile; + applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile); } } @@ -1034,39 +1072,76 @@ void apply_pool_profile(struct pool *pool) } applog(LOG_DEBUG, "Pool %i lookup gap set to \"%s\"", pool->pool_no, pool->lookup_gap); - if(pool_cmp(pool->intensity, default_profile.intensity)) - { - if(!empty_string(profile->intensity)) - pool->intensity = profile->intensity; - else - pool->intensity = default_profile.intensity; - } - applog(LOG_DEBUG, "Pool %i Intensity set to \"%s\"", pool->pool_no, pool->intensity); + int int_type = 0; - if(pool_cmp(pool->xintensity, default_profile.xintensity)) - { - if(!empty_string(profile->xintensity)) - pool->xintensity = profile->xintensity; - else + // FIXME: ifs from hell... + // First look for an existing intensity on pool + if (!empty_string(pool->rawintensity)) { + int_type = 2; + } + else if (!empty_string(pool->xintensity)) { + int_type = 1; + } + else if (!empty_string(pool->intensity)) { + int_type = 0; + } + else { + //no intensity found on pool... check if the profile has one and use it... + if (!empty_string(profile->rawintensity)) { + int_type = 2; + pool->rawintensity = profile->rawintensity; + } + else if (!empty_string(profile->xintensity)) { + int_type = 1; + pool->xintensity = profile->xintensity; + } + else if (!empty_string(profile->intensity)) { + int_type = 0; + pool->intensity = profile->intensity; + } + else { + //nothing in profile... check default profile/globals + if (!empty_string(default_profile.rawintensity)) { + int_type = 2; + pool->rawintensity = default_profile.rawintensity; + } + else if (!empty_string(default_profile.xintensity)) { + int_type = 1; pool->xintensity = default_profile.xintensity; + } + else if (!empty_string(default_profile.intensity)) { + int_type = 0; + pool->intensity = default_profile.intensity; + } + else { + //nothing anywhere? default to sgminer default of 8 + int_type = 0; + pool->intensity = strdup("8"); + } + } } - applog(LOG_DEBUG, "Pool %i XIntensity set to \"%s\"", pool->pool_no, pool->xintensity); - if(pool_cmp(pool->rawintensity, default_profile.rawintensity)) - { - if(!empty_string(profile->rawintensity)) - pool->rawintensity = profile->rawintensity; - else - pool->rawintensity = default_profile.rawintensity; + switch(int_type) { + case 2: + applog(LOG_DEBUG, "Pool %d Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity); + break; + + case 1: + applog(LOG_DEBUG, "Pool %d XIntensity set to \"%s\"", pool->pool_no, pool->xintensity); + break; + + default: + applog(LOG_DEBUG, "Pool %d Intensity set to \"%s\"", pool->pool_no, pool->intensity); + break; } - applog(LOG_DEBUG, "Pool %i Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity); if(pool_cmp(pool->thread_concurrency, default_profile.thread_concurrency)) { - if(!empty_string(profile->thread_concurrency)) - pool->thread_concurrency = profile->thread_concurrency; - else - pool->thread_concurrency = default_profile.thread_concurrency; + /* allow empty string TC + if(!empty_string(profile->thread_concurrency))*/ + pool->thread_concurrency = profile->thread_concurrency; +/* else + pool->thread_concurrency = default_profile.thread_concurrency;*/ } applog(LOG_DEBUG, "Pool %i Thread Concurrency set to \"%s\"", pool->pool_no, pool->thread_concurrency); @@ -1303,7 +1378,7 @@ static json_t *build_pool_json() // devices if (!build_pool_json_add(obj, "device", pool->devices, profile->devices, default_profile.devices, pool->pool_no)) return NULL; - + // kernelfile if (!build_pool_json_add(obj, "kernelfile", pool->algorithm.kernelfile, profile->algorithm.kernelfile, default_profile.algorithm.kernelfile, pool->pool_no)) return NULL; diff --git a/configure.ac b/configure.ac index 37cac960..c49e016e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,8 +1,8 @@ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_maj], [5]) -m4_define([v_min], [0]) -m4_define([v_mic], [1]) +m4_define([v_min], [1]) +m4_define([v_mic], [0]) ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_ver], [v_maj.v_min.v_mic]) m4_define([lt_rev], m4_eval(v_maj + v_min)) diff --git a/driver-opencl.c b/driver-opencl.c index 1f9af9e2..48ffc517 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -176,30 +176,42 @@ char *set_lookup_gap(char *arg) return NULL; } -char *set_thread_concurrency(const char *_arg) +char *set_thread_concurrency(const char *arg) { - int i, val = 0, device = 0; + int i, device = 0; + size_t val = 0; + char *tmpstr = strdup(arg); char *nextptr; - char *arg = (char *)alloca(strlen(_arg) + 1); - strcpy(arg, _arg); - nextptr = strtok(arg, ","); - if (nextptr == NULL) - return "Invalid parameters for set thread concurrency"; - val = atoi(nextptr); + // empty string - use 0 and let algo autodetect the TC + if (empty_string(tmpstr)) { + applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", device, val); + gpus[device++].opt_tc = val; + } + // not empty string + else { + if ((nextptr = strtok(tmpstr, ",")) == NULL) { + free(tmpstr); + return "Invalid parameters for set_thread_concurrency"; + } - gpus[device++].opt_tc = val; + do { + val = (unsigned long)atol(nextptr); - while ((nextptr = strtok(NULL, ",")) != NULL) { - val = atoi(nextptr); - - gpus[device++].opt_tc = val; + applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", device, val); + gpus[device++].opt_tc = val; + } while ((nextptr = strtok(NULL, ",")) != NULL); } + + // if only 1 TC was passed, assign the same worksize for all remaining GPUs if (device == 1) { - for (i = device; i < MAX_GPUDEVICES; i++) + for (i = device; i < total_devices; ++i) { gpus[i].opt_tc = gpus[0].opt_tc; + applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", i, gpus[i].opt_tc); + } } + free(tmpstr); return NULL; } @@ -1020,21 +1032,24 @@ static void set_threads_hashes(unsigned int vectors, unsigned int compute_shader { unsigned int threads = 0; while (threads < minthreads) { + if (*rawintensity > 0) { threads = *rawintensity; - } else if (*xintensity > 0) { - if (algorithm->xintensity_shift) - threads = compute_shaders * (1 << (algorithm->xintensity_shift + *xintensity)); - else - threads = compute_shaders * *xintensity; - } else { + } + else if (*xintensity > 0) { + threads = compute_shaders * ((algorithm->xintensity_shift)?(1 << (algorithm->xintensity_shift + *xintensity)):*xintensity); + } + else { threads = 1 << (algorithm->intensity_shift + *intensity); } + if (threads < minthreads) { - if (likely(*intensity < MAX_INTENSITY)) + if (likely(*intensity < MAX_INTENSITY)) { (*intensity)++; - else + } + else { threads = minthreads; + } } } diff --git a/findnonce.c b/findnonce.c index 6b87be7e..be9ba0df 100644 --- a/findnonce.c +++ b/findnonce.c @@ -202,7 +202,7 @@ static void *postcalc_hash(void *userdata) if (found == 0x0F) nonce = swab32(nonce); - applog(LOG_DEBUG, "OCL NONCE %u found in slot %d", nonce, entry); + applog(LOG_DEBUG, "[THR%d] OCL NONCE %08x (%lu) found in slot %d (found = %d)", thr->id, nonce, nonce, entry, found); submit_nonce(thr, pcd->work, nonce); } diff --git a/kernel/animecoin.cl b/kernel/animecoin.cl index 174d0941..2d33d3c6 100644 --- a/kernel/animecoin.cl +++ b/kernel/animecoin.cl @@ -4,7 +4,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +12,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -92,6 +92,14 @@ typedef long sph_s64; #define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP8(x) as_ulong(as_uchar8(x).s76543210) +#define SHL(x, n) ((x) << (n)) +#define SHR(x, n) ((x) >> (n)) + +#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ + q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ + q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ + q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + #if SPH_BIG_ENDIAN #define DEC64E(x) (x) #define DEC64BE(x) (*(const __global sph_u64 *) (x)); @@ -118,8 +126,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp for(unsigned u = 0; u < 16; u++) BMW_H[u] = BMW_IV512[u]; - sph_u64 BMW_h1[16], BMW_h2[16]; - sph_u64 mv[16]; + sph_u64 mv[16],q[32]; + sph_u64 tmp; mv[0] = DEC64LE(block + 0); mv[1] = DEC64LE(block + 8); @@ -139,34 +147,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp mv[13] = 0; mv[14] = 0; mv[15] = 0x280; -#define M(x) (mv[x]) -#define H(x) (BMW_H[x]) -#define dH(x) (BMW_h2[x]) - FOLDb; + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) + { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; + } -#undef M -#undef H -#undef dH + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } -#define M(x) (BMW_h2[x]) -#define H(x) (final_b[x]) -#define dH(x) (BMW_h1[x]) +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } - FOLDb; +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } -#undef M -#undef H -#undef dH +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } - hash.h8[0] = SWAP8(BMW_h1[8]); - hash.h8[1] = SWAP8(BMW_h1[9]); - hash.h8[2] = SWAP8(BMW_h1[10]); - hash.h8[3] = SWAP8(BMW_h1[11]); - hash.h8[4] = SWAP8(BMW_h1[12]); - hash.h8[5] = SWAP8(BMW_h1[13]); - hash.h8[6] = SWAP8(BMW_h1[14]); - hash.h8[7] = SWAP8(BMW_h1[15]); +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + hash.h8[0] = SWAP8(BMW_H[8]); + hash.h8[1] = SWAP8(BMW_H[9]); + hash.h8[2] = SWAP8(BMW_H[10]); + hash.h8[3] = SWAP8(BMW_H[11]); + hash.h8[4] = SWAP8(BMW_H[12]); + hash.h8[5] = SWAP8(BMW_H[13]); + hash.h8[6] = SWAP8(BMW_H[14]); + hash.h8[7] = SWAP8(BMW_H[15]); } // blake @@ -218,7 +434,6 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp bool dec = ((hash.h1[7] & 0x8) != 0); { // groestl - sph_u64 H[16]; for (unsigned int u = 0; u < 15; u ++) H[u] = 0; @@ -427,14 +642,13 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp hash.h8[7] = (dec ? H7 : hash.h8[7]); } { - // bmw sph_u64 BMW_H[16]; for(unsigned u = 0; u < 16; u++) BMW_H[u] = BMW_IV512[u]; - sph_u64 BMW_h1[16], BMW_h2[16]; - sph_u64 mv[16]; + sph_u64 mv[16],q[32]; + sph_u64 tmp; mv[ 0] = SWAP8(hash.h8[0]); mv[ 1] = SWAP8(hash.h8[1]); @@ -452,34 +666,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp mv[13] = 0; mv[14] = 0; mv[15] = 0x200; - #define M(x) (mv[x]) - #define H(x) (BMW_H[x]) - #define dH(x) (BMW_h2[x]) - FOLDb; + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } - #undef M - #undef H - #undef dH +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } - #define M(x) (BMW_h2[x]) - #define H(x) (final_b[x]) - #define dH(x) (BMW_h1[x]) +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } - FOLDb; +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } - #undef M - #undef H - #undef dH + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) + { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; + } + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } - hash.h8[0] = (!dec ? SWAP8(BMW_h1[8]) : hash.h8[0]); - hash.h8[1] = (!dec ? SWAP8(BMW_h1[9]) : hash.h8[1]); - hash.h8[2] = (!dec ? SWAP8(BMW_h1[10]) : hash.h8[2]); - hash.h8[3] = (!dec ? SWAP8(BMW_h1[11]) : hash.h8[3]); - hash.h8[4] = (!dec ? SWAP8(BMW_h1[12]) : hash.h8[4]); - hash.h8[5] = (!dec ? SWAP8(BMW_h1[13]) : hash.h8[5]); - hash.h8[6] = (!dec ? SWAP8(BMW_h1[14]) : hash.h8[6]); - hash.h8[7] = (!dec ? SWAP8(BMW_h1[15]) : hash.h8[7]); + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + hash.h8[0] = (!dec ? SWAP8(BMW_H[8]) : hash.h8[0]); + hash.h8[1] = (!dec ? SWAP8(BMW_H[9]) : hash.h8[1]); + hash.h8[2] = (!dec ? SWAP8(BMW_H[10]) : hash.h8[2]); + hash.h8[3] = (!dec ? SWAP8(BMW_H[11]) : hash.h8[3]); + hash.h8[4] = (!dec ? SWAP8(BMW_H[12]) : hash.h8[4]); + hash.h8[5] = (!dec ? SWAP8(BMW_H[13]) : hash.h8[5]); + hash.h8[6] = (!dec ? SWAP8(BMW_H[14]) : hash.h8[6]); + hash.h8[7] = (!dec ? SWAP8(BMW_H[15]) : hash.h8[7]); } @@ -642,4 +1064,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp output[output[0xFF]++] = SWAP4(gid); } -#endif // ANIMECOIN_CL +#endif // ANIMECOIN_CL \ No newline at end of file diff --git a/kernel/arebyp.cl b/kernel/arebyp.cl new file mode 100644 index 00000000..2e00e2cd --- /dev/null +++ b/kernel/arebyp.cl @@ -0,0 +1,993 @@ + /*- + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt, + * 2012-2013 Con Kolivas, 2013 Alexey Karimov. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + + /* N (nfactor), CPU/Memory cost parameter */ +__constant uint N[] = { + 0x00000001U, /* never used, padding */ + 0x00000002U, + 0x00000004U, + 0x00000008U, + 0x00000010U, + 0x00000020U, + 0x00000040U, + 0x00000080U, + 0x00000100U, + 0x00000200U, + 0x00000400U, /* 2^10 == 1024, Litecoin scrypt default */ + 0x00000800U, + 0x00001000U, + 0x00002000U, + 0x00004000U, + 0x00008000U, + 0x00010000U, + 0x00020000U, + 0x00040000U, + 0x00080000U, + 0x00100000U +}; + +/* Backwards compatibility, if NFACTOR not defined, default to 10 for scrypt */ +#ifndef NFACTOR +#define NFACTOR 10 +#endif + + +__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 }; +__constant uint K[] = { + 0x428a2f98U, + 0x71374491U, + 0xb5c0fbcfU, + 0xe9b5dba5U, + 0x3956c25bU, + 0x59f111f1U, + 0x923f82a4U, + 0xab1c5ed5U, + 0xd807aa98U, + 0x12835b01U, + 0x243185beU, // 10 + 0x550c7dc3U, + 0x72be5d74U, + 0x80deb1feU, + 0x9bdc06a7U, + 0xe49b69c1U, + 0xefbe4786U, + 0x0fc19dc6U, + 0x240ca1ccU, + 0x2de92c6fU, + 0x4a7484aaU, // 20 + 0x5cb0a9dcU, + 0x76f988daU, + 0x983e5152U, + 0xa831c66dU, + 0xb00327c8U, + 0xbf597fc7U, + 0xc6e00bf3U, + 0xd5a79147U, + 0x06ca6351U, + 0x14292967U, // 30 + 0x27b70a85U, + 0x2e1b2138U, + 0x4d2c6dfcU, + 0x53380d13U, + 0x650a7354U, + 0x766a0abbU, + 0x81c2c92eU, + 0x92722c85U, + 0xa2bfe8a1U, + 0xa81a664bU, // 40 + 0xc24b8b70U, + 0xc76c51a3U, + 0xd192e819U, + 0xd6990624U, + 0xf40e3585U, + 0x106aa070U, + 0x19a4c116U, + 0x1e376c08U, + 0x2748774cU, + 0x34b0bcb5U, // 50 + 0x391c0cb3U, + 0x4ed8aa4aU, + 0x5b9cca4fU, + 0x682e6ff3U, + 0x748f82eeU, + 0x78a5636fU, + 0x84c87814U, + 0x8cc70208U, + 0x90befffaU, + 0xa4506cebU, // 60 + 0xbef9a3f7U, + 0xc67178f2U, + 0x98c7e2a2U, + 0xfc08884dU, + 0xcd2a11aeU, + 0x510e527fU, + 0x9b05688cU, + 0xC3910C8EU, + 0xfb6feee7U, + 0x2a01a605U, // 70 + 0x0c2e12e0U, + 0x4498517BU, + 0x6a09e667U, + 0xa4ce148bU, + 0x95F61999U, + 0xc19bf174U, + 0xBB67AE85U, + 0x3C6EF372U, + 0xA54FF53AU, + 0x1F83D9ABU, // 80 + 0x5BE0CD19U, + 0x5C5C5C5CU, + 0x36363636U, + 0x80000000U, + 0x000003FFU, + 0x00000280U, + 0x000004a0U, + 0x00000300U +}; + +#define rotl(x,y) rotate(x,y) +#define Ch(x,y,z) bitselect(z,y,x) +#define Maj(x,y,z) Ch((x^z),y,z) + +#define EndianSwap(n) (rotl(n & ES[0], 24U)|rotl(n & ES[1], 8U)) + +#define Tr2(x) (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U)) +#define Tr1(x) (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U)) +#define Wr2(x) (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U)) +#define Wr1(x) (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U)) + +#define RND(a, b, c, d, e, f, g, h, k) \ + h += Tr1(e); \ + h += Ch(e, f, g); \ + h += k; \ + d += h; \ + h += Tr2(a); \ + h += Maj(a, b, c); + +void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) +{ + uint4 S0 = *state0; + uint4 S1 = *state1; + +#define A S0.x +#define B S0.y +#define C S0.z +#define D S0.w +#define E S1.x +#define F S1.y +#define G S1.z +#define H S1.w + + uint4 W[4]; + + W[ 0].x = block0.x; + RND(A,B,C,D,E,F,G,H, W[0].x+ K[0]); + W[ 0].y = block0.y; + RND(H,A,B,C,D,E,F,G, W[0].y+ K[1]); + W[ 0].z = block0.z; + RND(G,H,A,B,C,D,E,F, W[0].z+ K[2]); + W[ 0].w = block0.w; + RND(F,G,H,A,B,C,D,E, W[0].w+ K[3]); + + W[ 1].x = block1.x; + RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]); + W[ 1].y = block1.y; + RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]); + W[ 1].z = block1.z; + RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]); + W[ 1].w = block1.w; + RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]); + + W[ 2].x = block2.x; + RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]); + W[ 2].y = block2.y; + RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]); + W[ 2].z = block2.z; + RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]); + W[ 2].w = block2.w; + RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]); + + W[ 3].x = block3.x; + RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]); + W[ 3].y = block3.y; + RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]); + W[ 3].z = block3.z; + RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]); + W[ 3].w = block3.w; + RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + + *state0 += S0; + *state1 += S1; +} + +void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) +{ +#define A (*state0).x +#define B (*state0).y +#define C (*state0).z +#define D (*state0).w +#define E (*state1).x +#define F (*state1).y +#define G (*state1).z +#define H (*state1).w + + uint4 W[4]; + + W[0].x = block0.x; + D= K[63] +W[0].x; + H= K[64] +W[0].x; + + W[0].y = block0.y; + C= K[65] +Tr1(D)+Ch(D, K[66], K[67])+W[0].y; + G= K[68] +C+Tr2(H)+Ch(H, K[69] ,K[70]); + + W[0].z = block0.z; + B= K[71] +Tr1(C)+Ch(C,D,K[66])+W[0].z; + F= K[72] +B+Tr2(G)+Maj(G,H, K[73]); + + W[0].w = block0.w; + A= K[74] +Tr1(B)+Ch(B,C,D)+W[0].w; + E= K[75] +A+Tr2(F)+Maj(F,G,H); + + W[1].x = block1.x; + RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]); + W[1].y = block1.y; + RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]); + W[1].z = block1.z; + RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]); + W[1].w = block1.w; + RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]); + + W[2].x = block2.x; + RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]); + W[2].y = block2.y; + RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]); + W[2].z = block2.z; + RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]); + W[2].w = block2.w; + RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]); + + W[3].x = block3.x; + RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]); + W[3].y = block3.y; + RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]); + W[3].z = block3.z; + RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]); + W[3].w = block3.w; + RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]); + + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]); + + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]); + + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]); + + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]); + + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]); + + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]); + + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]); + + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]); + + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]); + + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]); + + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]); + + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]); + + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]); + + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]); + + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]); + + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]); + + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]); + + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]); + + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]); + + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]); + + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]); + + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]); + + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]); + + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]); + + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]); + + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]); + + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]); + + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]); + + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]); + + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]); + + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]); + + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]); + + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]); + + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]); + + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]); + + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]); + + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]); + + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]); + + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]); + + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]); + + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]); + + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]); + + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]); + + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]); + + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]); + + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]); + + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]); + + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + + *state0 += (uint4)(K[73], K[77], K[78], K[79]); + *state1 += (uint4)(K[66], K[67], K[80], K[81]); +} + +__constant uint fixedW[64] = +{ + 0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, + 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794, + 0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f, + 0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c, + 0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa, + 0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012, + 0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4, + 0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848, +}; + +void SHA256_fixed(uint4*restrict state0,uint4*restrict state1) +{ + uint4 S0 = *state0; + uint4 S1 = *state1; + +#define A S0.x +#define B S0.y +#define C S0.z +#define D S0.w +#define E S1.x +#define F S1.y +#define G S1.z +#define H S1.w + + RND(A,B,C,D,E,F,G,H, fixedW[0]); + RND(H,A,B,C,D,E,F,G, fixedW[1]); + RND(G,H,A,B,C,D,E,F, fixedW[2]); + RND(F,G,H,A,B,C,D,E, fixedW[3]); + RND(E,F,G,H,A,B,C,D, fixedW[4]); + RND(D,E,F,G,H,A,B,C, fixedW[5]); + RND(C,D,E,F,G,H,A,B, fixedW[6]); + RND(B,C,D,E,F,G,H,A, fixedW[7]); + RND(A,B,C,D,E,F,G,H, fixedW[8]); + RND(H,A,B,C,D,E,F,G, fixedW[9]); + RND(G,H,A,B,C,D,E,F, fixedW[10]); + RND(F,G,H,A,B,C,D,E, fixedW[11]); + RND(E,F,G,H,A,B,C,D, fixedW[12]); + RND(D,E,F,G,H,A,B,C, fixedW[13]); + RND(C,D,E,F,G,H,A,B, fixedW[14]); + RND(B,C,D,E,F,G,H,A, fixedW[15]); + RND(A,B,C,D,E,F,G,H, fixedW[16]); + RND(H,A,B,C,D,E,F,G, fixedW[17]); + RND(G,H,A,B,C,D,E,F, fixedW[18]); + RND(F,G,H,A,B,C,D,E, fixedW[19]); + RND(E,F,G,H,A,B,C,D, fixedW[20]); + RND(D,E,F,G,H,A,B,C, fixedW[21]); + RND(C,D,E,F,G,H,A,B, fixedW[22]); + RND(B,C,D,E,F,G,H,A, fixedW[23]); + RND(A,B,C,D,E,F,G,H, fixedW[24]); + RND(H,A,B,C,D,E,F,G, fixedW[25]); + RND(G,H,A,B,C,D,E,F, fixedW[26]); + RND(F,G,H,A,B,C,D,E, fixedW[27]); + RND(E,F,G,H,A,B,C,D, fixedW[28]); + RND(D,E,F,G,H,A,B,C, fixedW[29]); + RND(C,D,E,F,G,H,A,B, fixedW[30]); + RND(B,C,D,E,F,G,H,A, fixedW[31]); + RND(A,B,C,D,E,F,G,H, fixedW[32]); + RND(H,A,B,C,D,E,F,G, fixedW[33]); + RND(G,H,A,B,C,D,E,F, fixedW[34]); + RND(F,G,H,A,B,C,D,E, fixedW[35]); + RND(E,F,G,H,A,B,C,D, fixedW[36]); + RND(D,E,F,G,H,A,B,C, fixedW[37]); + RND(C,D,E,F,G,H,A,B, fixedW[38]); + RND(B,C,D,E,F,G,H,A, fixedW[39]); + RND(A,B,C,D,E,F,G,H, fixedW[40]); + RND(H,A,B,C,D,E,F,G, fixedW[41]); + RND(G,H,A,B,C,D,E,F, fixedW[42]); + RND(F,G,H,A,B,C,D,E, fixedW[43]); + RND(E,F,G,H,A,B,C,D, fixedW[44]); + RND(D,E,F,G,H,A,B,C, fixedW[45]); + RND(C,D,E,F,G,H,A,B, fixedW[46]); + RND(B,C,D,E,F,G,H,A, fixedW[47]); + RND(A,B,C,D,E,F,G,H, fixedW[48]); + RND(H,A,B,C,D,E,F,G, fixedW[49]); + RND(G,H,A,B,C,D,E,F, fixedW[50]); + RND(F,G,H,A,B,C,D,E, fixedW[51]); + RND(E,F,G,H,A,B,C,D, fixedW[52]); + RND(D,E,F,G,H,A,B,C, fixedW[53]); + RND(C,D,E,F,G,H,A,B, fixedW[54]); + RND(B,C,D,E,F,G,H,A, fixedW[55]); + RND(A,B,C,D,E,F,G,H, fixedW[56]); + RND(H,A,B,C,D,E,F,G, fixedW[57]); + RND(G,H,A,B,C,D,E,F, fixedW[58]); + RND(F,G,H,A,B,C,D,E, fixedW[59]); + RND(E,F,G,H,A,B,C,D, fixedW[60]); + RND(D,E,F,G,H,A,B,C, fixedW[61]); + RND(C,D,E,F,G,H,A,B, fixedW[62]); + RND(B,C,D,E,F,G,H,A, fixedW[63]); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + *state0 += S0; + *state1 += S1; +} + +void shittify(uint4 B[8]) +{ + uint4 tmp[4]; + tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w); + tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w); + tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w); + tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i] = EndianSwap(tmp[i]); + + tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w); + tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w); + tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w); + tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] = EndianSwap(tmp[i]); +} + +void unshittify(uint4 B[8]) +{ + uint4 tmp[4]; + tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w); + tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w); + tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w); + tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i] = EndianSwap(tmp[i]); + + tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w); + tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w); + tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w); + tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] = EndianSwap(tmp[i]); +} + +#define SALSAUNROLLED +#ifdef SALSAUNROLLED +void salsa(uint4 B[8]) +{ + uint4 w[4]; + + w[0] = (B[0]^=B[4]); + w[1] = (B[1]^=B[5]); + w[2] = (B[2]^=B[6]); + w[3] = (B[3]^=B[7]); + + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + + w[0] = (B[4]^=(B[0]+=w[0])); + w[1] = (B[5]^=(B[1]+=w[1])); + w[2] = (B[6]^=(B[2]+=w[2])); + w[3] = (B[7]^=(B[3]+=w[3])); + + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + + B[4] += w[0]; + B[5] += w[1]; + B[6] += w[2]; + B[7] += w[3]; +} +#else +void salsa(uint4 B[8]) +{ + uint4 w[4]; + +#pragma unroll + for(uint i=0; i<4; ++i) + w[i] = (B[i]^=B[i+4]); + +#pragma unroll + for(uint i=0; i<4; ++i) + { + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + } + +#pragma unroll + for(uint i=0; i<4; ++i) + w[i] = (B[i+4]^=(B[i]+=w[i])); + +#pragma unroll + for(uint i=0; i<4; ++i) + { + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + } + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] += w[i]; +} +#endif + + +#if (LOOKUP_GAP != 2) +wrong lookup gap! +#endif + +void scrypt_core(uint4 X[8], __global uint4*restrict lookup) +{ + const uint zSIZE = 8; + const uint ySIZE = N[NFACTOR] / 2; + const uint xSIZE = CONCURRENT_THREADS; + uint4 V[8]; + uint x = get_global_id(0) % xSIZE; + uint z; + uint y; + uint i; + uint CO; + ushort progress; // Progress state + ushort state; + + CO = 8 * x; + for (y = 0; y < ySIZE; ++y, CO += (xSIZE - 1) * (zSIZE)) + { +#pragma unroll + for (z = 0; z < zSIZE; ++z, CO++) + lookup[CO] = X[z]; + // Next salsa + salsa(X); + salsa(X); + } + //------------------------------------------------------------------------------------------------------------ + uint cotmp = x * zSIZE; + progress = 0; + for (i = 0; i < N[NFACTOR] + 512 + 42; i++) + { + //if (progress < 2 * N[NFACTOR]) + { + y = X[7].x & (N[NFACTOR]-1); + CO = cotmp + (y / LOOKUP_GAP) * (xSIZE) * zSIZE; + + state = ((progress & 1) << 1) | (y & 1); + if (state != 3) + { +#pragma unroll + for (z = 0; z < zSIZE; ++z, CO++) + V[z] = lookup[CO]; + } + if (state != 1) + { +#pragma unroll + for (z = 0; z < zSIZE; ++z) + V[z] ^= X[z]; + } + salsa(V); + ushort cond = (state != 1) && (progress < 2 * N[NFACTOR]); + if (cond) + { +#pragma unroll + for (z = 0; z < zSIZE; ++z) + X[z] = V[z]; + } + // S/y + // 00 +2 + // 01 +1 + // 11 +1 + // 10 error + progress += (state == 0)? 2 : 1; + } + } +} + + + + +#define SCRYPT_FOUND (0xFF) +#define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uint4 * restrict input, +volatile __global uint*restrict output, __global uint4*restrict padcache, +const uint4 midstate0, const uint4 midstate16, const uint target) +{ + uint4 X[8]; + uint4 tstate0, tstate1, ostate0, ostate1; + uint4 tmp0, tmp1; + uint4 data = (uint4)(input[4].x,input[4].y,input[4].z, get_global_id(0)); + uint4 pad0 = midstate0, pad1 = midstate16; + + SHA256(&pad0,&pad1, data, (uint4)(K[84],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[86])); + SHA256_fresh(&ostate0,&ostate1, pad0^ K[82], pad1^ K[82], K[82], K[82]); + SHA256_fresh(&tstate0,&tstate1, pad0^ K[83], pad1^ K[83], K[83], K[83]); + + tmp0 = tstate0; + tmp1 = tstate1; + SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]); + +#pragma unroll + for (uint i=0; i<4; i++) + { + pad0 = tstate0; + pad1 = tstate1; + X[rotl(i,1U) ] = ostate0; + X[rotl(i,1U)+1] = ostate1; + + SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87])); + SHA256(X+rotl(i,1U),X+rotl(i,1U)+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88])); + } + shittify(X); + scrypt_core(X,padcache); + unshittify(X); + + SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]); + SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]); + SHA256_fixed(&tmp0,&tmp1); + SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88])); + + bool result = (EndianSwap(ostate1.w) <= target); + if (result) + SETFOUND(get_global_id(0)); +} \ No newline at end of file diff --git a/kernel/bitblock.cl b/kernel/bitblock.cl index 3603a160..7ebdf351 100644 --- a/kernel/bitblock.cl +++ b/kernel/bitblock.cl @@ -464,69 +464,92 @@ __kernel void search2(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif int init = get_local_id(0); int step = get_local_size(0); for (int i = init; i < 256; i += step) { - T0_L[i] = T0[i]; - T4_L[i] = T4[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif } - - barrier(CLK_LOCAL_MEM_FENCE); - - #define T0 T0_L - #define T1 T1_L - #define T2 T2_L - #define T3 T3_L - #define T4 T4_L - #define T5 T5_L - #define T6 T6_L - #define T7 T7_L - - // groestl - sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C + + + sph_u64 H[16]; +//#pragma unroll 15 + for (unsigned int u = 0; u < 15; u ++) + H[u] = 0; +#if USE_LE + H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); +#else + H[15] = (sph_u64)512; +#endif sph_u64 g[16], m[16]; - g[0] = m[0] = DEC64E(hash->h8[0]); - g[1] = m[1] = DEC64E(hash->h8[1]); - g[2] = m[2] = DEC64E(hash->h8[2]); - g[3] = m[3] = DEC64E(hash->h8[3]); - g[4] = m[4] = DEC64E(hash->h8[4]); - g[5] = m[5] = DEC64E(hash->h8[5]); - g[6] = m[6] = DEC64E(hash->h8[6]); - g[7] = m[7] = DEC64E(hash->h8[7]); - g[8] = m[8] = 0x80; - g[9] = m[9] = 0; - g[10] = m[10] = 0; - g[11] = m[11] = 0; - g[12] = m[12] = 0; - g[13] = m[13] = 0; - g[14] = m[14] = 0; - g[15] = 0x102000000000000; - m[15] = 0x100000000000000; - + m[0] = DEC64E(hash->h8[0]); + m[1] = DEC64E(hash->h8[1]); + m[2] = DEC64E(hash->h8[2]); + m[3] = DEC64E(hash->h8[3]); + m[4] = DEC64E(hash->h8[4]); + m[5] = DEC64E(hash->h8[5]); + m[6] = DEC64E(hash->h8[6]); + m[7] = DEC64E(hash->h8[7]); + +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + g[u] = m[u] ^ H[u]; + m[8] = 0x80; g[8] = m[8] ^ H[8]; + m[9] = 0; g[9] = m[9] ^ H[9]; + m[10] = 0; g[10] = m[10] ^ H[10]; + m[11] = 0; g[11] = m[11] ^ H[11]; + m[12] = 0; g[12] = m[12] ^ H[12]; + m[13] = 0; g[13] = m[13] ^ H[13]; + m[14] = 0; g[14] = m[14] ^ H[14]; + m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; PERM_BIG_P(g); PERM_BIG_Q(m); - sph_u64 xH[16]; +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u] ^= g[u] ^ m[u]; + H[u] ^= g[u] ^ m[u]; + sph_u64 xH[16]; +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u]; PERM_BIG_P(xH); - for (unsigned int u = 8; u < 16; u ++) - hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + H[u] ^= xH[u]; + +//#pragma unroll 8 + for (unsigned int u = 0; u < 8; u ++) + hash->h8[u] = DEC64E(H[u + 8]); + barrier(CLK_GLOBAL_MEM_FENCE); - barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -1466,4 +1489,4 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo barrier(CLK_GLOBAL_MEM_FENCE); } -#endif // BITBLOCK_CL +#endif // BITBLOCK_CL \ No newline at end of file diff --git a/kernel/darkcoin-mod.cl b/kernel/darkcoin-mod.cl index d4240369..89553f80 100644 --- a/kernel/darkcoin-mod.cl +++ b/kernel/darkcoin-mod.cl @@ -193,7 +193,7 @@ __kernel void search1(__global hash_t* hashes) BMW_H[u] = BMW_IV512[u]; sph_u64 mv[16],q[32]; - sph_u64 tmp; + sph_u64 tmp; mv[0] = SWAP8(hash->h8[0]); mv[1] = SWAP8(hash->h8[1]); @@ -457,74 +457,97 @@ __kernel void search2(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif int init = get_local_id(0); int step = get_local_size(0); for (int i = init; i < 256; i += step) { - T0_L[i] = T0[i]; - T4_L[i] = T4[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif } - - barrier(CLK_LOCAL_MEM_FENCE); - - #define T0 T0_L - #define T1 T1_L - #define T2 T2_L - #define T3 T3_L - #define T4 T4_L - #define T5 T5_L - #define T6 T6_L - #define T7 T7_L - - // groestl - sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C + + + sph_u64 H[16]; +//#pragma unroll 15 + for (unsigned int u = 0; u < 15; u ++) + H[u] = 0; +#if USE_LE + H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); +#else + H[15] = (sph_u64)512; +#endif sph_u64 g[16], m[16]; - g[0] = m[0] = DEC64E(hash->h8[0]); - g[1] = m[1] = DEC64E(hash->h8[1]); - g[2] = m[2] = DEC64E(hash->h8[2]); - g[3] = m[3] = DEC64E(hash->h8[3]); - g[4] = m[4] = DEC64E(hash->h8[4]); - g[5] = m[5] = DEC64E(hash->h8[5]); - g[6] = m[6] = DEC64E(hash->h8[6]); - g[7] = m[7] = DEC64E(hash->h8[7]); - g[8] = m[8] = 0x80; - g[9] = m[9] = 0; - g[10] = m[10] = 0; - g[11] = m[11] = 0; - g[12] = m[12] = 0; - g[13] = m[13] = 0; - g[14] = m[14] = 0; - g[15] = 0x102000000000000; - m[15] = 0x100000000000000; - + m[0] = DEC64E(hash->h8[0]); + m[1] = DEC64E(hash->h8[1]); + m[2] = DEC64E(hash->h8[2]); + m[3] = DEC64E(hash->h8[3]); + m[4] = DEC64E(hash->h8[4]); + m[5] = DEC64E(hash->h8[5]); + m[6] = DEC64E(hash->h8[6]); + m[7] = DEC64E(hash->h8[7]); + +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + g[u] = m[u] ^ H[u]; + m[8] = 0x80; g[8] = m[8] ^ H[8]; + m[9] = 0; g[9] = m[9] ^ H[9]; + m[10] = 0; g[10] = m[10] ^ H[10]; + m[11] = 0; g[11] = m[11] ^ H[11]; + m[12] = 0; g[12] = m[12] ^ H[12]; + m[13] = 0; g[13] = m[13] ^ H[13]; + m[14] = 0; g[14] = m[14] ^ H[14]; + m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; PERM_BIG_P(g); PERM_BIG_Q(m); - sph_u64 xH[16]; +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u] ^= g[u] ^ m[u]; + H[u] ^= g[u] ^ m[u]; + sph_u64 xH[16]; +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u]; PERM_BIG_P(xH); - for (unsigned int u = 8; u < 16; u ++) - hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + H[u] ^= xH[u]; - barrier(CLK_GLOBAL_MEM_FENCE); -} +//#pragma unroll 8 + for (unsigned int u = 0; u < 8; u ++) + hash->h8[u] = DEC64E(H[u + 8]); + barrier(CLK_GLOBAL_MEM_FENCE); +} __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search3(__global hash_t* hashes) { + uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); @@ -840,7 +863,7 @@ __kernel void search8(__global hash_t* hashes) sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; - sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; rk00 = hash->h4[0]; rk01 = hash->h4[1]; @@ -1101,4 +1124,4 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo output[atomic_inc(output+0xFF)] = SWAP4(gid); } -#endif// DARKCOIN_MOD_CL +#endif// DARKCOIN_MOD_CL \ No newline at end of file diff --git a/kernel/darkcoin.cl b/kernel/darkcoin.cl index 3620bd52..4474bdd9 100644 --- a/kernel/darkcoin.cl +++ b/kernel/darkcoin.cl @@ -4,7 +4,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +12,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -71,16 +71,9 @@ typedef long sph_s64; #define SPH_SMALL_FOOTPRINT_GROESTL 0 #define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_CUBEHASH_UNROLL 0 - -#ifndef SPH_COMPACT_BLAKE_64 - #define SPH_COMPACT_BLAKE_64 0 -#endif -#ifndef SPH_LUFFA_PARALLEL - #define SPH_LUFFA_PARALLEL 0 -#endif -#ifndef SPH_KECCAK_UNROLL - #define SPH_KECCAK_UNROLL 0 -#endif +#define SPH_COMPACT_BLAKE_64 0 +#define SPH_LUFFA_PARALLEL 0 +#define SPH_KECCAK_UNROLL 0 #include "blake.cl" #include "bmw.cl" @@ -736,4 +729,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp } } -#endif // DARKCOIN_CL +#endif // DARKCOIN_CL \ No newline at end of file diff --git a/kernel/diamond.cl b/kernel/diamond.cl new file mode 100644 index 00000000..b02edd7a --- /dev/null +++ b/kernel/diamond.cl @@ -0,0 +1,1853 @@ +/* + * ==========================(LICENSE BEGIN)============================ + * + * GroestlCoin kernel implementation: Copyright (c) 2014 pallas + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * GroestlCoin kernel implementation: @author pallas + * Forum thread: http://bitcointalk.org/index.php?topic=779598 + * Donations to: BTC 1H7qC5uHuGX2d5s9Kuw3k7Wm7xMQzL16SN + */ + +#ifndef DIAMOND_CL +#define DIAMOND_CL + +#define DC64(x) ((ulong)(x ## UL)) +#define DEC64E(x) (*(const __global ulong *) (x)); +#define H15 (((ulong)(512 & 0xFF) << 56) | ((ulong)(512 & 0xFF00) << 40)) +#define M15 0x100000000000000 +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) // rotate + +#define C64e(x) ((DC64(x) >> 56) \ + | ((DC64(x) >> 40) & DC64(0x000000000000FF00)) \ + | ((DC64(x) >> 24) & DC64(0x0000000000FF0000)) \ + | ((DC64(x) >> 8) & DC64(0x00000000FF000000)) \ + | ((DC64(x) << 8) & DC64(0x000000FF00000000)) \ + | ((DC64(x) << 24) & DC64(0x0000FF0000000000)) \ + | ((DC64(x) << 40) & DC64(0x00FF000000000000)) \ + | ((DC64(x) << 56))) +#define B64_0(x) ((x) & 0xFF) +#define B64_1(x) (((x) >> 8) & 0xFF) +#define B64_2(x) (((x) >> 16) & 0xFF) +#define B64_3(x) (((x) >> 24) & 0xFF) +#define B64_4(x) (((x) >> 32) & 0xFF) +#define B64_5(x) (((x) >> 40) & 0xFF) +#define B64_6(x) (((x) >> 48) & 0xFF) +#define B64_7(x) ((x) >> 56) +#define PC64(j, r) ((ulong)((j) + (r))) +#define QC64(j, r) (((ulong)(r) << 56) ^ ~((ulong)(j) << 56)) + +__constant static const ulong T0[] = { + C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), + C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), + C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), + C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), + C64e(0x6090f050f0c05060), C64e(0x0207050305040302), + C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), + C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), + C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), + C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), + C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), + C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), + C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), + C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), + C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), + C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), + C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), + C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), + C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), + C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), + C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), + C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), + C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), + C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), + C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), + C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), + C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), + C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), + C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), + C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), + C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), + C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), + C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), + C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), + C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), + C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), + C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), + C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), + C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), + C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), + C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), + C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), + C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), + C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), + C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), + C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), + C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), + C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), + C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), + C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), + C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), + C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), + C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), + C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), + C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), + C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), + C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), + C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), + C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), + C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), + C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), + C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), + C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), + C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), + C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), + C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), + C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), + C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), + C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), + C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), + C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), + C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), + C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), + C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), + C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), + C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), + C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), + C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), + C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), + C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), + C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), + C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), + C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), + C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), + C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), + C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), + C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), + C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), + C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), + C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), + C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), + C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), + C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), + C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), + C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), + C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), + C64e(0x476720e9208ee947), C64e(0x1038281828201810), + C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), + C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), + C64e(0x38546c246c702438), C64e(0x575f08f108aef157), + C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), + C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), + C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), + C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), + C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), + C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), + C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), + C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), + C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), + C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), + C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), + C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), + C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), + C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), + C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), + C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), + C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), + C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), + C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), + C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), + C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), + C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), + C64e(0x09929b809b128009), C64e(0x1a2339173934171a), + C64e(0x651075da75cada65), C64e(0xd784533153b531d7), + C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), + C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), + C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), + C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), + C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) +}; + +__constant static const ulong T1[] = { + C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), + C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), + C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), + C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), + C64e(0x606090f050f0c050), C64e(0x0202070503050403), + C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), + C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), + C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), + C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), + C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), + C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), + C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), + C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), + C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), + C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), + C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), + C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), + C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), + C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), + C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), + C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), + C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), + C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), + C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), + C64e(0x08081c140c14100c), C64e(0x959563f652f63152), + C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), + C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), + C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), + C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), + C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), + C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), + C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), + C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), + C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), + C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), + C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), + C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), + C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), + C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), + C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), + C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), + C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), + C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), + C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), + C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), + C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), + C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), + C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), + C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), + C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), + C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), + C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), + C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), + C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), + C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), + C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), + C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), + C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), + C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), + C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), + C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), + C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), + C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), + C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), + C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), + C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), + C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), + C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), + C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), + C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), + C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), + C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), + C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), + C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), + C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), + C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), + C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), + C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), + C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), + C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), + C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), + C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), + C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), + C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), + C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), + C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), + C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), + C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), + C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), + C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), + C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), + C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), + C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), + C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), + C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), + C64e(0x47476720e9208ee9), C64e(0x1010382818282018), + C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), + C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), + C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), + C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), + C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), + C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), + C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), + C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), + C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), + C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), + C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), + C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), + C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), + C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), + C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), + C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), + C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), + C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), + C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), + C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), + C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), + C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), + C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), + C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), + C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), + C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), + C64e(0x65651075da75cada), C64e(0xd7d784533153b531), + C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), + C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), + C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), + C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), + C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) +}; +/* +__constant static const ulong T2_G[] = { + C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), + C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), + C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), + C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), + C64e(0x50606090f050f0c0), C64e(0x0302020705030504), + C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), + C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), + C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), + C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), + C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), + C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), + C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), + C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), + C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), + C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), + C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), + C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), + C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), + C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), + C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), + C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), + C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), + C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), + C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), + C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), + C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), + C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), + C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), + C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), + C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), + C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), + C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), + C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), + C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), + C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), + C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), + C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), + C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), + C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), + C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), + C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), + C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), + C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), + C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), + C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), + C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), + C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), + C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), + C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), + C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), + C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), + C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), + C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), + C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), + C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), + C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), + C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), + C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), + C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), + C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), + C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), + C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), + C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), + C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), + C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), + C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), + C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), + C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), + C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), + C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), + C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), + C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), + C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), + C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), + C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), + C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), + C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), + C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), + C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), + C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), + C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), + C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), + C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), + C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), + C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), + C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), + C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), + C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), + C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), + C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), + C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), + C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), + C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), + C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), + C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), + C64e(0xe947476720e9208e), C64e(0x1810103828182820), + C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), + C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), + C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), + C64e(0xc773732152c752e6), C64e(0x51979764f351f335), + C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), + C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), + C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), + C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), + C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), + C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), + C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), + C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), + C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), + C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), + C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), + C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), + C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), + C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), + C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), + C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), + C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), + C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), + C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), + C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), + C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), + C64e(0x800909929b809b12), C64e(0x171a1a2339173934), + C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), + C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), + C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), + C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), + C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), + C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) +}; + +__constant static const ulong T3_G[] = { + C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), + C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), + C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), + C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), + C64e(0xc050606090f050f0), C64e(0x0403020207050305), + C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), + C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), + C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), + C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), + C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), + C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), + C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), + C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), + C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), + C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), + C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), + C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), + C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), + C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), + C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), + C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), + C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), + C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), + C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), + C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), + C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), + C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), + C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), + C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), + C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), + C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), + C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), + C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), + C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), + C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), + C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), + C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), + C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), + C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), + C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), + C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), + C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), + C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), + C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), + C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), + C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), + C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), + C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), + C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), + C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), + C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), + C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), + C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), + C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), + C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), + C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), + C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), + C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), + C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), + C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), + C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), + C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), + C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), + C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), + C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), + C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), + C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), + C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), + C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), + C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), + C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), + C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), + C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), + C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), + C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), + C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), + C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), + C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), + C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), + C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), + C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), + C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), + C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), + C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), + C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), + C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), + C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), + C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), + C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), + C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), + C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), + C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), + C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), + C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), + C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), + C64e(0x8ee947476720e920), C64e(0x2018101038281828), + C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), + C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), + C64e(0x70243838546c246c), C64e(0xaef157575f08f108), + C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), + C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), + C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), + C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), + C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), + C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), + C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), + C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), + C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), + C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), + C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), + C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), + C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), + C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), + C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), + C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), + C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), + C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), + C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), + C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), + C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), + C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), + C64e(0x12800909929b809b), C64e(0x34171a1a23391739), + C64e(0xcada65651075da75), C64e(0xb531d7d784533153), + C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), + C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), + C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), + C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), + C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) +}; + +__constant static const ulong T4_G[] = { + C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), + C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), + C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), + C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), + C64e(0xf0c050606090f050), C64e(0x0504030202070503), + C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), + C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), + C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), + C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), + C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), + C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), + C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), + C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), + C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), + C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), + C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), + C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), + C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), + C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), + C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), + C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), + C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), + C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), + C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), + C64e(0x14100c08081c140c), C64e(0xf63152959563f652), + C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), + C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), + C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), + C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), + C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), + C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), + C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), + C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), + C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), + C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), + C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), + C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), + C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), + C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), + C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), + C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), + C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), + C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), + C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), + C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), + C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), + C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), + C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), + C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), + C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), + C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), + C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), + C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), + C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), + C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), + C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), + C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), + C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), + C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), + C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), + C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), + C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), + C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), + C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), + C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), + C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), + C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), + C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), + C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), + C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), + C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), + C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), + C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), + C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), + C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), + C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), + C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), + C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), + C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), + C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), + C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), + C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), + C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), + C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), + C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), + C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), + C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), + C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), + C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), + C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), + C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), + C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), + C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), + C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), + C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), + C64e(0x208ee947476720e9), C64e(0x2820181010382818), + C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), + C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), + C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), + C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), + C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), + C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), + C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), + C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), + C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), + C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), + C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), + C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), + C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), + C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), + C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), + C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), + C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), + C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), + C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), + C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), + C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), + C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), + C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), + C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), + C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), + C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), + C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), + C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), + C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), + C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), + C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), + C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) +}; + +__constant static const ulong T5_G[] = { + C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), + C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), + C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), + C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), + C64e(0x50f0c050606090f0), C64e(0x0305040302020705), + C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), + C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), + C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), + C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), + C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), + C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), + C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), + C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), + C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), + C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), + C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), + C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), + C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), + C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), + C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), + C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), + C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), + C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), + C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), + C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), + C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), + C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), + C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), + C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), + C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), + C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), + C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), + C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), + C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), + C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), + C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), + C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), + C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), + C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), + C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), + C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), + C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), + C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), + C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), + C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), + C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), + C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), + C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), + C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), + C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), + C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), + C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), + C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), + C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), + C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), + C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), + C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), + C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), + C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), + C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), + C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), + C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), + C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), + C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), + C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), + C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), + C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), + C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), + C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), + C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), + C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), + C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), + C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), + C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), + C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), + C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), + C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), + C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), + C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), + C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), + C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), + C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), + C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), + C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), + C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), + C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), + C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), + C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), + C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), + C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), + C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), + C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), + C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), + C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), + C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), + C64e(0xe9208ee947476720), C64e(0x1828201810103828), + C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), + C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), + C64e(0x246c70243838546c), C64e(0xf108aef157575f08), + C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), + C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), + C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), + C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), + C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), + C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), + C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), + C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), + C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), + C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), + C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), + C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), + C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), + C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), + C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), + C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), + C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), + C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), + C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), + C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), + C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), + C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), + C64e(0x809b12800909929b), C64e(0x173934171a1a2339), + C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), + C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), + C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), + C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), + C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), + C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) +}; + +__constant static const ulong T6_G[] = { + C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), + C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), + C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), + C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), + C64e(0xf050f0c050606090), C64e(0x0503050403020207), + C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), + C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), + C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), + C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), + C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), + C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), + C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), + C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), + C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), + C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), + C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), + C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), + C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), + C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), + C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), + C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), + C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), + C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), + C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), + C64e(0x140c14100c08081c), C64e(0xf652f63152959563), + C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), + C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), + C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), + C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), + C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), + C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), + C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), + C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), + C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), + C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), + C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), + C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), + C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), + C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), + C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), + C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), + C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), + C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), + C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), + C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), + C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), + C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), + C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), + C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), + C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), + C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), + C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), + C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), + C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), + C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), + C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), + C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), + C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), + C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), + C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), + C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), + C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), + C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), + C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), + C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), + C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), + C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), + C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), + C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), + C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), + C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), + C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), + C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), + C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), + C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), + C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), + C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), + C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), + C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), + C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), + C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), + C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), + C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), + C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), + C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), + C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), + C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), + C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), + C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), + C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), + C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), + C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), + C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), + C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), + C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), + C64e(0x20e9208ee9474767), C64e(0x2818282018101038), + C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), + C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), + C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), + C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), + C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), + C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), + C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), + C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), + C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), + C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), + C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), + C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), + C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), + C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), + C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), + C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), + C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), + C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), + C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), + C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), + C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), + C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), + C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), + C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), + C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), + C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), + C64e(0x75da75cada656510), C64e(0x533153b531d7d784), + C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), + C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), + C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), + C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), + C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) +}; + +__constant static const ulong T7_G[] = { + C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), + C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), + C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), + C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), + C64e(0x90f050f0c0506060), C64e(0x0705030504030202), + C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), + C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), + C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), + C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), + C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), + C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), + C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), + C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), + C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), + C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), + C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), + C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), + C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), + C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), + C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), + C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), + C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), + C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), + C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), + C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), + C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), + C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), + C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), + C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), + C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), + C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), + C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), + C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), + C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), + C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), + C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), + C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), + C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), + C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), + C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), + C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), + C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), + C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), + C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), + C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), + C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), + C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), + C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), + C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), + C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), + C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), + C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), + C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), + C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), + C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), + C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), + C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), + C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), + C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), + C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), + C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), + C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), + C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), + C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), + C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), + C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), + C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), + C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), + C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), + C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), + C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), + C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), + C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), + C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), + C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), + C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), + C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), + C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), + C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), + C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), + C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), + C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), + C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), + C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), + C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), + C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), + C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), + C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), + C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), + C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), + C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), + C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), + C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), + C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), + C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), + C64e(0x6720e9208ee94747), C64e(0x3828182820181010), + C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), + C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), + C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), + C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), + C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), + C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), + C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), + C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), + C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), + C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), + C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), + C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), + C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), + C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), + C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), + C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), + C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), + C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), + C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), + C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), + C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), + C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), + C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), + C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), + C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), + C64e(0x929b809b12800909), C64e(0x2339173934171a1a), + C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), + C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), + C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), + C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), + C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), + C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) +}; +*/ +#define RBTT(d, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + d = T0[t0[b0]] \ + ^ T1[t1[b1]] \ + ^ T2[t2[b2]] \ + ^ T3[t3[b3]] \ + ^ T4[t4[b4]] \ + ^ T5[t5[b5]] \ + ^ T6[t6[b6]] \ + ^ T7[t7[b7]]; \ + } while (0) + +#define ROUND_BIG_P(a, r) do { \ + t0[0x0] = B64_0(a[0x0]) ^ PC64(0x00, r); \ + t1[0x0] = B64_1(a[0x0]); \ + t2[0x0] = B64_2(a[0x0]); \ + t3[0x0] = B64_3(a[0x0]); \ + t4[0x0] = B64_4(a[0x0]); \ + t5[0x0] = B64_5(a[0x0]); \ + t6[0x0] = B64_6(a[0x0]); \ + t7[0x0] = B64_7(a[0x0]); \ + t0[0x1] = B64_0(a[0x1]) ^ PC64(0x10, r); \ + t1[0x1] = B64_1(a[0x1]); \ + t2[0x1] = B64_2(a[0x1]); \ + t3[0x1] = B64_3(a[0x1]); \ + t4[0x1] = B64_4(a[0x1]); \ + t5[0x1] = B64_5(a[0x1]); \ + t6[0x1] = B64_6(a[0x1]); \ + t7[0x1] = B64_7(a[0x1]); \ + t0[0x2] = B64_0(a[0x2]) ^ PC64(0x20, r); \ + t1[0x2] = B64_1(a[0x2]); \ + t2[0x2] = B64_2(a[0x2]); \ + t3[0x2] = B64_3(a[0x2]); \ + t4[0x2] = B64_4(a[0x2]); \ + t5[0x2] = B64_5(a[0x2]); \ + t6[0x2] = B64_6(a[0x2]); \ + t7[0x2] = B64_7(a[0x2]); \ + t0[0x3] = B64_0(a[0x3]) ^ PC64(0x30, r); \ + t1[0x3] = B64_1(a[0x3]); \ + t2[0x3] = B64_2(a[0x3]); \ + t3[0x3] = B64_3(a[0x3]); \ + t4[0x3] = B64_4(a[0x3]); \ + t5[0x3] = B64_5(a[0x3]); \ + t6[0x3] = B64_6(a[0x3]); \ + t7[0x3] = B64_7(a[0x3]); \ + t0[0x4] = B64_0(a[0x4]) ^ PC64(0x40, r); \ + t1[0x4] = B64_1(a[0x4]); \ + t2[0x4] = B64_2(a[0x4]); \ + t3[0x4] = B64_3(a[0x4]); \ + t4[0x4] = B64_4(a[0x4]); \ + t5[0x4] = B64_5(a[0x4]); \ + t6[0x4] = B64_6(a[0x4]); \ + t7[0x4] = B64_7(a[0x4]); \ + t0[0x5] = B64_0(a[0x5]) ^ PC64(0x50, r); \ + t1[0x5] = B64_1(a[0x5]); \ + t2[0x5] = B64_2(a[0x5]); \ + t3[0x5] = B64_3(a[0x5]); \ + t4[0x5] = B64_4(a[0x5]); \ + t5[0x5] = B64_5(a[0x5]); \ + t6[0x5] = B64_6(a[0x5]); \ + t7[0x5] = B64_7(a[0x5]); \ + t0[0x6] = B64_0(a[0x6]) ^ PC64(0x60, r); \ + t1[0x6] = B64_1(a[0x6]); \ + t2[0x6] = B64_2(a[0x6]); \ + t3[0x6] = B64_3(a[0x6]); \ + t4[0x6] = B64_4(a[0x6]); \ + t5[0x6] = B64_5(a[0x6]); \ + t6[0x6] = B64_6(a[0x6]); \ + t7[0x6] = B64_7(a[0x6]); \ + t0[0x7] = B64_0(a[0x7]) ^ PC64(0x70, r); \ + t1[0x7] = B64_1(a[0x7]); \ + t2[0x7] = B64_2(a[0x7]); \ + t3[0x7] = B64_3(a[0x7]); \ + t4[0x7] = B64_4(a[0x7]); \ + t5[0x7] = B64_5(a[0x7]); \ + t6[0x7] = B64_6(a[0x7]); \ + t7[0x7] = B64_7(a[0x7]); \ + t0[0x8] = B64_0(a[0x8]) ^ PC64(0x80, r); \ + t1[0x8] = B64_1(a[0x8]); \ + t2[0x8] = B64_2(a[0x8]); \ + t3[0x8] = B64_3(a[0x8]); \ + t4[0x8] = B64_4(a[0x8]); \ + t5[0x8] = B64_5(a[0x8]); \ + t6[0x8] = B64_6(a[0x8]); \ + t7[0x8] = B64_7(a[0x8]); \ + t0[0x9] = B64_0(a[0x9]) ^ PC64(0x90, r); \ + t1[0x9] = B64_1(a[0x9]); \ + t2[0x9] = B64_2(a[0x9]); \ + t3[0x9] = B64_3(a[0x9]); \ + t4[0x9] = B64_4(a[0x9]); \ + t5[0x9] = B64_5(a[0x9]); \ + t6[0x9] = B64_6(a[0x9]); \ + t7[0x9] = B64_7(a[0x9]); \ + t0[0xA] = B64_0(a[0xA]) ^ PC64(0xA0, r); \ + t1[0xA] = B64_1(a[0xA]); \ + t2[0xA] = B64_2(a[0xA]); \ + t3[0xA] = B64_3(a[0xA]); \ + t4[0xA] = B64_4(a[0xA]); \ + t5[0xA] = B64_5(a[0xA]); \ + t6[0xA] = B64_6(a[0xA]); \ + t7[0xA] = B64_7(a[0xA]); \ + t0[0xB] = B64_0(a[0xB]) ^ PC64(0xB0, r); \ + t1[0xB] = B64_1(a[0xB]); \ + t2[0xB] = B64_2(a[0xB]); \ + t3[0xB] = B64_3(a[0xB]); \ + t4[0xB] = B64_4(a[0xB]); \ + t5[0xB] = B64_5(a[0xB]); \ + t6[0xB] = B64_6(a[0xB]); \ + t7[0xB] = B64_7(a[0xB]); \ + t0[0xC] = B64_0(a[0xC]) ^ PC64(0xC0, r); \ + t1[0xC] = B64_1(a[0xC]); \ + t2[0xC] = B64_2(a[0xC]); \ + t3[0xC] = B64_3(a[0xC]); \ + t4[0xC] = B64_4(a[0xC]); \ + t5[0xC] = B64_5(a[0xC]); \ + t6[0xC] = B64_6(a[0xC]); \ + t7[0xC] = B64_7(a[0xC]); \ + t0[0xD] = B64_0(a[0xD]) ^ PC64(0xD0, r); \ + t1[0xD] = B64_1(a[0xD]); \ + t2[0xD] = B64_2(a[0xD]); \ + t3[0xD] = B64_3(a[0xD]); \ + t4[0xD] = B64_4(a[0xD]); \ + t5[0xD] = B64_5(a[0xD]); \ + t6[0xD] = B64_6(a[0xD]); \ + t7[0xD] = B64_7(a[0xD]); \ + t0[0xE] = B64_0(a[0xE]) ^ PC64(0xE0, r); \ + t1[0xE] = B64_1(a[0xE]); \ + t2[0xE] = B64_2(a[0xE]); \ + t3[0xE] = B64_3(a[0xE]); \ + t4[0xE] = B64_4(a[0xE]); \ + t5[0xE] = B64_5(a[0xE]); \ + t6[0xE] = B64_6(a[0xE]); \ + t7[0xE] = B64_7(a[0xE]); \ + t0[0xF] = B64_0(a[0xF]) ^ PC64(0xF0, r); \ + t1[0xF] = B64_1(a[0xF]); \ + t2[0xF] = B64_2(a[0xF]); \ + t3[0xF] = B64_3(a[0xF]); \ + t4[0xF] = B64_4(a[0xF]); \ + t5[0xF] = B64_5(a[0xF]); \ + t6[0xF] = B64_6(a[0xF]); \ + t7[0xF] = B64_7(a[0xF]); \ + RBTT(a[0x0], 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \ + RBTT(a[0x1], 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \ + RBTT(a[0x2], 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \ + RBTT(a[0x3], 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \ + RBTT(a[0x4], 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \ + RBTT(a[0x5], 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \ + RBTT(a[0x6], 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \ + RBTT(a[0x7], 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \ + RBTT(a[0x8], 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \ + RBTT(a[0x9], 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \ + RBTT(a[0xA], 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \ + RBTT(a[0xB], 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \ + RBTT(a[0xC], 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \ + RBTT(a[0xD], 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \ + RBTT(a[0xE], 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \ + RBTT(a[0xF], 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + t0[0x0] = B64_0(a[0x0]); \ + t1[0x0] = B64_1(a[0x0]); \ + t2[0x0] = B64_2(a[0x0]); \ + t3[0x0] = B64_3(a[0x0]); \ + t4[0x0] = B64_4(a[0x0]); \ + t5[0x0] = B64_5(a[0x0]); \ + t6[0x0] = B64_6(a[0x0]); \ + t7[0x0] = B64_7(a[0x0]); \ + t0[0x1] = B64_0(a[0x1]); \ + t1[0x1] = B64_1(a[0x1]); \ + t2[0x1] = B64_2(a[0x1]); \ + t3[0x1] = B64_3(a[0x1]); \ + t4[0x1] = B64_4(a[0x1]); \ + t5[0x1] = B64_5(a[0x1]); \ + t6[0x1] = B64_6(a[0x1]); \ + t7[0x1] = B64_7(a[0x1]); \ + t0[0x2] = B64_0(a[0x2]); \ + t1[0x2] = B64_1(a[0x2]); \ + t2[0x2] = B64_2(a[0x2]); \ + t3[0x2] = B64_3(a[0x2]); \ + t4[0x2] = B64_4(a[0x2]); \ + t5[0x2] = B64_5(a[0x2]); \ + t6[0x2] = B64_6(a[0x2]); \ + t7[0x2] = B64_7(a[0x2]); \ + t0[0x3] = B64_0(a[0x3]); \ + t1[0x3] = B64_1(a[0x3]); \ + t2[0x3] = B64_2(a[0x3]); \ + t3[0x3] = B64_3(a[0x3]); \ + t4[0x3] = B64_4(a[0x3]); \ + t5[0x3] = B64_5(a[0x3]); \ + t6[0x3] = B64_6(a[0x3]); \ + t7[0x3] = B64_7(a[0x3]); \ + t0[0x4] = B64_0(a[0x4]); \ + t1[0x4] = B64_1(a[0x4]); \ + t2[0x4] = B64_2(a[0x4]); \ + t3[0x4] = B64_3(a[0x4]); \ + t4[0x4] = B64_4(a[0x4]); \ + t5[0x4] = B64_5(a[0x4]); \ + t6[0x4] = B64_6(a[0x4]); \ + t7[0x4] = B64_7(a[0x4]); \ + t0[0x5] = B64_0(a[0x5]); \ + t1[0x5] = B64_1(a[0x5]); \ + t2[0x5] = B64_2(a[0x5]); \ + t3[0x5] = B64_3(a[0x5]); \ + t4[0x5] = B64_4(a[0x5]); \ + t5[0x5] = B64_5(a[0x5]); \ + t6[0x5] = B64_6(a[0x5]); \ + t7[0x5] = B64_7(a[0x5]); \ + t0[0x6] = B64_0(a[0x6]); \ + t1[0x6] = B64_1(a[0x6]); \ + t2[0x6] = B64_2(a[0x6]); \ + t3[0x6] = B64_3(a[0x6]); \ + t4[0x6] = B64_4(a[0x6]); \ + t5[0x6] = B64_5(a[0x6]); \ + t6[0x6] = B64_6(a[0x6]); \ + t7[0x6] = B64_7(a[0x6]); \ + t0[0x7] = B64_0(a[0x7]); \ + t1[0x7] = B64_1(a[0x7]); \ + t2[0x7] = B64_2(a[0x7]); \ + t3[0x7] = B64_3(a[0x7]); \ + t4[0x7] = B64_4(a[0x7]); \ + t5[0x7] = B64_5(a[0x7]); \ + t6[0x7] = B64_6(a[0x7]); \ + t7[0x7] = B64_7(a[0x7]); \ + t0[0x8] = B64_0(a[0x8]); \ + t1[0x8] = B64_1(a[0x8]); \ + t2[0x8] = B64_2(a[0x8]); \ + t3[0x8] = B64_3(a[0x8]); \ + t4[0x8] = B64_4(a[0x8]); \ + t5[0x8] = B64_5(a[0x8]); \ + t6[0x8] = B64_6(a[0x8]); \ + t7[0x8] = B64_7(a[0x8]); \ + t0[0x9] = B64_0(a[0x9]); \ + t1[0x9] = B64_1(a[0x9]); \ + t2[0x9] = B64_2(a[0x9]); \ + t3[0x9] = B64_3(a[0x9]); \ + t4[0x9] = B64_4(a[0x9]); \ + t5[0x9] = B64_5(a[0x9]); \ + t6[0x9] = B64_6(a[0x9]); \ + t7[0x9] = B64_7(a[0x9]); \ + t0[0xA] = B64_0(a[0xA]); \ + t1[0xA] = B64_1(a[0xA]); \ + t2[0xA] = B64_2(a[0xA]); \ + t3[0xA] = B64_3(a[0xA]); \ + t4[0xA] = B64_4(a[0xA]); \ + t5[0xA] = B64_5(a[0xA]); \ + t6[0xA] = B64_6(a[0xA]); \ + t7[0xA] = B64_7(a[0xA]); \ + t0[0xB] = B64_0(a[0xB]); \ + t1[0xB] = B64_1(a[0xB]); \ + t2[0xB] = B64_2(a[0xB]); \ + t3[0xB] = B64_3(a[0xB]); \ + t4[0xB] = B64_4(a[0xB]); \ + t5[0xB] = B64_5(a[0xB]); \ + t6[0xB] = B64_6(a[0xB]); \ + t7[0xB] = B64_7(a[0xB]); \ + t0[0xC] = B64_0(a[0xC]); \ + t1[0xC] = B64_1(a[0xC]); \ + t2[0xC] = B64_2(a[0xC]); \ + t3[0xC] = B64_3(a[0xC]); \ + t4[0xC] = B64_4(a[0xC]); \ + t5[0xC] = B64_5(a[0xC]); \ + t6[0xC] = B64_6(a[0xC]); \ + t7[0xC] = B64_7(a[0xC]); \ + t0[0xD] = B64_0(a[0xD]); \ + t1[0xD] = B64_1(a[0xD]); \ + t2[0xD] = B64_2(a[0xD]); \ + t3[0xD] = B64_3(a[0xD]); \ + t4[0xD] = B64_4(a[0xD]); \ + t5[0xD] = B64_5(a[0xD]); \ + t6[0xD] = B64_6(a[0xD]); \ + t7[0xD] = B64_7(a[0xD]); \ + t0[0xE] = B64_0(a[0xE]); \ + t1[0xE] = B64_1(a[0xE]); \ + t2[0xE] = B64_2(a[0xE]); \ + t3[0xE] = B64_3(a[0xE]); \ + t4[0xE] = B64_4(a[0xE]); \ + t5[0xE] = B64_5(a[0xE]); \ + t6[0xE] = B64_6(a[0xE]); \ + t7[0xE] = B64_7(a[0xE]); \ + t0[0xF] = B64_0(a[0xF]); \ + t1[0xF] = B64_1(a[0xF]); \ + t2[0xF] = B64_2(a[0xF]); \ + t3[0xF] = B64_3(a[0xF]); \ + t4[0xF] = B64_4(a[0xF]); \ + t5[0xF] = B64_5(a[0xF]); \ + t6[0xF] = B64_6(a[0xF]); \ + t7[0xF] = B64_7(a[0xF]); \ + RBTT(a[0x0], 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \ + RBTT(a[0x1], 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \ + RBTT(a[0x2], 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \ + RBTT(a[0x3], 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \ + RBTT(a[0x4], 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \ + RBTT(a[0x5], 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \ + RBTT(a[0x6], 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \ + RBTT(a[0x7], 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \ + RBTT(a[0x8], 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \ + RBTT(a[0x9], 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \ + RBTT(a[0xA], 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \ + RBTT(a[0xB], 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \ + RBTT(a[0xC], 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \ + RBTT(a[0xD], 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \ + RBTT(a[0xE], 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \ + RBTT(a[0xF], 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \ + } while (0) + +#define PERM_BIG_P(a, start, end) do { \ + for (u = start; u < end; u++) { \ + ROUND_BIG_P(a, u); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + /* for (ulong u = 0; u < (14UL << 56); u += (1UL << 56)) { */ \ + for (u = 0; u < 14; u++) { \ + ROUND_BIG_Q(a, u); \ + } \ + } while (0) + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) { + __local ulong T2[256], T3[256], T4[256], T5[256], T6[256], T7[256]; + uint u; + + // for (u = get_local_id(0); u < 256; u += get_local_size(0)) { + u = get_local_id(0); + /* + T1[u] = T1_G[u]; + T2[u] = T2_G[u]; + T3[u] = T3_G[u]; + T4[u] = T4_G[u]; + T5[u] = T5_G[u]; + T6[u] = T6_G[u]; + T7[u] = T7_G[u]; + */ + // create other tables based on T0: avoids keeping them in the kernel. +// T1[u] = ROTL64(T0[u], 8UL); + T2[u] = ROTL64(T0[u], 16UL); + T3[u] = ROTL64(T0[u], 24UL); + T4[u] = ROTL64(T0[u], 32UL); + T5[u] = ROTL64(T0[u], 40UL); + T6[u] = ROTL64(T0[u], 48UL); + T7[u] = ROTL64(T0[u], 56UL); + barrier(CLK_LOCAL_MEM_FENCE); + + ulong g[16], m[16], t0[16], t1[16], t2[16], t3[16], t4[16], t5[16], t6[16], t7[16]; + uint flag = 0, gid = get_global_id(0), r = 13; + + m[0] = DEC64E(block + 0 * 8); + m[1] = DEC64E(block + 1 * 8); + m[2] = DEC64E(block + 2 * 8); + m[3] = DEC64E(block + 3 * 8); + m[4] = DEC64E(block + 4 * 8); + m[5] = DEC64E(block + 5 * 8); + m[6] = DEC64E(block + 6 * 8); + m[7] = DEC64E(block + 7 * 8); + m[8] = DEC64E(block + 8 * 8); + m[9] = DEC64E(block + 9 * 8); + m[9] &= 0x00000000FFFFFFFF; + m[9] |= ((ulong) gid << 32); + m[10] = 0x80; + +perm: + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = M15; + +#pragma unroll + for (u = 0; u < 15; u++) g[u] = m[u]; + g[15] = M15 ^ H15; + + + g[0x0] ^= PC64(0x00, 0); + g[0x1] ^= PC64(0x10, 0); + g[0x2] ^= PC64(0x20, 0); + g[0x3] ^= PC64(0x30, 0); + g[0x4] ^= PC64(0x40, 0); + g[0x5] ^= PC64(0x50, 0); + g[0x6] ^= PC64(0x60, 0); + g[0x7] ^= PC64(0x70, 0); + g[0x8] ^= PC64(0x80, 0); + g[0x9] ^= PC64(0x90, 0); + g[0xA] ^= PC64(0xA0, 0); + g[0xB] = PC64(0xB0, 0); + g[0xC] = PC64(0xC0, 0); + g[0xD] = PC64(0xD0, 0); + g[0xE] = PC64(0xE0, 0); + g[0xF] ^= PC64(0xF0, 0); + t0[0x0] = B64_0(g[0x0]); + t1[0x0] = B64_1(g[0x0]); + t2[0x0] = B64_2(g[0x0]); + t3[0x0] = B64_3(g[0x0]); + t4[0x0] = B64_4(g[0x0]); + t5[0x0] = B64_5(g[0x0]); + t6[0x0] = B64_6(g[0x0]); + t7[0x0] = B64_7(g[0x0]); + t0[0x1] = B64_0(g[0x1]); + t1[0x1] = B64_1(g[0x1]); + t2[0x1] = B64_2(g[0x1]); + t3[0x1] = B64_3(g[0x1]); + t4[0x1] = B64_4(g[0x1]); + t5[0x1] = B64_5(g[0x1]); + t6[0x1] = B64_6(g[0x1]); + t7[0x1] = B64_7(g[0x1]); + t0[0x2] = B64_0(g[0x2]); + t1[0x2] = B64_1(g[0x2]); + t2[0x2] = B64_2(g[0x2]); + t3[0x2] = B64_3(g[0x2]); + t4[0x2] = B64_4(g[0x2]); + t5[0x2] = B64_5(g[0x2]); + t6[0x2] = B64_6(g[0x2]); + t7[0x2] = B64_7(g[0x2]); + t0[0x3] = B64_0(g[0x3]); + t1[0x3] = B64_1(g[0x3]); + t2[0x3] = B64_2(g[0x3]); + t3[0x3] = B64_3(g[0x3]); + t4[0x3] = B64_4(g[0x3]); + t5[0x3] = B64_5(g[0x3]); + t6[0x3] = B64_6(g[0x3]); + t7[0x3] = B64_7(g[0x3]); + t0[0x4] = B64_0(g[0x4]); + t1[0x4] = B64_1(g[0x4]); + t2[0x4] = B64_2(g[0x4]); + t3[0x4] = B64_3(g[0x4]); + t4[0x4] = B64_4(g[0x4]); + t5[0x4] = B64_5(g[0x4]); + t6[0x4] = B64_6(g[0x4]); + t7[0x4] = B64_7(g[0x4]); + t0[0x5] = B64_0(g[0x5]); + t1[0x5] = B64_1(g[0x5]); + t2[0x5] = B64_2(g[0x5]); + t3[0x5] = B64_3(g[0x5]); + t4[0x5] = B64_4(g[0x5]); + t5[0x5] = B64_5(g[0x5]); + t6[0x5] = B64_6(g[0x5]); + t7[0x5] = B64_7(g[0x5]); + t0[0x6] = B64_0(g[0x6]); + t1[0x6] = B64_1(g[0x6]); + t2[0x6] = B64_2(g[0x6]); + t3[0x6] = B64_3(g[0x6]); + t4[0x6] = B64_4(g[0x6]); + t5[0x6] = B64_5(g[0x6]); + t6[0x6] = B64_6(g[0x6]); + t7[0x6] = B64_7(g[0x6]); + t0[0x7] = B64_0(g[0x7]); + t1[0x7] = B64_1(g[0x7]); + t2[0x7] = B64_2(g[0x7]); + t3[0x7] = B64_3(g[0x7]); + t4[0x7] = B64_4(g[0x7]); + t5[0x7] = B64_5(g[0x7]); + t6[0x7] = B64_6(g[0x7]); + t7[0x7] = B64_7(g[0x7]); + t0[0x8] = B64_0(g[0x8]); + t1[0x8] = B64_1(g[0x8]); + t2[0x8] = B64_2(g[0x8]); + t3[0x8] = B64_3(g[0x8]); + t4[0x8] = B64_4(g[0x8]); + t5[0x8] = B64_5(g[0x8]); + t6[0x8] = B64_6(g[0x8]); + t7[0x8] = B64_7(g[0x8]); + t0[0x9] = B64_0(g[0x9]); + t1[0x9] = B64_1(g[0x9]); + t2[0x9] = B64_2(g[0x9]); + t3[0x9] = B64_3(g[0x9]); + t4[0x9] = B64_4(g[0x9]); + t5[0x9] = B64_5(g[0x9]); + t6[0x9] = B64_6(g[0x9]); + t7[0x9] = B64_7(g[0x9]); + t0[0xA] = B64_0(g[0xA]); + t1[0xA] = B64_1(g[0xA]); + t2[0xA] = B64_2(g[0xA]); + t3[0xA] = B64_3(g[0xA]); + t4[0xA] = B64_4(g[0xA]); + t5[0xA] = B64_5(g[0xA]); + t6[0xA] = B64_6(g[0xA]); + t7[0xA] = B64_7(g[0xA]); + t0[0xB] = B64_0(g[0xB]); + t1[0xB] = B64_1(g[0xB]); + t2[0xB] = B64_2(g[0xB]); + t3[0xB] = B64_3(g[0xB]); + t4[0xB] = B64_4(g[0xB]); + t5[0xB] = B64_5(g[0xB]); + t6[0xB] = B64_6(g[0xB]); + t7[0xB] = B64_7(g[0xB]); + t0[0xC] = B64_0(g[0xC]); + t1[0xC] = B64_1(g[0xC]); + t2[0xC] = B64_2(g[0xC]); + t3[0xC] = B64_3(g[0xC]); + t4[0xC] = B64_4(g[0xC]); + t5[0xC] = B64_5(g[0xC]); + t6[0xC] = B64_6(g[0xC]); + t7[0xC] = B64_7(g[0xC]); + t0[0xD] = B64_0(g[0xD]); + t1[0xD] = B64_1(g[0xD]); + t2[0xD] = B64_2(g[0xD]); + t3[0xD] = B64_3(g[0xD]); + t4[0xD] = B64_4(g[0xD]); + t5[0xD] = B64_5(g[0xD]); + t6[0xD] = B64_6(g[0xD]); + t7[0xD] = B64_7(g[0xD]); + t0[0xE] = B64_0(g[0xE]); + t1[0xE] = B64_1(g[0xE]); + t2[0xE] = B64_2(g[0xE]); + t3[0xE] = B64_3(g[0xE]); + t4[0xE] = B64_4(g[0xE]); + t5[0xE] = B64_5(g[0xE]); + t6[0xE] = B64_6(g[0xE]); + t7[0xE] = B64_7(g[0xE]); + t0[0xF] = B64_0(g[0xF]); + t1[0xF] = B64_1(g[0xF]); + t2[0xF] = B64_2(g[0xF]); + t3[0xF] = B64_3(g[0xF]); + t4[0xF] = B64_4(g[0xF]); + t5[0xF] = B64_5(g[0xF]); + t6[0xF] = B64_6(g[0xF]); + t7[0xF] = B64_7(g[0xF]); + g[0x0] = T0[t0[0x0]] ^ T1[t1[0x1]] ^ T2[t2[0x2]] ^ T3[t3[0x3]] ^ T4[t4[0x4]] ^ T5[t5[0x5]] ^ T6[t6[0x6]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x1] = T0[t0[0x1]] ^ T1[t1[0x2]] ^ T2[t2[0x3]] ^ T3[t3[0x4]] ^ T4[t4[0x5]] ^ T5[t5[0x6]] ^ T6[t6[0x7]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x2] = T0[t0[0x2]] ^ T1[t1[0x3]] ^ T2[t2[0x4]] ^ T3[t3[0x5]] ^ T4[t4[0x6]] ^ T5[t5[0x7]] ^ T6[t6[0x8]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x3] = T0[t0[0x3]] ^ T1[t1[0x4]] ^ T2[t2[0x5]] ^ T3[t3[0x6]] ^ T4[t4[0x7]] ^ T5[t5[0x8]] ^ T6[t6[0x9]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x4] = T0[t0[0x4]] ^ T1[t1[0x5]] ^ T2[t2[0x6]] ^ T3[t3[0x7]] ^ T4[t4[0x8]] ^ T5[t5[0x9]] ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0xF]]; + g[0x5] = T0[t0[0x5]] ^ T1[t1[0x6]] ^ T2[t2[0x7]] ^ T3[t3[0x8]] ^ T4[t4[0x9]] ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x0]]; + g[0x6] = T0[t0[0x6]] ^ T1[t1[0x7]] ^ T2[t2[0x8]] ^ T3[t3[0x9]] ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x1]]; + g[0x7] = T0[t0[0x7]] ^ T1[t1[0x8]] ^ T2[t2[0x9]] ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x2]]; + g[0x8] = T0[t0[0x8]] ^ T1[t1[0x9]] ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x3]]; + g[0x9] = T0[t0[0x9]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ T6[t6[0xF]] ^ T7[t7[0x4]]; + g[0xA] = T0[t0[0xA]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ T5[t5[0xF]] ^ T6[t6[0x0]] ^ T7[t7[0x5]]; + g[0xB] = T0[t0[0xB]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ T4[t4[0xF]] ^ T5[t5[0x0]] ^ T6[t6[0x1]] ^ T7[t7[0x6]]; + g[0xC] = T0[t0[0xC]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ T3[t3[0xF]] ^ T4[t4[0x0]] ^ T5[t5[0x1]] ^ T6[t6[0x2]] ^ T7[t7[0x7]]; + g[0xD] = T0[t0[0xD]] ^ C64e(0xc6c632f4a5f497a5) ^ T2[t2[0xF]] ^ T3[t3[0x0]] ^ T4[t4[0x1]] ^ T5[t5[0x2]] ^ T6[t6[0x3]] ^ T7[t7[0x8]]; + g[0xE] = T0[t0[0xE]] ^ T1[t1[0xF]] ^ T2[t2[0x0]] ^ T3[t3[0x1]] ^ T4[t4[0x2]] ^ T5[t5[0x3]] ^ T6[t6[0x4]] ^ T7[t7[0x9]]; + g[0xF] = T0[t0[0xF]] ^ T1[t1[0x0]] ^ T2[t2[0x1]] ^ T3[t3[0x2]] ^ T4[t4[0x3]] ^ T5[t5[0x4]] ^ T6[t6[0x5]] ^ T7[t7[0xA]]; + + + PERM_BIG_P(g, 1, 14); + PERM_BIG_Q(m); + +#pragma unroll + for (u = 0; u < 16; u++) g[u] ^= m[u]; +#pragma unroll + for (u = 0; u < 8; u++) m[u] = g[u + 8]; + g[15] ^= H15; + PERM_BIG_P(g, 0, r); + +round: +// move the ^= to the relevant first byte down here? tried that, was slower?!?!? + g[0x0] ^= PC64(0x00, r); + g[0x1] ^= PC64(0x10, r); + g[0x6] ^= PC64(0x60, r); + g[0xB] ^= PC64(0xB0, r); + g[0xC] ^= PC64(0xC0, r); + g[0xD] ^= PC64(0xD0, r); + g[0xE] ^= PC64(0xE0, r); + g[0xF] ^= PC64(0xF0, r); + t0[0x0] = B64_0(g[0x0]); + t1[0x0] = B64_1(g[0x0]); + t2[0x0] = B64_2(g[0x0]); + t3[0x0] = B64_3(g[0x0]); + t4[0x0] = B64_4(g[0x0]); + t5[0x0] = B64_5(g[0x0]); + t6[0x0] = B64_6(g[0x0]); + t7[0x0] = B64_7(g[0x0]); + t0[0x1] = B64_0(g[0x1]); + t1[0x1] = B64_1(g[0x1]); + t2[0x1] = B64_2(g[0x1]); + t3[0x1] = B64_3(g[0x1]); + t4[0x1] = B64_4(g[0x1]); + t5[0x1] = B64_5(g[0x1]); + t6[0x1] = B64_6(g[0x1]); + t7[0x1] = B64_7(g[0x1]); + t0[0x6] = B64_0(g[0x6]); + t1[0x6] = B64_1(g[0x6]); + t2[0x6] = B64_2(g[0x6]); + t3[0x6] = B64_3(g[0x6]); + t4[0x6] = B64_4(g[0x6]); + t5[0x6] = B64_5(g[0x6]); + t6[0x6] = B64_6(g[0x6]); + t7[0x6] = B64_7(g[0x6]); + t0[0xB] = B64_0(g[0xB]); + t1[0xB] = B64_1(g[0xB]); + t2[0xB] = B64_2(g[0xB]); + t3[0xB] = B64_3(g[0xB]); + t4[0xB] = B64_4(g[0xB]); + t5[0xB] = B64_5(g[0xB]); + t6[0xB] = B64_6(g[0xB]); + t7[0xB] = B64_7(g[0xB]); + t0[0xC] = B64_0(g[0xC]); + t1[0xC] = B64_1(g[0xC]); + t2[0xC] = B64_2(g[0xC]); + t3[0xC] = B64_3(g[0xC]); + t4[0xC] = B64_4(g[0xC]); + t5[0xC] = B64_5(g[0xC]); + t6[0xC] = B64_6(g[0xC]); + t7[0xC] = B64_7(g[0xC]); + t0[0xD] = B64_0(g[0xD]); + t1[0xD] = B64_1(g[0xD]); + t2[0xD] = B64_2(g[0xD]); + t3[0xD] = B64_3(g[0xD]); + t4[0xD] = B64_4(g[0xD]); + t5[0xD] = B64_5(g[0xD]); + t6[0xD] = B64_6(g[0xD]); + t7[0xD] = B64_7(g[0xD]); + t0[0xE] = B64_0(g[0xE]); + t1[0xE] = B64_1(g[0xE]); + t2[0xE] = B64_2(g[0xE]); + t3[0xE] = B64_3(g[0xE]); + t4[0xE] = B64_4(g[0xE]); + t5[0xE] = B64_5(g[0xE]); + t6[0xE] = B64_6(g[0xE]); + t7[0xE] = B64_7(g[0xE]); + t0[0xF] = B64_0(g[0xF]); + t1[0xF] = B64_1(g[0xF]); + t2[0xF] = B64_2(g[0xF]); + t3[0xF] = B64_3(g[0xF]); + t4[0xF] = B64_4(g[0xF]); + t5[0xF] = B64_5(g[0xF]); + t6[0xF] = B64_6(g[0xF]); + t7[0xF] = B64_7(g[0xF]); + + if (flag < 2) { + g[0x2] ^= PC64(0x20, r); + g[0x3] ^= PC64(0x30, r); + g[0x4] ^= PC64(0x40, r); + g[0x5] ^= PC64(0x50, r); + g[0x7] ^= PC64(0x70, r); + g[0x8] ^= PC64(0x80, r); + g[0x9] ^= PC64(0x90, r); + g[0xA] ^= PC64(0xA0, r); + t0[0x2] = B64_0(g[0x2]); + t1[0x2] = B64_1(g[0x2]); + t2[0x2] = B64_2(g[0x2]); + t3[0x2] = B64_3(g[0x2]); + t4[0x2] = B64_4(g[0x2]); + t5[0x2] = B64_5(g[0x2]); + t6[0x2] = B64_6(g[0x2]); + t7[0x2] = B64_7(g[0x2]); + t0[0x3] = B64_0(g[0x3]); + t1[0x3] = B64_1(g[0x3]); + t2[0x3] = B64_2(g[0x3]); + t3[0x3] = B64_3(g[0x3]); + t4[0x3] = B64_4(g[0x3]); + t5[0x3] = B64_5(g[0x3]); + t6[0x3] = B64_6(g[0x3]); + t7[0x3] = B64_7(g[0x3]); + t0[0x4] = B64_0(g[0x4]); + t1[0x4] = B64_1(g[0x4]); + t2[0x4] = B64_2(g[0x4]); + t3[0x4] = B64_3(g[0x4]); + t4[0x4] = B64_4(g[0x4]); + t5[0x4] = B64_5(g[0x4]); + t6[0x4] = B64_6(g[0x4]); + t7[0x4] = B64_7(g[0x4]); + t0[0x5] = B64_0(g[0x5]); + t1[0x5] = B64_1(g[0x5]); + t2[0x5] = B64_2(g[0x5]); + t3[0x5] = B64_3(g[0x5]); + t4[0x5] = B64_4(g[0x5]); + t5[0x5] = B64_5(g[0x5]); + t6[0x5] = B64_6(g[0x5]); + t7[0x5] = B64_7(g[0x5]); + t0[0x7] = B64_0(g[0x7]); + t1[0x7] = B64_1(g[0x7]); + t2[0x7] = B64_2(g[0x7]); + t3[0x7] = B64_3(g[0x7]); + t4[0x7] = B64_4(g[0x7]); + t5[0x7] = B64_5(g[0x7]); + t6[0x7] = B64_6(g[0x7]); + t7[0x7] = B64_7(g[0x7]); + t0[0x8] = B64_0(g[0x8]); + t1[0x8] = B64_1(g[0x8]); + t2[0x8] = B64_2(g[0x8]); + t3[0x8] = B64_3(g[0x8]); + t4[0x8] = B64_4(g[0x8]); + t5[0x8] = B64_5(g[0x8]); + t6[0x8] = B64_6(g[0x8]); + t7[0x8] = B64_7(g[0x8]); + t0[0x9] = B64_0(g[0x9]); + t1[0x9] = B64_1(g[0x9]); + t2[0x9] = B64_2(g[0x9]); + t3[0x9] = B64_3(g[0x9]); + t4[0x9] = B64_4(g[0x9]); + t5[0x9] = B64_5(g[0x9]); + t6[0x9] = B64_6(g[0x9]); + t7[0x9] = B64_7(g[0x9]); + t0[0xA] = B64_0(g[0xA]); + t1[0xA] = B64_1(g[0xA]); + t2[0xA] = B64_2(g[0xA]); + t3[0xA] = B64_3(g[0xA]); + t4[0xA] = B64_4(g[0xA]); + t5[0xA] = B64_5(g[0xA]); + t6[0xA] = B64_6(g[0xA]); + t7[0xA] = B64_7(g[0xA]); + if (flag == 0) { + RBTT(g[0x8], 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); + RBTT(g[0x9], 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); + RBTT(g[0xA], 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); + } else { + RBTT(g[0x0], 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); + RBTT(g[0x1], 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); + RBTT(g[0x6], 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); + } + RBTT(g[0xC], 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); + RBTT(g[0xD], 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); + RBTT(g[0xE], 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); + RBTT(g[0xF], 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); + } + RBTT(g[0xB], 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); + + if (flag == 2) goto end; + if (flag++ == 1) { + r = 13; + goto round; + } + + r = 12; +#pragma unroll + for (u = 0; u < 8; u++) m[u] ^= g[u + 8]; + m[7] ^= H15; + m[8] = 0x80; + m[9] = 0; + m[10] = 0; + goto perm; + +end: + if ((g[3 + 8] ^ m[3]) <= target) output[output[0xFF]++] = as_uint(as_uchar4(gid).wzyx); +} + +#endif // DIAMOND_CL \ No newline at end of file diff --git a/kernel/groestl.cl b/kernel/groestl.cl index 80827c65..bf93cd0b 100644 --- a/kernel/groestl.cl +++ b/kernel/groestl.cl @@ -1429,4 +1429,4 @@ __constant static const sph_u64 T7[] = { for (r = 0; r < 14; ++r) { \ ROUND_BIG_Q(a, r); \ } \ - } while (0) + } while (0) \ No newline at end of file diff --git a/kernel/groestlcoin.cl b/kernel/groestlcoin.cl index 153f0606..54420d12 100644 --- a/kernel/groestlcoin.cl +++ b/kernel/groestlcoin.cl @@ -1,10 +1,8 @@ /* - * GroestlCoin kernel implementation. - * * ==========================(LICENSE BEGIN)============================ * - * Copyright (c) 2014 phm - * + * GroestlCoin kernel implementation: Copyright (c) 2014 pallas + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +10,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -26,230 +24,1830 @@ * * ===========================(LICENSE END)============================= * - * @author phm + * GroestlCoin kernel implementation: @author pallas + * Forum thread: http://bitcointalk.org/index.php?topic=779598 + * Donations to: BTC 1H7qC5uHuGX2d5s9Kuw3k7Wm7xMQzL16SN */ #ifndef GROESTLCOIN_CL #define GROESTLCOIN_CL -#if __ENDIAN_LITTLE__ -#define SPH_LITTLE_ENDIAN 1 -#else -#define SPH_BIG_ENDIAN 1 -#endif - -#define SPH_UPTR sph_u64 - -typedef unsigned int sph_u32; -typedef int sph_s32; -#ifndef __OPENCL_VERSION__ -typedef unsigned long long sph_u64; -typedef long long sph_s64; -#else -typedef unsigned long sph_u64; -typedef long sph_s64; -#endif - -#define SPH_64 1 -#define SPH_64_TRUE 1 - -#define SPH_C32(x) ((sph_u32)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -#define SPH_C64(x) ((sph_u64)(x ## UL)) -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) -#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) - -#define SPH_ECHO_64 1 -#define SPH_SIMD_NOCOPY 0 -#define SPH_CUBEHASH_UNROLL 0 - -#ifndef SPH_LUFFA_PARALLEL - #define SPH_LUFFA_PARALLEL 0 -#endif - -#include "groestl.cl" - -#define SWAP4(x) as_uint(as_uchar4(x).wzyx) -#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) - -#if SPH_BIG_ENDIAN - #define ENC64E(x) SWAP8(x) - #define DEC64E(x) SWAP8(*(const __global sph_u64 *) (x)); -#else - #define ENC64E(x) (x) - #define DEC64E(x) (*(const __global sph_u64 *) (x)); -#endif - -#define ROL32(x, n) rotate(x, (uint) n) -#define SHR(x, n) ((x) >> n) -#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) - -#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3)) -#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10)) - -#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10)) -#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7)) - -#define P(a,b,c,d,e,f,g,h,x,K) \ -{ \ - temp1 = h + S3(e) + F1(e,f,g) + (K + x); \ - d += temp1; h = temp1 + S2(a) + F0(a,b,c); \ -} +#define DC64(x) ((ulong)(x ## UL)) +#define DEC64E(x) (*(const __global ulong *) (x)); +#define H15 (((ulong)(512 & 0xFF) << 56) | ((ulong)(512 & 0xFF00) << 40)) +#define M15 0x100000000000000 +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) // rotate -#define PLAST(a,b,c,d,e,f,g,h,x,K) \ -{ \ - d += h + S3(e) + F1(e,f,g) + (x + K); \ -} +#define C64e(x) ((DC64(x) >> 56) \ + | ((DC64(x) >> 40) & DC64(0x000000000000FF00)) \ + | ((DC64(x) >> 24) & DC64(0x0000000000FF0000)) \ + | ((DC64(x) >> 8) & DC64(0x00000000FF000000)) \ + | ((DC64(x) << 8) & DC64(0x000000FF00000000)) \ + | ((DC64(x) << 24) & DC64(0x0000FF0000000000)) \ + | ((DC64(x) << 40) & DC64(0x00FF000000000000)) \ + | ((DC64(x) << 56))) +#define B64_0(x) ((x) & 0xFF) +#define B64_1(x) (((x) >> 8) & 0xFF) +#define B64_2(x) (((x) >> 16) & 0xFF) +#define B64_3(x) (((x) >> 24) & 0xFF) +#define B64_4(x) (((x) >> 32) & 0xFF) +#define B64_5(x) (((x) >> 40) & 0xFF) +#define B64_6(x) (((x) >> 48) & 0xFF) +#define B64_7(x) ((x) >> 56) +#define PC64(j, r) ((ulong)((j) + (r))) +#define QC64(j, r) (((ulong)(r) << 56) ^ ~((ulong)(j) << 56)) + +__constant static const ulong T0[] = { + C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), + C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), + C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), + C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), + C64e(0x6090f050f0c05060), C64e(0x0207050305040302), + C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), + C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), + C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), + C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), + C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), + C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), + C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), + C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), + C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), + C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), + C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), + C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), + C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), + C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), + C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), + C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), + C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), + C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), + C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), + C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), + C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), + C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), + C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), + C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), + C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), + C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), + C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), + C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), + C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), + C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), + C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), + C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), + C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), + C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), + C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), + C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), + C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), + C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), + C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), + C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), + C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), + C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), + C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), + C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), + C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), + C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), + C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), + C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), + C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), + C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), + C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), + C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), + C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), + C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), + C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), + C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), + C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), + C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), + C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), + C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), + C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), + C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), + C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), + C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), + C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), + C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), + C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), + C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), + C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), + C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), + C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), + C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), + C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), + C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), + C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), + C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), + C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), + C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), + C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), + C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), + C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), + C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), + C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), + C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), + C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), + C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), + C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), + C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), + C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), + C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), + C64e(0x476720e9208ee947), C64e(0x1038281828201810), + C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), + C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), + C64e(0x38546c246c702438), C64e(0x575f08f108aef157), + C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), + C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), + C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), + C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), + C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), + C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), + C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), + C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), + C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), + C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), + C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), + C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), + C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), + C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), + C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), + C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), + C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), + C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), + C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), + C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), + C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), + C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), + C64e(0x09929b809b128009), C64e(0x1a2339173934171a), + C64e(0x651075da75cada65), C64e(0xd784533153b531d7), + C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), + C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), + C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), + C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), + C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) +}; + +__constant static const ulong T1[] = { + C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), + C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), + C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), + C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), + C64e(0x606090f050f0c050), C64e(0x0202070503050403), + C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), + C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), + C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), + C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), + C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), + C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), + C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), + C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), + C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), + C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), + C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), + C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), + C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), + C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), + C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), + C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), + C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), + C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), + C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), + C64e(0x08081c140c14100c), C64e(0x959563f652f63152), + C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), + C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), + C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), + C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), + C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), + C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), + C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), + C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), + C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), + C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), + C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), + C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), + C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), + C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), + C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), + C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), + C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), + C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), + C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), + C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), + C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), + C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), + C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), + C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), + C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), + C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), + C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), + C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), + C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), + C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), + C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), + C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), + C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), + C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), + C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), + C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), + C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), + C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), + C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), + C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), + C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), + C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), + C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), + C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), + C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), + C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), + C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), + C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), + C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), + C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), + C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), + C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), + C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), + C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), + C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), + C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), + C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), + C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), + C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), + C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), + C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), + C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), + C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), + C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), + C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), + C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), + C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), + C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), + C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), + C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), + C64e(0x47476720e9208ee9), C64e(0x1010382818282018), + C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), + C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), + C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), + C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), + C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), + C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), + C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), + C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), + C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), + C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), + C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), + C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), + C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), + C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), + C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), + C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), + C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), + C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), + C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), + C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), + C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), + C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), + C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), + C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), + C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), + C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), + C64e(0x65651075da75cada), C64e(0xd7d784533153b531), + C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), + C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), + C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), + C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), + C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) +}; +/* +__constant static const ulong T2_G[] = { + C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), + C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), + C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), + C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), + C64e(0x50606090f050f0c0), C64e(0x0302020705030504), + C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), + C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), + C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), + C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), + C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), + C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), + C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), + C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), + C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), + C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), + C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), + C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), + C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), + C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), + C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), + C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), + C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), + C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), + C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), + C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), + C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), + C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), + C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), + C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), + C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), + C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), + C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), + C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), + C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), + C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), + C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), + C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), + C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), + C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), + C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), + C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), + C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), + C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), + C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), + C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), + C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), + C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), + C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), + C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), + C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), + C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), + C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), + C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), + C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), + C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), + C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), + C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), + C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), + C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), + C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), + C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), + C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), + C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), + C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), + C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), + C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), + C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), + C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), + C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), + C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), + C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), + C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), + C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), + C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), + C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), + C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), + C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), + C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), + C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), + C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), + C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), + C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), + C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), + C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), + C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), + C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), + C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), + C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), + C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), + C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), + C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), + C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), + C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), + C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), + C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), + C64e(0xe947476720e9208e), C64e(0x1810103828182820), + C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), + C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), + C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), + C64e(0xc773732152c752e6), C64e(0x51979764f351f335), + C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), + C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), + C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), + C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), + C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), + C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), + C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), + C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), + C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), + C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), + C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), + C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), + C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), + C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), + C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), + C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), + C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), + C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), + C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), + C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), + C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), + C64e(0x800909929b809b12), C64e(0x171a1a2339173934), + C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), + C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), + C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), + C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), + C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), + C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) +}; + +__constant static const ulong T3_G[] = { + C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), + C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), + C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), + C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), + C64e(0xc050606090f050f0), C64e(0x0403020207050305), + C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), + C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), + C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), + C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), + C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), + C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), + C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), + C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), + C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), + C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), + C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), + C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), + C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), + C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), + C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), + C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), + C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), + C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), + C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), + C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), + C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), + C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), + C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), + C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), + C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), + C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), + C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), + C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), + C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), + C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), + C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), + C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), + C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), + C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), + C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), + C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), + C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), + C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), + C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), + C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), + C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), + C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), + C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), + C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), + C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), + C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), + C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), + C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), + C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), + C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), + C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), + C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), + C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), + C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), + C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), + C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), + C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), + C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), + C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), + C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), + C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), + C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), + C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), + C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), + C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), + C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), + C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), + C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), + C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), + C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), + C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), + C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), + C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), + C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), + C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), + C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), + C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), + C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), + C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), + C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), + C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), + C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), + C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), + C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), + C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), + C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), + C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), + C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), + C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), + C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), + C64e(0x8ee947476720e920), C64e(0x2018101038281828), + C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), + C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), + C64e(0x70243838546c246c), C64e(0xaef157575f08f108), + C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), + C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), + C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), + C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), + C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), + C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), + C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), + C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), + C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), + C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), + C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), + C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), + C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), + C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), + C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), + C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), + C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), + C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), + C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), + C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), + C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), + C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), + C64e(0x12800909929b809b), C64e(0x34171a1a23391739), + C64e(0xcada65651075da75), C64e(0xb531d7d784533153), + C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), + C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), + C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), + C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), + C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) +}; + +__constant static const ulong T4_G[] = { + C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), + C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), + C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), + C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), + C64e(0xf0c050606090f050), C64e(0x0504030202070503), + C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), + C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), + C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), + C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), + C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), + C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), + C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), + C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), + C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), + C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), + C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), + C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), + C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), + C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), + C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), + C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), + C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), + C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), + C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), + C64e(0x14100c08081c140c), C64e(0xf63152959563f652), + C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), + C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), + C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), + C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), + C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), + C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), + C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), + C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), + C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), + C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), + C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), + C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), + C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), + C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), + C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), + C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), + C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), + C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), + C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), + C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), + C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), + C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), + C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), + C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), + C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), + C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), + C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), + C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), + C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), + C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), + C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), + C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), + C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), + C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), + C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), + C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), + C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), + C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), + C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), + C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), + C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), + C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), + C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), + C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), + C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), + C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), + C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), + C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), + C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), + C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), + C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), + C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), + C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), + C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), + C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), + C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), + C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), + C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), + C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), + C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), + C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), + C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), + C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), + C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), + C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), + C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), + C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), + C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), + C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), + C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), + C64e(0x208ee947476720e9), C64e(0x2820181010382818), + C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), + C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), + C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), + C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), + C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), + C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), + C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), + C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), + C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), + C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), + C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), + C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), + C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), + C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), + C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), + C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), + C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), + C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), + C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), + C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), + C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), + C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), + C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), + C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), + C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), + C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), + C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), + C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), + C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), + C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), + C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), + C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) +}; + +__constant static const ulong T5_G[] = { + C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), + C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), + C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), + C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), + C64e(0x50f0c050606090f0), C64e(0x0305040302020705), + C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), + C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), + C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), + C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), + C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), + C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), + C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), + C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), + C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), + C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), + C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), + C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), + C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), + C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), + C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), + C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), + C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), + C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), + C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), + C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), + C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), + C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), + C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), + C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), + C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), + C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), + C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), + C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), + C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), + C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), + C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), + C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), + C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), + C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), + C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), + C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), + C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), + C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), + C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), + C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), + C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), + C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), + C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), + C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), + C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), + C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), + C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), + C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), + C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), + C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), + C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), + C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), + C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), + C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), + C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), + C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), + C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), + C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), + C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), + C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), + C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), + C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), + C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), + C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), + C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), + C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), + C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), + C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), + C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), + C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), + C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), + C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), + C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), + C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), + C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), + C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), + C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), + C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), + C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), + C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), + C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), + C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), + C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), + C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), + C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), + C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), + C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), + C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), + C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), + C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), + C64e(0xe9208ee947476720), C64e(0x1828201810103828), + C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), + C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), + C64e(0x246c70243838546c), C64e(0xf108aef157575f08), + C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), + C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), + C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), + C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), + C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), + C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), + C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), + C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), + C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), + C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), + C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), + C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), + C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), + C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), + C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), + C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), + C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), + C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), + C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), + C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), + C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), + C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), + C64e(0x809b12800909929b), C64e(0x173934171a1a2339), + C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), + C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), + C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), + C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), + C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), + C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) +}; + +__constant static const ulong T6_G[] = { + C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), + C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), + C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), + C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), + C64e(0xf050f0c050606090), C64e(0x0503050403020207), + C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), + C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), + C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), + C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), + C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), + C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), + C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), + C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), + C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), + C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), + C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), + C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), + C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), + C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), + C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), + C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), + C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), + C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), + C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), + C64e(0x140c14100c08081c), C64e(0xf652f63152959563), + C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), + C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), + C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), + C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), + C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), + C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), + C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), + C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), + C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), + C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), + C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), + C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), + C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), + C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), + C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), + C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), + C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), + C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), + C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), + C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), + C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), + C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), + C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), + C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), + C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), + C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), + C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), + C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), + C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), + C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), + C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), + C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), + C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), + C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), + C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), + C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), + C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), + C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), + C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), + C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), + C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), + C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), + C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), + C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), + C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), + C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), + C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), + C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), + C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), + C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), + C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), + C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), + C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), + C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), + C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), + C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), + C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), + C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), + C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), + C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), + C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), + C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), + C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), + C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), + C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), + C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), + C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), + C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), + C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), + C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), + C64e(0x20e9208ee9474767), C64e(0x2818282018101038), + C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), + C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), + C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), + C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), + C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), + C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), + C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), + C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), + C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), + C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), + C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), + C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), + C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), + C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), + C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), + C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), + C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), + C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), + C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), + C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), + C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), + C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), + C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), + C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), + C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), + C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), + C64e(0x75da75cada656510), C64e(0x533153b531d7d784), + C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), + C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), + C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), + C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), + C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) +}; + +__constant static const ulong T7_G[] = { + C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), + C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), + C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), + C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), + C64e(0x90f050f0c0506060), C64e(0x0705030504030202), + C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), + C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), + C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), + C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), + C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), + C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), + C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), + C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), + C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), + C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), + C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), + C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), + C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), + C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), + C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), + C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), + C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), + C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), + C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), + C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), + C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), + C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), + C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), + C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), + C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), + C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), + C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), + C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), + C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), + C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), + C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), + C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), + C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), + C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), + C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), + C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), + C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), + C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), + C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), + C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), + C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), + C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), + C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), + C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), + C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), + C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), + C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), + C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), + C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), + C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), + C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), + C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), + C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), + C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), + C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), + C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), + C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), + C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), + C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), + C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), + C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), + C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), + C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), + C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), + C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), + C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), + C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), + C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), + C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), + C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), + C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), + C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), + C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), + C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), + C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), + C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), + C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), + C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), + C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), + C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), + C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), + C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), + C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), + C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), + C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), + C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), + C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), + C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), + C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), + C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), + C64e(0x6720e9208ee94747), C64e(0x3828182820181010), + C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), + C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), + C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), + C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), + C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), + C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), + C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), + C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), + C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), + C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), + C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), + C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), + C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), + C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), + C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), + C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), + C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), + C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), + C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), + C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), + C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), + C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), + C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), + C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), + C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), + C64e(0x929b809b12800909), C64e(0x2339173934171a1a), + C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), + C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), + C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), + C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), + C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), + C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) +}; +*/ +#define RBTT(d, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + d = T0[t0[b0]] \ + ^ T1[t1[b1]] \ + ^ T2[t2[b2]] \ + ^ T3[t3[b3]] \ + ^ T4[t4[b4]] \ + ^ T5[t5[b5]] \ + ^ T6[t6[b6]] \ + ^ T7[t7[b7]]; \ + } while (0) + +#define ROUND_BIG_P(a, r) do { \ + t0[0x0] = B64_0(a[0x0]) ^ PC64(0x00, r); \ + t1[0x0] = B64_1(a[0x0]); \ + t2[0x0] = B64_2(a[0x0]); \ + t3[0x0] = B64_3(a[0x0]); \ + t4[0x0] = B64_4(a[0x0]); \ + t5[0x0] = B64_5(a[0x0]); \ + t6[0x0] = B64_6(a[0x0]); \ + t7[0x0] = B64_7(a[0x0]); \ + t0[0x1] = B64_0(a[0x1]) ^ PC64(0x10, r); \ + t1[0x1] = B64_1(a[0x1]); \ + t2[0x1] = B64_2(a[0x1]); \ + t3[0x1] = B64_3(a[0x1]); \ + t4[0x1] = B64_4(a[0x1]); \ + t5[0x1] = B64_5(a[0x1]); \ + t6[0x1] = B64_6(a[0x1]); \ + t7[0x1] = B64_7(a[0x1]); \ + t0[0x2] = B64_0(a[0x2]) ^ PC64(0x20, r); \ + t1[0x2] = B64_1(a[0x2]); \ + t2[0x2] = B64_2(a[0x2]); \ + t3[0x2] = B64_3(a[0x2]); \ + t4[0x2] = B64_4(a[0x2]); \ + t5[0x2] = B64_5(a[0x2]); \ + t6[0x2] = B64_6(a[0x2]); \ + t7[0x2] = B64_7(a[0x2]); \ + t0[0x3] = B64_0(a[0x3]) ^ PC64(0x30, r); \ + t1[0x3] = B64_1(a[0x3]); \ + t2[0x3] = B64_2(a[0x3]); \ + t3[0x3] = B64_3(a[0x3]); \ + t4[0x3] = B64_4(a[0x3]); \ + t5[0x3] = B64_5(a[0x3]); \ + t6[0x3] = B64_6(a[0x3]); \ + t7[0x3] = B64_7(a[0x3]); \ + t0[0x4] = B64_0(a[0x4]) ^ PC64(0x40, r); \ + t1[0x4] = B64_1(a[0x4]); \ + t2[0x4] = B64_2(a[0x4]); \ + t3[0x4] = B64_3(a[0x4]); \ + t4[0x4] = B64_4(a[0x4]); \ + t5[0x4] = B64_5(a[0x4]); \ + t6[0x4] = B64_6(a[0x4]); \ + t7[0x4] = B64_7(a[0x4]); \ + t0[0x5] = B64_0(a[0x5]) ^ PC64(0x50, r); \ + t1[0x5] = B64_1(a[0x5]); \ + t2[0x5] = B64_2(a[0x5]); \ + t3[0x5] = B64_3(a[0x5]); \ + t4[0x5] = B64_4(a[0x5]); \ + t5[0x5] = B64_5(a[0x5]); \ + t6[0x5] = B64_6(a[0x5]); \ + t7[0x5] = B64_7(a[0x5]); \ + t0[0x6] = B64_0(a[0x6]) ^ PC64(0x60, r); \ + t1[0x6] = B64_1(a[0x6]); \ + t2[0x6] = B64_2(a[0x6]); \ + t3[0x6] = B64_3(a[0x6]); \ + t4[0x6] = B64_4(a[0x6]); \ + t5[0x6] = B64_5(a[0x6]); \ + t6[0x6] = B64_6(a[0x6]); \ + t7[0x6] = B64_7(a[0x6]); \ + t0[0x7] = B64_0(a[0x7]) ^ PC64(0x70, r); \ + t1[0x7] = B64_1(a[0x7]); \ + t2[0x7] = B64_2(a[0x7]); \ + t3[0x7] = B64_3(a[0x7]); \ + t4[0x7] = B64_4(a[0x7]); \ + t5[0x7] = B64_5(a[0x7]); \ + t6[0x7] = B64_6(a[0x7]); \ + t7[0x7] = B64_7(a[0x7]); \ + t0[0x8] = B64_0(a[0x8]) ^ PC64(0x80, r); \ + t1[0x8] = B64_1(a[0x8]); \ + t2[0x8] = B64_2(a[0x8]); \ + t3[0x8] = B64_3(a[0x8]); \ + t4[0x8] = B64_4(a[0x8]); \ + t5[0x8] = B64_5(a[0x8]); \ + t6[0x8] = B64_6(a[0x8]); \ + t7[0x8] = B64_7(a[0x8]); \ + t0[0x9] = B64_0(a[0x9]) ^ PC64(0x90, r); \ + t1[0x9] = B64_1(a[0x9]); \ + t2[0x9] = B64_2(a[0x9]); \ + t3[0x9] = B64_3(a[0x9]); \ + t4[0x9] = B64_4(a[0x9]); \ + t5[0x9] = B64_5(a[0x9]); \ + t6[0x9] = B64_6(a[0x9]); \ + t7[0x9] = B64_7(a[0x9]); \ + t0[0xA] = B64_0(a[0xA]) ^ PC64(0xA0, r); \ + t1[0xA] = B64_1(a[0xA]); \ + t2[0xA] = B64_2(a[0xA]); \ + t3[0xA] = B64_3(a[0xA]); \ + t4[0xA] = B64_4(a[0xA]); \ + t5[0xA] = B64_5(a[0xA]); \ + t6[0xA] = B64_6(a[0xA]); \ + t7[0xA] = B64_7(a[0xA]); \ + t0[0xB] = B64_0(a[0xB]) ^ PC64(0xB0, r); \ + t1[0xB] = B64_1(a[0xB]); \ + t2[0xB] = B64_2(a[0xB]); \ + t3[0xB] = B64_3(a[0xB]); \ + t4[0xB] = B64_4(a[0xB]); \ + t5[0xB] = B64_5(a[0xB]); \ + t6[0xB] = B64_6(a[0xB]); \ + t7[0xB] = B64_7(a[0xB]); \ + t0[0xC] = B64_0(a[0xC]) ^ PC64(0xC0, r); \ + t1[0xC] = B64_1(a[0xC]); \ + t2[0xC] = B64_2(a[0xC]); \ + t3[0xC] = B64_3(a[0xC]); \ + t4[0xC] = B64_4(a[0xC]); \ + t5[0xC] = B64_5(a[0xC]); \ + t6[0xC] = B64_6(a[0xC]); \ + t7[0xC] = B64_7(a[0xC]); \ + t0[0xD] = B64_0(a[0xD]) ^ PC64(0xD0, r); \ + t1[0xD] = B64_1(a[0xD]); \ + t2[0xD] = B64_2(a[0xD]); \ + t3[0xD] = B64_3(a[0xD]); \ + t4[0xD] = B64_4(a[0xD]); \ + t5[0xD] = B64_5(a[0xD]); \ + t6[0xD] = B64_6(a[0xD]); \ + t7[0xD] = B64_7(a[0xD]); \ + t0[0xE] = B64_0(a[0xE]) ^ PC64(0xE0, r); \ + t1[0xE] = B64_1(a[0xE]); \ + t2[0xE] = B64_2(a[0xE]); \ + t3[0xE] = B64_3(a[0xE]); \ + t4[0xE] = B64_4(a[0xE]); \ + t5[0xE] = B64_5(a[0xE]); \ + t6[0xE] = B64_6(a[0xE]); \ + t7[0xE] = B64_7(a[0xE]); \ + t0[0xF] = B64_0(a[0xF]) ^ PC64(0xF0, r); \ + t1[0xF] = B64_1(a[0xF]); \ + t2[0xF] = B64_2(a[0xF]); \ + t3[0xF] = B64_3(a[0xF]); \ + t4[0xF] = B64_4(a[0xF]); \ + t5[0xF] = B64_5(a[0xF]); \ + t6[0xF] = B64_6(a[0xF]); \ + t7[0xF] = B64_7(a[0xF]); \ + RBTT(a[0x0], 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \ + RBTT(a[0x1], 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \ + RBTT(a[0x2], 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \ + RBTT(a[0x3], 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \ + RBTT(a[0x4], 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \ + RBTT(a[0x5], 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \ + RBTT(a[0x6], 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \ + RBTT(a[0x7], 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \ + RBTT(a[0x8], 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \ + RBTT(a[0x9], 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \ + RBTT(a[0xA], 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \ + RBTT(a[0xB], 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \ + RBTT(a[0xC], 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \ + RBTT(a[0xD], 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \ + RBTT(a[0xE], 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \ + RBTT(a[0xF], 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + t0[0x0] = B64_0(a[0x0]); \ + t1[0x0] = B64_1(a[0x0]); \ + t2[0x0] = B64_2(a[0x0]); \ + t3[0x0] = B64_3(a[0x0]); \ + t4[0x0] = B64_4(a[0x0]); \ + t5[0x0] = B64_5(a[0x0]); \ + t6[0x0] = B64_6(a[0x0]); \ + t7[0x0] = B64_7(a[0x0]); \ + t0[0x1] = B64_0(a[0x1]); \ + t1[0x1] = B64_1(a[0x1]); \ + t2[0x1] = B64_2(a[0x1]); \ + t3[0x1] = B64_3(a[0x1]); \ + t4[0x1] = B64_4(a[0x1]); \ + t5[0x1] = B64_5(a[0x1]); \ + t6[0x1] = B64_6(a[0x1]); \ + t7[0x1] = B64_7(a[0x1]); \ + t0[0x2] = B64_0(a[0x2]); \ + t1[0x2] = B64_1(a[0x2]); \ + t2[0x2] = B64_2(a[0x2]); \ + t3[0x2] = B64_3(a[0x2]); \ + t4[0x2] = B64_4(a[0x2]); \ + t5[0x2] = B64_5(a[0x2]); \ + t6[0x2] = B64_6(a[0x2]); \ + t7[0x2] = B64_7(a[0x2]); \ + t0[0x3] = B64_0(a[0x3]); \ + t1[0x3] = B64_1(a[0x3]); \ + t2[0x3] = B64_2(a[0x3]); \ + t3[0x3] = B64_3(a[0x3]); \ + t4[0x3] = B64_4(a[0x3]); \ + t5[0x3] = B64_5(a[0x3]); \ + t6[0x3] = B64_6(a[0x3]); \ + t7[0x3] = B64_7(a[0x3]); \ + t0[0x4] = B64_0(a[0x4]); \ + t1[0x4] = B64_1(a[0x4]); \ + t2[0x4] = B64_2(a[0x4]); \ + t3[0x4] = B64_3(a[0x4]); \ + t4[0x4] = B64_4(a[0x4]); \ + t5[0x4] = B64_5(a[0x4]); \ + t6[0x4] = B64_6(a[0x4]); \ + t7[0x4] = B64_7(a[0x4]); \ + t0[0x5] = B64_0(a[0x5]); \ + t1[0x5] = B64_1(a[0x5]); \ + t2[0x5] = B64_2(a[0x5]); \ + t3[0x5] = B64_3(a[0x5]); \ + t4[0x5] = B64_4(a[0x5]); \ + t5[0x5] = B64_5(a[0x5]); \ + t6[0x5] = B64_6(a[0x5]); \ + t7[0x5] = B64_7(a[0x5]); \ + t0[0x6] = B64_0(a[0x6]); \ + t1[0x6] = B64_1(a[0x6]); \ + t2[0x6] = B64_2(a[0x6]); \ + t3[0x6] = B64_3(a[0x6]); \ + t4[0x6] = B64_4(a[0x6]); \ + t5[0x6] = B64_5(a[0x6]); \ + t6[0x6] = B64_6(a[0x6]); \ + t7[0x6] = B64_7(a[0x6]); \ + t0[0x7] = B64_0(a[0x7]); \ + t1[0x7] = B64_1(a[0x7]); \ + t2[0x7] = B64_2(a[0x7]); \ + t3[0x7] = B64_3(a[0x7]); \ + t4[0x7] = B64_4(a[0x7]); \ + t5[0x7] = B64_5(a[0x7]); \ + t6[0x7] = B64_6(a[0x7]); \ + t7[0x7] = B64_7(a[0x7]); \ + t0[0x8] = B64_0(a[0x8]); \ + t1[0x8] = B64_1(a[0x8]); \ + t2[0x8] = B64_2(a[0x8]); \ + t3[0x8] = B64_3(a[0x8]); \ + t4[0x8] = B64_4(a[0x8]); \ + t5[0x8] = B64_5(a[0x8]); \ + t6[0x8] = B64_6(a[0x8]); \ + t7[0x8] = B64_7(a[0x8]); \ + t0[0x9] = B64_0(a[0x9]); \ + t1[0x9] = B64_1(a[0x9]); \ + t2[0x9] = B64_2(a[0x9]); \ + t3[0x9] = B64_3(a[0x9]); \ + t4[0x9] = B64_4(a[0x9]); \ + t5[0x9] = B64_5(a[0x9]); \ + t6[0x9] = B64_6(a[0x9]); \ + t7[0x9] = B64_7(a[0x9]); \ + t0[0xA] = B64_0(a[0xA]); \ + t1[0xA] = B64_1(a[0xA]); \ + t2[0xA] = B64_2(a[0xA]); \ + t3[0xA] = B64_3(a[0xA]); \ + t4[0xA] = B64_4(a[0xA]); \ + t5[0xA] = B64_5(a[0xA]); \ + t6[0xA] = B64_6(a[0xA]); \ + t7[0xA] = B64_7(a[0xA]); \ + t0[0xB] = B64_0(a[0xB]); \ + t1[0xB] = B64_1(a[0xB]); \ + t2[0xB] = B64_2(a[0xB]); \ + t3[0xB] = B64_3(a[0xB]); \ + t4[0xB] = B64_4(a[0xB]); \ + t5[0xB] = B64_5(a[0xB]); \ + t6[0xB] = B64_6(a[0xB]); \ + t7[0xB] = B64_7(a[0xB]); \ + t0[0xC] = B64_0(a[0xC]); \ + t1[0xC] = B64_1(a[0xC]); \ + t2[0xC] = B64_2(a[0xC]); \ + t3[0xC] = B64_3(a[0xC]); \ + t4[0xC] = B64_4(a[0xC]); \ + t5[0xC] = B64_5(a[0xC]); \ + t6[0xC] = B64_6(a[0xC]); \ + t7[0xC] = B64_7(a[0xC]); \ + t0[0xD] = B64_0(a[0xD]); \ + t1[0xD] = B64_1(a[0xD]); \ + t2[0xD] = B64_2(a[0xD]); \ + t3[0xD] = B64_3(a[0xD]); \ + t4[0xD] = B64_4(a[0xD]); \ + t5[0xD] = B64_5(a[0xD]); \ + t6[0xD] = B64_6(a[0xD]); \ + t7[0xD] = B64_7(a[0xD]); \ + t0[0xE] = B64_0(a[0xE]); \ + t1[0xE] = B64_1(a[0xE]); \ + t2[0xE] = B64_2(a[0xE]); \ + t3[0xE] = B64_3(a[0xE]); \ + t4[0xE] = B64_4(a[0xE]); \ + t5[0xE] = B64_5(a[0xE]); \ + t6[0xE] = B64_6(a[0xE]); \ + t7[0xE] = B64_7(a[0xE]); \ + t0[0xF] = B64_0(a[0xF]); \ + t1[0xF] = B64_1(a[0xF]); \ + t2[0xF] = B64_2(a[0xF]); \ + t3[0xF] = B64_3(a[0xF]); \ + t4[0xF] = B64_4(a[0xF]); \ + t5[0xF] = B64_5(a[0xF]); \ + t6[0xF] = B64_6(a[0xF]); \ + t7[0xF] = B64_7(a[0xF]); \ + RBTT(a[0x0], 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \ + RBTT(a[0x1], 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \ + RBTT(a[0x2], 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \ + RBTT(a[0x3], 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \ + RBTT(a[0x4], 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \ + RBTT(a[0x5], 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \ + RBTT(a[0x6], 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \ + RBTT(a[0x7], 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \ + RBTT(a[0x8], 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \ + RBTT(a[0x9], 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \ + RBTT(a[0xA], 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \ + RBTT(a[0xB], 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \ + RBTT(a[0xC], 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \ + RBTT(a[0xD], 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \ + RBTT(a[0xE], 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \ + RBTT(a[0xF], 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \ + } while (0) + +#define PERM_BIG_P(a, start, end) do { \ + for (u = start; u < end; u++) { \ + ROUND_BIG_P(a, u); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + /* for (ulong u = 0; u < (14UL << 56); u += (1UL << 56)) { */ \ + for (u = 0; u < 14; u++) { \ + ROUND_BIG_Q(a, u); \ + } \ + } while (0) -#define F0(y, x, z) bitselect(z, y, z ^ x) -#define F1(x, y, z) bitselect(z, y, x) - -#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0) -#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1) -#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2) -#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3) -#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4) -#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5) -#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6) -#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7) -#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8) -#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9) -#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10) -#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11) -#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12) -#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13) -#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14) -#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15) - -#define RD14 (S1(W12) + W7 + S0(W15) + W14) -#define RD15 (S1(W13) + W8 + S0(W0) + W15) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) -{ - uint gid = get_global_id(0); - union { - unsigned char h1[64]; - uint h4[16]; - ulong h8[8]; - } hash; - - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - int init = get_local_id(0); - int step = get_local_size(0); - for (int i = init; i < 256; i += step) - { - T0_L[i] = T0[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T4_L[i] = T4[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; - } +__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) { + __local ulong T2[256], T3[256], T4[256], T5[256], T6[256], T7[256]; + uint u; + + // for (u = get_local_id(0); u < 256; u += get_local_size(0)) { + u = get_local_id(0); + /* + T1[u] = T1_G[u]; + T2[u] = T2_G[u]; + T3[u] = T3_G[u]; + T4[u] = T4_G[u]; + T5[u] = T5_G[u]; + T6[u] = T6_G[u]; + T7[u] = T7_G[u]; + */ + // create other tables based on T0: avoids keeping them in the kernel. +// T1[u] = ROTL64(T0[u], 8UL); + T2[u] = ROTL64(T0[u], 16UL); + T3[u] = ROTL64(T0[u], 24UL); + T4[u] = ROTL64(T0[u], 32UL); + T5[u] = ROTL64(T0[u], 40UL); + T6[u] = ROTL64(T0[u], 48UL); + T7[u] = ROTL64(T0[u], 56UL); barrier(CLK_LOCAL_MEM_FENCE); -#define T0 T0_L -#define T1 T1_L -#define T2 T2_L -#define T3 T3_L -#define T4 T4_L -#define T5 T5_L -#define T6 T6_L -#define T7 T7_L - - // groestl - { - sph_u64 H[16]; - for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; - #if USE_LE - H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); - #else - H[15] = (sph_u64)512; - #endif - - sph_u64 g[16], m[16]; - m[0] = DEC64E(block + 0 * 8); - m[1] = DEC64E(block + 1 * 8); - m[2] = DEC64E(block + 2 * 8); - m[3] = DEC64E(block + 3 * 8); - m[4] = DEC64E(block + 4 * 8); - m[5] = DEC64E(block + 5 * 8); - m[6] = DEC64E(block + 6 * 8); - m[7] = DEC64E(block + 7 * 8); - m[8] = DEC64E(block + 8 * 8); - m[9] = DEC64E(block + 9 * 8); - m[9] &= 0x00000000FFFFFFFF; - m[9] |= ((sph_u64) gid << 32); - m[10] = 0x80; - m[11] = 0; - m[12] = 0; - m[13] = 0; - m[14] = 0; - m[15] = 0x100000000000000; - for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - PERM_BIG_P(g); - PERM_BIG_Q(m); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - sph_u64 xH[16]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; - PERM_BIG_P(xH); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; - for (unsigned int u = 0; u < 8; u ++) - hash.h8[u] = ENC64E(H[u + 8]); - - for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; - #if USE_LE - H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); - #else - H[15] = (sph_u64)512; - #endif - - m[0] = hash.h8[0]; - m[1] = hash.h8[1]; - m[2] = hash.h8[2]; - m[3] = hash.h8[3]; - m[4] = hash.h8[4]; - m[5] = hash.h8[5]; - m[6] = hash.h8[6]; - m[7] = hash.h8[7]; - for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - m[8] = 0x80; g[8] = m[8] ^ H[8]; - m[9] = 0; g[9] = m[9] ^ H[9]; - m[10] = 0; g[10] = m[10] ^ H[10]; - m[11] = 0; g[11] = m[11] ^ H[11]; - m[12] = 0; g[12] = m[12] ^ H[12]; - m[13] = 0; g[13] = m[13] ^ H[13]; - m[14] = 0; g[14] = m[14] ^ H[14]; - m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; - PERM_BIG_P(g); - PERM_BIG_Q(m); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; - PERM_BIG_P(xH); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; - for (unsigned int u = 0; u < 8; u ++) - hash.h8[u] = H[u + 8]; - } - - bool result = (hash.h8[3] <= target); - if (result) - output[output[0xFF]++] = SWAP4(gid); + ulong g[16], m[16], t0[16], t1[16], t2[16], t3[16], t4[16], t5[16], t6[16], t7[16]; + uint flag = 0, gid = get_global_id(0), r = 13; + + m[0] = DEC64E(block + 0 * 8); + m[1] = DEC64E(block + 1 * 8); + m[2] = DEC64E(block + 2 * 8); + m[3] = DEC64E(block + 3 * 8); + m[4] = DEC64E(block + 4 * 8); + m[5] = DEC64E(block + 5 * 8); + m[6] = DEC64E(block + 6 * 8); + m[7] = DEC64E(block + 7 * 8); + m[8] = DEC64E(block + 8 * 8); + m[9] = DEC64E(block + 9 * 8); + m[9] &= 0x00000000FFFFFFFF; + m[9] |= ((ulong) gid << 32); + m[10] = 0x80; + +perm: + m[11] = 0; + m[12] = 0; + m[13] = 0; + m[14] = 0; + m[15] = M15; + +#pragma unroll + for (u = 0; u < 15; u++) g[u] = m[u]; + g[15] = M15 ^ H15; + + + g[0x0] ^= PC64(0x00, 0); + g[0x1] ^= PC64(0x10, 0); + g[0x2] ^= PC64(0x20, 0); + g[0x3] ^= PC64(0x30, 0); + g[0x4] ^= PC64(0x40, 0); + g[0x5] ^= PC64(0x50, 0); + g[0x6] ^= PC64(0x60, 0); + g[0x7] ^= PC64(0x70, 0); + g[0x8] ^= PC64(0x80, 0); + g[0x9] ^= PC64(0x90, 0); + g[0xA] ^= PC64(0xA0, 0); + g[0xB] = PC64(0xB0, 0); + g[0xC] = PC64(0xC0, 0); + g[0xD] = PC64(0xD0, 0); + g[0xE] = PC64(0xE0, 0); + g[0xF] ^= PC64(0xF0, 0); + t0[0x0] = B64_0(g[0x0]); + t1[0x0] = B64_1(g[0x0]); + t2[0x0] = B64_2(g[0x0]); + t3[0x0] = B64_3(g[0x0]); + t4[0x0] = B64_4(g[0x0]); + t5[0x0] = B64_5(g[0x0]); + t6[0x0] = B64_6(g[0x0]); + t7[0x0] = B64_7(g[0x0]); + t0[0x1] = B64_0(g[0x1]); + t1[0x1] = B64_1(g[0x1]); + t2[0x1] = B64_2(g[0x1]); + t3[0x1] = B64_3(g[0x1]); + t4[0x1] = B64_4(g[0x1]); + t5[0x1] = B64_5(g[0x1]); + t6[0x1] = B64_6(g[0x1]); + t7[0x1] = B64_7(g[0x1]); + t0[0x2] = B64_0(g[0x2]); + t1[0x2] = B64_1(g[0x2]); + t2[0x2] = B64_2(g[0x2]); + t3[0x2] = B64_3(g[0x2]); + t4[0x2] = B64_4(g[0x2]); + t5[0x2] = B64_5(g[0x2]); + t6[0x2] = B64_6(g[0x2]); + t7[0x2] = B64_7(g[0x2]); + t0[0x3] = B64_0(g[0x3]); + t1[0x3] = B64_1(g[0x3]); + t2[0x3] = B64_2(g[0x3]); + t3[0x3] = B64_3(g[0x3]); + t4[0x3] = B64_4(g[0x3]); + t5[0x3] = B64_5(g[0x3]); + t6[0x3] = B64_6(g[0x3]); + t7[0x3] = B64_7(g[0x3]); + t0[0x4] = B64_0(g[0x4]); + t1[0x4] = B64_1(g[0x4]); + t2[0x4] = B64_2(g[0x4]); + t3[0x4] = B64_3(g[0x4]); + t4[0x4] = B64_4(g[0x4]); + t5[0x4] = B64_5(g[0x4]); + t6[0x4] = B64_6(g[0x4]); + t7[0x4] = B64_7(g[0x4]); + t0[0x5] = B64_0(g[0x5]); + t1[0x5] = B64_1(g[0x5]); + t2[0x5] = B64_2(g[0x5]); + t3[0x5] = B64_3(g[0x5]); + t4[0x5] = B64_4(g[0x5]); + t5[0x5] = B64_5(g[0x5]); + t6[0x5] = B64_6(g[0x5]); + t7[0x5] = B64_7(g[0x5]); + t0[0x6] = B64_0(g[0x6]); + t1[0x6] = B64_1(g[0x6]); + t2[0x6] = B64_2(g[0x6]); + t3[0x6] = B64_3(g[0x6]); + t4[0x6] = B64_4(g[0x6]); + t5[0x6] = B64_5(g[0x6]); + t6[0x6] = B64_6(g[0x6]); + t7[0x6] = B64_7(g[0x6]); + t0[0x7] = B64_0(g[0x7]); + t1[0x7] = B64_1(g[0x7]); + t2[0x7] = B64_2(g[0x7]); + t3[0x7] = B64_3(g[0x7]); + t4[0x7] = B64_4(g[0x7]); + t5[0x7] = B64_5(g[0x7]); + t6[0x7] = B64_6(g[0x7]); + t7[0x7] = B64_7(g[0x7]); + t0[0x8] = B64_0(g[0x8]); + t1[0x8] = B64_1(g[0x8]); + t2[0x8] = B64_2(g[0x8]); + t3[0x8] = B64_3(g[0x8]); + t4[0x8] = B64_4(g[0x8]); + t5[0x8] = B64_5(g[0x8]); + t6[0x8] = B64_6(g[0x8]); + t7[0x8] = B64_7(g[0x8]); + t0[0x9] = B64_0(g[0x9]); + t1[0x9] = B64_1(g[0x9]); + t2[0x9] = B64_2(g[0x9]); + t3[0x9] = B64_3(g[0x9]); + t4[0x9] = B64_4(g[0x9]); + t5[0x9] = B64_5(g[0x9]); + t6[0x9] = B64_6(g[0x9]); + t7[0x9] = B64_7(g[0x9]); + t0[0xA] = B64_0(g[0xA]); + t1[0xA] = B64_1(g[0xA]); + t2[0xA] = B64_2(g[0xA]); + t3[0xA] = B64_3(g[0xA]); + t4[0xA] = B64_4(g[0xA]); + t5[0xA] = B64_5(g[0xA]); + t6[0xA] = B64_6(g[0xA]); + t7[0xA] = B64_7(g[0xA]); + t0[0xB] = B64_0(g[0xB]); + t1[0xB] = B64_1(g[0xB]); + t2[0xB] = B64_2(g[0xB]); + t3[0xB] = B64_3(g[0xB]); + t4[0xB] = B64_4(g[0xB]); + t5[0xB] = B64_5(g[0xB]); + t6[0xB] = B64_6(g[0xB]); + t7[0xB] = B64_7(g[0xB]); + t0[0xC] = B64_0(g[0xC]); + t1[0xC] = B64_1(g[0xC]); + t2[0xC] = B64_2(g[0xC]); + t3[0xC] = B64_3(g[0xC]); + t4[0xC] = B64_4(g[0xC]); + t5[0xC] = B64_5(g[0xC]); + t6[0xC] = B64_6(g[0xC]); + t7[0xC] = B64_7(g[0xC]); + t0[0xD] = B64_0(g[0xD]); + t1[0xD] = B64_1(g[0xD]); + t2[0xD] = B64_2(g[0xD]); + t3[0xD] = B64_3(g[0xD]); + t4[0xD] = B64_4(g[0xD]); + t5[0xD] = B64_5(g[0xD]); + t6[0xD] = B64_6(g[0xD]); + t7[0xD] = B64_7(g[0xD]); + t0[0xE] = B64_0(g[0xE]); + t1[0xE] = B64_1(g[0xE]); + t2[0xE] = B64_2(g[0xE]); + t3[0xE] = B64_3(g[0xE]); + t4[0xE] = B64_4(g[0xE]); + t5[0xE] = B64_5(g[0xE]); + t6[0xE] = B64_6(g[0xE]); + t7[0xE] = B64_7(g[0xE]); + t0[0xF] = B64_0(g[0xF]); + t1[0xF] = B64_1(g[0xF]); + t2[0xF] = B64_2(g[0xF]); + t3[0xF] = B64_3(g[0xF]); + t4[0xF] = B64_4(g[0xF]); + t5[0xF] = B64_5(g[0xF]); + t6[0xF] = B64_6(g[0xF]); + t7[0xF] = B64_7(g[0xF]); + g[0x0] = T0[t0[0x0]] ^ T1[t1[0x1]] ^ T2[t2[0x2]] ^ T3[t3[0x3]] ^ T4[t4[0x4]] ^ T5[t5[0x5]] ^ T6[t6[0x6]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x1] = T0[t0[0x1]] ^ T1[t1[0x2]] ^ T2[t2[0x3]] ^ T3[t3[0x4]] ^ T4[t4[0x5]] ^ T5[t5[0x6]] ^ T6[t6[0x7]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x2] = T0[t0[0x2]] ^ T1[t1[0x3]] ^ T2[t2[0x4]] ^ T3[t3[0x5]] ^ T4[t4[0x6]] ^ T5[t5[0x7]] ^ T6[t6[0x8]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x3] = T0[t0[0x3]] ^ T1[t1[0x4]] ^ T2[t2[0x5]] ^ T3[t3[0x6]] ^ T4[t4[0x7]] ^ T5[t5[0x8]] ^ T6[t6[0x9]] ^ C64e(0x32f4a5f497a5c6c6); + g[0x4] = T0[t0[0x4]] ^ T1[t1[0x5]] ^ T2[t2[0x6]] ^ T3[t3[0x7]] ^ T4[t4[0x8]] ^ T5[t5[0x9]] ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0xF]]; + g[0x5] = T0[t0[0x5]] ^ T1[t1[0x6]] ^ T2[t2[0x7]] ^ T3[t3[0x8]] ^ T4[t4[0x9]] ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x0]]; + g[0x6] = T0[t0[0x6]] ^ T1[t1[0x7]] ^ T2[t2[0x8]] ^ T3[t3[0x9]] ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x1]]; + g[0x7] = T0[t0[0x7]] ^ T1[t1[0x8]] ^ T2[t2[0x9]] ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x2]]; + g[0x8] = T0[t0[0x8]] ^ T1[t1[0x9]] ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ C64e(0xf4a5f497a5c6c632) ^ T7[t7[0x3]]; + g[0x9] = T0[t0[0x9]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ C64e(0xa5f497a5c6c632f4) ^ T6[t6[0xF]] ^ T7[t7[0x4]]; + g[0xA] = T0[t0[0xA]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ C64e(0xf497a5c6c632f4a5) ^ T5[t5[0xF]] ^ T6[t6[0x0]] ^ T7[t7[0x5]]; + g[0xB] = T0[t0[0xB]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ C64e(0x97a5c6c632f4a5f4) ^ T4[t4[0xF]] ^ T5[t5[0x0]] ^ T6[t6[0x1]] ^ T7[t7[0x6]]; + g[0xC] = T0[t0[0xC]] ^ C64e(0xc6c632f4a5f497a5) ^ C64e(0xa5c6c632f4a5f497) ^ T3[t3[0xF]] ^ T4[t4[0x0]] ^ T5[t5[0x1]] ^ T6[t6[0x2]] ^ T7[t7[0x7]]; + g[0xD] = T0[t0[0xD]] ^ C64e(0xc6c632f4a5f497a5) ^ T2[t2[0xF]] ^ T3[t3[0x0]] ^ T4[t4[0x1]] ^ T5[t5[0x2]] ^ T6[t6[0x3]] ^ T7[t7[0x8]]; + g[0xE] = T0[t0[0xE]] ^ T1[t1[0xF]] ^ T2[t2[0x0]] ^ T3[t3[0x1]] ^ T4[t4[0x2]] ^ T5[t5[0x3]] ^ T6[t6[0x4]] ^ T7[t7[0x9]]; + g[0xF] = T0[t0[0xF]] ^ T1[t1[0x0]] ^ T2[t2[0x1]] ^ T3[t3[0x2]] ^ T4[t4[0x3]] ^ T5[t5[0x4]] ^ T6[t6[0x5]] ^ T7[t7[0xA]]; + + + PERM_BIG_P(g, 1, 14); + PERM_BIG_Q(m); + +#pragma unroll + for (u = 0; u < 16; u++) g[u] ^= m[u]; +#pragma unroll + for (u = 0; u < 8; u++) m[u] = g[u + 8]; + g[15] ^= H15; + PERM_BIG_P(g, 0, r); + +round: +// move the ^= to the relevant first byte down here? tried that, was slower?!?!? + g[0x0] ^= PC64(0x00, r); + g[0x1] ^= PC64(0x10, r); + g[0x6] ^= PC64(0x60, r); + g[0xB] ^= PC64(0xB0, r); + g[0xC] ^= PC64(0xC0, r); + g[0xD] ^= PC64(0xD0, r); + g[0xE] ^= PC64(0xE0, r); + g[0xF] ^= PC64(0xF0, r); + t0[0x0] = B64_0(g[0x0]); + t1[0x0] = B64_1(g[0x0]); + t2[0x0] = B64_2(g[0x0]); + t3[0x0] = B64_3(g[0x0]); + t4[0x0] = B64_4(g[0x0]); + t5[0x0] = B64_5(g[0x0]); + t6[0x0] = B64_6(g[0x0]); + t7[0x0] = B64_7(g[0x0]); + t0[0x1] = B64_0(g[0x1]); + t1[0x1] = B64_1(g[0x1]); + t2[0x1] = B64_2(g[0x1]); + t3[0x1] = B64_3(g[0x1]); + t4[0x1] = B64_4(g[0x1]); + t5[0x1] = B64_5(g[0x1]); + t6[0x1] = B64_6(g[0x1]); + t7[0x1] = B64_7(g[0x1]); + t0[0x6] = B64_0(g[0x6]); + t1[0x6] = B64_1(g[0x6]); + t2[0x6] = B64_2(g[0x6]); + t3[0x6] = B64_3(g[0x6]); + t4[0x6] = B64_4(g[0x6]); + t5[0x6] = B64_5(g[0x6]); + t6[0x6] = B64_6(g[0x6]); + t7[0x6] = B64_7(g[0x6]); + t0[0xB] = B64_0(g[0xB]); + t1[0xB] = B64_1(g[0xB]); + t2[0xB] = B64_2(g[0xB]); + t3[0xB] = B64_3(g[0xB]); + t4[0xB] = B64_4(g[0xB]); + t5[0xB] = B64_5(g[0xB]); + t6[0xB] = B64_6(g[0xB]); + t7[0xB] = B64_7(g[0xB]); + t0[0xC] = B64_0(g[0xC]); + t1[0xC] = B64_1(g[0xC]); + t2[0xC] = B64_2(g[0xC]); + t3[0xC] = B64_3(g[0xC]); + t4[0xC] = B64_4(g[0xC]); + t5[0xC] = B64_5(g[0xC]); + t6[0xC] = B64_6(g[0xC]); + t7[0xC] = B64_7(g[0xC]); + t0[0xD] = B64_0(g[0xD]); + t1[0xD] = B64_1(g[0xD]); + t2[0xD] = B64_2(g[0xD]); + t3[0xD] = B64_3(g[0xD]); + t4[0xD] = B64_4(g[0xD]); + t5[0xD] = B64_5(g[0xD]); + t6[0xD] = B64_6(g[0xD]); + t7[0xD] = B64_7(g[0xD]); + t0[0xE] = B64_0(g[0xE]); + t1[0xE] = B64_1(g[0xE]); + t2[0xE] = B64_2(g[0xE]); + t3[0xE] = B64_3(g[0xE]); + t4[0xE] = B64_4(g[0xE]); + t5[0xE] = B64_5(g[0xE]); + t6[0xE] = B64_6(g[0xE]); + t7[0xE] = B64_7(g[0xE]); + t0[0xF] = B64_0(g[0xF]); + t1[0xF] = B64_1(g[0xF]); + t2[0xF] = B64_2(g[0xF]); + t3[0xF] = B64_3(g[0xF]); + t4[0xF] = B64_4(g[0xF]); + t5[0xF] = B64_5(g[0xF]); + t6[0xF] = B64_6(g[0xF]); + t7[0xF] = B64_7(g[0xF]); + + if (flag < 2) { + g[0x2] ^= PC64(0x20, r); + g[0x3] ^= PC64(0x30, r); + g[0x4] ^= PC64(0x40, r); + g[0x5] ^= PC64(0x50, r); + g[0x7] ^= PC64(0x70, r); + g[0x8] ^= PC64(0x80, r); + g[0x9] ^= PC64(0x90, r); + g[0xA] ^= PC64(0xA0, r); + t0[0x2] = B64_0(g[0x2]); + t1[0x2] = B64_1(g[0x2]); + t2[0x2] = B64_2(g[0x2]); + t3[0x2] = B64_3(g[0x2]); + t4[0x2] = B64_4(g[0x2]); + t5[0x2] = B64_5(g[0x2]); + t6[0x2] = B64_6(g[0x2]); + t7[0x2] = B64_7(g[0x2]); + t0[0x3] = B64_0(g[0x3]); + t1[0x3] = B64_1(g[0x3]); + t2[0x3] = B64_2(g[0x3]); + t3[0x3] = B64_3(g[0x3]); + t4[0x3] = B64_4(g[0x3]); + t5[0x3] = B64_5(g[0x3]); + t6[0x3] = B64_6(g[0x3]); + t7[0x3] = B64_7(g[0x3]); + t0[0x4] = B64_0(g[0x4]); + t1[0x4] = B64_1(g[0x4]); + t2[0x4] = B64_2(g[0x4]); + t3[0x4] = B64_3(g[0x4]); + t4[0x4] = B64_4(g[0x4]); + t5[0x4] = B64_5(g[0x4]); + t6[0x4] = B64_6(g[0x4]); + t7[0x4] = B64_7(g[0x4]); + t0[0x5] = B64_0(g[0x5]); + t1[0x5] = B64_1(g[0x5]); + t2[0x5] = B64_2(g[0x5]); + t3[0x5] = B64_3(g[0x5]); + t4[0x5] = B64_4(g[0x5]); + t5[0x5] = B64_5(g[0x5]); + t6[0x5] = B64_6(g[0x5]); + t7[0x5] = B64_7(g[0x5]); + t0[0x7] = B64_0(g[0x7]); + t1[0x7] = B64_1(g[0x7]); + t2[0x7] = B64_2(g[0x7]); + t3[0x7] = B64_3(g[0x7]); + t4[0x7] = B64_4(g[0x7]); + t5[0x7] = B64_5(g[0x7]); + t6[0x7] = B64_6(g[0x7]); + t7[0x7] = B64_7(g[0x7]); + t0[0x8] = B64_0(g[0x8]); + t1[0x8] = B64_1(g[0x8]); + t2[0x8] = B64_2(g[0x8]); + t3[0x8] = B64_3(g[0x8]); + t4[0x8] = B64_4(g[0x8]); + t5[0x8] = B64_5(g[0x8]); + t6[0x8] = B64_6(g[0x8]); + t7[0x8] = B64_7(g[0x8]); + t0[0x9] = B64_0(g[0x9]); + t1[0x9] = B64_1(g[0x9]); + t2[0x9] = B64_2(g[0x9]); + t3[0x9] = B64_3(g[0x9]); + t4[0x9] = B64_4(g[0x9]); + t5[0x9] = B64_5(g[0x9]); + t6[0x9] = B64_6(g[0x9]); + t7[0x9] = B64_7(g[0x9]); + t0[0xA] = B64_0(g[0xA]); + t1[0xA] = B64_1(g[0xA]); + t2[0xA] = B64_2(g[0xA]); + t3[0xA] = B64_3(g[0xA]); + t4[0xA] = B64_4(g[0xA]); + t5[0xA] = B64_5(g[0xA]); + t6[0xA] = B64_6(g[0xA]); + t7[0xA] = B64_7(g[0xA]); + if (flag == 0) { + RBTT(g[0x8], 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); + RBTT(g[0x9], 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); + RBTT(g[0xA], 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); + } else { + RBTT(g[0x0], 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); + RBTT(g[0x1], 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); + RBTT(g[0x6], 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); + } + RBTT(g[0xC], 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); + RBTT(g[0xD], 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); + RBTT(g[0xE], 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); + RBTT(g[0xF], 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); + } + RBTT(g[0xB], 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); + + if (flag == 2) goto end; + if (flag++ == 1) { + r = 13; + goto round; + } + + r = 12; +#pragma unroll + for (u = 0; u < 8; u++) m[u] ^= g[u + 8]; + m[7] ^= H15; + m[8] = 0x80; + m[9] = 0; + m[10] = 0; + goto perm; + +end: + if ((g[3 + 8] ^ m[3]) <= target) output[output[0xFF]++] = as_uint(as_uchar4(gid).wzyx); } -#endif // GROESTLCOIN_CL +#endif // GROESTLCOIN_CL \ No newline at end of file diff --git a/kernel/inkcoin.cl b/kernel/inkcoin.cl index 8311453a..fa787edb 100644 --- a/kernel/inkcoin.cl +++ b/kernel/inkcoin.cl @@ -4,7 +4,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +12,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -29,8 +29,8 @@ * @author phm */ -#ifndef DARKCOIN_CL -#define DARKCOIN_CL +#ifndef INKCOIN_CL +#define INKCOIN_CL #if __ENDIAN_LITTLE__ #define SPH_LITTLE_ENDIAN 1 @@ -54,34 +54,15 @@ typedef long sph_s64; #define SPH_64_TRUE 1 #define SPH_C32(x) ((sph_u32)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_C64(x) ((sph_u64)(x ## UL)) -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) -#define SPH_ECHO_64 1 -#define SPH_KECCAK_64 1 -#define SPH_JH_64 1 -#define SPH_SIMD_NOCOPY 0 -#define SPH_KECCAK_NOCOPY 0 -#define SPH_SMALL_FOOTPRINT_GROESTL 0 -#define SPH_GROESTL_BIG_ENDIAN 0 -#define SPH_CUBEHASH_UNROLL 0 - -#ifndef SPH_COMPACT_BLAKE_64 - #define SPH_COMPACT_BLAKE_64 0 -#endif -#ifndef SPH_LUFFA_PARALLEL - #define SPH_LUFFA_PARALLEL 0 -#endif -#ifndef SPH_KECCAK_UNROLL - #define SPH_KECCAK_UNROLL 0 -#endif - #include "shavite.cl" #define SWAP4(x) as_uint(as_uchar4(x).wzyx) @@ -97,6 +78,14 @@ typedef long sph_s64; #define DEC32LE(x) (*(const __global sph_u32 *) (x)); #endif +#define SHL(x, n) ((x) << (n)) +#define SHR(x, n) ((x) >> (n)) + +#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ + q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ + q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ + q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + // __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) { @@ -244,4 +233,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp output[output[0xFF]++] = SWAP4(gid); } -#endif // DARKCOIN_CL +#endif // INKCOIN_CL \ No newline at end of file diff --git a/kernel/marucoin-mod.cl b/kernel/marucoin-mod.cl index b0bf0dd3..0cd844ef 100644 --- a/kernel/marucoin-mod.cl +++ b/kernel/marucoin-mod.cl @@ -460,69 +460,92 @@ __kernel void search2(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif int init = get_local_id(0); int step = get_local_size(0); for (int i = init; i < 256; i += step) { - T0_L[i] = T0[i]; - T4_L[i] = T4[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif } - - barrier(CLK_LOCAL_MEM_FENCE); - - #define T0 T0_L - #define T1 T1_L - #define T2 T2_L - #define T3 T3_L - #define T4 T4_L - #define T5 T5_L - #define T6 T6_L - #define T7 T7_L - - // groestl - sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C + + + sph_u64 H[16]; +//#pragma unroll 15 + for (unsigned int u = 0; u < 15; u ++) + H[u] = 0; +#if USE_LE + H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); +#else + H[15] = (sph_u64)512; +#endif sph_u64 g[16], m[16]; - g[0] = m[0] = DEC64E(hash->h8[0]); - g[1] = m[1] = DEC64E(hash->h8[1]); - g[2] = m[2] = DEC64E(hash->h8[2]); - g[3] = m[3] = DEC64E(hash->h8[3]); - g[4] = m[4] = DEC64E(hash->h8[4]); - g[5] = m[5] = DEC64E(hash->h8[5]); - g[6] = m[6] = DEC64E(hash->h8[6]); - g[7] = m[7] = DEC64E(hash->h8[7]); - g[8] = m[8] = 0x80; - g[9] = m[9] = 0; - g[10] = m[10] = 0; - g[11] = m[11] = 0; - g[12] = m[12] = 0; - g[13] = m[13] = 0; - g[14] = m[14] = 0; - g[15] = 0x102000000000000; - m[15] = 0x100000000000000; - + m[0] = DEC64E(hash->h8[0]); + m[1] = DEC64E(hash->h8[1]); + m[2] = DEC64E(hash->h8[2]); + m[3] = DEC64E(hash->h8[3]); + m[4] = DEC64E(hash->h8[4]); + m[5] = DEC64E(hash->h8[5]); + m[6] = DEC64E(hash->h8[6]); + m[7] = DEC64E(hash->h8[7]); + +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + g[u] = m[u] ^ H[u]; + m[8] = 0x80; g[8] = m[8] ^ H[8]; + m[9] = 0; g[9] = m[9] ^ H[9]; + m[10] = 0; g[10] = m[10] ^ H[10]; + m[11] = 0; g[11] = m[11] ^ H[11]; + m[12] = 0; g[12] = m[12] ^ H[12]; + m[13] = 0; g[13] = m[13] ^ H[13]; + m[14] = 0; g[14] = m[14] ^ H[14]; + m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; PERM_BIG_P(g); PERM_BIG_Q(m); - sph_u64 xH[16]; +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u] ^= g[u] ^ m[u]; + H[u] ^= g[u] ^ m[u]; + sph_u64 xH[16]; +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u]; PERM_BIG_P(xH); - for (unsigned int u = 8; u < 16; u ++) - hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + H[u] ^= xH[u]; + +//#pragma unroll 8 + for (unsigned int u = 0; u < 8; u ++) + hash->h8[u] = DEC64E(H[u + 8]); + barrier(CLK_GLOBAL_MEM_FENCE); - barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -843,7 +866,7 @@ __kernel void search8(__global hash_t* hashes) sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; - sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; rk00 = hash->h4[0]; rk01 = hash->h4[1]; @@ -1266,4 +1289,4 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo barrier(CLK_GLOBAL_MEM_FENCE); } -#endif // X13MOD_CL +#endif // X13MOD_CL \ No newline at end of file diff --git a/kernel/marucoin.cl b/kernel/marucoin.cl index 07f63b48..ee81d803 100644 --- a/kernel/marucoin.cl +++ b/kernel/marucoin.cl @@ -4,7 +4,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +12,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -862,4 +862,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp output[output[0xFF]++] = SWAP4(gid); } -#endif // MARUCOIN_CL +#endif // MARUCOIN_CL \ No newline at end of file diff --git a/kernel/myriadcoin-groestl.cl b/kernel/myriadcoin-groestl.cl index c5233f67..1067ccb1 100644 --- a/kernel/myriadcoin-groestl.cl +++ b/kernel/myriadcoin-groestl.cl @@ -4,7 +4,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +12,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -54,23 +54,19 @@ typedef long sph_s64; #define SPH_64_TRUE 1 #define SPH_C32(x) ((sph_u32)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_C64(x) ((sph_u64)(x ## UL)) -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ECHO_64 1 #define SPH_SIMD_NOCOPY 0 #define SPH_CUBEHASH_UNROLL 0 -#ifndef SPH_LUFFA_PARALLEL - #define SPH_LUFFA_PARALLEL 0 -#endif - #include "groestl.cl" #define SWAP4(x) as_uint(as_uchar4(x).wzyx) @@ -84,6 +80,14 @@ typedef long sph_s64; #define DEC64E(x) (*(const __global sph_u64 *) (x)); #endif +#define SHL(x, n) ((x) << (n)) +#define SHR(x, n) ((x) >> (n)) + +#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ + q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ + q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ + q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + #define ROL32(x, n) rotate(x, (uint) n) #define SHR(x, n) ((x) >> n) #define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) @@ -138,34 +142,41 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp ulong h8[8]; } hash; - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif int init = get_local_id(0); int step = get_local_size(0); + for (int i = init; i < 256; i += step) { - T0_L[i] = T0[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T4_L[i] = T4[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif } - barrier(CLK_LOCAL_MEM_FENCE); - -#define T0 T0_L -#define T1 T1_L -#define T2 T2_L -#define T3 T3_L -#define T4 T4_L -#define T5 T5_L -#define T6 T6_L -#define T7 T7_L + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C - // groestl sph_u64 H[16]; +//#pragma unroll 15 for (unsigned int u = 0; u < 15; u ++) H[u] = 0; #if USE_LE @@ -193,20 +204,33 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp m[13] = 0; m[14] = 0; m[15] = 0x100000000000000; + +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) g[u] = m[u] ^ H[u]; + PERM_BIG_P(g); PERM_BIG_Q(m); + +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) H[u] ^= g[u] ^ m[u]; sph_u64 xH[16]; + +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) xH[u] = H[u]; PERM_BIG_P(xH); + +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) H[u] ^= xH[u]; + +//#pragma unroll 8 for (unsigned int u = 0; u < 8; u ++) hash.h8[u] = ENC64E(H[u + 8]); + barrier(CLK_GLOBAL_MEM_FENCE); + uint temp1; uint W0 = SWAP32(hash.h4[0x0]); uint W1 = SWAP32(hash.h4[0x1]); @@ -396,4 +420,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp output[output[0xFF]++] = SWAP4(gid); } -#endif // MYRIADCOIN_GROESTL_CL +#endif // MYRIADCOIN_GROESTL_CL \ No newline at end of file diff --git a/kernel/neoscrypt.cl b/kernel/neoscrypt.cl index 7939d7ed..1531a3dd 100644 --- a/kernel/neoscrypt.cl +++ b/kernel/neoscrypt.cl @@ -1,15 +1,14 @@ /* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */ /* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */ -// Stupid AMD compiler ignores the unroll pragma in these two -#define SALSA_SMALL_UNROLL 3 -#define CHACHA_SMALL_UNROLL 3 +#define rotl(x,y) rotate(x,y) +#define Ch(x,y,z) bitselect(z,y,x) +#define Maj(x,y,z) Ch((x^z),y,z) +#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) +#define ROTL32(a,b) rotate(a,as_uint(b)) -// If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted -// as the unroll factor; must divide cleanly into ten. -// Usually a bad idea. -//#define SMALL_BLAKE2S -//#define BLAKE2S_UNROLL 5 +__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 }; +#define EndianSwap(n) (rotate(n & ES[0], 24U)|rotate(n & ES[1], 8U)) #define BLOCK_SIZE 64U #define FASTKDF_BUFFER_SIZE 256U @@ -17,62 +16,236 @@ #define PASSWORD_LEN 80U #endif -#if !defined(cl_khr_byte_addressable_store) -#error "Device does not support unaligned stores" +#ifdef TEST +__constant uchar testsalt[]= { +135, 99, 188, 101, 252, 81, 54, 91, 243, 212, 78, 99, 46, 1, 113, 232, 9, 208, 203, 88, 25, 93, 218, 215, 53, 112, 105, 136, 238, 114, 242, 24, 194, 144, 239, 172, 37, 158, 113, 15, 116, 114, 47, 53, 51, 167, 178, 107, 192, 90, 92, 37, 59, 116, 234, 107, 80, 251, 2, 251, 145, 185, 119, 89, 115, 112, 94, 154, 117, 126, 233, 100, 15, 24, 246, 137, 220, 124, 244, 244, 129, 246, 244, 180, 78, 247, 146, 229, 69, 177, 143, 94, 2, 144, 63, 33, 89, 136, 234, 174, 38, 37, 183, 62, 176, 243, 136, 30, 249, 195, 129, 227, 146, 216, 38, 118, 185, 43, 175, 217, 246, 203, 251, 211, 222, 237, 21, 231, 133, 218, 206, 9, 148, 229, 20, 229, 101, 146, 183, 120, 155, 91, 16, 10, 86, 198, 185, 179, 1, 197, 69, 95, 44, 133, 49, 225, 2, 115, 182, 6, 82, 166, 35, 3, 19, 59, 193, 253, 14, 239, 65, 79, 105, 154, 70, 146, 169, 233, 227, 20, 66, 15, 52, 223, 228, 202, 158, 207, 6, 245, 204, 212, 220, 108, 204, 39, 136, 66, 215, 186, 247, 184, 92, 171, 56, 166, 162, 105, 126, 162, 127, 175, 181, 227, 236, 233, 127, 219, 115, 30, 136, 108, 169, 14, 172, 71, 82, 250, 141, 209, 98, 216, 221, 165, 132, 146, 98, 76, 194, 239, 123, 90, 91, 193, 58, 121, 235, 161, 51, 144, 5, 41, 216, 160, 93, 173 +}; #endif -// Swaps 128 bytes at a time without using temp vars -void SwapBytes128(void *restrict A, void *restrict B, uint len) -{ - #pragma unroll 2 - for(int i = 0; i < (len >> 7); ++i) - { - ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i]; - ((ulong16 *)B)[i] ^= ((ulong16 *)A)[i]; - ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i]; - } +/* When changing the optimal type, make sure the loops unrolled + in _blkcopy, _blkswp and _blkxor are modified accordingly. */ +#define OPTIMAL_TYPE uint + +/* Fast 32-bit / 64-bit memcpy(); + * len must be a multiple of 32 bytes */ +void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { + OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; + OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; + uint i; + +#ifdef WITH_UNROLL +#pragma unroll(1<< max(0, (32- sizeof(OPTIMAL_TYPE))>> 2)) + for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); ++i) + dst[i] = src[i]; +#else + for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { + dst[i] = src[i]; + dst[i + 1] = src[i + 1]; + dst[i + 2] = src[i + 2]; + dst[i + 3] = src[i + 3]; + } +#endif +} +void neoscrypt_gl_blkcpy(__global void *dstp, const void *srcp, uint len) { + __global OPTIMAL_TYPE *dst = (__global OPTIMAL_TYPE *) dstp; + OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; + uint i; + + for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { + dst[i] = src[i]; + dst[i + 1] = src[i + 1]; + dst[i + 2] = src[i + 2]; + dst[i + 3] = src[i + 3]; + } } -void CopyBytes128(void *restrict dst, const void *restrict src, uint len) -{ - #pragma unroll 2 - for(int i = 0; i < len; ++i) - ((ulong16 *)dst)[i] = ((ulong16 *)src)[i]; +/* Fast 32-bit / 64-bit block swapper; + * len must be a multiple of 32 bytes */ +void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { + OPTIMAL_TYPE *blkA = (OPTIMAL_TYPE *) blkAp; + OPTIMAL_TYPE *blkB = (OPTIMAL_TYPE *) blkBp; + OPTIMAL_TYPE t0, t1, t2, t3; + uint i; + + for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { + t0 = blkA[i]; + t1 = blkA[i + 1]; + t2 = blkA[i + 2]; + t3 = blkA[i + 3]; + blkA[i] = blkB[i]; + blkA[i + 1] = blkB[i + 1]; + blkA[i + 2] = blkB[i + 2]; + blkA[i + 3] = blkB[i + 3]; + blkB[i] = t0; + blkB[i + 1] = t1; + blkB[i + 2] = t2; + blkB[i + 3] = t3; + } } -void CopyBytes(void *restrict dst, const void *restrict src, uint len) -{ - for(int i = 0; i < len; ++i) - ((uchar *)dst)[i] = ((uchar *)src)[i]; +/* Fast 32-bit / 64-bit block XOR engine; + * len must be a multiple of 32 bytes */ +void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { + OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; + OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; + uint i; + + for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { + dst[i] ^= src[i]; + dst[i + 1] ^= src[i + 1]; + dst[i + 2] ^= src[i + 2]; + dst[i + 3] ^= src[i + 3]; + } } -void XORBytesInPlace(void *restrict dst, const void *restrict src, uint len) -{ - for(int i = 0; i < len; ++i) - ((uchar *)dst)[i] ^= ((uchar *)src)[i]; +void neoscrypt_gl_blkxor(void *dstp, __global void *srcp, uint len) { + OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; + __global OPTIMAL_TYPE *src = (__global OPTIMAL_TYPE *) srcp; + uint i; + + for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { + dst[i] ^= src[i]; + dst[i + 1] ^= src[i + 1]; + dst[i + 2] ^= src[i + 2]; + dst[i + 3] ^= src[i + 3]; + } } -void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len) -{ - #pragma unroll 1 - for(int i = 0; i < len; ++i) - ((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i]; +/* 32-bit / 64-bit / 128-bit optimised memcpy() */ +void neoscrypt_copy(void *dstp, const void *srcp, uint len) { + OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; + OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; + uint i, tail; + const uint c_len= len/ sizeof(OPTIMAL_TYPE); + + for(i= 0; i< c_len; ++i) + dst[i] = src[i]; + + tail= len- c_len* sizeof(OPTIMAL_TYPE); + if(tail) { +#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) + uchar *dstb = (uchar *) dstp; + uchar *srcb = (uchar *) srcp; + + for(i= len- tail; i< len; i++) + dstb[i] = srcb[i]; +#else + uint *dsti = (uint *) dstp; + uint *srci = (uint *) srcp; + + for(i*= (sizeof(OPTIMAL_TYPE)/ sizeof(uint)); i< (len>> 2); ++i) + dsti[i] = srci[i]; +#endif + } } -// Blake2S +/* 32-bit / 64-bit / 128-bit optimised memcpy() */ +void neoscrypt_gl_copy(__global uchar *dstp, const void *srcp, uint len) { + __global OPTIMAL_TYPE *dst = (__global OPTIMAL_TYPE *) dstp; + OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; + uint i, tail; + + for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i++) + dst[i] = src[i]; + + tail = len & (sizeof(OPTIMAL_TYPE) - 1); + if(tail) { + uchar *srcb = (uchar *) srcp; + + for(i = len - tail; i < len; i++) + dstp[i] = srcb[i]; + } +} + +/* 32-bit / 64-bit optimised memory erase aka memset() to zero */ +void neoscrypt_erase(void *dstp, uint len) { + const OPTIMAL_TYPE null = 0; + OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; + uint i, tail; + + for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i++) + dst[i] = null; + + tail = len & (sizeof(OPTIMAL_TYPE) - 1); + if(tail) { +#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) + uchar *dstb = (uchar *) dstp; + + for(i = len - tail; i < len; i++) + dstb[i] = 0u; +#else + uint *dsti = (uint *) dstp; + + for(i*= sizeof(OPTIMAL_TYPE)/ sizeof(uint); i< (len>> 2); ++i) + dsti[i] = 0u; +#endif + } +} + +/* 32-bit / 64-bit optimised XOR engine */ +void neoscrypt_xor(void *dstp, const void *srcp, uint len) { + OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; + OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; + uint i, tail; + const unsigned c_len= len/ sizeof(OPTIMAL_TYPE); + + for(i= 0; i< c_len; ++i) + dst[i]^= src[i]; + + //tail = len & (sizeof(OPTIMAL_TYPE) - 1); + tail= len- c_len* sizeof(OPTIMAL_TYPE); + if(tail) { +#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) + uchar *dstb = (uchar *) dstp; + uchar *srcb = (uchar *) srcp; + + for(i = len - tail; i < len; i++) + dstb[i] ^= srcb[i]; +#else + uint *dsti = (uint *) dstp; + uint *srci = (uint *) srcp; + + for(i*= (sizeof(OPTIMAL_TYPE)/ sizeof(uint)); i < (len>> 2); ++i) + dsti[i]^= srci[i]; +#endif + } +} + +/* BLAKE2s */ #define BLAKE2S_BLOCK_SIZE 64U #define BLAKE2S_OUT_SIZE 32U #define BLAKE2S_KEY_SIZE 32U -static const __constant uint BLAKE2S_IV[8] = -{ +/* Parameter block of 32 bytes */ +typedef struct blake2s_param_t { + uchar digest_length; + uchar key_length; + uchar fanout; + uchar depth; + uint leaf_length; + uchar node_offset[6]; + uchar node_depth; + uchar inner_length; + uchar salt[8]; + uchar personal[8]; +} blake2s_param; + +/* State block of 180 bytes */ +typedef struct blake2s_state_t { + uint h[8]; + uint t[2]; + uint f[2]; + uchar buf[2 * BLAKE2S_BLOCK_SIZE]; + uint buflen; +} blake2s_state; + +__constant uint blake2s_IV[8] = { 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; -static const __constant uchar BLAKE2S_SIGMA[10][16] = -{ +__constant uchar blake2s_sigma[10][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , @@ -85,100 +258,140 @@ static const __constant uchar BLAKE2S_SIGMA[10][16] = { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , }; -#define BLAKE_G(idx0, idx1, a, b, c, d, key) do { \ - a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \ - d = rotate(d ^ a, 16U); \ - c += d; \ - b = rotate(b ^ c, 20U); \ - a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \ - d = rotate(d ^ a, 24U); \ - c += d; \ - b = rotate(b ^ c, 25U); \ -} while(0) - -void Blake2S(uint *restrict inout, const uint *restrict inkey) -{ - uint16 V; - uint8 tmpblock; - - // Load first block (IV into V.lo) and constants (IV into V.hi) - V.lo = V.hi = vload8(0U, BLAKE2S_IV); - - // XOR with initial constant - V.s0 ^= 0x01012020; - - // Copy input block for later - tmpblock = V.lo; - - // XOR length of message so far (including this block) - // There are two uints for this field, but high uint is zero - V.sc ^= BLAKE2S_BLOCK_SIZE; - - // Compress state, using the key as the key - #ifdef SMALL_BLAKE2S - #pragma unroll BLAKE2S_UNROLL - #else - #pragma unroll - #endif - for(int x = 0; x < 10; ++x) - { - BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey); - BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey); - BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey); - BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey); - BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey); - BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey); - BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey); - BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey); - } - - // XOR low part of state with the high part, - // then with the original input block. - V.lo ^= V.hi ^ tmpblock; - - // Load constants (IV into V.hi) - V.hi = vload8(0U, BLAKE2S_IV); - - // Copy input block for later - tmpblock = V.lo; - - // XOR length of message into block again - V.sc ^= BLAKE2S_BLOCK_SIZE << 1; - - // Last block compression - XOR final constant into state - V.se ^= 0xFFFFFFFFU; - - // Compress block, using the input as the key - #ifdef SMALL_BLAKE2S - #pragma unroll BLAKE2S_UNROLL - #else - #pragma unroll - #endif - for(int x = 0; x < 10; ++x) - { - BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout); - BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout); - BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout); - BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout); - BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout); - BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout); - BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout); - BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout); - } +void blake2s_compress(blake2s_state *S, const uint *buf) { + uint i; + uint m[16]; + uint v[16]; + + neoscrypt_copy(m, buf, 64); + neoscrypt_copy(v, S->h, 32); + + v[ 8] = blake2s_IV[0]; + v[ 9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; + v[11] = blake2s_IV[3]; + v[12] = S->t[0] ^ blake2s_IV[4]; + v[13] = S->t[1] ^ blake2s_IV[5]; + v[14] = S->f[0] ^ blake2s_IV[6]; + v[15] = S->f[1] ^ blake2s_IV[7]; +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2s_sigma[r][2*i+0]]; \ + d = ROTR32(d ^ a, 16); \ + c = c + d; \ + b = ROTR32(b ^ c, 12); \ + a = a + b + m[blake2s_sigma[r][2*i+1]]; \ + d = ROTR32(d ^ a, 8); \ + c = c + d; \ + b = ROTR32(b ^ c, 7); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \ + G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \ + G(r, 2, v[ 2], v[ 6], v[10], v[14]); \ + G(r, 3, v[ 3], v[ 7], v[11], v[15]); \ + G(r, 4, v[ 0], v[ 5], v[10], v[15]); \ + G(r, 5, v[ 1], v[ 6], v[11], v[12]); \ + G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ + G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ + } while(0) + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + + for(i = 0; i < 8; i++) + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND +} - // XOR low part of state with high part, then with input block - V.lo ^= V.hi ^ tmpblock; +void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) { + uint left, fill; + + while(input_size > 0) { + left = S->buflen; + fill = 2 * BLAKE2S_BLOCK_SIZE - left; + if(input_size > fill) { + /* Buffer fill */ + neoscrypt_copy(&S->buf[left], input, fill); + S->buflen += fill; + /* Counter increment */ + S->t[0] += BLAKE2S_BLOCK_SIZE; + /* Compress */ + blake2s_compress(S, (uint *) S->buf); + /* Shift buffer left */ + neoscrypt_copy(S->buf, &S->buf[BLAKE2S_BLOCK_SIZE], BLAKE2S_BLOCK_SIZE); + S->buflen -= BLAKE2S_BLOCK_SIZE; + input += fill; + input_size -= fill; + } else { + neoscrypt_copy(&S->buf[left], input, input_size); + S->buflen += input_size; + /* Do not compress */ + //input += input_size; + input_size = 0; + } + } +} - // Store result in input/output buffer - vstore8(V.lo, 0, inout); +void blake2s(const void *input, const uint input_size, + const void *key, const uchar key_size, + void *output, const uchar output_size) { + uchar block[BLAKE2S_BLOCK_SIZE]; + blake2s_param P; + blake2s_state S; + + /* Initialise */ + neoscrypt_erase(&P, sizeof(blake2s_param)); + P.digest_length = output_size; + P.key_length = key_size; + P.fanout = 1; + P.depth = 1; + + neoscrypt_erase(&S, sizeof(blake2s_state)); + // Initialize the state + for(int i= 0; i< 8; ++i) + S.h[i]= blake2s_IV[i]; + // neoscrypt_xor(&S, &P, 32); + S.h[0]^= ((uint)output_size)| (((uint)key_size)<< 8)| (1U<< 16)| (1U<< 24); + // All other values of P are unset yet. + + neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); + neoscrypt_copy(block, key, key_size); + blake2s_update(&S, block, BLAKE2S_BLOCK_SIZE); + /* Update */ + blake2s_update(&S, (uchar *) input, input_size); + + /* Finish */ + if(S.buflen > BLAKE2S_BLOCK_SIZE) { + S.t[0] += BLAKE2S_BLOCK_SIZE; + blake2s_compress(&S, (uint *) S.buf); + S.buflen -= BLAKE2S_BLOCK_SIZE; + neoscrypt_copy(S.buf, &S.buf[BLAKE2S_BLOCK_SIZE], S.buflen); + } + S.t[0] += S.buflen; + S.f[0] = ~0U; + neoscrypt_erase(&S.buf[S.buflen], 2 * BLAKE2S_BLOCK_SIZE - S.buflen); + blake2s_compress(&S, (uint *) S.buf); + /* Write back */ + neoscrypt_copy(output, S.h, output_size); } /* FastKDF, a fast buffered key derivation function: * FASTKDF_BUFFER_SIZE must be a power of 2; * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; * prf_output_size must be <= prf_key_size; */ -void fastkdf(const uchar *restrict password, const uchar *restrict salt, const uint salt_len, uchar *restrict output, uint output_len) -{ +void fastkdf(const uchar *password, const uchar *salt, const uint salt_len, + uint N, uchar *output, uint output_len) { /* WARNING! * This algorithm uses byte-wise addressing for memory blocks. @@ -194,234 +407,145 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u // BLAKE2S_BLOCK_SIZE 64U // BLAKE2S_KEY_SIZE 32U // BLAKE2S_OUT_SIZE 32U - uchar bufidx = 0; - uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) }; - uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer; - - // Initialize the password buffer - #pragma unroll 1 - for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10]; - - ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0]; - - // Initialize the salt buffer - if(salt_len == FASTKDF_BUFFER_SIZE) - { - ((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0]; - ((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1]; - } - else - { - // salt_len is 80 bytes here - #pragma unroll 1 - for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)salt)[i % 10]; - - // Initialized the rest to zero earlier - #pragma unroll 1 - for(int i = 0; i < 10; ++i) ((ulong *)(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong *)salt)[i]; - } - - // The primary iteration - #pragma unroll 1 - for(int i = 0; i < 32; ++i) - { - // Make the key buffer twice the size of the key so it fits a Blake2S block - // This way, we don't need a temp buffer in the Blake2S function. - uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 }; - - // Copy input and key to their buffers - CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); - CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); - - // PRF - Blake2S((uint *)input, (uint *)key); - - // Calculate the next buffer pointer - bufidx = 0; - - for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x) - bufidx += input[x]; - - // bufidx a uchar now - always mod 255 - //bufidx &= (FASTKDF_BUFFER_SIZE - 1); + uchar A[FASTKDF_BUFFER_SIZE + BLAKE2S_BLOCK_SIZE]; + uchar B[FASTKDF_BUFFER_SIZE + BLAKE2S_KEY_SIZE]; + uchar prf_output[BLAKE2S_OUT_SIZE], prf_input[BLAKE2S_BLOCK_SIZE], + prf_key[BLAKE2S_KEY_SIZE]; + uint bufidx, a, b, i, j; + + /* Initialise the password buffer */ + a = FASTKDF_BUFFER_SIZE / PASSWORD_LEN; + for(i = 0, j= 0; i < a; ++i, j+= PASSWORD_LEN) + neoscrypt_copy(&A[j], (uchar *)password, PASSWORD_LEN); + b= FASTKDF_BUFFER_SIZE- j; + if(b) + neoscrypt_copy(&A[j], (uchar *)password, b); +#if (PASSWORD_LEN< BLAKE2S_BLOCK_SIZE) + /* Initialise the password buffer */ + a = BLAKE2S_BLOCK_SIZE / PASSWORD_LEN; + for(i = 0, j= 0; i < a; ++i, j+= PASSWORD_LEN) + neoscrypt_copy(&A[j], (uchar *)password, PASSWORD_LEN); + b= BLAKE2S_BLOCK_SIZE- j; + if(b) + neoscrypt_copy(&A[j], (uchar *)password, b); + //neoscrypt_copy(&A[FASTKDF_BUFFER_SIZE], (uchar *)password, PASSWORD_LEN); + //// Erase the remainder of the blake-block, when the password length is smaller + //neoscrypt_erase(&A[FASTKDF_BUFFER_SIZE+ PASSWORD_LEN], BLAKE2S_BLOCK_SIZE- PASSWORD_LEN); +#else + neoscrypt_copy(&A[FASTKDF_BUFFER_SIZE], (uchar *)password, BLAKE2S_BLOCK_SIZE); +#endif - // Modify the salt buffer - XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE); + /* Initialise the salt buffer */ + a = FASTKDF_BUFFER_SIZE/ salt_len; + for(i = 0, j= 0; i< a; ++i, j+= salt_len) + neoscrypt_copy(&B[j], salt, salt_len); + b= FASTKDF_BUFFER_SIZE- j; + if(b) + neoscrypt_copy(&B[j], (uchar *)salt, b); + if(salt_len< BLAKE2S_BLOCK_SIZE) { + neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE], (uchar *)salt, salt_len); + // Erase the remainder of the blake-block, when the password length is smaller + neoscrypt_erase(&B[FASTKDF_BUFFER_SIZE+ salt_len], BLAKE2S_BLOCK_SIZE- salt_len); + } else + neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE], salt, BLAKE2S_KEY_SIZE); + + /* The primary iteration */ + for(i = 0, bufidx = 0; i < N; ++i) { + /* Copy the PRF input buffer byte by byte to make sure prf_input + starts at a well aligned address. Missing to do so may slow down + computation. */ + for(j= 0, a= bufidx; j< BLAKE2S_BLOCK_SIZE; ++j, ++a) + prf_input[j]= A[a]; + + /* Copy the PRF key buffer */ + for(j= 0, a= bufidx; j< BLAKE2S_KEY_SIZE; ++j, ++a) + prf_key[j]= B[a]; + + /* PRF */ + blake2s(prf_input, BLAKE2S_BLOCK_SIZE, + prf_key, BLAKE2S_KEY_SIZE, + prf_output, BLAKE2S_OUT_SIZE); + + /* Calculate the next buffer pointer */ + for(j = 0, bufidx = 0; j < BLAKE2S_OUT_SIZE; j++) + bufidx += prf_output[j]; + bufidx &= (FASTKDF_BUFFER_SIZE - 1); + + /* Modify the salt buffer */ + //neoscrypt_xor(&B[bufidx], &prf_output[0], BLAKE2S_OUT_SIZE); + for(j= 0, a= bufidx; j< BLAKE2S_OUT_SIZE; ++j, ++a) + B[a]^= prf_output[j]; + + /* Head modified, tail updated */ + if(bufidx < BLAKE2S_KEY_SIZE) + //neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE + bufidx], &B[bufidx], + // min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); + for(j= 0, a= FASTKDF_BUFFER_SIZE + bufidx, b= bufidx; + j< min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE- bufidx); ++j, ++a, ++b) + B[a]= B[b]; + + /* Tail modified, head updated */ + if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) + neoscrypt_copy(B, &B[FASTKDF_BUFFER_SIZE], + BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); - if(bufidx < BLAKE2S_KEY_SIZE) - { - // Head modified, tail updated - // this was made off the original code... wtf - //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); - CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx); - } - else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) - { - // Tail modified, head updated - CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); - } } - // Modify and copy into the output buffer - - // Damned compiler crashes - // Fuck you, AMD - - //for(uint i = 0; i < output_len; ++i, ++bufidx) - // output[i] = B[bufidx] ^ A[i]; - - uint left = FASTKDF_BUFFER_SIZE - bufidx; - //uint left = (~bufidx) + 1 - - if(left < output_len) - { - XORBytes(output, B + bufidx, A, left); - XORBytes(output + left, B, A + left, output_len - left); - } - else - { - XORBytes(output, B + bufidx, A, output_len); - } + /* Modify and copy into the output buffer */ + if(output_len > FASTKDF_BUFFER_SIZE) + output_len = FASTKDF_BUFFER_SIZE; + + a = FASTKDF_BUFFER_SIZE - bufidx; + if(a >= output_len) { + for(j= 0, i= bufidx; j< output_len; ++j, ++i) + output[j]= B[i]^ A[j]; + } else { + for(j= 0, i= bufidx; j< a; ++j, ++i) + output[j]= B[i]^ A[j]; + for(j= a, i= 0; i< output_len- a; ++j, ++i) + output[j]= B[i]^ A[j]; + } } -#define SALSA_CORE(state) do { \ - state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \ - state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \ - state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \ - state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \ - state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \ - state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \ - state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \ - state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \ -} while(0) - -uint16 salsa_small_scalar_rnd(uint16 X) +uint16 neoscrypt_salsa(uint16 X) { - uint16 st = X; - - #if SALSA_SMALL_UNROLL == 1 + uint16 tmp = X; for(int i = 0; i < 10; ++i) { - SALSA_CORE(st); - } - - #elif SALSA_SMALL_UNROLL == 2 - - for(int i = 0; i < 5; ++i) - { - SALSA_CORE(st); - SALSA_CORE(st); - } - - #elif SALSA_SMALL_UNROLL == 3 - - for(int i = 0; i < 4; ++i) - { - SALSA_CORE(st); - if(i == 3) break; - SALSA_CORE(st); - SALSA_CORE(st); + tmp.s4 ^= rotate(tmp.s0 + tmp.sc, 7U); tmp.s8 ^= rotate(tmp.s4 + tmp.s0, 9U); tmp.sc ^= rotate(tmp.s8 + tmp.s4, 13U); tmp.s0 ^= rotate(tmp.sc + tmp.s8, 18U); + tmp.s9 ^= rotate(tmp.s5 + tmp.s1, 7U); tmp.sd ^= rotate(tmp.s9 + tmp.s5, 9U); tmp.s1 ^= rotate(tmp.sd + tmp.s9, 13U); tmp.s5 ^= rotate(tmp.s1 + tmp.sd, 18U); + tmp.se ^= rotate(tmp.sa + tmp.s6, 7U); tmp.s2 ^= rotate(tmp.se + tmp.sa, 9U); tmp.s6 ^= rotate(tmp.s2 + tmp.se, 13U); tmp.sa ^= rotate(tmp.s6 + tmp.s2, 18U); + tmp.s3 ^= rotate(tmp.sf + tmp.sb, 7U); tmp.s7 ^= rotate(tmp.s3 + tmp.sf, 9U); tmp.sb ^= rotate(tmp.s7 + tmp.s3, 13U); tmp.sf ^= rotate(tmp.sb + tmp.s7, 18U); + tmp.s1 ^= rotate(tmp.s0 + tmp.s3, 7U); tmp.s2 ^= rotate(tmp.s1 + tmp.s0, 9U); tmp.s3 ^= rotate(tmp.s2 + tmp.s1, 13U); tmp.s0 ^= rotate(tmp.s3 + tmp.s2, 18U); + tmp.s6 ^= rotate(tmp.s5 + tmp.s4, 7U); tmp.s7 ^= rotate(tmp.s6 + tmp.s5, 9U); tmp.s4 ^= rotate(tmp.s7 + tmp.s6, 13U); tmp.s5 ^= rotate(tmp.s4 + tmp.s7, 18U); + tmp.sb ^= rotate(tmp.sa + tmp.s9, 7U); tmp.s8 ^= rotate(tmp.sb + tmp.sa, 9U); tmp.s9 ^= rotate(tmp.s8 + tmp.sb, 13U); tmp.sa ^= rotate(tmp.s9 + tmp.s8, 18U); + tmp.sc ^= rotate(tmp.sf + tmp.se, 7U); tmp.sd ^= rotate(tmp.sc + tmp.sf, 9U); tmp.se ^= rotate(tmp.sd + tmp.sc, 13U); tmp.sf ^= rotate(tmp.se + tmp.sd, 18U); } - #elif SALSA_SMALL_UNROLL == 4 - - for(int i = 0; i < 3; ++i) - { - SALSA_CORE(st); - SALSA_CORE(st); - if(i == 2) break; - SALSA_CORE(st); - SALSA_CORE(st); - } - - #else - - for(int i = 0; i < 2; ++i) - { - SALSA_CORE(st); - SALSA_CORE(st); - SALSA_CORE(st); - SALSA_CORE(st); - SALSA_CORE(st); - } - - #endif - - return(X + st); + return(X + tmp); } -#define CHACHA_CORE_PARALLEL(state) do { \ - state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \ - state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \ - state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \ - state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \ - \ - state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \ - state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \ - state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \ - state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \ -} while(0) - -uint16 chacha_small_parallel_rnd(uint16 X) +uint16 neoscrypt_chacha(uint16 X) { - uint4 t, st[4]; - - ((uint16 *)st)[0] = X; - - #if CHACHA_SMALL_UNROLL == 1 + uint16 tmp = X; for(int i = 0; i < 10; ++i) { - CHACHA_CORE_PARALLEL(st); - } - - #elif CHACHA_SMALL_UNROLL == 2 - - for(int i = 0; i < 5; ++i) - { - CHACHA_CORE_PARALLEL(st); - CHACHA_CORE_PARALLEL(st); - } - - #elif CHACHA_SMALL_UNROLL == 3 - - for(int i = 0; i < 4; ++i) - { - CHACHA_CORE_PARALLEL(st); - if(i == 3) break; - CHACHA_CORE_PARALLEL(st); - CHACHA_CORE_PARALLEL(st); + tmp.s0 += tmp.s4; tmp.sc = rotate(tmp.sc ^ tmp.s0, 16U); tmp.s8 += tmp.sc; tmp.s4 = rotate(tmp.s4 ^ tmp.s8, 12U); tmp.s0 += tmp.s4; tmp.sc = rotate(tmp.sc ^ tmp.s0, 8U); tmp.s8 += tmp.sc; tmp.s4 = rotate(tmp.s4 ^ tmp.s8, 7U); + tmp.s1 += tmp.s5; tmp.sd = rotate(tmp.sd ^ tmp.s1, 16U); tmp.s9 += tmp.sd; tmp.s5 = rotate(tmp.s5 ^ tmp.s9, 12U); tmp.s1 += tmp.s5; tmp.sd = rotate(tmp.sd ^ tmp.s1, 8U); tmp.s9 += tmp.sd; tmp.s5 = rotate(tmp.s5 ^ tmp.s9, 7U); + tmp.s2 += tmp.s6; tmp.se = rotate(tmp.se ^ tmp.s2, 16U); tmp.sa += tmp.se; tmp.s6 = rotate(tmp.s6 ^ tmp.sa, 12U); tmp.s2 += tmp.s6; tmp.se = rotate(tmp.se ^ tmp.s2, 8U); tmp.sa += tmp.se; tmp.s6 = rotate(tmp.s6 ^ tmp.sa, 7U); + tmp.s3 += tmp.s7; tmp.sf = rotate(tmp.sf ^ tmp.s3, 16U); tmp.sb += tmp.sf; tmp.s7 = rotate(tmp.s7 ^ tmp.sb, 12U); tmp.s3 += tmp.s7; tmp.sf = rotate(tmp.sf ^ tmp.s3, 8U); tmp.sb += tmp.sf; tmp.s7 = rotate(tmp.s7 ^ tmp.sb, 7U); + tmp.s0 += tmp.s5; tmp.sf = rotate(tmp.sf ^ tmp.s0, 16U); tmp.sa += tmp.sf; tmp.s5 = rotate(tmp.s5 ^ tmp.sa, 12U); tmp.s0 += tmp.s5; tmp.sf = rotate(tmp.sf ^ tmp.s0, 8U); tmp.sa += tmp.sf; tmp.s5 = rotate(tmp.s5 ^ tmp.sa, 7U); + tmp.s1 += tmp.s6; tmp.sc = rotate(tmp.sc ^ tmp.s1, 16U); tmp.sb += tmp.sc; tmp.s6 = rotate(tmp.s6 ^ tmp.sb, 12U); tmp.s1 += tmp.s6; tmp.sc = rotate(tmp.sc ^ tmp.s1, 8U); tmp.sb += tmp.sc; tmp.s6 = rotate(tmp.s6 ^ tmp.sb, 7U); + tmp.s2 += tmp.s7; tmp.sd = rotate(tmp.sd ^ tmp.s2, 16U); tmp.s8 += tmp.sd; tmp.s7 = rotate(tmp.s7 ^ tmp.s8, 12U); tmp.s2 += tmp.s7; tmp.sd = rotate(tmp.sd ^ tmp.s2, 8U); tmp.s8 += tmp.sd; tmp.s7 = rotate(tmp.s7 ^ tmp.s8, 7U); + tmp.s3 += tmp.s4; tmp.se = rotate(tmp.se ^ tmp.s3, 16U); tmp.s9 += tmp.se; tmp.s4 = rotate(tmp.s4 ^ tmp.s9, 12U); tmp.s3 += tmp.s4; tmp.se = rotate(tmp.se ^ tmp.s3, 8U); tmp.s9 += tmp.se; tmp.s4 = rotate(tmp.s4 ^ tmp.s9, 7U); } - #elif CHACHA_SMALL_UNROLL == 4 - - for(int i = 0; i < 3; ++i) - { - CHACHA_CORE_PARALLEL(st); - CHACHA_CORE_PARALLEL(st); - if(i == 2) break; - CHACHA_CORE_PARALLEL(st); - CHACHA_CORE_PARALLEL(st); - } - - #else - - for(int i = 0; i < 2; ++i) - { - CHACHA_CORE_PARALLEL(st); - CHACHA_CORE_PARALLEL(st); - CHACHA_CORE_PARALLEL(st); - CHACHA_CORE_PARALLEL(st); - CHACHA_CORE_PARALLEL(st); - } - - #endif - - return(X + ((uint16 *)st)[0]); + return(X + tmp); } -void neoscrypt_blkmix(uint16 *XV, bool alg) +void neoscrypt_blkmix(uint16 *XV, uint mixmode) { /* NeoScrypt flow: Scrypt flow: @@ -434,92 +558,152 @@ void neoscrypt_blkmix(uint16 *XV, bool alg) XV[0] ^= XV[3]; - if(!alg) - { - XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; - XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; - XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; - XV[3] = salsa_small_scalar_rnd(XV[3]); - } - else - { - XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; - XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1]; - XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2]; - XV[3] = chacha_small_parallel_rnd(XV[3]); - } + if(!mixmode) XV[0] = neoscrypt_salsa(XV[0]); + else XV[0] = neoscrypt_chacha(XV[0]); + + XV[1] ^= XV[0]; + + if(!mixmode) XV[1] = neoscrypt_salsa(XV[1]); + else XV[1] = neoscrypt_chacha(XV[1]); - XV[1] ^= XV[2]; XV[2] ^= XV[1]; - XV[1] ^= XV[2]; -} -void ScratchpadStore(__global void *V, void *X, uchar idx) -{ - ((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0]; - ((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1]; -} + if(!mixmode) XV[2] = neoscrypt_salsa(XV[2]); + else XV[2] = neoscrypt_chacha(XV[2]); -void ScratchpadMix(void *X, const __global void *V, uchar idx) -{ - ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1]; - ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1]; -} + XV[3] ^= XV[2]; -void SMix(uint16 *X, __global uint16 *V, bool flag) -{ - #pragma unroll 1 - for(int i = 0; i < 128; ++i) - { - ScratchpadStore(V, X, i); - neoscrypt_blkmix(X, flag); - } + if(!mixmode) XV[3] = neoscrypt_salsa(XV[3]); + else XV[3] = neoscrypt_chacha(XV[3]); - #pragma unroll 1 - for(int i = 0; i < 128; ++i) - { - const uint idx = convert_uchar(((uint *)X)[48] & 0x7F); - ScratchpadMix(X, V, idx); - neoscrypt_blkmix(X, flag); - } + neoscrypt_blkswp(&XV[1], &XV[2], BLOCK_SIZE); } +/* NeoScrypt core engine: + * p = 1, salt = password; + * Basic customisation (required): + * profile bit 0: + * 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; + * 1 = Scrypt(1024, 1, 1) with Salsa20/8; + * profile bits 4 to 1: + * 0000 = FastKDF-BLAKE2s; + * 0001 = PBKDF2-HMAC-SHA256; + * 0010 = PBKDF2-HMAC-BLAKE256; + * Extended customisation (optional): + * profile bit 31: + * 0 = extended customisation absent; + * 1 = extended customisation present; + * profile bits 7 to 5 (rfactor): + * 000 = r of 1; + * 001 = r of 2; + * 010 = r of 4; + * ... + * 111 = r of 128; + * profile bits 12 to 8 (Nfactor): + * 00000 = N of 2; + * 00001 = N of 4; + * 00010 = N of 8; + * ..... + * 00110 = N of 128; + * ..... + * 01001 = N of 1024; + * ..... + * 11110 = N of 2147483648; + * profile bits 30 to 13 are reserved */ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target) +__kernel void search(__global const uchar* restrict input, +#ifdef TEST + __global uchar* restrict output, +#else + volatile __global uint* restrict output, +#endif + __global uchar* padcache, + const uint target) { #define CONSTANT_N 128 #define CONSTANT_r 2 - // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha - uint16 X[4], Z[4]; + /* Ensure stack alignment by putting those first. */ + /* X = CONSTANT_r * 2 * BLOCK_SIZE(64) */ + uchar X[FASTKDF_BUFFER_SIZE]; + /* Z is a copy of X for ChaCha */ + uchar Z[FASTKDF_BUFFER_SIZE]; /* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */ - __global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global uchar *V= &padcache[CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE* + (get_global_id(0)% MAX_GLOBAL_THREADS)]; +#ifndef TEST uchar outbuf[32]; uchar data[PASSWORD_LEN]; + uint i, j; + for(i= 0; i< PASSWORD_LEN- 4; ++i) + data[i]= input[i]; + ((uint *)data)[(PASSWORD_LEN- 4)/ sizeof(uint)]= get_global_id(0); +#else + uchar outbuf[OUTPUT_LEN]; + uchar data[PASSWORD_LEN]; + uint i, j; + for(i= 0; i< PASSWORD_LEN; ++i) + data[i]= input[i]; +#endif + const uint mixmode = 0x14; + +#ifdef TEST +#ifdef BLAKE2S_TEST + blake2s(data, 64, data, 32, outbuf, OUTPUT_LEN); + for(i= 0; i< OUTPUT_LEN; ++i) + output[i]= outbuf[i]; + return; +#elif defined(FASTKDF_TEST) + for(i= 0; i< FASTKDF_BUFFER_SIZE; ++i) + X[i]= testsalt[i]; + fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); + for(i= 0; i< OUTPUT_LEN; ++i) + output[i]= outbuf[i]; + return; +#endif +#endif - ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; - ((ulong *)data)[8] = ((__global const ulong *)input)[8]; - ((uint *)data)[18] = ((__global const uint *)input)[18]; - ((uint *)data)[19] = get_global_id(0); - - // X = KDF(password, salt) - fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); + /* X = KDF(password, salt) */ + fastkdf(data, data, PASSWORD_LEN, 32, X, CONSTANT_r * 2 * BLOCK_SIZE); - // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2 - CopyBytes128(Z, X, 2); + /* Process ChaCha 1st, Salsa 2nd and XOR them into PBKDF2 */ + neoscrypt_blkcpy(Z, X, CONSTANT_r * 2 * BLOCK_SIZE); - // X = SMix(X); X & Z are swapped, repeat. - for(bool flag = false;; ++flag) + for(int y = 0; y < 2; ++y) { - SMix(X, V, flag); - if(flag) break; - SwapBytes128(X, Z, 256); + for(i = 0; i < 128; ++i) + { + neoscrypt_gl_blkcpy(&V[i << 8], &X[0], 256); + neoscrypt_blkmix((uint16 *)X, y); + } + + for(i = 0; i < 128; ++i) + { + neoscrypt_gl_blkxor(&X[0], &V[(((uint *)X)[48] & 127) << 8], 256); + neoscrypt_blkmix((uint16 *)X, y); + } + if(!y) neoscrypt_blkswp(&X[0], &Z[0], 256); } - // blkxor(X, Z) - ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0]; - ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1]; + /* blkxor(X, Z) */ + neoscrypt_blkxor(&X[0], &Z[0], CONSTANT_r * 2 * BLOCK_SIZE); + +#ifdef TEST + fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); + //((uint *)outbuf)[8]= target; + for(i= 0; i< OUTPUT_LEN; ++i) + output[i]= outbuf[i]; +#else + /* output = KDF(password, X) */ + fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); + +#define SCRYPT_FOUND (0xFF) +#ifdef cl_khr_global_int32_base_atomics + #define SETFOUND(Xnonce) output[atomic_add(&output[SCRYPT_FOUND], 1)]= Xnonce +#else + #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce +#endif - // output = KDF(password, X) - fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); - if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0); + if (((uint *)outbuf)[7]<= target) + SETFOUND(get_global_id(0)); +#endif } \ No newline at end of file diff --git a/kernel/quarkcoin.cl b/kernel/quarkcoin.cl index ec258b26..7d405b75 100644 --- a/kernel/quarkcoin.cl +++ b/kernel/quarkcoin.cl @@ -4,7 +4,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +12,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -54,13 +54,13 @@ typedef long sph_s64; #define SPH_64_TRUE 1 #define SPH_C32(x) ((sph_u32)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_C64(x) ((sph_u64)(x ## UL)) -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ECHO_64 1 @@ -75,11 +75,8 @@ typedef long sph_s64; #ifndef SPH_COMPACT_BLAKE_64 #define SPH_COMPACT_BLAKE_64 0 #endif -#ifndef SPH_LUFFA_PARALLEL - #define SPH_LUFFA_PARALLEL 0 -#endif #ifndef SPH_KECCAK_UNROLL - #define SPH_KECCAK_UNROLL 0 + #define SPH_KECCAK_UNROLL 0 #endif #include "blake.cl" @@ -92,6 +89,14 @@ typedef long sph_s64; #define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP8(x) as_ulong(as_uchar8(x).s76543210) +#define SHL(x, n) ((x) << (n)) +#define SHR(x, n) ((x) >> (n)) + +#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ + q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ + q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ + q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + #if SPH_BIG_ENDIAN #define DEC64E(x) (x) #define DEC64BE(x) (*(const __global sph_u64 *) (x)); @@ -111,112 +116,337 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp } hash; // blake +{ + sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); + sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); + sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); + sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); + sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; + sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; + + if ((T0 = SPH_T64(T0 + 1024)) < 1024) { - sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); - sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); - sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); - sph_u64 H6 = SPH_C64(0x1F83D9ABFB41BD6B), H7 = SPH_C64(0x5BE0CD19137E2179); - sph_u64 S0 = 0, S1 = 0, S2 = 0, S3 = 0; - sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; + T1 = SPH_T64(T1 + 1); + } + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; + M0 = DEC64BE(block + 0); + M1 = DEC64BE(block + 8); + M2 = DEC64BE(block + 16); + M3 = DEC64BE(block + 24); + M4 = DEC64BE(block + 32); + M5 = DEC64BE(block + 40); + M6 = DEC64BE(block + 48); + M7 = DEC64BE(block + 56); + M8 = DEC64BE(block + 64); + M9 = DEC64BE(block + 72); + M9 &= 0xFFFFFFFF00000000; + M9 ^= SWAP4(gid); + MA = 0x8000000000000000; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 0x280; - if ((T0 = SPH_T64(T0 + 1024)) < 1024) - { - T1 = SPH_T64(T1 + 1); - } - sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; - sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; - sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; - sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; - M0 = DEC64BE(block + 0); - M1 = DEC64BE(block + 8); - M2 = DEC64BE(block + 16); - M3 = DEC64BE(block + 24); - M4 = DEC64BE(block + 32); - M5 = DEC64BE(block + 40); - M6 = DEC64BE(block + 48); - M7 = DEC64BE(block + 56); - M8 = DEC64BE(block + 64); - M9 = DEC64BE(block + 72); - M9 &= 0xFFFFFFFF00000000; - M9 ^= SWAP4(gid); - MA = 0x8000000000000000; - MB = 0; - MC = 0; - MD = 1; - ME = 0; - MF = 0x280; + COMPRESS64; - COMPRESS64; + hash.h8[0] = H0; + hash.h8[1] = H1; + hash.h8[2] = H2; + hash.h8[3] = H3; + hash.h8[4] = H4; + hash.h8[5] = H5; + hash.h8[6] = H6; + hash.h8[7] = H7; +} + // bmw - hash.h8[0] = H0; - hash.h8[1] = H1; - hash.h8[2] = H2; - hash.h8[3] = H3; - hash.h8[4] = H4; - hash.h8[5] = H5; - hash.h8[6] = H6; - hash.h8[7] = H7; + + sph_u64 BMW_H[16]; +#pragma unroll 16 + for(unsigned u = 0; u < 16; u++) + BMW_H[u] = BMW_IV512[u]; + + sph_u64 mv[16],q[32]; + sph_u64 tmp; + + mv[ 0] = SWAP8(hash.h8[0]); + mv[ 1] = SWAP8(hash.h8[1]); + mv[ 2] = SWAP8(hash.h8[2]); + mv[ 3] = SWAP8(hash.h8[3]); + mv[ 4] = SWAP8(hash.h8[4]); + mv[ 5] = SWAP8(hash.h8[5]); + mv[ 6] = SWAP8(hash.h8[6]); + mv[ 7] = SWAP8(hash.h8[7]); + mv[ 8] = 0x80; + mv[ 9] = 0; + mv[10] = 0; + mv[11] = 0; + mv[12] = 0; + mv[13] = 0; + mv[14] = 0; + mv[15] = SPH_C64(512); + + tmp = (mv[ 5] ^ BMW_H[ 5]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[ 6] ^ BMW_H[ 6]) - (mv[ 8] ^ BMW_H[ 8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[ 0] ^ BMW_H[ 0]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 1] ^ BMW_H[ 1]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 2] ^ BMW_H[ 2]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 2] ^ BMW_H[ 2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[ 4] ^ BMW_H[ 4]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[ 6] ^ BMW_H[ 6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) + (mv[ 6] ^ BMW_H[ 6]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 9] ^ BMW_H[ 9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 3] ^ BMW_H[ 3]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[ 2] ^ BMW_H[ 2]) + (mv[ 4] ^ BMW_H[ 4]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); } - // bmw +#pragma unroll 4 + for(int i=2;i<6;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } +#pragma unroll 3 + for(int i=6;i<9;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } +#pragma unroll 4 + for(int i=9;i<13;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } +#pragma unroll 3 + for(int i=13;i<16;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; +sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[ 0]) + ( XL64 ^ q[24] ^ q[ 0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[ 1]) + ( XL64 ^ q[25] ^ q[ 1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[ 2]) + ( XL64 ^ q[26] ^ q[ 2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[ 3]) + ( XL64 ^ q[27] ^ q[ 3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[ 4]) + ( XL64 ^ q[28] ^ q[ 4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[ 5]) + ( XL64 ^ q[29] ^ q[ 5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[ 6]) + ( XL64 ^ q[30] ^ q[ 6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[ 7]) + ( XL64 ^ q[31] ^ q[ 7]); + + BMW_H[ 8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]); + BMW_H[ 9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; + } + + tmp = (mv[ 5] ^ BMW_H[ 5]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[ 6] ^ BMW_H[ 6]) - (mv[ 8] ^ BMW_H[ 8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[ 0] ^ BMW_H[ 0]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 1] ^ BMW_H[ 1]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 2] ^ BMW_H[ 2]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 2] ^ BMW_H[ 2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[ 4] ^ BMW_H[ 4]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[ 6] ^ BMW_H[ 6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) + (mv[ 6] ^ BMW_H[ 6]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 9] ^ BMW_H[ 9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 3] ^ BMW_H[ 3]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[ 2] ^ BMW_H[ 2]) + (mv[ 4] ^ BMW_H[ 4]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + + +#pragma unroll 2 + for(int i=0;i<2;i++) { - sph_u64 BMW_H[16]; - for(unsigned u = 0; u < 16; u++) - BMW_H[u] = BMW_IV512[u]; - - sph_u64 BMW_h1[16], BMW_h2[16]; - sph_u64 mv[16]; - - mv[ 0] = SWAP8(hash.h8[0]); - mv[ 1] = SWAP8(hash.h8[1]); - mv[ 2] = SWAP8(hash.h8[2]); - mv[ 3] = SWAP8(hash.h8[3]); - mv[ 4] = SWAP8(hash.h8[4]); - mv[ 5] = SWAP8(hash.h8[5]); - mv[ 6] = SWAP8(hash.h8[6]); - mv[ 7] = SWAP8(hash.h8[7]); - mv[ 8] = 0x80; - mv[ 9] = 0; - mv[10] = 0; - mv[11] = 0; - mv[12] = 0; - mv[13] = 0; - mv[14] = 0; - mv[15] = 0x200; -#define M(x) (mv[x]) -#define H(x) (BMW_H[x]) -#define dH(x) (BMW_h2[x]) - - FOLDb; - -#undef M -#undef H -#undef dH - -#define M(x) (BMW_h2[x]) -#define H(x) (final_b[x]) -#define dH(x) (BMW_h1[x]) - - FOLDb; - -#undef M -#undef H -#undef dH - - hash.h8[0] = SWAP8(BMW_h1[8]); - hash.h8[1] = SWAP8(BMW_h1[9]); - hash.h8[2] = SWAP8(BMW_h1[10]); - hash.h8[3] = SWAP8(BMW_h1[11]); - hash.h8[4] = SWAP8(BMW_h1[12]); - hash.h8[5] = SWAP8(BMW_h1[13]); - hash.h8[6] = SWAP8(BMW_h1[14]); - hash.h8[7] = SWAP8(BMW_h1[15]); + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } +#pragma unroll 3 + for(int i=6;i<9;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); } +#pragma unroll 4 + for(int i=9;i<13;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } +#pragma unroll 3 + for(int i=13;i<16;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; +XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[ 0]) + ( XL64 ^ q[24] ^ q[ 0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[ 1]) + ( XL64 ^ q[25] ^ q[ 1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[ 2]) + ( XL64 ^ q[26] ^ q[ 2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[ 3]) + ( XL64 ^ q[27] ^ q[ 3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[ 4]) + ( XL64 ^ q[28] ^ q[ 4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[ 5]) + ( XL64 ^ q[29] ^ q[ 5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[ 6]) + ( XL64 ^ q[30] ^ q[ 6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[ 7]) + ( XL64 ^ q[31] ^ q[ 7]); + + BMW_H[ 8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]); + BMW_H[ 9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); - if ((hash.h1[7] & 0x8) != 0) + hash.h8[0] = SWAP8(BMW_H[8]); + hash.h8[1] = SWAP8(BMW_H[9]); + hash.h8[2] = SWAP8(BMW_H[10]); + hash.h8[3] = SWAP8(BMW_H[11]); + hash.h8[4] = SWAP8(BMW_H[12]); + hash.h8[5] = SWAP8(BMW_H[13]); + hash.h8[6] = SWAP8(BMW_H[14]); + hash.h8[7] = SWAP8(BMW_H[15]); + + bool dec = ((hash.h1[7] & 0x8) != 0); { + // groestl - sph_u64 H[16]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif + } + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C + sph_u64 H[16]; for (unsigned int u = 0; u < 15; u ++) H[u] = 0; #if USE_LE @@ -255,11 +485,13 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp for (unsigned int u = 0; u < 16; u ++) H[u] ^= xH[u]; for (unsigned int u = 0; u < 8; u ++) - hash.h8[u] = DEC64E(H[u + 8]); + hash.h8[u] = (dec ? DEC64E(H[u + 8]) : hash.h8[u]); + } - else { + // skein + sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; sph_u64 bcount = 0; @@ -276,108 +508,138 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp bcount = 0; m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; UBI_BIG(510, 8); - hash.h8[0] = SWAP8(h0); - hash.h8[1] = SWAP8(h1); - hash.h8[2] = SWAP8(h2); - hash.h8[3] = SWAP8(h3); - hash.h8[4] = SWAP8(h4); - hash.h8[5] = SWAP8(h5); - hash.h8[6] = SWAP8(h6); - hash.h8[7] = SWAP8(h7); + hash.h8[0] = (!dec ? SWAP8(h0) : hash.h8[0]); + hash.h8[1] = (!dec ? SWAP8(h1) : hash.h8[1]); + hash.h8[2] = (!dec ? SWAP8(h2) : hash.h8[2]); + hash.h8[3] = (!dec ? SWAP8(h3) : hash.h8[3]); + hash.h8[4] = (!dec ? SWAP8(h4) : hash.h8[4]); + hash.h8[5] = (!dec ? SWAP8(h5) : hash.h8[5]); + hash.h8[6] = (!dec ? SWAP8(h6) : hash.h8[6]); + hash.h8[7] = (!dec ? SWAP8(h7) : hash.h8[7]); } // groestl - { - sph_u64 H[16]; - for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; - #if USE_LE - H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); - #else - H[15] = (sph_u64)512; - #endif +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif + int init = get_local_id(0); + int step = get_local_size(0); - sph_u64 g[16], m[16]; - m[0] = DEC64E(hash.h8[0]); - m[1] = DEC64E(hash.h8[1]); - m[2] = DEC64E(hash.h8[2]); - m[3] = DEC64E(hash.h8[3]); - m[4] = DEC64E(hash.h8[4]); - m[5] = DEC64E(hash.h8[5]); - m[6] = DEC64E(hash.h8[6]); - m[7] = DEC64E(hash.h8[7]); - for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - m[8] = 0x80; g[8] = m[8] ^ H[8]; - m[9] = 0; g[9] = m[9] ^ H[9]; - m[10] = 0; g[10] = m[10] ^ H[10]; - m[11] = 0; g[11] = m[11] ^ H[11]; - m[12] = 0; g[12] = m[12] ^ H[12]; - m[13] = 0; g[13] = m[13] ^ H[13]; - m[14] = 0; g[14] = m[14] ^ H[14]; - m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; - PERM_BIG_P(g); - PERM_BIG_Q(m); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - sph_u64 xH[16]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; - PERM_BIG_P(xH); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; - for (unsigned int u = 0; u < 8; u ++) - hash.h8[u] = DEC64E(H[u + 8]); + for (int i = init; i < 256; i += step) + { + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif } + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C + sph_u64 H[16]; + for (unsigned int u = 0; u < 15; u ++) + H[u] = 0; +#if USE_LE + H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); +#else + H[15] = (sph_u64)512; +#endif + + sph_u64 g[16], m[16]; + m[0] = DEC64E(hash.h8[0]); + m[1] = DEC64E(hash.h8[1]); + m[2] = DEC64E(hash.h8[2]); + m[3] = DEC64E(hash.h8[3]); + m[4] = DEC64E(hash.h8[4]); + m[5] = DEC64E(hash.h8[5]); + m[6] = DEC64E(hash.h8[6]); + m[7] = DEC64E(hash.h8[7]); + for (unsigned int u = 0; u < 16; u ++) + g[u] = m[u] ^ H[u]; + m[8] = 0x80; g[8] = m[8] ^ H[8]; + m[9] = 0; g[9] = m[9] ^ H[9]; + m[10] = 0; g[10] = m[10] ^ H[10]; + m[11] = 0; g[11] = m[11] ^ H[11]; + m[12] = 0; g[12] = m[12] ^ H[12]; + m[13] = 0; g[13] = m[13] ^ H[13]; + m[14] = 0; g[14] = m[14] ^ H[14]; + m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; + PERM_BIG_P(g); + PERM_BIG_Q(m); + for (unsigned int u = 0; u < 16; u ++) + H[u] ^= g[u] ^ m[u]; + sph_u64 xH[16]; + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u]; + PERM_BIG_P(xH); + for (unsigned int u = 0; u < 16; u ++) + H[u] ^= xH[u]; + for (unsigned int u = 0; u < 8; u ++) + hash.h8[u] = DEC64E(H[u + 8]); // jh - { - sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); - sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); - sph_u64 tmp; - for(int i = 0; i < 2; i++) - { - if (i == 0) { - h0h ^= DEC64E(hash.h8[0]); - h0l ^= DEC64E(hash.h8[1]); - h1h ^= DEC64E(hash.h8[2]); - h1l ^= DEC64E(hash.h8[3]); - h2h ^= DEC64E(hash.h8[4]); - h2l ^= DEC64E(hash.h8[5]); - h3h ^= DEC64E(hash.h8[6]); - h3l ^= DEC64E(hash.h8[7]); - } else if(i == 1) { - h4h ^= DEC64E(hash.h8[0]); - h4l ^= DEC64E(hash.h8[1]); - h5h ^= DEC64E(hash.h8[2]); - h5l ^= DEC64E(hash.h8[3]); - h6h ^= DEC64E(hash.h8[4]); - h6l ^= DEC64E(hash.h8[5]); - h7h ^= DEC64E(hash.h8[6]); - h7l ^= DEC64E(hash.h8[7]); + sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); + sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); +// tmp; - h0h ^= 0x80; - h3l ^= 0x2000000000000; - } - E8; - } - h4h ^= 0x80; - h7l ^= 0x2000000000000; + for(int i = 0; i < 2; i++) + { + if (i == 0) { + h0h ^= DEC64E(hash.h8[0]); + h0l ^= DEC64E(hash.h8[1]); + h1h ^= DEC64E(hash.h8[2]); + h1l ^= DEC64E(hash.h8[3]); + h2h ^= DEC64E(hash.h8[4]); + h2l ^= DEC64E(hash.h8[5]); + h3h ^= DEC64E(hash.h8[6]); + h3l ^= DEC64E(hash.h8[7]); + } else if(i == 1) { + h4h ^= DEC64E(hash.h8[0]); + h4l ^= DEC64E(hash.h8[1]); + h5h ^= DEC64E(hash.h8[2]); + h5l ^= DEC64E(hash.h8[3]); + h6h ^= DEC64E(hash.h8[4]); + h6l ^= DEC64E(hash.h8[5]); + h7h ^= DEC64E(hash.h8[6]); + h7l ^= DEC64E(hash.h8[7]); - hash.h8[0] = DEC64E(h4h); - hash.h8[1] = DEC64E(h4l); - hash.h8[2] = DEC64E(h5h); - hash.h8[3] = DEC64E(h5l); - hash.h8[4] = DEC64E(h6h); - hash.h8[5] = DEC64E(h6l); - hash.h8[6] = DEC64E(h7h); - hash.h8[7] = DEC64E(h7l); + h0h ^= 0x80; + h3l ^= 0x2000000000000; + } + E8; } + h4h ^= 0x80; + h7l ^= 0x2000000000000; - if ((hash.h1[7] & 0x8) != 0) + hash.h8[0] = DEC64E(h4h); + hash.h8[1] = DEC64E(h4l); + hash.h8[2] = DEC64E(h5h); + hash.h8[3] = DEC64E(h5l); + hash.h8[4] = DEC64E(h6h); + hash.h8[5] = DEC64E(h6l); + hash.h8[6] = DEC64E(h7h); + hash.h8[7] = DEC64E(h7l); + + dec = ((hash.h1[7] & 0x8) != 0); { + // blake + sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); @@ -412,141 +674,338 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp COMPRESS64; - hash.h8[0] = H0; - hash.h8[1] = H1; - hash.h8[2] = H2; - hash.h8[3] = H3; - hash.h8[4] = H4; - hash.h8[5] = H5; - hash.h8[6] = H6; - hash.h8[7] = H7; + hash.h8[0] = (dec ? H0 : hash.h8[0]); + hash.h8[1] = (dec ? H1 : hash.h8[1]); + hash.h8[2] = (dec ? H2 : hash.h8[2]); + hash.h8[3] = (dec ? H3 : hash.h8[3]); + hash.h8[4] = (dec ? H4 : hash.h8[4]); + hash.h8[5] = (dec ? H5 : hash.h8[5]); + hash.h8[6] = (dec ? H6 : hash.h8[6]); + hash.h8[7] = (dec ? H7 : hash.h8[7]); + } - else { + // bmw - sph_u64 BMW_H[16]; - for(unsigned u = 0; u < 16; u++) - BMW_H[u] = BMW_IV512[u]; - - sph_u64 BMW_h1[16], BMW_h2[16]; - sph_u64 mv[16]; - - mv[ 0] = SWAP8(hash.h8[0]); - mv[ 1] = SWAP8(hash.h8[1]); - mv[ 2] = SWAP8(hash.h8[2]); - mv[ 3] = SWAP8(hash.h8[3]); - mv[ 4] = SWAP8(hash.h8[4]); - mv[ 5] = SWAP8(hash.h8[5]); - mv[ 6] = SWAP8(hash.h8[6]); - mv[ 7] = SWAP8(hash.h8[7]); - mv[ 8] = 0x80; - mv[ 9] = 0; - mv[10] = 0; - mv[11] = 0; - mv[12] = 0; - mv[13] = 0; - mv[14] = 0; - mv[15] = 0x200; - #define M(x) (mv[x]) - #define H(x) (BMW_H[x]) - #define dH(x) (BMW_h2[x]) - - FOLDb; - - #undef M - #undef H - #undef dH - - #define M(x) (BMW_h2[x]) - #define H(x) (final_b[x]) - #define dH(x) (BMW_h1[x]) - - FOLDb; - - #undef M - #undef H - #undef dH - - hash.h8[0] = SWAP8(BMW_h1[8]); - hash.h8[1] = SWAP8(BMW_h1[9]); - hash.h8[2] = SWAP8(BMW_h1[10]); - hash.h8[3] = SWAP8(BMW_h1[11]); - hash.h8[4] = SWAP8(BMW_h1[12]); - hash.h8[5] = SWAP8(BMW_h1[13]); - hash.h8[6] = SWAP8(BMW_h1[14]); - hash.h8[7] = SWAP8(BMW_h1[15]); - } + sph_u64 BMW_H[16]; +#pragma unroll 16 + for(unsigned u = 0; u < 16; u++) + BMW_H[u] = BMW_IV512[u]; - // keccak + sph_u64 mv[16],q[32]; +// sph_u64 tmp; + + mv[ 0] = SWAP8(hash.h8[0]); + mv[ 1] = SWAP8(hash.h8[1]); + mv[ 2] = SWAP8(hash.h8[2]); + mv[ 3] = SWAP8(hash.h8[3]); + mv[ 4] = SWAP8(hash.h8[4]); + mv[ 5] = SWAP8(hash.h8[5]); + mv[ 6] = SWAP8(hash.h8[6]); + mv[ 7] = SWAP8(hash.h8[7]); + mv[ 8] = 0x80; + mv[ 9] = 0; + mv[10] = 0; + mv[11] = 0; + mv[12] = 0; + mv[13] = 0; + mv[14] = 0; + mv[15] = SPH_C64(512); + + tmp = (mv[ 5] ^ BMW_H[ 5]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[ 6] ^ BMW_H[ 6]) - (mv[ 8] ^ BMW_H[ 8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[ 0] ^ BMW_H[ 0]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 1] ^ BMW_H[ 1]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 2] ^ BMW_H[ 2]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 2] ^ BMW_H[ 2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[ 4] ^ BMW_H[ 4]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[ 6] ^ BMW_H[ 6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) + (mv[ 6] ^ BMW_H[ 6]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 9] ^ BMW_H[ 9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 3] ^ BMW_H[ 3]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[ 2] ^ BMW_H[ 2]) + (mv[ 4] ^ BMW_H[ 4]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) { - sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; - sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; - sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; - sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; - sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } - a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); - a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); +#pragma unroll 4 + for(int i=2;i<6;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } +#pragma unroll 3 + for(int i=6;i<9;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } +#pragma unroll 4 + for(int i=9;i<13;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } +#pragma unroll 3 + for(int i=13;i<16;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } - a00 ^= SWAP8(hash.h8[0]); - a10 ^= SWAP8(hash.h8[1]); - a20 ^= SWAP8(hash.h8[2]); - a30 ^= SWAP8(hash.h8[3]); - a40 ^= SWAP8(hash.h8[4]); - a01 ^= SWAP8(hash.h8[5]); - a11 ^= SWAP8(hash.h8[6]); - a21 ^= SWAP8(hash.h8[7]); - a31 ^= 0x8000000000000001; - KECCAK_F_1600; - // Finalize the "lane complement" - a10 = ~a10; - a20 = ~a20; +sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; +sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[ 0]) + ( XL64 ^ q[24] ^ q[ 0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[ 1]) + ( XL64 ^ q[25] ^ q[ 1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[ 2]) + ( XL64 ^ q[26] ^ q[ 2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[ 3]) + ( XL64 ^ q[27] ^ q[ 3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[ 4]) + ( XL64 ^ q[28] ^ q[ 4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[ 5]) + ( XL64 ^ q[29] ^ q[ 5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[ 6]) + ( XL64 ^ q[30] ^ q[ 6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[ 7]) + ( XL64 ^ q[31] ^ q[ 7]); - hash.h8[0] = SWAP8(a00); - hash.h8[1] = SWAP8(a10); - hash.h8[2] = SWAP8(a20); - hash.h8[3] = SWAP8(a30); - hash.h8[4] = SWAP8(a40); - hash.h8[5] = SWAP8(a01); - hash.h8[6] = SWAP8(a11); - hash.h8[7] = SWAP8(a21); + BMW_H[ 8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]); + BMW_H[ 9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; } - // skein + tmp = (mv[ 5] ^ BMW_H[ 5]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[ 6] ^ BMW_H[ 6]) - (mv[ 8] ^ BMW_H[ 8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[ 0] ^ BMW_H[ 0]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 1] ^ BMW_H[ 1]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 2] ^ BMW_H[ 2]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 2] ^ BMW_H[ 2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[ 4] ^ BMW_H[ 4]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[ 6] ^ BMW_H[ 6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) + (mv[ 6] ^ BMW_H[ 6]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 9] ^ BMW_H[ 9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 3] ^ BMW_H[ 3]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[ 2] ^ BMW_H[ 2]) + (mv[ 4] ^ BMW_H[ 4]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + + +#pragma unroll 2 + for(int i=0;i<2;i++) { - sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); - sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; - sph_u64 bcount = 0; + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } +#pragma unroll 3 + for(int i=6;i<9;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } +#pragma unroll 4 + for(int i=9;i<13;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } +#pragma unroll 3 + for(int i=13;i<16;i++) { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; +XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[ 0]) + ( XL64 ^ q[24] ^ q[ 0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[ 1]) + ( XL64 ^ q[25] ^ q[ 1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[ 2]) + ( XL64 ^ q[26] ^ q[ 2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[ 3]) + ( XL64 ^ q[27] ^ q[ 3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[ 4]) + ( XL64 ^ q[28] ^ q[ 4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[ 5]) + ( XL64 ^ q[29] ^ q[ 5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[ 6]) + ( XL64 ^ q[30] ^ q[ 6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[ 7]) + ( XL64 ^ q[31] ^ q[ 7]); + + BMW_H[ 8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]); + BMW_H[ 9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + + hash.h8[0] = (!dec ? SWAP8(BMW_H[8]) : hash.h8[0]); + hash.h8[1] = (!dec ? SWAP8(BMW_H[9]) : hash.h8[1]); + hash.h8[2] = (!dec ? SWAP8(BMW_H[10]) : hash.h8[2]); + hash.h8[3] = (!dec ? SWAP8(BMW_H[11]) : hash.h8[3]); + hash.h8[4] = (!dec ? SWAP8(BMW_H[12]) : hash.h8[4]); + hash.h8[5] = (!dec ? SWAP8(BMW_H[13]) : hash.h8[5]); + hash.h8[6] = (!dec ? SWAP8(BMW_H[14]) : hash.h8[6]); + hash.h8[7] = (!dec ? SWAP8(BMW_H[15]) : hash.h8[7]); - m0 = SWAP8(hash.h8[0]); - m1 = SWAP8(hash.h8[1]); - m2 = SWAP8(hash.h8[2]); - m3 = SWAP8(hash.h8[3]); - m4 = SWAP8(hash.h8[4]); - m5 = SWAP8(hash.h8[5]); - m6 = SWAP8(hash.h8[6]); - m7 = SWAP8(hash.h8[7]); - UBI_BIG(480, 64); - bcount = 0; - m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; - UBI_BIG(510, 8); - hash.h8[0] = SWAP8(h0); - hash.h8[1] = SWAP8(h1); - hash.h8[2] = SWAP8(h2); - hash.h8[3] = SWAP8(h3); - hash.h8[4] = SWAP8(h4); - hash.h8[5] = SWAP8(h5); - hash.h8[6] = SWAP8(h6); - hash.h8[7] = SWAP8(h7); } - if ((hash.h1[7] & 0x8) != 0) + // keccak + + sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; + sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; + sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; + sph_u64 a30 = 0, a31 = 0, a32 = 0, a33 = 0, a34 = 0; + sph_u64 a40 = 0, a41 = 0, a42 = 0, a43 = 0, a44 = 0; + + a10 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a20 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a31 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a22 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a23 = SPH_C64(0xFFFFFFFFFFFFFFFF); + a04 = SPH_C64(0xFFFFFFFFFFFFFFFF); + + a00 ^= SWAP8(hash.h8[0]); + a10 ^= SWAP8(hash.h8[1]); + a20 ^= SWAP8(hash.h8[2]); + a30 ^= SWAP8(hash.h8[3]); + a40 ^= SWAP8(hash.h8[4]); + a01 ^= SWAP8(hash.h8[5]); + a11 ^= SWAP8(hash.h8[6]); + a21 ^= SWAP8(hash.h8[7]); + a31 ^= 0x8000000000000001; + KECCAK_F_1600; + // Finalize the "lane complement" + a10 = ~a10; + a20 = ~a20; + + hash.h8[0] = SWAP8(a00); + hash.h8[1] = SWAP8(a10); + hash.h8[2] = SWAP8(a20); + hash.h8[3] = SWAP8(a30); + hash.h8[4] = SWAP8(a40); + hash.h8[5] = SWAP8(a01); + hash.h8[6] = SWAP8(a11); + hash.h8[7] = SWAP8(a21); + + // skein + + sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); + sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; + sph_u64 bcount = 0; + + m0 = SWAP8(hash.h8[0]); + m1 = SWAP8(hash.h8[1]); + m2 = SWAP8(hash.h8[2]); + m3 = SWAP8(hash.h8[3]); + m4 = SWAP8(hash.h8[4]); + m5 = SWAP8(hash.h8[5]); + m6 = SWAP8(hash.h8[6]); + m7 = SWAP8(hash.h8[7]); + UBI_BIG(480, 64); + bcount = 0; + m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; + UBI_BIG(510, 8); + hash.h8[0] = SWAP8(h0); + hash.h8[1] = SWAP8(h1); + hash.h8[2] = SWAP8(h2); + hash.h8[3] = SWAP8(h3); + hash.h8[4] = SWAP8(h4); + hash.h8[5] = SWAP8(h5); + hash.h8[6] = SWAP8(h6); + hash.h8[7] = SWAP8(h7); + + dec = ((hash.h1[7] & 0x8) != 0); { + // keccak + sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; @@ -574,18 +1033,20 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp a10 = ~a10; a20 = ~a20; - hash.h8[0] = SWAP8(a00); - hash.h8[1] = SWAP8(a10); - hash.h8[2] = SWAP8(a20); - hash.h8[3] = SWAP8(a30); - hash.h8[4] = SWAP8(a40); - hash.h8[5] = SWAP8(a01); - hash.h8[6] = SWAP8(a11); - hash.h8[7] = SWAP8(a21); + hash.h8[0] = (dec ? SWAP8(a00) : hash.h8[0]); + hash.h8[1] = (dec ? SWAP8(a10) : hash.h8[1]); + hash.h8[2] = (dec ? SWAP8(a20) : hash.h8[2]); + hash.h8[3] = (dec ? SWAP8(a30) : hash.h8[3]); + hash.h8[4] = (dec ? SWAP8(a40) : hash.h8[4]); + hash.h8[5] = (dec ? SWAP8(a01) : hash.h8[5]); + hash.h8[6] = (dec ? SWAP8(a11) : hash.h8[6]); + hash.h8[7] = (dec ? SWAP8(a21) : hash.h8[7]); + } - else { + // jh + sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); sph_u64 tmp; @@ -619,14 +1080,15 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp h4h ^= 0x80; h7l ^= 0x2000000000000; - hash.h8[0] = DEC64E(h4h); - hash.h8[1] = DEC64E(h4l); - hash.h8[2] = DEC64E(h5h); - hash.h8[3] = DEC64E(h5l); - hash.h8[4] = DEC64E(h6h); - hash.h8[5] = DEC64E(h6l); - hash.h8[6] = DEC64E(h7h); - hash.h8[7] = DEC64E(h7l); + hash.h8[0] = (!dec ? DEC64E(h4h) : hash.h8[0]); + hash.h8[1] = (!dec ? DEC64E(h4l) : hash.h8[1]); + hash.h8[2] = (!dec ? DEC64E(h5h) : hash.h8[2]); + hash.h8[3] = (!dec ? DEC64E(h5l) : hash.h8[3]); + hash.h8[4] = (!dec ? DEC64E(h6h) : hash.h8[4]); + hash.h8[5] = (!dec ? DEC64E(h6l) : hash.h8[5]); + hash.h8[6] = (!dec ? DEC64E(h7h) : hash.h8[6]); + hash.h8[7] = (!dec ? DEC64E(h7l) : hash.h8[7]); + } bool result = (SWAP8(hash.h8[3]) <= target); @@ -634,4 +1096,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp output[output[0xFF]++] = SWAP4(gid); } -#endif // QUARKCOIN_CL +#endif // QUARKCOIN_CL \ No newline at end of file diff --git a/kernel/qubitcoin.cl b/kernel/qubitcoin.cl index cf3efb4b..8b65b7f7 100644 --- a/kernel/qubitcoin.cl +++ b/kernel/qubitcoin.cl @@ -4,7 +4,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +12,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -54,13 +54,13 @@ typedef long sph_s64; #define SPH_64_TRUE 1 #define SPH_C32(x) ((sph_u32)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_C64(x) ((sph_u64)(x ## UL)) -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ECHO_64 1 @@ -88,6 +88,14 @@ typedef long sph_s64; #define DEC32BE(x) SWAP4(*(const __global sph_u32 *) (x)); #endif +#define SHL(x, n) ((x) << (n)) +#define SHR(x, n) ((x) >> (n)) + +#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ + q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ + q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ + q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) { @@ -465,4 +473,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp } } -#endif // QUBITCOIN_CL +#endif // QUBITCOIN_CL \ No newline at end of file diff --git a/kernel/sifcoin.cl b/kernel/sifcoin.cl index 05efffac..8596f0a3 100644 --- a/kernel/sifcoin.cl +++ b/kernel/sifcoin.cl @@ -4,7 +4,7 @@ * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 phm - * + * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including @@ -12,10 +12,10 @@ * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: - * + * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. @@ -54,13 +54,13 @@ typedef long sph_s64; #define SPH_64_TRUE 1 #define SPH_C32(x) ((sph_u32)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_C64(x) ((sph_u64)(x ## UL)) -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ECHO_64 1 @@ -100,6 +100,14 @@ typedef long sph_s64; #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); #endif +#define SHL(x, n) ((x) << (n)) +#define SHR(x, n) ((x) >> (n)) + +#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ + q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ + q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ + q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) { @@ -164,8 +172,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp for(unsigned u = 0; u < 16; u++) BMW_H[u] = BMW_IV512[u]; - sph_u64 BMW_h1[16], BMW_h2[16]; - sph_u64 mv[16]; + sph_u64 mv[16],q[32]; + sph_u64 tmp; mv[ 0] = SWAP8(hash.h8[0]); mv[ 1] = SWAP8(hash.h8[1]); @@ -183,78 +191,330 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp mv[13] = 0; mv[14] = 0; mv[15] = 0x200; -#define M(x) (mv[x]) -#define H(x) (BMW_H[x]) -#define dH(x) (BMW_h2[x]) - - FOLDb; - -#undef M -#undef H -#undef dH - -#define M(x) (BMW_h2[x]) -#define H(x) (final_b[x]) -#define dH(x) (BMW_h1[x]) - - FOLDb; - -#undef M -#undef H -#undef dH - - hash.h8[0] = SWAP8(BMW_h1[8]); - hash.h8[1] = SWAP8(BMW_h1[9]); - hash.h8[2] = SWAP8(BMW_h1[10]); - hash.h8[3] = SWAP8(BMW_h1[11]); - hash.h8[4] = SWAP8(BMW_h1[12]); - hash.h8[5] = SWAP8(BMW_h1[13]); - hash.h8[6] = SWAP8(BMW_h1[14]); - hash.h8[7] = SWAP8(BMW_h1[15]); + + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + +#pragma unroll 16 + for(int i=0;i<16;i++) + { + mv[i] = BMW_H[i]; + BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i; } + tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]); + q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1]; + tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]); + q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2]; + tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]); + q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]); + q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]); + q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6]; + tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]); + q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7]; + tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]); + q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8]; + tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]); + q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9]; + tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]); + q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]); + q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11]; + tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]); + q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12]; + tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]); + q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13]; + tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]); + q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14]; + tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]); + q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15]; + tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); + q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; + +#pragma unroll 2 + for(int i=0;i<2;i++) + { + q[i+16] = + (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) + + (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) + + (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) + + (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) + + (SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) + + (SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) + + (SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) + + (SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) + + (SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) + + (SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) + + (SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) + + (SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) + + (SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) + + (SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) + + (SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) + + (SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=2;i<6;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); + } + +#pragma unroll 3 + for(int i=6;i<9;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); + } + +#pragma unroll 4 + for(int i=9;i<13;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + +#pragma unroll 3 + for(int i=13;i<16;i++) + { + q[i+16] = CONST_EXP2 + + (( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) + + SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); + } + + XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; + + BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); + BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); + BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); + BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]); + BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]); + BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]); + BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]); + BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]); + + BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]); + BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]); + BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); + BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); + BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); + BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); + BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); + BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); + + hash.h8[0] = SWAP8(BMW_H[8]); + hash.h8[1] = SWAP8(BMW_H[9]); + hash.h8[2] = SWAP8(BMW_H[10]); + hash.h8[3] = SWAP8(BMW_H[11]); + hash.h8[4] = SWAP8(BMW_H[12]); + hash.h8[5] = SWAP8(BMW_H[13]); + hash.h8[6] = SWAP8(BMW_H[14]); + hash.h8[7] = SWAP8(BMW_H[15]); + } // groestl { - sph_u64 H[16]; - for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif + int init = get_local_id(0); + int step = get_local_size(0); + + for (int i = init; i < 256; i += step) + { + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif + } + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C + + + sph_u64 H[16]; +//#pragma unroll 15 + for (unsigned int u = 0; u < 15; u ++) + H[u] = 0; #if USE_LE - H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); + H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); #else - H[15] = (sph_u64)512; + H[15] = (sph_u64)512; #endif - sph_u64 g[16], m[16]; - m[0] = DEC64E(hash.h8[0]); - m[1] = DEC64E(hash.h8[1]); - m[2] = DEC64E(hash.h8[2]); - m[3] = DEC64E(hash.h8[3]); - m[4] = DEC64E(hash.h8[4]); - m[5] = DEC64E(hash.h8[5]); - m[6] = DEC64E(hash.h8[6]); - m[7] = DEC64E(hash.h8[7]); - for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - m[8] = 0x80; g[8] = m[8] ^ H[8]; - m[9] = 0; g[9] = m[9] ^ H[9]; - m[10] = 0; g[10] = m[10] ^ H[10]; - m[11] = 0; g[11] = m[11] ^ H[11]; - m[12] = 0; g[12] = m[12] ^ H[12]; - m[13] = 0; g[13] = m[13] ^ H[13]; - m[14] = 0; g[14] = m[14] ^ H[14]; - m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; - PERM_BIG_P(g); - PERM_BIG_Q(m); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - sph_u64 xH[16]; - for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; - PERM_BIG_P(xH); - for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; - for (unsigned int u = 0; u < 8; u ++) - hash.h8[u] = DEC64E(H[u + 8]); + sph_u64 g[16], m[16]; + m[0] = DEC64E(hash.h8[0]); + m[1] = DEC64E(hash.h8[1]); + m[2] = DEC64E(hash.h8[2]); + m[3] = DEC64E(hash.h8[3]); + m[4] = DEC64E(hash.h8[4]); + m[5] = DEC64E(hash.h8[5]); + m[6] = DEC64E(hash.h8[6]); + m[7] = DEC64E(hash.h8[7]); + +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + g[u] = m[u] ^ H[u]; + m[8] = 0x80; g[8] = m[8] ^ H[8]; + m[9] = 0; g[9] = m[9] ^ H[9]; + m[10] = 0; g[10] = m[10] ^ H[10]; + m[11] = 0; g[11] = m[11] ^ H[11]; + m[12] = 0; g[12] = m[12] ^ H[12]; + m[13] = 0; g[13] = m[13] ^ H[13]; + m[14] = 0; g[14] = m[14] ^ H[14]; + m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; + PERM_BIG_P(g); + PERM_BIG_Q(m); + +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + H[u] ^= g[u] ^ m[u]; + sph_u64 xH[16]; + +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u]; + PERM_BIG_P(xH); + +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + H[u] ^= xH[u]; + +//#pragma unroll 8 + for (unsigned int u = 0; u < 8; u ++) + hash.h8[u] = DEC64E(H[u + 8]); + barrier(CLK_GLOBAL_MEM_FENCE); } // jh @@ -374,4 +634,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp output[output[0xFF]++] = SWAP4(gid); } -#endif // SIFCOIN_CL +#endif // SIFCOIN_CL \ No newline at end of file diff --git a/kernel/talkcoin-mod.cl b/kernel/talkcoin-mod.cl index 05c18c13..990fcb55 100644 --- a/kernel/talkcoin-mod.cl +++ b/kernel/talkcoin-mod.cl @@ -68,10 +68,7 @@ typedef int sph_s32; #define SPH_JH_64 1 #define SPH_KECCAK_64 1 #define SPH_KECCAK_NOCOPY 0 - -#ifndef SPH_COMPACT_BLAKE_64 - #define SPH_COMPACT_BLAKE_64 0 -#endif +#define SPH_COMPACT_BLAKE_64 0 #ifndef SPH_KECCAK_UNROLL #define SPH_KECCAK_UNROLL 0 #endif @@ -162,44 +159,48 @@ __kernel void search1(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif int init = get_local_id(0); int step = get_local_size(0); for (int i = init; i < 256; i += step) { - T0_L[i] = T0[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T4_L[i] = T4[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif } + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C - barrier(CLK_LOCAL_MEM_FENCE); - - #define T0 T0_L - #define T1 T1_L - #define T2 T2_L - #define T3 T3_L - #define T4 T4_L - #define T5 T5_L - #define T6 T6_L - #define T7 T7_L sph_u64 H[16]; - +//#pragma unroll 15 for (unsigned int u = 0; u < 15; u ++) - H[u] = 0; - - #if USE_LE + H[u] = 0; +#if USE_LE H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); - #else +#else H[15] = (sph_u64)512; - #endif +#endif sph_u64 g[16], m[16]; m[0] = DEC64E(hash->h8[0]); @@ -211,9 +212,9 @@ __kernel void search1(__global hash_t* hashes) m[6] = DEC64E(hash->h8[6]); m[7] = DEC64E(hash->h8[7]); +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) - g[u] = m[u] ^ H[u]; - + g[u] = m[u] ^ H[u]; m[8] = 0x80; g[8] = m[8] ^ H[8]; m[9] = 0; g[9] = m[9] ^ H[9]; m[10] = 0; g[10] = m[10] ^ H[10]; @@ -222,27 +223,28 @@ __kernel void search1(__global hash_t* hashes) m[13] = 0; g[13] = m[13] ^ H[13]; m[14] = 0; g[14] = m[14] ^ H[14]; m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; - PERM_BIG_P(g); PERM_BIG_Q(m); +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) - H[u] ^= g[u] ^ m[u]; - + H[u] ^= g[u] ^ m[u]; sph_u64 xH[16]; +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u]; - + xH[u] = H[u]; PERM_BIG_P(xH); +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) - H[u] ^= xH[u]; + H[u] ^= xH[u]; +//#pragma unroll 8 for (unsigned int u = 0; u < 8; u ++) - hash->h8[u] = DEC64E(H[u + 8]); + hash->h8[u] = DEC64E(H[u + 8]); + barrier(CLK_GLOBAL_MEM_FENCE); - barrier(CLK_GLOBAL_MEM_FENCE); } // jh @@ -398,4 +400,4 @@ __kernel void search4(__global hash_t* hashes, __global uint* output, const ulon output[atomic_inc(output+0xFF)] = SWAP4(gid); } -#endif // TALKCOIN_MOD_CL +#endif // TALKCOIN_MOD_CL \ No newline at end of file diff --git a/kernel/twecoin.cl b/kernel/twecoin.cl index d9c87107..60c6a605 100644 --- a/kernel/twecoin.cl +++ b/kernel/twecoin.cl @@ -20,13 +20,13 @@ typedef long sph_s64; #define SPH_64_TRUE 1 #define SPH_C32(x) ((sph_u32)(x ## U)) -#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_C64(x) ((sph_u64)(x ## UL)) -#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) -#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_HAMSI_EXPAND_SMALL 1 @@ -47,6 +47,14 @@ typedef long sph_s64; #define sph_bswap32(x) SWAP4(x) +#define SHL(x, n) ((x) << (n)) +#define SHR(x, n) ((x) >> (n)) + +#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \ + q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \ + q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \ + q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + static void sph_enc32be(void *dst, sph_u32 val) { #if defined SPH_UPTR @@ -425,4 +433,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp if (result) output[output[0xFF]++] = SWAP4(gid); } -} +} \ No newline at end of file diff --git a/kernel/whirlcoin.cl b/kernel/whirlcoin.cl new file mode 100644 index 00000000..275b5437 --- /dev/null +++ b/kernel/whirlcoin.cl @@ -0,0 +1,1358 @@ +/* + * whirlcoin kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2014 phm + * Copyright (c) 2014 djm34 + * Copyright (c) 2014 uray + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author djm34, uray + */ +#ifndef W_CL +#define W_CL + +#if __ENDIAN_LITTLE__ +#define SPH_LITTLE_ENDIAN 1 +#else +#define SPH_BIG_ENDIAN 1 +#endif + +#define SPH_UPTR sph_u64 + +typedef unsigned int sph_u32; +typedef int sph_s32; +#ifndef __OPENCL_VERSION__ +typedef unsigned long long sph_u64 __attribute__ ((aligned (128))); +typedef long long sph_s64; +#else +typedef unsigned long sph_u64; +typedef long sph_s64; +#endif + +#define SPH_64 1 +#define SPH_64_TRUE 1 + +#define SPH_C32(x) ((sph_u32)(x ## U)) +#define SPH_C64(x) ((sph_u64)(x ## UL)) + +__constant static const sph_u64 old1_T0[256] __attribute__ ((aligned (128))) = { + SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323), + SPH_C64(0xF9B87EF9C657C6C6), SPH_C64(0x6FFB136FE825E8E8), + SPH_C64(0xA1CB4CA187948787), SPH_C64(0x6211A962B8D5B8B8), + SPH_C64(0x0509080501030101), SPH_C64(0x6E0D426E4FD14F4F), + SPH_C64(0xEE9BADEE365A3636), SPH_C64(0x04FF5904A6F7A6A6), + SPH_C64(0xBD0CDEBDD26BD2D2), SPH_C64(0x060EFB06F502F5F5), + SPH_C64(0x8096EF80798B7979), SPH_C64(0xCE305FCE6FB16F6F), + SPH_C64(0xEF6DFCEF91AE9191), SPH_C64(0x07F8AA0752F65252), + SPH_C64(0xFD4727FD60A06060), SPH_C64(0x76358976BCD9BCBC), + SPH_C64(0xCD37ACCD9BB09B9B), SPH_C64(0x8C8A048C8E8F8E8E), + SPH_C64(0x15D27115A3F8A3A3), SPH_C64(0x3C6C603C0C140C0C), + SPH_C64(0x8A84FF8A7B8D7B7B), SPH_C64(0xE180B5E1355F3535), + SPH_C64(0x69F5E8691D271D1D), SPH_C64(0x47B35347E03DE0E0), + SPH_C64(0xAC21F6ACD764D7D7), SPH_C64(0xED9C5EEDC25BC2C2), + SPH_C64(0x96436D962E722E2E), SPH_C64(0x7A29627A4BDD4B4B), + SPH_C64(0x215DA321FE1FFEFE), SPH_C64(0x16D5821657F95757), + SPH_C64(0x41BDA841153F1515), SPH_C64(0xB6E89FB677997777), + SPH_C64(0xEB92A5EB37593737), SPH_C64(0x569E7B56E532E5E5), + SPH_C64(0xD9138CD99FBC9F9F), SPH_C64(0x1723D317F00DF0F0), + SPH_C64(0x7F206A7F4ADE4A4A), SPH_C64(0x95449E95DA73DADA), + SPH_C64(0x25A2FA2558E85858), SPH_C64(0xCACF06CAC946C9C9), + SPH_C64(0x8D7C558D297B2929), SPH_C64(0x225A50220A1E0A0A), + SPH_C64(0x4F50E14FB1CEB1B1), SPH_C64(0x1AC9691AA0FDA0A0), + SPH_C64(0xDA147FDA6BBD6B6B), SPH_C64(0xABD95CAB85928585), + SPH_C64(0x733C8173BDDABDBD), SPH_C64(0x348FD2345DE75D5D), + SPH_C64(0x5090805010301010), SPH_C64(0x0307F303F401F4F4), + SPH_C64(0xC0DD16C0CB40CBCB), SPH_C64(0xC6D3EDC63E423E3E), + SPH_C64(0x112D2811050F0505), SPH_C64(0xE6781FE667A96767), + SPH_C64(0x53977353E431E4E4), SPH_C64(0xBB0225BB27692727), + SPH_C64(0x5873325841C34141), SPH_C64(0x9DA72C9D8B808B8B), + SPH_C64(0x01F65101A7F4A7A7), SPH_C64(0x94B2CF947D877D7D), + SPH_C64(0xFB49DCFB95A29595), SPH_C64(0x9F568E9FD875D8D8), + SPH_C64(0x30708B30FB10FBFB), SPH_C64(0x71CD2371EE2FEEEE), + SPH_C64(0x91BBC7917C847C7C), SPH_C64(0xE37117E366AA6666), + SPH_C64(0x8E7BA68EDD7ADDDD), SPH_C64(0x4BAFB84B17391717), + SPH_C64(0x4645024647C94747), SPH_C64(0xDC1A84DC9EBF9E9E), + SPH_C64(0xC5D41EC5CA43CACA), SPH_C64(0x995875992D772D2D), + SPH_C64(0x792E9179BFDCBFBF), SPH_C64(0x1B3F381B07090707), + SPH_C64(0x23AC0123ADEAADAD), SPH_C64(0x2FB0EA2F5AEE5A5A), + SPH_C64(0xB5EF6CB583988383), SPH_C64(0xFFB685FF33553333), + SPH_C64(0xF25C3FF263A56363), SPH_C64(0x0A12100A02060202), + SPH_C64(0x38933938AAE3AAAA), SPH_C64(0xA8DEAFA871937171), + SPH_C64(0xCFC60ECFC845C8C8), SPH_C64(0x7DD1C87D192B1919), + SPH_C64(0x703B727049DB4949), SPH_C64(0x9A5F869AD976D9D9), + SPH_C64(0x1D31C31DF20BF2F2), SPH_C64(0x48A84B48E338E3E3), + SPH_C64(0x2AB9E22A5BED5B5B), SPH_C64(0x92BC349288858888), + SPH_C64(0xC83EA4C89AB39A9A), SPH_C64(0xBE0B2DBE266A2626), + SPH_C64(0xFABF8DFA32563232), SPH_C64(0x4A59E94AB0CDB0B0), + SPH_C64(0x6AF21B6AE926E9E9), SPH_C64(0x337778330F110F0F), + SPH_C64(0xA633E6A6D562D5D5), SPH_C64(0xBAF474BA809D8080), + SPH_C64(0x7C27997CBEDFBEBE), SPH_C64(0xDEEB26DECD4ACDCD), + SPH_C64(0xE489BDE4345C3434), SPH_C64(0x75327A7548D84848), + SPH_C64(0x2454AB24FF1CFFFF), SPH_C64(0x8F8DF78F7A8E7A7A), + SPH_C64(0xEA64F4EA90AD9090), SPH_C64(0x3E9DC23E5FE15F5F), + SPH_C64(0xA03D1DA020602020), SPH_C64(0xD50F67D568B86868), + SPH_C64(0x72CAD0721A2E1A1A), SPH_C64(0x2CB7192CAEEFAEAE), + SPH_C64(0x5E7DC95EB4C1B4B4), SPH_C64(0x19CE9A1954FC5454), + SPH_C64(0xE57FECE593A89393), SPH_C64(0xAA2F0DAA22662222), + SPH_C64(0xE96307E964AC6464), SPH_C64(0x122ADB12F10EF1F1), + SPH_C64(0xA2CCBFA273957373), SPH_C64(0x5A82905A12361212), + SPH_C64(0x5D7A3A5D40C04040), SPH_C64(0x2848402808180808), + SPH_C64(0xE89556E8C358C3C3), SPH_C64(0x7BDF337BEC29ECEC), + SPH_C64(0x904D9690DB70DBDB), SPH_C64(0x1FC0611FA1FEA1A1), + SPH_C64(0x83911C838D8A8D8D), SPH_C64(0xC9C8F5C93D473D3D), + SPH_C64(0xF15BCCF197A49797), SPH_C64(0x0000000000000000), + SPH_C64(0xD4F936D4CF4CCFCF), SPH_C64(0x876E45872B7D2B2B), + SPH_C64(0xB3E197B3769A7676), SPH_C64(0xB0E664B0829B8282), + SPH_C64(0xA928FEA9D667D6D6), SPH_C64(0x77C3D8771B2D1B1B), + SPH_C64(0x5B74C15BB5C2B5B5), SPH_C64(0x29BE1129AFECAFAF), + SPH_C64(0xDF1D77DF6ABE6A6A), SPH_C64(0x0DEABA0D50F05050), + SPH_C64(0x4C57124C45CF4545), SPH_C64(0x1838CB18F308F3F3), + SPH_C64(0xF0AD9DF030503030), SPH_C64(0x74C42B74EF2CEFEF), + SPH_C64(0xC3DAE5C33F413F3F), SPH_C64(0x1CC7921C55FF5555), + SPH_C64(0x10DB7910A2FBA2A2), SPH_C64(0x65E90365EA23EAEA), + SPH_C64(0xEC6A0FEC65AF6565), SPH_C64(0x6803B968BAD3BABA), + SPH_C64(0x934A65932F712F2F), SPH_C64(0xE78E4EE7C05DC0C0), + SPH_C64(0x8160BE81DE7FDEDE), SPH_C64(0x6CFCE06C1C241C1C), + SPH_C64(0x2E46BB2EFD1AFDFD), SPH_C64(0x641F52644DD74D4D), + SPH_C64(0xE076E4E092AB9292), SPH_C64(0xBCFA8FBC759F7575), + SPH_C64(0x1E36301E060A0606), SPH_C64(0x98AE24988A838A8A), + SPH_C64(0x404BF940B2CBB2B2), SPH_C64(0x59856359E637E6E6), + SPH_C64(0x367E70360E120E0E), SPH_C64(0x63E7F8631F211F1F), + SPH_C64(0xF75537F762A66262), SPH_C64(0xA33AEEA3D461D4D4), + SPH_C64(0x32812932A8E5A8A8), SPH_C64(0xF452C4F496A79696), + SPH_C64(0x3A629B3AF916F9F9), SPH_C64(0xF6A366F6C552C5C5), + SPH_C64(0xB11035B1256F2525), SPH_C64(0x20ABF22059EB5959), + SPH_C64(0xAED054AE84918484), SPH_C64(0xA7C5B7A772967272), + SPH_C64(0xDDECD5DD394B3939), SPH_C64(0x61165A614CD44C4C), + SPH_C64(0x3B94CA3B5EE25E5E), SPH_C64(0x859FE78578887878), + SPH_C64(0xD8E5DDD838483838), SPH_C64(0x869814868C898C8C), + SPH_C64(0xB217C6B2D16ED1D1), SPH_C64(0x0BE4410BA5F2A5A5), + SPH_C64(0x4DA1434DE23BE2E2), SPH_C64(0xF84E2FF861A36161), + SPH_C64(0x4542F145B3C8B3B3), SPH_C64(0xA53415A521632121), + SPH_C64(0xD60894D69CB99C9C), SPH_C64(0x66EEF0661E221E1E), + SPH_C64(0x5261225243C54343), SPH_C64(0xFCB176FCC754C7C7), + SPH_C64(0x2B4FB32BFC19FCFC), SPH_C64(0x14242014040C0404), + SPH_C64(0x08E3B20851F35151), SPH_C64(0xC725BCC799B69999), + SPH_C64(0xC4224FC46DB76D6D), SPH_C64(0x396568390D170D0D), + SPH_C64(0x35798335FA13FAFA), SPH_C64(0x8469B684DF7CDFDF), + SPH_C64(0x9BA9D79B7E827E7E), SPH_C64(0xB4193DB4246C2424), + SPH_C64(0xD7FEC5D73B4D3B3B), SPH_C64(0x3D9A313DABE0ABAB), + SPH_C64(0xD1F03ED1CE4FCECE), SPH_C64(0x5599885511331111), + SPH_C64(0x89830C898F8C8F8F), SPH_C64(0x6B044A6B4ED24E4E), + SPH_C64(0x5166D151B7C4B7B7), SPH_C64(0x60E00B60EB20EBEB), + SPH_C64(0xCCC1FDCC3C443C3C), SPH_C64(0xBFFD7CBF819E8181), + SPH_C64(0xFE40D4FE94A19494), SPH_C64(0x0C1CEB0CF704F7F7), + SPH_C64(0x6718A167B9D6B9B9), SPH_C64(0x5F8B985F13351313), + SPH_C64(0x9C517D9C2C742C2C), SPH_C64(0xB805D6B8D368D3D3), + SPH_C64(0x5C8C6B5CE734E7E7), SPH_C64(0xCB3957CB6EB26E6E), + SPH_C64(0xF3AA6EF3C451C4C4), SPH_C64(0x0F1B180F03050303), + SPH_C64(0x13DC8A1356FA5656), SPH_C64(0x495E1A4944CC4444), + SPH_C64(0x9EA0DF9E7F817F7F), SPH_C64(0x37882137A9E6A9A9), + SPH_C64(0x82674D822A7E2A2A), SPH_C64(0x6D0AB16DBBD0BBBB), + SPH_C64(0xE28746E2C15EC1C1), SPH_C64(0x02F1A20253F55353), + SPH_C64(0x8B72AE8BDC79DCDC), SPH_C64(0x275358270B1D0B0B), + SPH_C64(0xD3019CD39DBA9D9D), SPH_C64(0xC12B47C16CB46C6C), + SPH_C64(0xF5A495F531533131), SPH_C64(0xB9F387B9749C7474), + SPH_C64(0x0915E309F607F6F6), SPH_C64(0x434C0A4346CA4646), + SPH_C64(0x26A50926ACE9ACAC), SPH_C64(0x97B53C9789868989), + SPH_C64(0x44B4A044143C1414), SPH_C64(0x42BA5B42E13EE1E1), + SPH_C64(0x4EA6B04E163A1616), SPH_C64(0xD2F7CDD23A4E3A3A), + SPH_C64(0xD0066FD069BB6969), SPH_C64(0x2D41482D091B0909), + SPH_C64(0xADD7A7AD70907070), SPH_C64(0x546FD954B6C7B6B6), + SPH_C64(0xB71ECEB7D06DD0D0), SPH_C64(0x7ED63B7EED2AEDED), + SPH_C64(0xDBE22EDBCC49CCCC), SPH_C64(0x57682A5742C64242), + SPH_C64(0xC22CB4C298B59898), SPH_C64(0x0EED490EA4F1A4A4), + SPH_C64(0x88755D8828782828), SPH_C64(0x3186DA315CE45C5C), + SPH_C64(0x3F6B933FF815F8F8), SPH_C64(0xA4C244A486978686) +}; + +__constant static const sph_u64 old1_T1[256] __attribute__ ((aligned (128))) = { + SPH_C64(0xD8C0781828181878), SPH_C64(0x2605AF23652323AF), + SPH_C64(0xB87EF9C657C6C6F9), SPH_C64(0xFB136FE825E8E86F), + SPH_C64(0xCB4CA187948787A1), SPH_C64(0x11A962B8D5B8B862), + SPH_C64(0x0908050103010105), SPH_C64(0x0D426E4FD14F4F6E), + SPH_C64(0x9BADEE365A3636EE), SPH_C64(0xFF5904A6F7A6A604), + SPH_C64(0x0CDEBDD26BD2D2BD), SPH_C64(0x0EFB06F502F5F506), + SPH_C64(0x96EF80798B797980), SPH_C64(0x305FCE6FB16F6FCE), + SPH_C64(0x6DFCEF91AE9191EF), SPH_C64(0xF8AA0752F6525207), + SPH_C64(0x4727FD60A06060FD), SPH_C64(0x358976BCD9BCBC76), + SPH_C64(0x37ACCD9BB09B9BCD), SPH_C64(0x8A048C8E8F8E8E8C), + SPH_C64(0xD27115A3F8A3A315), SPH_C64(0x6C603C0C140C0C3C), + SPH_C64(0x84FF8A7B8D7B7B8A), SPH_C64(0x80B5E1355F3535E1), + SPH_C64(0xF5E8691D271D1D69), SPH_C64(0xB35347E03DE0E047), + SPH_C64(0x21F6ACD764D7D7AC), SPH_C64(0x9C5EEDC25BC2C2ED), + SPH_C64(0x436D962E722E2E96), SPH_C64(0x29627A4BDD4B4B7A), + SPH_C64(0x5DA321FE1FFEFE21), SPH_C64(0xD5821657F9575716), + SPH_C64(0xBDA841153F151541), SPH_C64(0xE89FB677997777B6), + SPH_C64(0x92A5EB37593737EB), SPH_C64(0x9E7B56E532E5E556), + SPH_C64(0x138CD99FBC9F9FD9), SPH_C64(0x23D317F00DF0F017), + SPH_C64(0x206A7F4ADE4A4A7F), SPH_C64(0x449E95DA73DADA95), + SPH_C64(0xA2FA2558E8585825), SPH_C64(0xCF06CAC946C9C9CA), + SPH_C64(0x7C558D297B29298D), SPH_C64(0x5A50220A1E0A0A22), + SPH_C64(0x50E14FB1CEB1B14F), SPH_C64(0xC9691AA0FDA0A01A), + SPH_C64(0x147FDA6BBD6B6BDA), SPH_C64(0xD95CAB85928585AB), + SPH_C64(0x3C8173BDDABDBD73), SPH_C64(0x8FD2345DE75D5D34), + SPH_C64(0x9080501030101050), SPH_C64(0x07F303F401F4F403), + SPH_C64(0xDD16C0CB40CBCBC0), SPH_C64(0xD3EDC63E423E3EC6), + SPH_C64(0x2D2811050F050511), SPH_C64(0x781FE667A96767E6), + SPH_C64(0x977353E431E4E453), SPH_C64(0x0225BB27692727BB), + SPH_C64(0x73325841C3414158), SPH_C64(0xA72C9D8B808B8B9D), + SPH_C64(0xF65101A7F4A7A701), SPH_C64(0xB2CF947D877D7D94), + SPH_C64(0x49DCFB95A29595FB), SPH_C64(0x568E9FD875D8D89F), + SPH_C64(0x708B30FB10FBFB30), SPH_C64(0xCD2371EE2FEEEE71), + SPH_C64(0xBBC7917C847C7C91), SPH_C64(0x7117E366AA6666E3), + SPH_C64(0x7BA68EDD7ADDDD8E), SPH_C64(0xAFB84B173917174B), + SPH_C64(0x45024647C9474746), SPH_C64(0x1A84DC9EBF9E9EDC), + SPH_C64(0xD41EC5CA43CACAC5), SPH_C64(0x5875992D772D2D99), + SPH_C64(0x2E9179BFDCBFBF79), SPH_C64(0x3F381B070907071B), + SPH_C64(0xAC0123ADEAADAD23), SPH_C64(0xB0EA2F5AEE5A5A2F), + SPH_C64(0xEF6CB583988383B5), SPH_C64(0xB685FF33553333FF), + SPH_C64(0x5C3FF263A56363F2), SPH_C64(0x12100A020602020A), + SPH_C64(0x933938AAE3AAAA38), SPH_C64(0xDEAFA871937171A8), + SPH_C64(0xC60ECFC845C8C8CF), SPH_C64(0xD1C87D192B19197D), + SPH_C64(0x3B727049DB494970), SPH_C64(0x5F869AD976D9D99A), + SPH_C64(0x31C31DF20BF2F21D), SPH_C64(0xA84B48E338E3E348), + SPH_C64(0xB9E22A5BED5B5B2A), SPH_C64(0xBC34928885888892), + SPH_C64(0x3EA4C89AB39A9AC8), SPH_C64(0x0B2DBE266A2626BE), + SPH_C64(0xBF8DFA32563232FA), SPH_C64(0x59E94AB0CDB0B04A), + SPH_C64(0xF21B6AE926E9E96A), SPH_C64(0x7778330F110F0F33), + SPH_C64(0x33E6A6D562D5D5A6), SPH_C64(0xF474BA809D8080BA), + SPH_C64(0x27997CBEDFBEBE7C), SPH_C64(0xEB26DECD4ACDCDDE), + SPH_C64(0x89BDE4345C3434E4), SPH_C64(0x327A7548D8484875), + SPH_C64(0x54AB24FF1CFFFF24), SPH_C64(0x8DF78F7A8E7A7A8F), + SPH_C64(0x64F4EA90AD9090EA), SPH_C64(0x9DC23E5FE15F5F3E), + SPH_C64(0x3D1DA020602020A0), SPH_C64(0x0F67D568B86868D5), + SPH_C64(0xCAD0721A2E1A1A72), SPH_C64(0xB7192CAEEFAEAE2C), + SPH_C64(0x7DC95EB4C1B4B45E), SPH_C64(0xCE9A1954FC545419), + SPH_C64(0x7FECE593A89393E5), SPH_C64(0x2F0DAA22662222AA), + SPH_C64(0x6307E964AC6464E9), SPH_C64(0x2ADB12F10EF1F112), + SPH_C64(0xCCBFA273957373A2), SPH_C64(0x82905A123612125A), + SPH_C64(0x7A3A5D40C040405D), SPH_C64(0x4840280818080828), + SPH_C64(0x9556E8C358C3C3E8), SPH_C64(0xDF337BEC29ECEC7B), + SPH_C64(0x4D9690DB70DBDB90), SPH_C64(0xC0611FA1FEA1A11F), + SPH_C64(0x911C838D8A8D8D83), SPH_C64(0xC8F5C93D473D3DC9), + SPH_C64(0x5BCCF197A49797F1), SPH_C64(0x0000000000000000), + SPH_C64(0xF936D4CF4CCFCFD4), SPH_C64(0x6E45872B7D2B2B87), + SPH_C64(0xE197B3769A7676B3), SPH_C64(0xE664B0829B8282B0), + SPH_C64(0x28FEA9D667D6D6A9), SPH_C64(0xC3D8771B2D1B1B77), + SPH_C64(0x74C15BB5C2B5B55B), SPH_C64(0xBE1129AFECAFAF29), + SPH_C64(0x1D77DF6ABE6A6ADF), SPH_C64(0xEABA0D50F050500D), + SPH_C64(0x57124C45CF45454C), SPH_C64(0x38CB18F308F3F318), + SPH_C64(0xAD9DF030503030F0), SPH_C64(0xC42B74EF2CEFEF74), + SPH_C64(0xDAE5C33F413F3FC3), SPH_C64(0xC7921C55FF55551C), + SPH_C64(0xDB7910A2FBA2A210), SPH_C64(0xE90365EA23EAEA65), + SPH_C64(0x6A0FEC65AF6565EC), SPH_C64(0x03B968BAD3BABA68), + SPH_C64(0x4A65932F712F2F93), SPH_C64(0x8E4EE7C05DC0C0E7), + SPH_C64(0x60BE81DE7FDEDE81), SPH_C64(0xFCE06C1C241C1C6C), + SPH_C64(0x46BB2EFD1AFDFD2E), SPH_C64(0x1F52644DD74D4D64), + SPH_C64(0x76E4E092AB9292E0), SPH_C64(0xFA8FBC759F7575BC), + SPH_C64(0x36301E060A06061E), SPH_C64(0xAE24988A838A8A98), + SPH_C64(0x4BF940B2CBB2B240), SPH_C64(0x856359E637E6E659), + SPH_C64(0x7E70360E120E0E36), SPH_C64(0xE7F8631F211F1F63), + SPH_C64(0x5537F762A66262F7), SPH_C64(0x3AEEA3D461D4D4A3), + SPH_C64(0x812932A8E5A8A832), SPH_C64(0x52C4F496A79696F4), + SPH_C64(0x629B3AF916F9F93A), SPH_C64(0xA366F6C552C5C5F6), + SPH_C64(0x1035B1256F2525B1), SPH_C64(0xABF22059EB595920), + SPH_C64(0xD054AE84918484AE), SPH_C64(0xC5B7A772967272A7), + SPH_C64(0xECD5DD394B3939DD), SPH_C64(0x165A614CD44C4C61), + SPH_C64(0x94CA3B5EE25E5E3B), SPH_C64(0x9FE7857888787885), + SPH_C64(0xE5DDD838483838D8), SPH_C64(0x9814868C898C8C86), + SPH_C64(0x17C6B2D16ED1D1B2), SPH_C64(0xE4410BA5F2A5A50B), + SPH_C64(0xA1434DE23BE2E24D), SPH_C64(0x4E2FF861A36161F8), + SPH_C64(0x42F145B3C8B3B345), SPH_C64(0x3415A521632121A5), + SPH_C64(0x0894D69CB99C9CD6), SPH_C64(0xEEF0661E221E1E66), + SPH_C64(0x61225243C5434352), SPH_C64(0xB176FCC754C7C7FC), + SPH_C64(0x4FB32BFC19FCFC2B), SPH_C64(0x242014040C040414), + SPH_C64(0xE3B20851F3515108), SPH_C64(0x25BCC799B69999C7), + SPH_C64(0x224FC46DB76D6DC4), SPH_C64(0x6568390D170D0D39), + SPH_C64(0x798335FA13FAFA35), SPH_C64(0x69B684DF7CDFDF84), + SPH_C64(0xA9D79B7E827E7E9B), SPH_C64(0x193DB4246C2424B4), + SPH_C64(0xFEC5D73B4D3B3BD7), SPH_C64(0x9A313DABE0ABAB3D), + SPH_C64(0xF03ED1CE4FCECED1), SPH_C64(0x9988551133111155), + SPH_C64(0x830C898F8C8F8F89), SPH_C64(0x044A6B4ED24E4E6B), + SPH_C64(0x66D151B7C4B7B751), SPH_C64(0xE00B60EB20EBEB60), + SPH_C64(0xC1FDCC3C443C3CCC), SPH_C64(0xFD7CBF819E8181BF), + SPH_C64(0x40D4FE94A19494FE), SPH_C64(0x1CEB0CF704F7F70C), + SPH_C64(0x18A167B9D6B9B967), SPH_C64(0x8B985F133513135F), + SPH_C64(0x517D9C2C742C2C9C), SPH_C64(0x05D6B8D368D3D3B8), + SPH_C64(0x8C6B5CE734E7E75C), SPH_C64(0x3957CB6EB26E6ECB), + SPH_C64(0xAA6EF3C451C4C4F3), SPH_C64(0x1B180F030503030F), + SPH_C64(0xDC8A1356FA565613), SPH_C64(0x5E1A4944CC444449), + SPH_C64(0xA0DF9E7F817F7F9E), SPH_C64(0x882137A9E6A9A937), + SPH_C64(0x674D822A7E2A2A82), SPH_C64(0x0AB16DBBD0BBBB6D), + SPH_C64(0x8746E2C15EC1C1E2), SPH_C64(0xF1A20253F5535302), + SPH_C64(0x72AE8BDC79DCDC8B), SPH_C64(0x5358270B1D0B0B27), + SPH_C64(0x019CD39DBA9D9DD3), SPH_C64(0x2B47C16CB46C6CC1), + SPH_C64(0xA495F531533131F5), SPH_C64(0xF387B9749C7474B9), + SPH_C64(0x15E309F607F6F609), SPH_C64(0x4C0A4346CA464643), + SPH_C64(0xA50926ACE9ACAC26), SPH_C64(0xB53C978986898997), + SPH_C64(0xB4A044143C141444), SPH_C64(0xBA5B42E13EE1E142), + SPH_C64(0xA6B04E163A16164E), SPH_C64(0xF7CDD23A4E3A3AD2), + SPH_C64(0x066FD069BB6969D0), SPH_C64(0x41482D091B09092D), + SPH_C64(0xD7A7AD70907070AD), SPH_C64(0x6FD954B6C7B6B654), + SPH_C64(0x1ECEB7D06DD0D0B7), SPH_C64(0xD63B7EED2AEDED7E), + SPH_C64(0xE22EDBCC49CCCCDB), SPH_C64(0x682A5742C6424257), + SPH_C64(0x2CB4C298B59898C2), SPH_C64(0xED490EA4F1A4A40E), + SPH_C64(0x755D882878282888), SPH_C64(0x86DA315CE45C5C31), + SPH_C64(0x6B933FF815F8F83F), SPH_C64(0xC244A486978686A4) +}; + +__constant static const sph_u64 old1_T2[256] __attribute__ ((aligned (128))) = { + SPH_C64(0xC0781828181878D8), SPH_C64(0x05AF23652323AF26), + SPH_C64(0x7EF9C657C6C6F9B8), SPH_C64(0x136FE825E8E86FFB), + SPH_C64(0x4CA187948787A1CB), SPH_C64(0xA962B8D5B8B86211), + SPH_C64(0x0805010301010509), SPH_C64(0x426E4FD14F4F6E0D), + SPH_C64(0xADEE365A3636EE9B), SPH_C64(0x5904A6F7A6A604FF), + SPH_C64(0xDEBDD26BD2D2BD0C), SPH_C64(0xFB06F502F5F5060E), + SPH_C64(0xEF80798B79798096), SPH_C64(0x5FCE6FB16F6FCE30), + SPH_C64(0xFCEF91AE9191EF6D), SPH_C64(0xAA0752F6525207F8), + SPH_C64(0x27FD60A06060FD47), SPH_C64(0x8976BCD9BCBC7635), + SPH_C64(0xACCD9BB09B9BCD37), SPH_C64(0x048C8E8F8E8E8C8A), + SPH_C64(0x7115A3F8A3A315D2), SPH_C64(0x603C0C140C0C3C6C), + SPH_C64(0xFF8A7B8D7B7B8A84), SPH_C64(0xB5E1355F3535E180), + SPH_C64(0xE8691D271D1D69F5), SPH_C64(0x5347E03DE0E047B3), + SPH_C64(0xF6ACD764D7D7AC21), SPH_C64(0x5EEDC25BC2C2ED9C), + SPH_C64(0x6D962E722E2E9643), SPH_C64(0x627A4BDD4B4B7A29), + SPH_C64(0xA321FE1FFEFE215D), SPH_C64(0x821657F9575716D5), + SPH_C64(0xA841153F151541BD), SPH_C64(0x9FB677997777B6E8), + SPH_C64(0xA5EB37593737EB92), SPH_C64(0x7B56E532E5E5569E), + SPH_C64(0x8CD99FBC9F9FD913), SPH_C64(0xD317F00DF0F01723), + SPH_C64(0x6A7F4ADE4A4A7F20), SPH_C64(0x9E95DA73DADA9544), + SPH_C64(0xFA2558E8585825A2), SPH_C64(0x06CAC946C9C9CACF), + SPH_C64(0x558D297B29298D7C), SPH_C64(0x50220A1E0A0A225A), + SPH_C64(0xE14FB1CEB1B14F50), SPH_C64(0x691AA0FDA0A01AC9), + SPH_C64(0x7FDA6BBD6B6BDA14), SPH_C64(0x5CAB85928585ABD9), + SPH_C64(0x8173BDDABDBD733C), SPH_C64(0xD2345DE75D5D348F), + SPH_C64(0x8050103010105090), SPH_C64(0xF303F401F4F40307), + SPH_C64(0x16C0CB40CBCBC0DD), SPH_C64(0xEDC63E423E3EC6D3), + SPH_C64(0x2811050F0505112D), SPH_C64(0x1FE667A96767E678), + SPH_C64(0x7353E431E4E45397), SPH_C64(0x25BB27692727BB02), + SPH_C64(0x325841C341415873), SPH_C64(0x2C9D8B808B8B9DA7), + SPH_C64(0x5101A7F4A7A701F6), SPH_C64(0xCF947D877D7D94B2), + SPH_C64(0xDCFB95A29595FB49), SPH_C64(0x8E9FD875D8D89F56), + SPH_C64(0x8B30FB10FBFB3070), SPH_C64(0x2371EE2FEEEE71CD), + SPH_C64(0xC7917C847C7C91BB), SPH_C64(0x17E366AA6666E371), + SPH_C64(0xA68EDD7ADDDD8E7B), SPH_C64(0xB84B173917174BAF), + SPH_C64(0x024647C947474645), SPH_C64(0x84DC9EBF9E9EDC1A), + SPH_C64(0x1EC5CA43CACAC5D4), SPH_C64(0x75992D772D2D9958), + SPH_C64(0x9179BFDCBFBF792E), SPH_C64(0x381B070907071B3F), + SPH_C64(0x0123ADEAADAD23AC), SPH_C64(0xEA2F5AEE5A5A2FB0), + SPH_C64(0x6CB583988383B5EF), SPH_C64(0x85FF33553333FFB6), + SPH_C64(0x3FF263A56363F25C), SPH_C64(0x100A020602020A12), + SPH_C64(0x3938AAE3AAAA3893), SPH_C64(0xAFA871937171A8DE), + SPH_C64(0x0ECFC845C8C8CFC6), SPH_C64(0xC87D192B19197DD1), + SPH_C64(0x727049DB4949703B), SPH_C64(0x869AD976D9D99A5F), + SPH_C64(0xC31DF20BF2F21D31), SPH_C64(0x4B48E338E3E348A8), + SPH_C64(0xE22A5BED5B5B2AB9), SPH_C64(0x34928885888892BC), + SPH_C64(0xA4C89AB39A9AC83E), SPH_C64(0x2DBE266A2626BE0B), + SPH_C64(0x8DFA32563232FABF), SPH_C64(0xE94AB0CDB0B04A59), + SPH_C64(0x1B6AE926E9E96AF2), SPH_C64(0x78330F110F0F3377), + SPH_C64(0xE6A6D562D5D5A633), SPH_C64(0x74BA809D8080BAF4), + SPH_C64(0x997CBEDFBEBE7C27), SPH_C64(0x26DECD4ACDCDDEEB), + SPH_C64(0xBDE4345C3434E489), SPH_C64(0x7A7548D848487532), + SPH_C64(0xAB24FF1CFFFF2454), SPH_C64(0xF78F7A8E7A7A8F8D), + SPH_C64(0xF4EA90AD9090EA64), SPH_C64(0xC23E5FE15F5F3E9D), + SPH_C64(0x1DA020602020A03D), SPH_C64(0x67D568B86868D50F), + SPH_C64(0xD0721A2E1A1A72CA), SPH_C64(0x192CAEEFAEAE2CB7), + SPH_C64(0xC95EB4C1B4B45E7D), SPH_C64(0x9A1954FC545419CE), + SPH_C64(0xECE593A89393E57F), SPH_C64(0x0DAA22662222AA2F), + SPH_C64(0x07E964AC6464E963), SPH_C64(0xDB12F10EF1F1122A), + SPH_C64(0xBFA273957373A2CC), SPH_C64(0x905A123612125A82), + SPH_C64(0x3A5D40C040405D7A), SPH_C64(0x4028081808082848), + SPH_C64(0x56E8C358C3C3E895), SPH_C64(0x337BEC29ECEC7BDF), + SPH_C64(0x9690DB70DBDB904D), SPH_C64(0x611FA1FEA1A11FC0), + SPH_C64(0x1C838D8A8D8D8391), SPH_C64(0xF5C93D473D3DC9C8), + SPH_C64(0xCCF197A49797F15B), SPH_C64(0x0000000000000000), + SPH_C64(0x36D4CF4CCFCFD4F9), SPH_C64(0x45872B7D2B2B876E), + SPH_C64(0x97B3769A7676B3E1), SPH_C64(0x64B0829B8282B0E6), + SPH_C64(0xFEA9D667D6D6A928), SPH_C64(0xD8771B2D1B1B77C3), + SPH_C64(0xC15BB5C2B5B55B74), SPH_C64(0x1129AFECAFAF29BE), + SPH_C64(0x77DF6ABE6A6ADF1D), SPH_C64(0xBA0D50F050500DEA), + SPH_C64(0x124C45CF45454C57), SPH_C64(0xCB18F308F3F31838), + SPH_C64(0x9DF030503030F0AD), SPH_C64(0x2B74EF2CEFEF74C4), + SPH_C64(0xE5C33F413F3FC3DA), SPH_C64(0x921C55FF55551CC7), + SPH_C64(0x7910A2FBA2A210DB), SPH_C64(0x0365EA23EAEA65E9), + SPH_C64(0x0FEC65AF6565EC6A), SPH_C64(0xB968BAD3BABA6803), + SPH_C64(0x65932F712F2F934A), SPH_C64(0x4EE7C05DC0C0E78E), + SPH_C64(0xBE81DE7FDEDE8160), SPH_C64(0xE06C1C241C1C6CFC), + SPH_C64(0xBB2EFD1AFDFD2E46), SPH_C64(0x52644DD74D4D641F), + SPH_C64(0xE4E092AB9292E076), SPH_C64(0x8FBC759F7575BCFA), + SPH_C64(0x301E060A06061E36), SPH_C64(0x24988A838A8A98AE), + SPH_C64(0xF940B2CBB2B2404B), SPH_C64(0x6359E637E6E65985), + SPH_C64(0x70360E120E0E367E), SPH_C64(0xF8631F211F1F63E7), + SPH_C64(0x37F762A66262F755), SPH_C64(0xEEA3D461D4D4A33A), + SPH_C64(0x2932A8E5A8A83281), SPH_C64(0xC4F496A79696F452), + SPH_C64(0x9B3AF916F9F93A62), SPH_C64(0x66F6C552C5C5F6A3), + SPH_C64(0x35B1256F2525B110), SPH_C64(0xF22059EB595920AB), + SPH_C64(0x54AE84918484AED0), SPH_C64(0xB7A772967272A7C5), + SPH_C64(0xD5DD394B3939DDEC), SPH_C64(0x5A614CD44C4C6116), + SPH_C64(0xCA3B5EE25E5E3B94), SPH_C64(0xE78578887878859F), + SPH_C64(0xDDD838483838D8E5), SPH_C64(0x14868C898C8C8698), + SPH_C64(0xC6B2D16ED1D1B217), SPH_C64(0x410BA5F2A5A50BE4), + SPH_C64(0x434DE23BE2E24DA1), SPH_C64(0x2FF861A36161F84E), + SPH_C64(0xF145B3C8B3B34542), SPH_C64(0x15A521632121A534), + SPH_C64(0x94D69CB99C9CD608), SPH_C64(0xF0661E221E1E66EE), + SPH_C64(0x225243C543435261), SPH_C64(0x76FCC754C7C7FCB1), + SPH_C64(0xB32BFC19FCFC2B4F), SPH_C64(0x2014040C04041424), + SPH_C64(0xB20851F3515108E3), SPH_C64(0xBCC799B69999C725), + SPH_C64(0x4FC46DB76D6DC422), SPH_C64(0x68390D170D0D3965), + SPH_C64(0x8335FA13FAFA3579), SPH_C64(0xB684DF7CDFDF8469), + SPH_C64(0xD79B7E827E7E9BA9), SPH_C64(0x3DB4246C2424B419), + SPH_C64(0xC5D73B4D3B3BD7FE), SPH_C64(0x313DABE0ABAB3D9A), + SPH_C64(0x3ED1CE4FCECED1F0), SPH_C64(0x8855113311115599), + SPH_C64(0x0C898F8C8F8F8983), SPH_C64(0x4A6B4ED24E4E6B04), + SPH_C64(0xD151B7C4B7B75166), SPH_C64(0x0B60EB20EBEB60E0), + SPH_C64(0xFDCC3C443C3CCCC1), SPH_C64(0x7CBF819E8181BFFD), + SPH_C64(0xD4FE94A19494FE40), SPH_C64(0xEB0CF704F7F70C1C), + SPH_C64(0xA167B9D6B9B96718), SPH_C64(0x985F133513135F8B), + SPH_C64(0x7D9C2C742C2C9C51), SPH_C64(0xD6B8D368D3D3B805), + SPH_C64(0x6B5CE734E7E75C8C), SPH_C64(0x57CB6EB26E6ECB39), + SPH_C64(0x6EF3C451C4C4F3AA), SPH_C64(0x180F030503030F1B), + SPH_C64(0x8A1356FA565613DC), SPH_C64(0x1A4944CC4444495E), + SPH_C64(0xDF9E7F817F7F9EA0), SPH_C64(0x2137A9E6A9A93788), + SPH_C64(0x4D822A7E2A2A8267), SPH_C64(0xB16DBBD0BBBB6D0A), + SPH_C64(0x46E2C15EC1C1E287), SPH_C64(0xA20253F5535302F1), + SPH_C64(0xAE8BDC79DCDC8B72), SPH_C64(0x58270B1D0B0B2753), + SPH_C64(0x9CD39DBA9D9DD301), SPH_C64(0x47C16CB46C6CC12B), + SPH_C64(0x95F531533131F5A4), SPH_C64(0x87B9749C7474B9F3), + SPH_C64(0xE309F607F6F60915), SPH_C64(0x0A4346CA4646434C), + SPH_C64(0x0926ACE9ACAC26A5), SPH_C64(0x3C978986898997B5), + SPH_C64(0xA044143C141444B4), SPH_C64(0x5B42E13EE1E142BA), + SPH_C64(0xB04E163A16164EA6), SPH_C64(0xCDD23A4E3A3AD2F7), + SPH_C64(0x6FD069BB6969D006), SPH_C64(0x482D091B09092D41), + SPH_C64(0xA7AD70907070ADD7), SPH_C64(0xD954B6C7B6B6546F), + SPH_C64(0xCEB7D06DD0D0B71E), SPH_C64(0x3B7EED2AEDED7ED6), + SPH_C64(0x2EDBCC49CCCCDBE2), SPH_C64(0x2A5742C642425768), + SPH_C64(0xB4C298B59898C22C), SPH_C64(0x490EA4F1A4A40EED), + SPH_C64(0x5D88287828288875), SPH_C64(0xDA315CE45C5C3186), + SPH_C64(0x933FF815F8F83F6B), SPH_C64(0x44A486978686A4C2) +}; + +__constant static const sph_u64 old1_T3[256] __attribute__ ((aligned (128))) = { + SPH_C64(0x781828181878D8C0), SPH_C64(0xAF23652323AF2605), + SPH_C64(0xF9C657C6C6F9B87E), SPH_C64(0x6FE825E8E86FFB13), + SPH_C64(0xA187948787A1CB4C), SPH_C64(0x62B8D5B8B86211A9), + SPH_C64(0x0501030101050908), SPH_C64(0x6E4FD14F4F6E0D42), + SPH_C64(0xEE365A3636EE9BAD), SPH_C64(0x04A6F7A6A604FF59), + SPH_C64(0xBDD26BD2D2BD0CDE), SPH_C64(0x06F502F5F5060EFB), + SPH_C64(0x80798B79798096EF), SPH_C64(0xCE6FB16F6FCE305F), + SPH_C64(0xEF91AE9191EF6DFC), SPH_C64(0x0752F6525207F8AA), + SPH_C64(0xFD60A06060FD4727), SPH_C64(0x76BCD9BCBC763589), + SPH_C64(0xCD9BB09B9BCD37AC), SPH_C64(0x8C8E8F8E8E8C8A04), + SPH_C64(0x15A3F8A3A315D271), SPH_C64(0x3C0C140C0C3C6C60), + SPH_C64(0x8A7B8D7B7B8A84FF), SPH_C64(0xE1355F3535E180B5), + SPH_C64(0x691D271D1D69F5E8), SPH_C64(0x47E03DE0E047B353), + SPH_C64(0xACD764D7D7AC21F6), SPH_C64(0xEDC25BC2C2ED9C5E), + SPH_C64(0x962E722E2E96436D), SPH_C64(0x7A4BDD4B4B7A2962), + SPH_C64(0x21FE1FFEFE215DA3), SPH_C64(0x1657F9575716D582), + SPH_C64(0x41153F151541BDA8), SPH_C64(0xB677997777B6E89F), + SPH_C64(0xEB37593737EB92A5), SPH_C64(0x56E532E5E5569E7B), + SPH_C64(0xD99FBC9F9FD9138C), SPH_C64(0x17F00DF0F01723D3), + SPH_C64(0x7F4ADE4A4A7F206A), SPH_C64(0x95DA73DADA95449E), + SPH_C64(0x2558E8585825A2FA), SPH_C64(0xCAC946C9C9CACF06), + SPH_C64(0x8D297B29298D7C55), SPH_C64(0x220A1E0A0A225A50), + SPH_C64(0x4FB1CEB1B14F50E1), SPH_C64(0x1AA0FDA0A01AC969), + SPH_C64(0xDA6BBD6B6BDA147F), SPH_C64(0xAB85928585ABD95C), + SPH_C64(0x73BDDABDBD733C81), SPH_C64(0x345DE75D5D348FD2), + SPH_C64(0x5010301010509080), SPH_C64(0x03F401F4F40307F3), + SPH_C64(0xC0CB40CBCBC0DD16), SPH_C64(0xC63E423E3EC6D3ED), + SPH_C64(0x11050F0505112D28), SPH_C64(0xE667A96767E6781F), + SPH_C64(0x53E431E4E4539773), SPH_C64(0xBB27692727BB0225), + SPH_C64(0x5841C34141587332), SPH_C64(0x9D8B808B8B9DA72C), + SPH_C64(0x01A7F4A7A701F651), SPH_C64(0x947D877D7D94B2CF), + SPH_C64(0xFB95A29595FB49DC), SPH_C64(0x9FD875D8D89F568E), + SPH_C64(0x30FB10FBFB30708B), SPH_C64(0x71EE2FEEEE71CD23), + SPH_C64(0x917C847C7C91BBC7), SPH_C64(0xE366AA6666E37117), + SPH_C64(0x8EDD7ADDDD8E7BA6), SPH_C64(0x4B173917174BAFB8), + SPH_C64(0x4647C94747464502), SPH_C64(0xDC9EBF9E9EDC1A84), + SPH_C64(0xC5CA43CACAC5D41E), SPH_C64(0x992D772D2D995875), + SPH_C64(0x79BFDCBFBF792E91), SPH_C64(0x1B070907071B3F38), + SPH_C64(0x23ADEAADAD23AC01), SPH_C64(0x2F5AEE5A5A2FB0EA), + SPH_C64(0xB583988383B5EF6C), SPH_C64(0xFF33553333FFB685), + SPH_C64(0xF263A56363F25C3F), SPH_C64(0x0A020602020A1210), + SPH_C64(0x38AAE3AAAA389339), SPH_C64(0xA871937171A8DEAF), + SPH_C64(0xCFC845C8C8CFC60E), SPH_C64(0x7D192B19197DD1C8), + SPH_C64(0x7049DB4949703B72), SPH_C64(0x9AD976D9D99A5F86), + SPH_C64(0x1DF20BF2F21D31C3), SPH_C64(0x48E338E3E348A84B), + SPH_C64(0x2A5BED5B5B2AB9E2), SPH_C64(0x928885888892BC34), + SPH_C64(0xC89AB39A9AC83EA4), SPH_C64(0xBE266A2626BE0B2D), + SPH_C64(0xFA32563232FABF8D), SPH_C64(0x4AB0CDB0B04A59E9), + SPH_C64(0x6AE926E9E96AF21B), SPH_C64(0x330F110F0F337778), + SPH_C64(0xA6D562D5D5A633E6), SPH_C64(0xBA809D8080BAF474), + SPH_C64(0x7CBEDFBEBE7C2799), SPH_C64(0xDECD4ACDCDDEEB26), + SPH_C64(0xE4345C3434E489BD), SPH_C64(0x7548D8484875327A), + SPH_C64(0x24FF1CFFFF2454AB), SPH_C64(0x8F7A8E7A7A8F8DF7), + SPH_C64(0xEA90AD9090EA64F4), SPH_C64(0x3E5FE15F5F3E9DC2), + SPH_C64(0xA020602020A03D1D), SPH_C64(0xD568B86868D50F67), + SPH_C64(0x721A2E1A1A72CAD0), SPH_C64(0x2CAEEFAEAE2CB719), + SPH_C64(0x5EB4C1B4B45E7DC9), SPH_C64(0x1954FC545419CE9A), + SPH_C64(0xE593A89393E57FEC), SPH_C64(0xAA22662222AA2F0D), + SPH_C64(0xE964AC6464E96307), SPH_C64(0x12F10EF1F1122ADB), + SPH_C64(0xA273957373A2CCBF), SPH_C64(0x5A123612125A8290), + SPH_C64(0x5D40C040405D7A3A), SPH_C64(0x2808180808284840), + SPH_C64(0xE8C358C3C3E89556), SPH_C64(0x7BEC29ECEC7BDF33), + SPH_C64(0x90DB70DBDB904D96), SPH_C64(0x1FA1FEA1A11FC061), + SPH_C64(0x838D8A8D8D83911C), SPH_C64(0xC93D473D3DC9C8F5), + SPH_C64(0xF197A49797F15BCC), SPH_C64(0x0000000000000000), + SPH_C64(0xD4CF4CCFCFD4F936), SPH_C64(0x872B7D2B2B876E45), + SPH_C64(0xB3769A7676B3E197), SPH_C64(0xB0829B8282B0E664), + SPH_C64(0xA9D667D6D6A928FE), SPH_C64(0x771B2D1B1B77C3D8), + SPH_C64(0x5BB5C2B5B55B74C1), SPH_C64(0x29AFECAFAF29BE11), + SPH_C64(0xDF6ABE6A6ADF1D77), SPH_C64(0x0D50F050500DEABA), + SPH_C64(0x4C45CF45454C5712), SPH_C64(0x18F308F3F31838CB), + SPH_C64(0xF030503030F0AD9D), SPH_C64(0x74EF2CEFEF74C42B), + SPH_C64(0xC33F413F3FC3DAE5), SPH_C64(0x1C55FF55551CC792), + SPH_C64(0x10A2FBA2A210DB79), SPH_C64(0x65EA23EAEA65E903), + SPH_C64(0xEC65AF6565EC6A0F), SPH_C64(0x68BAD3BABA6803B9), + SPH_C64(0x932F712F2F934A65), SPH_C64(0xE7C05DC0C0E78E4E), + SPH_C64(0x81DE7FDEDE8160BE), SPH_C64(0x6C1C241C1C6CFCE0), + SPH_C64(0x2EFD1AFDFD2E46BB), SPH_C64(0x644DD74D4D641F52), + SPH_C64(0xE092AB9292E076E4), SPH_C64(0xBC759F7575BCFA8F), + SPH_C64(0x1E060A06061E3630), SPH_C64(0x988A838A8A98AE24), + SPH_C64(0x40B2CBB2B2404BF9), SPH_C64(0x59E637E6E6598563), + SPH_C64(0x360E120E0E367E70), SPH_C64(0x631F211F1F63E7F8), + SPH_C64(0xF762A66262F75537), SPH_C64(0xA3D461D4D4A33AEE), + SPH_C64(0x32A8E5A8A8328129), SPH_C64(0xF496A79696F452C4), + SPH_C64(0x3AF916F9F93A629B), SPH_C64(0xF6C552C5C5F6A366), + SPH_C64(0xB1256F2525B11035), SPH_C64(0x2059EB595920ABF2), + SPH_C64(0xAE84918484AED054), SPH_C64(0xA772967272A7C5B7), + SPH_C64(0xDD394B3939DDECD5), SPH_C64(0x614CD44C4C61165A), + SPH_C64(0x3B5EE25E5E3B94CA), SPH_C64(0x8578887878859FE7), + SPH_C64(0xD838483838D8E5DD), SPH_C64(0x868C898C8C869814), + SPH_C64(0xB2D16ED1D1B217C6), SPH_C64(0x0BA5F2A5A50BE441), + SPH_C64(0x4DE23BE2E24DA143), SPH_C64(0xF861A36161F84E2F), + SPH_C64(0x45B3C8B3B34542F1), SPH_C64(0xA521632121A53415), + SPH_C64(0xD69CB99C9CD60894), SPH_C64(0x661E221E1E66EEF0), + SPH_C64(0x5243C54343526122), SPH_C64(0xFCC754C7C7FCB176), + SPH_C64(0x2BFC19FCFC2B4FB3), SPH_C64(0x14040C0404142420), + SPH_C64(0x0851F3515108E3B2), SPH_C64(0xC799B69999C725BC), + SPH_C64(0xC46DB76D6DC4224F), SPH_C64(0x390D170D0D396568), + SPH_C64(0x35FA13FAFA357983), SPH_C64(0x84DF7CDFDF8469B6), + SPH_C64(0x9B7E827E7E9BA9D7), SPH_C64(0xB4246C2424B4193D), + SPH_C64(0xD73B4D3B3BD7FEC5), SPH_C64(0x3DABE0ABAB3D9A31), + SPH_C64(0xD1CE4FCECED1F03E), SPH_C64(0x5511331111559988), + SPH_C64(0x898F8C8F8F89830C), SPH_C64(0x6B4ED24E4E6B044A), + SPH_C64(0x51B7C4B7B75166D1), SPH_C64(0x60EB20EBEB60E00B), + SPH_C64(0xCC3C443C3CCCC1FD), SPH_C64(0xBF819E8181BFFD7C), + SPH_C64(0xFE94A19494FE40D4), SPH_C64(0x0CF704F7F70C1CEB), + SPH_C64(0x67B9D6B9B96718A1), SPH_C64(0x5F133513135F8B98), + SPH_C64(0x9C2C742C2C9C517D), SPH_C64(0xB8D368D3D3B805D6), + SPH_C64(0x5CE734E7E75C8C6B), SPH_C64(0xCB6EB26E6ECB3957), + SPH_C64(0xF3C451C4C4F3AA6E), SPH_C64(0x0F030503030F1B18), + SPH_C64(0x1356FA565613DC8A), SPH_C64(0x4944CC4444495E1A), + SPH_C64(0x9E7F817F7F9EA0DF), SPH_C64(0x37A9E6A9A9378821), + SPH_C64(0x822A7E2A2A82674D), SPH_C64(0x6DBBD0BBBB6D0AB1), + SPH_C64(0xE2C15EC1C1E28746), SPH_C64(0x0253F5535302F1A2), + SPH_C64(0x8BDC79DCDC8B72AE), SPH_C64(0x270B1D0B0B275358), + SPH_C64(0xD39DBA9D9DD3019C), SPH_C64(0xC16CB46C6CC12B47), + SPH_C64(0xF531533131F5A495), SPH_C64(0xB9749C7474B9F387), + SPH_C64(0x09F607F6F60915E3), SPH_C64(0x4346CA4646434C0A), + SPH_C64(0x26ACE9ACAC26A509), SPH_C64(0x978986898997B53C), + SPH_C64(0x44143C141444B4A0), SPH_C64(0x42E13EE1E142BA5B), + SPH_C64(0x4E163A16164EA6B0), SPH_C64(0xD23A4E3A3AD2F7CD), + SPH_C64(0xD069BB6969D0066F), SPH_C64(0x2D091B09092D4148), + SPH_C64(0xAD70907070ADD7A7), SPH_C64(0x54B6C7B6B6546FD9), + SPH_C64(0xB7D06DD0D0B71ECE), SPH_C64(0x7EED2AEDED7ED63B), + SPH_C64(0xDBCC49CCCCDBE22E), SPH_C64(0x5742C6424257682A), + SPH_C64(0xC298B59898C22CB4), SPH_C64(0x0EA4F1A4A40EED49), + SPH_C64(0x882878282888755D), SPH_C64(0x315CE45C5C3186DA), + SPH_C64(0x3FF815F8F83F6B93), SPH_C64(0xA486978686A4C244) +}; + +__constant static const sph_u64 old1_T4[256] __attribute__ ((aligned (128))) = { + SPH_C64(0x1828181878D8C078), SPH_C64(0x23652323AF2605AF), + SPH_C64(0xC657C6C6F9B87EF9), SPH_C64(0xE825E8E86FFB136F), + SPH_C64(0x87948787A1CB4CA1), SPH_C64(0xB8D5B8B86211A962), + SPH_C64(0x0103010105090805), SPH_C64(0x4FD14F4F6E0D426E), + SPH_C64(0x365A3636EE9BADEE), SPH_C64(0xA6F7A6A604FF5904), + SPH_C64(0xD26BD2D2BD0CDEBD), SPH_C64(0xF502F5F5060EFB06), + SPH_C64(0x798B79798096EF80), SPH_C64(0x6FB16F6FCE305FCE), + SPH_C64(0x91AE9191EF6DFCEF), SPH_C64(0x52F6525207F8AA07), + SPH_C64(0x60A06060FD4727FD), SPH_C64(0xBCD9BCBC76358976), + SPH_C64(0x9BB09B9BCD37ACCD), SPH_C64(0x8E8F8E8E8C8A048C), + SPH_C64(0xA3F8A3A315D27115), SPH_C64(0x0C140C0C3C6C603C), + SPH_C64(0x7B8D7B7B8A84FF8A), SPH_C64(0x355F3535E180B5E1), + SPH_C64(0x1D271D1D69F5E869), SPH_C64(0xE03DE0E047B35347), + SPH_C64(0xD764D7D7AC21F6AC), SPH_C64(0xC25BC2C2ED9C5EED), + SPH_C64(0x2E722E2E96436D96), SPH_C64(0x4BDD4B4B7A29627A), + SPH_C64(0xFE1FFEFE215DA321), SPH_C64(0x57F9575716D58216), + SPH_C64(0x153F151541BDA841), SPH_C64(0x77997777B6E89FB6), + SPH_C64(0x37593737EB92A5EB), SPH_C64(0xE532E5E5569E7B56), + SPH_C64(0x9FBC9F9FD9138CD9), SPH_C64(0xF00DF0F01723D317), + SPH_C64(0x4ADE4A4A7F206A7F), SPH_C64(0xDA73DADA95449E95), + SPH_C64(0x58E8585825A2FA25), SPH_C64(0xC946C9C9CACF06CA), + SPH_C64(0x297B29298D7C558D), SPH_C64(0x0A1E0A0A225A5022), + SPH_C64(0xB1CEB1B14F50E14F), SPH_C64(0xA0FDA0A01AC9691A), + SPH_C64(0x6BBD6B6BDA147FDA), SPH_C64(0x85928585ABD95CAB), + SPH_C64(0xBDDABDBD733C8173), SPH_C64(0x5DE75D5D348FD234), + SPH_C64(0x1030101050908050), SPH_C64(0xF401F4F40307F303), + SPH_C64(0xCB40CBCBC0DD16C0), SPH_C64(0x3E423E3EC6D3EDC6), + SPH_C64(0x050F0505112D2811), SPH_C64(0x67A96767E6781FE6), + SPH_C64(0xE431E4E453977353), SPH_C64(0x27692727BB0225BB), + SPH_C64(0x41C3414158733258), SPH_C64(0x8B808B8B9DA72C9D), + SPH_C64(0xA7F4A7A701F65101), SPH_C64(0x7D877D7D94B2CF94), + SPH_C64(0x95A29595FB49DCFB), SPH_C64(0xD875D8D89F568E9F), + SPH_C64(0xFB10FBFB30708B30), SPH_C64(0xEE2FEEEE71CD2371), + SPH_C64(0x7C847C7C91BBC791), SPH_C64(0x66AA6666E37117E3), + SPH_C64(0xDD7ADDDD8E7BA68E), SPH_C64(0x173917174BAFB84B), + SPH_C64(0x47C9474746450246), SPH_C64(0x9EBF9E9EDC1A84DC), + SPH_C64(0xCA43CACAC5D41EC5), SPH_C64(0x2D772D2D99587599), + SPH_C64(0xBFDCBFBF792E9179), SPH_C64(0x070907071B3F381B), + SPH_C64(0xADEAADAD23AC0123), SPH_C64(0x5AEE5A5A2FB0EA2F), + SPH_C64(0x83988383B5EF6CB5), SPH_C64(0x33553333FFB685FF), + SPH_C64(0x63A56363F25C3FF2), SPH_C64(0x020602020A12100A), + SPH_C64(0xAAE3AAAA38933938), SPH_C64(0x71937171A8DEAFA8), + SPH_C64(0xC845C8C8CFC60ECF), SPH_C64(0x192B19197DD1C87D), + SPH_C64(0x49DB4949703B7270), SPH_C64(0xD976D9D99A5F869A), + SPH_C64(0xF20BF2F21D31C31D), SPH_C64(0xE338E3E348A84B48), + SPH_C64(0x5BED5B5B2AB9E22A), SPH_C64(0x8885888892BC3492), + SPH_C64(0x9AB39A9AC83EA4C8), SPH_C64(0x266A2626BE0B2DBE), + SPH_C64(0x32563232FABF8DFA), SPH_C64(0xB0CDB0B04A59E94A), + SPH_C64(0xE926E9E96AF21B6A), SPH_C64(0x0F110F0F33777833), + SPH_C64(0xD562D5D5A633E6A6), SPH_C64(0x809D8080BAF474BA), + SPH_C64(0xBEDFBEBE7C27997C), SPH_C64(0xCD4ACDCDDEEB26DE), + SPH_C64(0x345C3434E489BDE4), SPH_C64(0x48D8484875327A75), + SPH_C64(0xFF1CFFFF2454AB24), SPH_C64(0x7A8E7A7A8F8DF78F), + SPH_C64(0x90AD9090EA64F4EA), SPH_C64(0x5FE15F5F3E9DC23E), + SPH_C64(0x20602020A03D1DA0), SPH_C64(0x68B86868D50F67D5), + SPH_C64(0x1A2E1A1A72CAD072), SPH_C64(0xAEEFAEAE2CB7192C), + SPH_C64(0xB4C1B4B45E7DC95E), SPH_C64(0x54FC545419CE9A19), + SPH_C64(0x93A89393E57FECE5), SPH_C64(0x22662222AA2F0DAA), + SPH_C64(0x64AC6464E96307E9), SPH_C64(0xF10EF1F1122ADB12), + SPH_C64(0x73957373A2CCBFA2), SPH_C64(0x123612125A82905A), + SPH_C64(0x40C040405D7A3A5D), SPH_C64(0x0818080828484028), + SPH_C64(0xC358C3C3E89556E8), SPH_C64(0xEC29ECEC7BDF337B), + SPH_C64(0xDB70DBDB904D9690), SPH_C64(0xA1FEA1A11FC0611F), + SPH_C64(0x8D8A8D8D83911C83), SPH_C64(0x3D473D3DC9C8F5C9), + SPH_C64(0x97A49797F15BCCF1), SPH_C64(0x0000000000000000), + SPH_C64(0xCF4CCFCFD4F936D4), SPH_C64(0x2B7D2B2B876E4587), + SPH_C64(0x769A7676B3E197B3), SPH_C64(0x829B8282B0E664B0), + SPH_C64(0xD667D6D6A928FEA9), SPH_C64(0x1B2D1B1B77C3D877), + SPH_C64(0xB5C2B5B55B74C15B), SPH_C64(0xAFECAFAF29BE1129), + SPH_C64(0x6ABE6A6ADF1D77DF), SPH_C64(0x50F050500DEABA0D), + SPH_C64(0x45CF45454C57124C), SPH_C64(0xF308F3F31838CB18), + SPH_C64(0x30503030F0AD9DF0), SPH_C64(0xEF2CEFEF74C42B74), + SPH_C64(0x3F413F3FC3DAE5C3), SPH_C64(0x55FF55551CC7921C), + SPH_C64(0xA2FBA2A210DB7910), SPH_C64(0xEA23EAEA65E90365), + SPH_C64(0x65AF6565EC6A0FEC), SPH_C64(0xBAD3BABA6803B968), + SPH_C64(0x2F712F2F934A6593), SPH_C64(0xC05DC0C0E78E4EE7), + SPH_C64(0xDE7FDEDE8160BE81), SPH_C64(0x1C241C1C6CFCE06C), + SPH_C64(0xFD1AFDFD2E46BB2E), SPH_C64(0x4DD74D4D641F5264), + SPH_C64(0x92AB9292E076E4E0), SPH_C64(0x759F7575BCFA8FBC), + SPH_C64(0x060A06061E36301E), SPH_C64(0x8A838A8A98AE2498), + SPH_C64(0xB2CBB2B2404BF940), SPH_C64(0xE637E6E659856359), + SPH_C64(0x0E120E0E367E7036), SPH_C64(0x1F211F1F63E7F863), + SPH_C64(0x62A66262F75537F7), SPH_C64(0xD461D4D4A33AEEA3), + SPH_C64(0xA8E5A8A832812932), SPH_C64(0x96A79696F452C4F4), + SPH_C64(0xF916F9F93A629B3A), SPH_C64(0xC552C5C5F6A366F6), + SPH_C64(0x256F2525B11035B1), SPH_C64(0x59EB595920ABF220), + SPH_C64(0x84918484AED054AE), SPH_C64(0x72967272A7C5B7A7), + SPH_C64(0x394B3939DDECD5DD), SPH_C64(0x4CD44C4C61165A61), + SPH_C64(0x5EE25E5E3B94CA3B), SPH_C64(0x78887878859FE785), + SPH_C64(0x38483838D8E5DDD8), SPH_C64(0x8C898C8C86981486), + SPH_C64(0xD16ED1D1B217C6B2), SPH_C64(0xA5F2A5A50BE4410B), + SPH_C64(0xE23BE2E24DA1434D), SPH_C64(0x61A36161F84E2FF8), + SPH_C64(0xB3C8B3B34542F145), SPH_C64(0x21632121A53415A5), + SPH_C64(0x9CB99C9CD60894D6), SPH_C64(0x1E221E1E66EEF066), + SPH_C64(0x43C5434352612252), SPH_C64(0xC754C7C7FCB176FC), + SPH_C64(0xFC19FCFC2B4FB32B), SPH_C64(0x040C040414242014), + SPH_C64(0x51F3515108E3B208), SPH_C64(0x99B69999C725BCC7), + SPH_C64(0x6DB76D6DC4224FC4), SPH_C64(0x0D170D0D39656839), + SPH_C64(0xFA13FAFA35798335), SPH_C64(0xDF7CDFDF8469B684), + SPH_C64(0x7E827E7E9BA9D79B), SPH_C64(0x246C2424B4193DB4), + SPH_C64(0x3B4D3B3BD7FEC5D7), SPH_C64(0xABE0ABAB3D9A313D), + SPH_C64(0xCE4FCECED1F03ED1), SPH_C64(0x1133111155998855), + SPH_C64(0x8F8C8F8F89830C89), SPH_C64(0x4ED24E4E6B044A6B), + SPH_C64(0xB7C4B7B75166D151), SPH_C64(0xEB20EBEB60E00B60), + SPH_C64(0x3C443C3CCCC1FDCC), SPH_C64(0x819E8181BFFD7CBF), + SPH_C64(0x94A19494FE40D4FE), SPH_C64(0xF704F7F70C1CEB0C), + SPH_C64(0xB9D6B9B96718A167), SPH_C64(0x133513135F8B985F), + SPH_C64(0x2C742C2C9C517D9C), SPH_C64(0xD368D3D3B805D6B8), + SPH_C64(0xE734E7E75C8C6B5C), SPH_C64(0x6EB26E6ECB3957CB), + SPH_C64(0xC451C4C4F3AA6EF3), SPH_C64(0x030503030F1B180F), + SPH_C64(0x56FA565613DC8A13), SPH_C64(0x44CC4444495E1A49), + SPH_C64(0x7F817F7F9EA0DF9E), SPH_C64(0xA9E6A9A937882137), + SPH_C64(0x2A7E2A2A82674D82), SPH_C64(0xBBD0BBBB6D0AB16D), + SPH_C64(0xC15EC1C1E28746E2), SPH_C64(0x53F5535302F1A202), + SPH_C64(0xDC79DCDC8B72AE8B), SPH_C64(0x0B1D0B0B27535827), + SPH_C64(0x9DBA9D9DD3019CD3), SPH_C64(0x6CB46C6CC12B47C1), + SPH_C64(0x31533131F5A495F5), SPH_C64(0x749C7474B9F387B9), + SPH_C64(0xF607F6F60915E309), SPH_C64(0x46CA4646434C0A43), + SPH_C64(0xACE9ACAC26A50926), SPH_C64(0x8986898997B53C97), + SPH_C64(0x143C141444B4A044), SPH_C64(0xE13EE1E142BA5B42), + SPH_C64(0x163A16164EA6B04E), SPH_C64(0x3A4E3A3AD2F7CDD2), + SPH_C64(0x69BB6969D0066FD0), SPH_C64(0x091B09092D41482D), + SPH_C64(0x70907070ADD7A7AD), SPH_C64(0xB6C7B6B6546FD954), + SPH_C64(0xD06DD0D0B71ECEB7), SPH_C64(0xED2AEDED7ED63B7E), + SPH_C64(0xCC49CCCCDBE22EDB), SPH_C64(0x42C6424257682A57), + SPH_C64(0x98B59898C22CB4C2), SPH_C64(0xA4F1A4A40EED490E), + SPH_C64(0x2878282888755D88), SPH_C64(0x5CE45C5C3186DA31), + SPH_C64(0xF815F8F83F6B933F), SPH_C64(0x86978686A4C244A4) +}; + +__constant static const sph_u64 old1_T5[256] __attribute__ ((aligned (128))) = { + SPH_C64(0x28181878D8C07818), SPH_C64(0x652323AF2605AF23), + SPH_C64(0x57C6C6F9B87EF9C6), SPH_C64(0x25E8E86FFB136FE8), + SPH_C64(0x948787A1CB4CA187), SPH_C64(0xD5B8B86211A962B8), + SPH_C64(0x0301010509080501), SPH_C64(0xD14F4F6E0D426E4F), + SPH_C64(0x5A3636EE9BADEE36), SPH_C64(0xF7A6A604FF5904A6), + SPH_C64(0x6BD2D2BD0CDEBDD2), SPH_C64(0x02F5F5060EFB06F5), + SPH_C64(0x8B79798096EF8079), SPH_C64(0xB16F6FCE305FCE6F), + SPH_C64(0xAE9191EF6DFCEF91), SPH_C64(0xF6525207F8AA0752), + SPH_C64(0xA06060FD4727FD60), SPH_C64(0xD9BCBC76358976BC), + SPH_C64(0xB09B9BCD37ACCD9B), SPH_C64(0x8F8E8E8C8A048C8E), + SPH_C64(0xF8A3A315D27115A3), SPH_C64(0x140C0C3C6C603C0C), + SPH_C64(0x8D7B7B8A84FF8A7B), SPH_C64(0x5F3535E180B5E135), + SPH_C64(0x271D1D69F5E8691D), SPH_C64(0x3DE0E047B35347E0), + SPH_C64(0x64D7D7AC21F6ACD7), SPH_C64(0x5BC2C2ED9C5EEDC2), + SPH_C64(0x722E2E96436D962E), SPH_C64(0xDD4B4B7A29627A4B), + SPH_C64(0x1FFEFE215DA321FE), SPH_C64(0xF9575716D5821657), + SPH_C64(0x3F151541BDA84115), SPH_C64(0x997777B6E89FB677), + SPH_C64(0x593737EB92A5EB37), SPH_C64(0x32E5E5569E7B56E5), + SPH_C64(0xBC9F9FD9138CD99F), SPH_C64(0x0DF0F01723D317F0), + SPH_C64(0xDE4A4A7F206A7F4A), SPH_C64(0x73DADA95449E95DA), + SPH_C64(0xE8585825A2FA2558), SPH_C64(0x46C9C9CACF06CAC9), + SPH_C64(0x7B29298D7C558D29), SPH_C64(0x1E0A0A225A50220A), + SPH_C64(0xCEB1B14F50E14FB1), SPH_C64(0xFDA0A01AC9691AA0), + SPH_C64(0xBD6B6BDA147FDA6B), SPH_C64(0x928585ABD95CAB85), + SPH_C64(0xDABDBD733C8173BD), SPH_C64(0xE75D5D348FD2345D), + SPH_C64(0x3010105090805010), SPH_C64(0x01F4F40307F303F4), + SPH_C64(0x40CBCBC0DD16C0CB), SPH_C64(0x423E3EC6D3EDC63E), + SPH_C64(0x0F0505112D281105), SPH_C64(0xA96767E6781FE667), + SPH_C64(0x31E4E453977353E4), SPH_C64(0x692727BB0225BB27), + SPH_C64(0xC341415873325841), SPH_C64(0x808B8B9DA72C9D8B), + SPH_C64(0xF4A7A701F65101A7), SPH_C64(0x877D7D94B2CF947D), + SPH_C64(0xA29595FB49DCFB95), SPH_C64(0x75D8D89F568E9FD8), + SPH_C64(0x10FBFB30708B30FB), SPH_C64(0x2FEEEE71CD2371EE), + SPH_C64(0x847C7C91BBC7917C), SPH_C64(0xAA6666E37117E366), + SPH_C64(0x7ADDDD8E7BA68EDD), SPH_C64(0x3917174BAFB84B17), + SPH_C64(0xC947474645024647), SPH_C64(0xBF9E9EDC1A84DC9E), + SPH_C64(0x43CACAC5D41EC5CA), SPH_C64(0x772D2D995875992D), + SPH_C64(0xDCBFBF792E9179BF), SPH_C64(0x0907071B3F381B07), + SPH_C64(0xEAADAD23AC0123AD), SPH_C64(0xEE5A5A2FB0EA2F5A), + SPH_C64(0x988383B5EF6CB583), SPH_C64(0x553333FFB685FF33), + SPH_C64(0xA56363F25C3FF263), SPH_C64(0x0602020A12100A02), + SPH_C64(0xE3AAAA38933938AA), SPH_C64(0x937171A8DEAFA871), + SPH_C64(0x45C8C8CFC60ECFC8), SPH_C64(0x2B19197DD1C87D19), + SPH_C64(0xDB4949703B727049), SPH_C64(0x76D9D99A5F869AD9), + SPH_C64(0x0BF2F21D31C31DF2), SPH_C64(0x38E3E348A84B48E3), + SPH_C64(0xED5B5B2AB9E22A5B), SPH_C64(0x85888892BC349288), + SPH_C64(0xB39A9AC83EA4C89A), SPH_C64(0x6A2626BE0B2DBE26), + SPH_C64(0x563232FABF8DFA32), SPH_C64(0xCDB0B04A59E94AB0), + SPH_C64(0x26E9E96AF21B6AE9), SPH_C64(0x110F0F337778330F), + SPH_C64(0x62D5D5A633E6A6D5), SPH_C64(0x9D8080BAF474BA80), + SPH_C64(0xDFBEBE7C27997CBE), SPH_C64(0x4ACDCDDEEB26DECD), + SPH_C64(0x5C3434E489BDE434), SPH_C64(0xD8484875327A7548), + SPH_C64(0x1CFFFF2454AB24FF), SPH_C64(0x8E7A7A8F8DF78F7A), + SPH_C64(0xAD9090EA64F4EA90), SPH_C64(0xE15F5F3E9DC23E5F), + SPH_C64(0x602020A03D1DA020), SPH_C64(0xB86868D50F67D568), + SPH_C64(0x2E1A1A72CAD0721A), SPH_C64(0xEFAEAE2CB7192CAE), + SPH_C64(0xC1B4B45E7DC95EB4), SPH_C64(0xFC545419CE9A1954), + SPH_C64(0xA89393E57FECE593), SPH_C64(0x662222AA2F0DAA22), + SPH_C64(0xAC6464E96307E964), SPH_C64(0x0EF1F1122ADB12F1), + SPH_C64(0x957373A2CCBFA273), SPH_C64(0x3612125A82905A12), + SPH_C64(0xC040405D7A3A5D40), SPH_C64(0x1808082848402808), + SPH_C64(0x58C3C3E89556E8C3), SPH_C64(0x29ECEC7BDF337BEC), + SPH_C64(0x70DBDB904D9690DB), SPH_C64(0xFEA1A11FC0611FA1), + SPH_C64(0x8A8D8D83911C838D), SPH_C64(0x473D3DC9C8F5C93D), + SPH_C64(0xA49797F15BCCF197), SPH_C64(0x0000000000000000), + SPH_C64(0x4CCFCFD4F936D4CF), SPH_C64(0x7D2B2B876E45872B), + SPH_C64(0x9A7676B3E197B376), SPH_C64(0x9B8282B0E664B082), + SPH_C64(0x67D6D6A928FEA9D6), SPH_C64(0x2D1B1B77C3D8771B), + SPH_C64(0xC2B5B55B74C15BB5), SPH_C64(0xECAFAF29BE1129AF), + SPH_C64(0xBE6A6ADF1D77DF6A), SPH_C64(0xF050500DEABA0D50), + SPH_C64(0xCF45454C57124C45), SPH_C64(0x08F3F31838CB18F3), + SPH_C64(0x503030F0AD9DF030), SPH_C64(0x2CEFEF74C42B74EF), + SPH_C64(0x413F3FC3DAE5C33F), SPH_C64(0xFF55551CC7921C55), + SPH_C64(0xFBA2A210DB7910A2), SPH_C64(0x23EAEA65E90365EA), + SPH_C64(0xAF6565EC6A0FEC65), SPH_C64(0xD3BABA6803B968BA), + SPH_C64(0x712F2F934A65932F), SPH_C64(0x5DC0C0E78E4EE7C0), + SPH_C64(0x7FDEDE8160BE81DE), SPH_C64(0x241C1C6CFCE06C1C), + SPH_C64(0x1AFDFD2E46BB2EFD), SPH_C64(0xD74D4D641F52644D), + SPH_C64(0xAB9292E076E4E092), SPH_C64(0x9F7575BCFA8FBC75), + SPH_C64(0x0A06061E36301E06), SPH_C64(0x838A8A98AE24988A), + SPH_C64(0xCBB2B2404BF940B2), SPH_C64(0x37E6E659856359E6), + SPH_C64(0x120E0E367E70360E), SPH_C64(0x211F1F63E7F8631F), + SPH_C64(0xA66262F75537F762), SPH_C64(0x61D4D4A33AEEA3D4), + SPH_C64(0xE5A8A832812932A8), SPH_C64(0xA79696F452C4F496), + SPH_C64(0x16F9F93A629B3AF9), SPH_C64(0x52C5C5F6A366F6C5), + SPH_C64(0x6F2525B11035B125), SPH_C64(0xEB595920ABF22059), + SPH_C64(0x918484AED054AE84), SPH_C64(0x967272A7C5B7A772), + SPH_C64(0x4B3939DDECD5DD39), SPH_C64(0xD44C4C61165A614C), + SPH_C64(0xE25E5E3B94CA3B5E), SPH_C64(0x887878859FE78578), + SPH_C64(0x483838D8E5DDD838), SPH_C64(0x898C8C869814868C), + SPH_C64(0x6ED1D1B217C6B2D1), SPH_C64(0xF2A5A50BE4410BA5), + SPH_C64(0x3BE2E24DA1434DE2), SPH_C64(0xA36161F84E2FF861), + SPH_C64(0xC8B3B34542F145B3), SPH_C64(0x632121A53415A521), + SPH_C64(0xB99C9CD60894D69C), SPH_C64(0x221E1E66EEF0661E), + SPH_C64(0xC543435261225243), SPH_C64(0x54C7C7FCB176FCC7), + SPH_C64(0x19FCFC2B4FB32BFC), SPH_C64(0x0C04041424201404), + SPH_C64(0xF3515108E3B20851), SPH_C64(0xB69999C725BCC799), + SPH_C64(0xB76D6DC4224FC46D), SPH_C64(0x170D0D396568390D), + SPH_C64(0x13FAFA35798335FA), SPH_C64(0x7CDFDF8469B684DF), + SPH_C64(0x827E7E9BA9D79B7E), SPH_C64(0x6C2424B4193DB424), + SPH_C64(0x4D3B3BD7FEC5D73B), SPH_C64(0xE0ABAB3D9A313DAB), + SPH_C64(0x4FCECED1F03ED1CE), SPH_C64(0x3311115599885511), + SPH_C64(0x8C8F8F89830C898F), SPH_C64(0xD24E4E6B044A6B4E), + SPH_C64(0xC4B7B75166D151B7), SPH_C64(0x20EBEB60E00B60EB), + SPH_C64(0x443C3CCCC1FDCC3C), SPH_C64(0x9E8181BFFD7CBF81), + SPH_C64(0xA19494FE40D4FE94), SPH_C64(0x04F7F70C1CEB0CF7), + SPH_C64(0xD6B9B96718A167B9), SPH_C64(0x3513135F8B985F13), + SPH_C64(0x742C2C9C517D9C2C), SPH_C64(0x68D3D3B805D6B8D3), + SPH_C64(0x34E7E75C8C6B5CE7), SPH_C64(0xB26E6ECB3957CB6E), + SPH_C64(0x51C4C4F3AA6EF3C4), SPH_C64(0x0503030F1B180F03), + SPH_C64(0xFA565613DC8A1356), SPH_C64(0xCC4444495E1A4944), + SPH_C64(0x817F7F9EA0DF9E7F), SPH_C64(0xE6A9A937882137A9), + SPH_C64(0x7E2A2A82674D822A), SPH_C64(0xD0BBBB6D0AB16DBB), + SPH_C64(0x5EC1C1E28746E2C1), SPH_C64(0xF5535302F1A20253), + SPH_C64(0x79DCDC8B72AE8BDC), SPH_C64(0x1D0B0B275358270B), + SPH_C64(0xBA9D9DD3019CD39D), SPH_C64(0xB46C6CC12B47C16C), + SPH_C64(0x533131F5A495F531), SPH_C64(0x9C7474B9F387B974), + SPH_C64(0x07F6F60915E309F6), SPH_C64(0xCA4646434C0A4346), + SPH_C64(0xE9ACAC26A50926AC), SPH_C64(0x86898997B53C9789), + SPH_C64(0x3C141444B4A04414), SPH_C64(0x3EE1E142BA5B42E1), + SPH_C64(0x3A16164EA6B04E16), SPH_C64(0x4E3A3AD2F7CDD23A), + SPH_C64(0xBB6969D0066FD069), SPH_C64(0x1B09092D41482D09), + SPH_C64(0x907070ADD7A7AD70), SPH_C64(0xC7B6B6546FD954B6), + SPH_C64(0x6DD0D0B71ECEB7D0), SPH_C64(0x2AEDED7ED63B7EED), + SPH_C64(0x49CCCCDBE22EDBCC), SPH_C64(0xC6424257682A5742), + SPH_C64(0xB59898C22CB4C298), SPH_C64(0xF1A4A40EED490EA4), + SPH_C64(0x78282888755D8828), SPH_C64(0xE45C5C3186DA315C), + SPH_C64(0x15F8F83F6B933FF8), SPH_C64(0x978686A4C244A486) +}; + +__constant static const sph_u64 old1_T6[256] __attribute__ ((aligned (128))) = { + SPH_C64(0x181878D8C0781828), SPH_C64(0x2323AF2605AF2365), + SPH_C64(0xC6C6F9B87EF9C657), SPH_C64(0xE8E86FFB136FE825), + SPH_C64(0x8787A1CB4CA18794), SPH_C64(0xB8B86211A962B8D5), + SPH_C64(0x0101050908050103), SPH_C64(0x4F4F6E0D426E4FD1), + SPH_C64(0x3636EE9BADEE365A), SPH_C64(0xA6A604FF5904A6F7), + SPH_C64(0xD2D2BD0CDEBDD26B), SPH_C64(0xF5F5060EFB06F502), + SPH_C64(0x79798096EF80798B), SPH_C64(0x6F6FCE305FCE6FB1), + SPH_C64(0x9191EF6DFCEF91AE), SPH_C64(0x525207F8AA0752F6), + SPH_C64(0x6060FD4727FD60A0), SPH_C64(0xBCBC76358976BCD9), + SPH_C64(0x9B9BCD37ACCD9BB0), SPH_C64(0x8E8E8C8A048C8E8F), + SPH_C64(0xA3A315D27115A3F8), SPH_C64(0x0C0C3C6C603C0C14), + SPH_C64(0x7B7B8A84FF8A7B8D), SPH_C64(0x3535E180B5E1355F), + SPH_C64(0x1D1D69F5E8691D27), SPH_C64(0xE0E047B35347E03D), + SPH_C64(0xD7D7AC21F6ACD764), SPH_C64(0xC2C2ED9C5EEDC25B), + SPH_C64(0x2E2E96436D962E72), SPH_C64(0x4B4B7A29627A4BDD), + SPH_C64(0xFEFE215DA321FE1F), SPH_C64(0x575716D5821657F9), + SPH_C64(0x151541BDA841153F), SPH_C64(0x7777B6E89FB67799), + SPH_C64(0x3737EB92A5EB3759), SPH_C64(0xE5E5569E7B56E532), + SPH_C64(0x9F9FD9138CD99FBC), SPH_C64(0xF0F01723D317F00D), + SPH_C64(0x4A4A7F206A7F4ADE), SPH_C64(0xDADA95449E95DA73), + SPH_C64(0x585825A2FA2558E8), SPH_C64(0xC9C9CACF06CAC946), + SPH_C64(0x29298D7C558D297B), SPH_C64(0x0A0A225A50220A1E), + SPH_C64(0xB1B14F50E14FB1CE), SPH_C64(0xA0A01AC9691AA0FD), + SPH_C64(0x6B6BDA147FDA6BBD), SPH_C64(0x8585ABD95CAB8592), + SPH_C64(0xBDBD733C8173BDDA), SPH_C64(0x5D5D348FD2345DE7), + SPH_C64(0x1010509080501030), SPH_C64(0xF4F40307F303F401), + SPH_C64(0xCBCBC0DD16C0CB40), SPH_C64(0x3E3EC6D3EDC63E42), + SPH_C64(0x0505112D2811050F), SPH_C64(0x6767E6781FE667A9), + SPH_C64(0xE4E453977353E431), SPH_C64(0x2727BB0225BB2769), + SPH_C64(0x41415873325841C3), SPH_C64(0x8B8B9DA72C9D8B80), + SPH_C64(0xA7A701F65101A7F4), SPH_C64(0x7D7D94B2CF947D87), + SPH_C64(0x9595FB49DCFB95A2), SPH_C64(0xD8D89F568E9FD875), + SPH_C64(0xFBFB30708B30FB10), SPH_C64(0xEEEE71CD2371EE2F), + SPH_C64(0x7C7C91BBC7917C84), SPH_C64(0x6666E37117E366AA), + SPH_C64(0xDDDD8E7BA68EDD7A), SPH_C64(0x17174BAFB84B1739), + SPH_C64(0x47474645024647C9), SPH_C64(0x9E9EDC1A84DC9EBF), + SPH_C64(0xCACAC5D41EC5CA43), SPH_C64(0x2D2D995875992D77), + SPH_C64(0xBFBF792E9179BFDC), SPH_C64(0x07071B3F381B0709), + SPH_C64(0xADAD23AC0123ADEA), SPH_C64(0x5A5A2FB0EA2F5AEE), + SPH_C64(0x8383B5EF6CB58398), SPH_C64(0x3333FFB685FF3355), + SPH_C64(0x6363F25C3FF263A5), SPH_C64(0x02020A12100A0206), + SPH_C64(0xAAAA38933938AAE3), SPH_C64(0x7171A8DEAFA87193), + SPH_C64(0xC8C8CFC60ECFC845), SPH_C64(0x19197DD1C87D192B), + SPH_C64(0x4949703B727049DB), SPH_C64(0xD9D99A5F869AD976), + SPH_C64(0xF2F21D31C31DF20B), SPH_C64(0xE3E348A84B48E338), + SPH_C64(0x5B5B2AB9E22A5BED), SPH_C64(0x888892BC34928885), + SPH_C64(0x9A9AC83EA4C89AB3), SPH_C64(0x2626BE0B2DBE266A), + SPH_C64(0x3232FABF8DFA3256), SPH_C64(0xB0B04A59E94AB0CD), + SPH_C64(0xE9E96AF21B6AE926), SPH_C64(0x0F0F337778330F11), + SPH_C64(0xD5D5A633E6A6D562), SPH_C64(0x8080BAF474BA809D), + SPH_C64(0xBEBE7C27997CBEDF), SPH_C64(0xCDCDDEEB26DECD4A), + SPH_C64(0x3434E489BDE4345C), SPH_C64(0x484875327A7548D8), + SPH_C64(0xFFFF2454AB24FF1C), SPH_C64(0x7A7A8F8DF78F7A8E), + SPH_C64(0x9090EA64F4EA90AD), SPH_C64(0x5F5F3E9DC23E5FE1), + SPH_C64(0x2020A03D1DA02060), SPH_C64(0x6868D50F67D568B8), + SPH_C64(0x1A1A72CAD0721A2E), SPH_C64(0xAEAE2CB7192CAEEF), + SPH_C64(0xB4B45E7DC95EB4C1), SPH_C64(0x545419CE9A1954FC), + SPH_C64(0x9393E57FECE593A8), SPH_C64(0x2222AA2F0DAA2266), + SPH_C64(0x6464E96307E964AC), SPH_C64(0xF1F1122ADB12F10E), + SPH_C64(0x7373A2CCBFA27395), SPH_C64(0x12125A82905A1236), + SPH_C64(0x40405D7A3A5D40C0), SPH_C64(0x0808284840280818), + SPH_C64(0xC3C3E89556E8C358), SPH_C64(0xECEC7BDF337BEC29), + SPH_C64(0xDBDB904D9690DB70), SPH_C64(0xA1A11FC0611FA1FE), + SPH_C64(0x8D8D83911C838D8A), SPH_C64(0x3D3DC9C8F5C93D47), + SPH_C64(0x9797F15BCCF197A4), SPH_C64(0x0000000000000000), + SPH_C64(0xCFCFD4F936D4CF4C), SPH_C64(0x2B2B876E45872B7D), + SPH_C64(0x7676B3E197B3769A), SPH_C64(0x8282B0E664B0829B), + SPH_C64(0xD6D6A928FEA9D667), SPH_C64(0x1B1B77C3D8771B2D), + SPH_C64(0xB5B55B74C15BB5C2), SPH_C64(0xAFAF29BE1129AFEC), + SPH_C64(0x6A6ADF1D77DF6ABE), SPH_C64(0x50500DEABA0D50F0), + SPH_C64(0x45454C57124C45CF), SPH_C64(0xF3F31838CB18F308), + SPH_C64(0x3030F0AD9DF03050), SPH_C64(0xEFEF74C42B74EF2C), + SPH_C64(0x3F3FC3DAE5C33F41), SPH_C64(0x55551CC7921C55FF), + SPH_C64(0xA2A210DB7910A2FB), SPH_C64(0xEAEA65E90365EA23), + SPH_C64(0x6565EC6A0FEC65AF), SPH_C64(0xBABA6803B968BAD3), + SPH_C64(0x2F2F934A65932F71), SPH_C64(0xC0C0E78E4EE7C05D), + SPH_C64(0xDEDE8160BE81DE7F), SPH_C64(0x1C1C6CFCE06C1C24), + SPH_C64(0xFDFD2E46BB2EFD1A), SPH_C64(0x4D4D641F52644DD7), + SPH_C64(0x9292E076E4E092AB), SPH_C64(0x7575BCFA8FBC759F), + SPH_C64(0x06061E36301E060A), SPH_C64(0x8A8A98AE24988A83), + SPH_C64(0xB2B2404BF940B2CB), SPH_C64(0xE6E659856359E637), + SPH_C64(0x0E0E367E70360E12), SPH_C64(0x1F1F63E7F8631F21), + SPH_C64(0x6262F75537F762A6), SPH_C64(0xD4D4A33AEEA3D461), + SPH_C64(0xA8A832812932A8E5), SPH_C64(0x9696F452C4F496A7), + SPH_C64(0xF9F93A629B3AF916), SPH_C64(0xC5C5F6A366F6C552), + SPH_C64(0x2525B11035B1256F), SPH_C64(0x595920ABF22059EB), + SPH_C64(0x8484AED054AE8491), SPH_C64(0x7272A7C5B7A77296), + SPH_C64(0x3939DDECD5DD394B), SPH_C64(0x4C4C61165A614CD4), + SPH_C64(0x5E5E3B94CA3B5EE2), SPH_C64(0x7878859FE7857888), + SPH_C64(0x3838D8E5DDD83848), SPH_C64(0x8C8C869814868C89), + SPH_C64(0xD1D1B217C6B2D16E), SPH_C64(0xA5A50BE4410BA5F2), + SPH_C64(0xE2E24DA1434DE23B), SPH_C64(0x6161F84E2FF861A3), + SPH_C64(0xB3B34542F145B3C8), SPH_C64(0x2121A53415A52163), + SPH_C64(0x9C9CD60894D69CB9), SPH_C64(0x1E1E66EEF0661E22), + SPH_C64(0x43435261225243C5), SPH_C64(0xC7C7FCB176FCC754), + SPH_C64(0xFCFC2B4FB32BFC19), SPH_C64(0x040414242014040C), + SPH_C64(0x515108E3B20851F3), SPH_C64(0x9999C725BCC799B6), + SPH_C64(0x6D6DC4224FC46DB7), SPH_C64(0x0D0D396568390D17), + SPH_C64(0xFAFA35798335FA13), SPH_C64(0xDFDF8469B684DF7C), + SPH_C64(0x7E7E9BA9D79B7E82), SPH_C64(0x2424B4193DB4246C), + SPH_C64(0x3B3BD7FEC5D73B4D), SPH_C64(0xABAB3D9A313DABE0), + SPH_C64(0xCECED1F03ED1CE4F), SPH_C64(0x1111559988551133), + SPH_C64(0x8F8F89830C898F8C), SPH_C64(0x4E4E6B044A6B4ED2), + SPH_C64(0xB7B75166D151B7C4), SPH_C64(0xEBEB60E00B60EB20), + SPH_C64(0x3C3CCCC1FDCC3C44), SPH_C64(0x8181BFFD7CBF819E), + SPH_C64(0x9494FE40D4FE94A1), SPH_C64(0xF7F70C1CEB0CF704), + SPH_C64(0xB9B96718A167B9D6), SPH_C64(0x13135F8B985F1335), + SPH_C64(0x2C2C9C517D9C2C74), SPH_C64(0xD3D3B805D6B8D368), + SPH_C64(0xE7E75C8C6B5CE734), SPH_C64(0x6E6ECB3957CB6EB2), + SPH_C64(0xC4C4F3AA6EF3C451), SPH_C64(0x03030F1B180F0305), + SPH_C64(0x565613DC8A1356FA), SPH_C64(0x4444495E1A4944CC), + SPH_C64(0x7F7F9EA0DF9E7F81), SPH_C64(0xA9A937882137A9E6), + SPH_C64(0x2A2A82674D822A7E), SPH_C64(0xBBBB6D0AB16DBBD0), + SPH_C64(0xC1C1E28746E2C15E), SPH_C64(0x535302F1A20253F5), + SPH_C64(0xDCDC8B72AE8BDC79), SPH_C64(0x0B0B275358270B1D), + SPH_C64(0x9D9DD3019CD39DBA), SPH_C64(0x6C6CC12B47C16CB4), + SPH_C64(0x3131F5A495F53153), SPH_C64(0x7474B9F387B9749C), + SPH_C64(0xF6F60915E309F607), SPH_C64(0x4646434C0A4346CA), + SPH_C64(0xACAC26A50926ACE9), SPH_C64(0x898997B53C978986), + SPH_C64(0x141444B4A044143C), SPH_C64(0xE1E142BA5B42E13E), + SPH_C64(0x16164EA6B04E163A), SPH_C64(0x3A3AD2F7CDD23A4E), + SPH_C64(0x6969D0066FD069BB), SPH_C64(0x09092D41482D091B), + SPH_C64(0x7070ADD7A7AD7090), SPH_C64(0xB6B6546FD954B6C7), + SPH_C64(0xD0D0B71ECEB7D06D), SPH_C64(0xEDED7ED63B7EED2A), + SPH_C64(0xCCCCDBE22EDBCC49), SPH_C64(0x424257682A5742C6), + SPH_C64(0x9898C22CB4C298B5), SPH_C64(0xA4A40EED490EA4F1), + SPH_C64(0x282888755D882878), SPH_C64(0x5C5C3186DA315CE4), + SPH_C64(0xF8F83F6B933FF815), SPH_C64(0x8686A4C244A48697) +}; + +__constant static const sph_u64 old1_T7[256] __attribute__ ((aligned (128))) = { + SPH_C64(0x1878D8C078182818), SPH_C64(0x23AF2605AF236523), + SPH_C64(0xC6F9B87EF9C657C6), SPH_C64(0xE86FFB136FE825E8), + SPH_C64(0x87A1CB4CA1879487), SPH_C64(0xB86211A962B8D5B8), + SPH_C64(0x0105090805010301), SPH_C64(0x4F6E0D426E4FD14F), + SPH_C64(0x36EE9BADEE365A36), SPH_C64(0xA604FF5904A6F7A6), + SPH_C64(0xD2BD0CDEBDD26BD2), SPH_C64(0xF5060EFB06F502F5), + SPH_C64(0x798096EF80798B79), SPH_C64(0x6FCE305FCE6FB16F), + SPH_C64(0x91EF6DFCEF91AE91), SPH_C64(0x5207F8AA0752F652), + SPH_C64(0x60FD4727FD60A060), SPH_C64(0xBC76358976BCD9BC), + SPH_C64(0x9BCD37ACCD9BB09B), SPH_C64(0x8E8C8A048C8E8F8E), + SPH_C64(0xA315D27115A3F8A3), SPH_C64(0x0C3C6C603C0C140C), + SPH_C64(0x7B8A84FF8A7B8D7B), SPH_C64(0x35E180B5E1355F35), + SPH_C64(0x1D69F5E8691D271D), SPH_C64(0xE047B35347E03DE0), + SPH_C64(0xD7AC21F6ACD764D7), SPH_C64(0xC2ED9C5EEDC25BC2), + SPH_C64(0x2E96436D962E722E), SPH_C64(0x4B7A29627A4BDD4B), + SPH_C64(0xFE215DA321FE1FFE), SPH_C64(0x5716D5821657F957), + SPH_C64(0x1541BDA841153F15), SPH_C64(0x77B6E89FB6779977), + SPH_C64(0x37EB92A5EB375937), SPH_C64(0xE5569E7B56E532E5), + SPH_C64(0x9FD9138CD99FBC9F), SPH_C64(0xF01723D317F00DF0), + SPH_C64(0x4A7F206A7F4ADE4A), SPH_C64(0xDA95449E95DA73DA), + SPH_C64(0x5825A2FA2558E858), SPH_C64(0xC9CACF06CAC946C9), + SPH_C64(0x298D7C558D297B29), SPH_C64(0x0A225A50220A1E0A), + SPH_C64(0xB14F50E14FB1CEB1), SPH_C64(0xA01AC9691AA0FDA0), + SPH_C64(0x6BDA147FDA6BBD6B), SPH_C64(0x85ABD95CAB859285), + SPH_C64(0xBD733C8173BDDABD), SPH_C64(0x5D348FD2345DE75D), + SPH_C64(0x1050908050103010), SPH_C64(0xF40307F303F401F4), + SPH_C64(0xCBC0DD16C0CB40CB), SPH_C64(0x3EC6D3EDC63E423E), + SPH_C64(0x05112D2811050F05), SPH_C64(0x67E6781FE667A967), + SPH_C64(0xE453977353E431E4), SPH_C64(0x27BB0225BB276927), + SPH_C64(0x415873325841C341), SPH_C64(0x8B9DA72C9D8B808B), + SPH_C64(0xA701F65101A7F4A7), SPH_C64(0x7D94B2CF947D877D), + SPH_C64(0x95FB49DCFB95A295), SPH_C64(0xD89F568E9FD875D8), + SPH_C64(0xFB30708B30FB10FB), SPH_C64(0xEE71CD2371EE2FEE), + SPH_C64(0x7C91BBC7917C847C), SPH_C64(0x66E37117E366AA66), + SPH_C64(0xDD8E7BA68EDD7ADD), SPH_C64(0x174BAFB84B173917), + SPH_C64(0x474645024647C947), SPH_C64(0x9EDC1A84DC9EBF9E), + SPH_C64(0xCAC5D41EC5CA43CA), SPH_C64(0x2D995875992D772D), + SPH_C64(0xBF792E9179BFDCBF), SPH_C64(0x071B3F381B070907), + SPH_C64(0xAD23AC0123ADEAAD), SPH_C64(0x5A2FB0EA2F5AEE5A), + SPH_C64(0x83B5EF6CB5839883), SPH_C64(0x33FFB685FF335533), + SPH_C64(0x63F25C3FF263A563), SPH_C64(0x020A12100A020602), + SPH_C64(0xAA38933938AAE3AA), SPH_C64(0x71A8DEAFA8719371), + SPH_C64(0xC8CFC60ECFC845C8), SPH_C64(0x197DD1C87D192B19), + SPH_C64(0x49703B727049DB49), SPH_C64(0xD99A5F869AD976D9), + SPH_C64(0xF21D31C31DF20BF2), SPH_C64(0xE348A84B48E338E3), + SPH_C64(0x5B2AB9E22A5BED5B), SPH_C64(0x8892BC3492888588), + SPH_C64(0x9AC83EA4C89AB39A), SPH_C64(0x26BE0B2DBE266A26), + SPH_C64(0x32FABF8DFA325632), SPH_C64(0xB04A59E94AB0CDB0), + SPH_C64(0xE96AF21B6AE926E9), SPH_C64(0x0F337778330F110F), + SPH_C64(0xD5A633E6A6D562D5), SPH_C64(0x80BAF474BA809D80), + SPH_C64(0xBE7C27997CBEDFBE), SPH_C64(0xCDDEEB26DECD4ACD), + SPH_C64(0x34E489BDE4345C34), SPH_C64(0x4875327A7548D848), + SPH_C64(0xFF2454AB24FF1CFF), SPH_C64(0x7A8F8DF78F7A8E7A), + SPH_C64(0x90EA64F4EA90AD90), SPH_C64(0x5F3E9DC23E5FE15F), + SPH_C64(0x20A03D1DA0206020), SPH_C64(0x68D50F67D568B868), + SPH_C64(0x1A72CAD0721A2E1A), SPH_C64(0xAE2CB7192CAEEFAE), + SPH_C64(0xB45E7DC95EB4C1B4), SPH_C64(0x5419CE9A1954FC54), + SPH_C64(0x93E57FECE593A893), SPH_C64(0x22AA2F0DAA226622), + SPH_C64(0x64E96307E964AC64), SPH_C64(0xF1122ADB12F10EF1), + SPH_C64(0x73A2CCBFA2739573), SPH_C64(0x125A82905A123612), + SPH_C64(0x405D7A3A5D40C040), SPH_C64(0x0828484028081808), + SPH_C64(0xC3E89556E8C358C3), SPH_C64(0xEC7BDF337BEC29EC), + SPH_C64(0xDB904D9690DB70DB), SPH_C64(0xA11FC0611FA1FEA1), + SPH_C64(0x8D83911C838D8A8D), SPH_C64(0x3DC9C8F5C93D473D), + SPH_C64(0x97F15BCCF197A497), SPH_C64(0x0000000000000000), + SPH_C64(0xCFD4F936D4CF4CCF), SPH_C64(0x2B876E45872B7D2B), + SPH_C64(0x76B3E197B3769A76), SPH_C64(0x82B0E664B0829B82), + SPH_C64(0xD6A928FEA9D667D6), SPH_C64(0x1B77C3D8771B2D1B), + SPH_C64(0xB55B74C15BB5C2B5), SPH_C64(0xAF29BE1129AFECAF), + SPH_C64(0x6ADF1D77DF6ABE6A), SPH_C64(0x500DEABA0D50F050), + SPH_C64(0x454C57124C45CF45), SPH_C64(0xF31838CB18F308F3), + SPH_C64(0x30F0AD9DF0305030), SPH_C64(0xEF74C42B74EF2CEF), + SPH_C64(0x3FC3DAE5C33F413F), SPH_C64(0x551CC7921C55FF55), + SPH_C64(0xA210DB7910A2FBA2), SPH_C64(0xEA65E90365EA23EA), + SPH_C64(0x65EC6A0FEC65AF65), SPH_C64(0xBA6803B968BAD3BA), + SPH_C64(0x2F934A65932F712F), SPH_C64(0xC0E78E4EE7C05DC0), + SPH_C64(0xDE8160BE81DE7FDE), SPH_C64(0x1C6CFCE06C1C241C), + SPH_C64(0xFD2E46BB2EFD1AFD), SPH_C64(0x4D641F52644DD74D), + SPH_C64(0x92E076E4E092AB92), SPH_C64(0x75BCFA8FBC759F75), + SPH_C64(0x061E36301E060A06), SPH_C64(0x8A98AE24988A838A), + SPH_C64(0xB2404BF940B2CBB2), SPH_C64(0xE659856359E637E6), + SPH_C64(0x0E367E70360E120E), SPH_C64(0x1F63E7F8631F211F), + SPH_C64(0x62F75537F762A662), SPH_C64(0xD4A33AEEA3D461D4), + SPH_C64(0xA832812932A8E5A8), SPH_C64(0x96F452C4F496A796), + SPH_C64(0xF93A629B3AF916F9), SPH_C64(0xC5F6A366F6C552C5), + SPH_C64(0x25B11035B1256F25), SPH_C64(0x5920ABF22059EB59), + SPH_C64(0x84AED054AE849184), SPH_C64(0x72A7C5B7A7729672), + SPH_C64(0x39DDECD5DD394B39), SPH_C64(0x4C61165A614CD44C), + SPH_C64(0x5E3B94CA3B5EE25E), SPH_C64(0x78859FE785788878), + SPH_C64(0x38D8E5DDD8384838), SPH_C64(0x8C869814868C898C), + SPH_C64(0xD1B217C6B2D16ED1), SPH_C64(0xA50BE4410BA5F2A5), + SPH_C64(0xE24DA1434DE23BE2), SPH_C64(0x61F84E2FF861A361), + SPH_C64(0xB34542F145B3C8B3), SPH_C64(0x21A53415A5216321), + SPH_C64(0x9CD60894D69CB99C), SPH_C64(0x1E66EEF0661E221E), + SPH_C64(0x435261225243C543), SPH_C64(0xC7FCB176FCC754C7), + SPH_C64(0xFC2B4FB32BFC19FC), SPH_C64(0x0414242014040C04), + SPH_C64(0x5108E3B20851F351), SPH_C64(0x99C725BCC799B699), + SPH_C64(0x6DC4224FC46DB76D), SPH_C64(0x0D396568390D170D), + SPH_C64(0xFA35798335FA13FA), SPH_C64(0xDF8469B684DF7CDF), + SPH_C64(0x7E9BA9D79B7E827E), SPH_C64(0x24B4193DB4246C24), + SPH_C64(0x3BD7FEC5D73B4D3B), SPH_C64(0xAB3D9A313DABE0AB), + SPH_C64(0xCED1F03ED1CE4FCE), SPH_C64(0x1155998855113311), + SPH_C64(0x8F89830C898F8C8F), SPH_C64(0x4E6B044A6B4ED24E), + SPH_C64(0xB75166D151B7C4B7), SPH_C64(0xEB60E00B60EB20EB), + SPH_C64(0x3CCCC1FDCC3C443C), SPH_C64(0x81BFFD7CBF819E81), + SPH_C64(0x94FE40D4FE94A194), SPH_C64(0xF70C1CEB0CF704F7), + SPH_C64(0xB96718A167B9D6B9), SPH_C64(0x135F8B985F133513), + SPH_C64(0x2C9C517D9C2C742C), SPH_C64(0xD3B805D6B8D368D3), + SPH_C64(0xE75C8C6B5CE734E7), SPH_C64(0x6ECB3957CB6EB26E), + SPH_C64(0xC4F3AA6EF3C451C4), SPH_C64(0x030F1B180F030503), + SPH_C64(0x5613DC8A1356FA56), SPH_C64(0x44495E1A4944CC44), + SPH_C64(0x7F9EA0DF9E7F817F), SPH_C64(0xA937882137A9E6A9), + SPH_C64(0x2A82674D822A7E2A), SPH_C64(0xBB6D0AB16DBBD0BB), + SPH_C64(0xC1E28746E2C15EC1), SPH_C64(0x5302F1A20253F553), + SPH_C64(0xDC8B72AE8BDC79DC), SPH_C64(0x0B275358270B1D0B), + SPH_C64(0x9DD3019CD39DBA9D), SPH_C64(0x6CC12B47C16CB46C), + SPH_C64(0x31F5A495F5315331), SPH_C64(0x74B9F387B9749C74), + SPH_C64(0xF60915E309F607F6), SPH_C64(0x46434C0A4346CA46), + SPH_C64(0xAC26A50926ACE9AC), SPH_C64(0x8997B53C97898689), + SPH_C64(0x1444B4A044143C14), SPH_C64(0xE142BA5B42E13EE1), + SPH_C64(0x164EA6B04E163A16), SPH_C64(0x3AD2F7CDD23A4E3A), + SPH_C64(0x69D0066FD069BB69), SPH_C64(0x092D41482D091B09), + SPH_C64(0x70ADD7A7AD709070), SPH_C64(0xB6546FD954B6C7B6), + SPH_C64(0xD0B71ECEB7D06DD0), SPH_C64(0xED7ED63B7EED2AED), + SPH_C64(0xCCDBE22EDBCC49CC), SPH_C64(0x4257682A5742C642), + SPH_C64(0x98C22CB4C298B598), SPH_C64(0xA40EED490EA4F1A4), + SPH_C64(0x2888755D88287828), SPH_C64(0x5C3186DA315CE45C), + SPH_C64(0xF83F6B933FF815F8), SPH_C64(0x86A4C244A4869786) +}; + +__constant static const sph_u64 rc[10] __attribute__ ((aligned (128))) = { + SPH_C64(0x4F01B887E8C62318), + SPH_C64(0x52916F79F5D2A636), + SPH_C64(0x357B0CA38E9BBC60), + SPH_C64(0x57FE4B2EC2D7E01D), + SPH_C64(0xDA4AF09FE5377715), + SPH_C64(0x856BA0B10A29C958), + SPH_C64(0x67053ECBF4105DBD), + SPH_C64(0xD8957DA78B4127E4), + SPH_C64(0x9E4717DD667CEEFB), + SPH_C64(0x33835AAD07BF2DCA) +}; + + +/* ====================================================================== */ + +#define BYTE(x, n) ((unsigned)((x) >> (8 * (n))) & 0xFF) + + +#define ROUND_ELT(in, i0, i1, i2, i3, i4, i5, i6, i7) \ + ( old1_T0[BYTE(in[i0], 0)] \ + ^ old1_T1[BYTE(in[i1], 1)] \ + ^ old1_T2[BYTE(in[i2], 2)] \ + ^ old1_T3[BYTE(in[i3], 3)] \ + ^ old1_T4[BYTE(in[i4], 4)] \ + ^ old1_T5[BYTE(in[i5], 5)] \ + ^ old1_T6[BYTE(in[i6], 6)] \ + ^ old1_T7[BYTE(in[i7], 7)]) + +#define SPH_T32(x) (as_uint(x)) +#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n)) +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) +#define SPH_T64(x) (as_ulong(x)) +#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL) +#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) + +#define SWAP4(x) (SPH_ROTL32(as_uint(x) & 0x00FF00FF, 24U)|SPH_ROTL32(as_uint(x) & 0xFF00FF00, 8U)) +#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) + +#if SPH_BIG_ENDIAN + #define DEC64E(x) (x) + #define DEC64BE(x) (*(const __global sph_u64 *) (x)); + #define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x)); + #define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x)); +#else + #define DEC64E(x) SWAP8(x) + #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC32LE(x) (*(const __global sph_u32 *) (x)); + #define DEC64LE(x) (*(const __global sph_u64 *) (x)); +#endif + +typedef union { + unsigned char h1[64]; + uint h4[16]; + ulong h8[8]; +} hash_t; + +void whirlpool_round(sph_u64* n, sph_u64* h){ + sph_u64 t0, t1, t2, t3, t4, t5, t6, t7; + for (unsigned r = 0; r < 10; r ++) { + t0 = (ROUND_ELT(h, 0, 7, 6, 5, 4, 3, 2, 1) ^ rc[r]); + t1 = (ROUND_ELT(h, 1, 0, 7, 6, 5, 4, 3, 2) ^ 0 ); + t2 = (ROUND_ELT(h, 2, 1, 0, 7, 6, 5, 4, 3) ^ 0 ); + t3 = (ROUND_ELT(h, 3, 2, 1, 0, 7, 6, 5, 4) ^ 0 ); + t4 = (ROUND_ELT(h, 4, 3, 2, 1, 0, 7, 6, 5) ^ 0 ); + t5 = (ROUND_ELT(h, 5, 4, 3, 2, 1, 0, 7, 6) ^ 0 ); + t6 = (ROUND_ELT(h, 6, 5, 4, 3, 2, 1, 0, 7) ^ 0 ); + t7 = (ROUND_ELT(h, 7, 6, 5, 4, 3, 2, 1, 0) ^ 0 ); + + h[0] = t0; + h[1] = t1; + h[2] = t2; + h[3] = t3; + h[4] = t4; + h[5] = t5; + h[6] = t6; + h[7] = t7; + + t0 = ROUND_ELT(n, 0, 7, 6, 5, 4, 3, 2, 1) ^ h[0]; + t1 = ROUND_ELT(n, 1, 0, 7, 6, 5, 4, 3, 2) ^ h[1]; + t2 = ROUND_ELT(n, 2, 1, 0, 7, 6, 5, 4, 3) ^ h[2]; + t3 = ROUND_ELT(n, 3, 2, 1, 0, 7, 6, 5, 4) ^ h[3]; + t4 = ROUND_ELT(n, 4, 3, 2, 1, 0, 7, 6, 5) ^ h[4]; + t5 = ROUND_ELT(n, 5, 4, 3, 2, 1, 0, 7, 6) ^ h[5]; + t6 = ROUND_ELT(n, 6, 5, 4, 3, 2, 1, 0, 7) ^ h[6]; + t7 = ROUND_ELT(n, 7, 6, 5, 4, 3, 2, 1, 0) ^ h[7]; + + n[0] = t0; + n[1] = t1; + n[2] = t2; + n[3] = t3; + n[4] = t4; + n[5] = t5; + n[6] = t6; + n[7] = t7; + } +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global unsigned char* block, __global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + + sph_u64 n[8]; + sph_u64 h[8]; + sph_u64 state[8]; + + h[0] = h[1] = h[2] = h[3] = h[4] = h[5] = h[6] = h[7] = 0; + + n[0] = h[0] ^ DEC64LE(block + 0); + n[1] = h[1] ^ DEC64LE(block + 8); + n[2] = h[2] ^ DEC64LE(block + 16); + n[3] = h[3] ^ DEC64LE(block + 24); + n[4] = h[4] ^ DEC64LE(block + 32); + n[5] = h[5] ^ DEC64LE(block + 40); + n[6] = h[6] ^ DEC64LE(block + 48); + n[7] = h[7] ^ DEC64LE(block + 56); + + whirlpool_round(n, h); + + h[0] = state[0] = n[0] ^ DEC64LE(block + 0); + h[1] = state[1] = n[1] ^ DEC64LE(block + 8); + h[2] = state[2] = n[2] ^ DEC64LE(block + 16); + h[3] = state[3] = n[3] ^ DEC64LE(block + 24); + h[4] = state[4] = n[4] ^ DEC64LE(block + 32); + h[5] = state[5] = n[5] ^ DEC64LE(block + 40); + h[6] = state[6] = n[6] ^ DEC64LE(block + 48); + h[7] = state[7] = n[7] ^ DEC64LE(block + 56); + + + n[0] = DEC64LE(block + 64); + n[1] = DEC64LE(block + 72); + n[1] &= 0x00000000FFFFFFFF; + n[1] ^= ((sph_u64) gid) << 32; + n[3] = n[4] = n[5] = n[6] = 0; + n[2] = 0x0000000000000080; + n[7] = 0x8002000000000000; + sph_u64 temp0,temp1,temp2,temp7; + temp0 = n[0]; + temp1 = n[1]; + temp2 = n[2]; + temp7 = n[7]; + + n[0] ^= h[0]; + n[1] ^= h[1]; + n[2] ^= h[2]; + n[3] ^= h[3]; + n[4] ^= h[4]; + n[5] ^= h[5]; + n[6] ^= h[6]; + n[7] ^= h[7]; + + whirlpool_round(n, h); + + hash->h8[0] = state[0] ^ n[0] ^ temp0; + hash->h8[1] = state[1] ^ n[1] ^ temp1; + hash->h8[2] = state[2] ^ n[2] ^ temp2; + hash->h8[3] = state[3] ^ n[3]; + hash->h8[4] = state[4] ^ n[4]; + hash->h8[5] = state[5] ^ n[5]; + hash->h8[6] = state[6] ^ n[6]; + hash->h8[7] = state[7] ^ n[7] ^ temp7; +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global hash_t* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + sph_u64 n[8]; + sph_u64 h[8]; + sph_u64 state[8]; + + for (int loop=0;loop<3; loop++) { + n[0] = (hash->h8[0]); + n[1] = (hash->h8[1]); + n[2] = (hash->h8[2]); + n[3] = (hash->h8[3]); + n[4] = (hash->h8[4]); + n[5] = (hash->h8[5]); + n[6] = (hash->h8[6]); + n[7] = (hash->h8[7]); + + h[0] = h[1] = h[2] = h[3] = h[4] = h[5] = h[6] = h[7] = 0; + + n[0] ^= h[0]; + n[1] ^= h[1]; + n[2] ^= h[2]; + n[3] ^= h[3]; + n[4] ^= h[4]; + n[5] ^= h[5]; + n[6] ^= h[6]; + n[7] ^= h[7]; + + whirlpool_round(n, h); + + n[0] = h[0] = state[0] = n[0] ^ (hash->h8[0]); + n[1] = h[1] = state[1] = n[1] ^ (hash->h8[1]); + n[2] = h[2] = state[2] = n[2] ^ (hash->h8[2]); + n[3] = h[3] = state[3] = n[3] ^ (hash->h8[3]); + n[4] = h[4] = state[4] = n[4] ^ (hash->h8[4]); + n[5] = h[5] = state[5] = n[5] ^ (hash->h8[5]); + n[6] = h[6] = state[6] = n[6] ^ (hash->h8[6]); + n[7] = h[7] = state[7] = n[7] ^ (hash->h8[7]); + + n[0] ^= 0x80 ; + n[1] ^= 0 ; + n[2] ^= 0 ; + n[3] ^= 0 ; + n[4] ^= 0 ; + n[5] ^= 0 ; + n[6] ^= 0 ; + n[7] ^= 0x2000000000000 ; + + whirlpool_round(n, h); + + hash->h8[0] = state[0] ^ n[0] ^ 0x80; + hash->h8[1] = state[1] ^ n[1]; + hash->h8[2] = state[2] ^ n[2]; + hash->h8[3] = state[3] ^ n[3]; + hash->h8[4] = state[4] ^ n[4]; + hash->h8[5] = state[5] ^ n[5]; + hash->h8[6] = state[6] ^ n[6]; + hash->h8[7] = state[7] ^ n[7] ^ 0x2000000000000; + } + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global hash_t* hashes) +{ + + +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global hash_t* hashes, __global uint* output, const ulong target) +{ + uint gid = get_global_id(0); + __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + bool result = (hash->h8[3] <= target); + if (result) + output[atomic_inc(output+0xFF)] = SWAP4(gid); +} + +#endif // W_CL \ No newline at end of file diff --git a/kernel/x14.cl b/kernel/x14.cl index ff335182..5a8d47bf 100644 --- a/kernel/x14.cl +++ b/kernel/x14.cl @@ -463,69 +463,92 @@ __kernel void search2(__global hash_t* hashes) uint gid = get_global_id(0); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); - __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; - +#if !SPH_SMALL_FOOTPRINT_GROESTL + __local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256]; + __local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256]; +#else + __local sph_u64 T0_C[256], T4_C[256]; +#endif int init = get_local_id(0); int step = get_local_size(0); for (int i = init; i < 256; i += step) { - T0_L[i] = T0[i]; - T4_L[i] = T4[i]; - T1_L[i] = T1[i]; - T2_L[i] = T2[i]; - T3_L[i] = T3[i]; - T5_L[i] = T5[i]; - T6_L[i] = T6[i]; - T7_L[i] = T7[i]; + T0_C[i] = T0[i]; + T4_C[i] = T4[i]; +#if !SPH_SMALL_FOOTPRINT_GROESTL + T1_C[i] = T1[i]; + T2_C[i] = T2[i]; + T3_C[i] = T3[i]; + T5_C[i] = T5[i]; + T6_C[i] = T6[i]; + T7_C[i] = T7[i]; +#endif } - - barrier(CLK_LOCAL_MEM_FENCE); - - #define T0 T0_L - #define T1 T1_L - #define T2 T2_L - #define T3 T3_L - #define T4 T4_L - #define T5 T5_L - #define T6 T6_L - #define T7 T7_L - - // groestl - sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; + barrier(CLK_LOCAL_MEM_FENCE); // groestl +#define T0 T0_C +#define T1 T1_C +#define T2 T2_C +#define T3 T3_C +#define T4 T4_C +#define T5 T5_C +#define T6 T6_C +#define T7 T7_C + + + sph_u64 H[16]; +//#pragma unroll 15 + for (unsigned int u = 0; u < 15; u ++) + H[u] = 0; +#if USE_LE + H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); +#else + H[15] = (sph_u64)512; +#endif sph_u64 g[16], m[16]; - g[0] = m[0] = DEC64E(hash->h8[0]); - g[1] = m[1] = DEC64E(hash->h8[1]); - g[2] = m[2] = DEC64E(hash->h8[2]); - g[3] = m[3] = DEC64E(hash->h8[3]); - g[4] = m[4] = DEC64E(hash->h8[4]); - g[5] = m[5] = DEC64E(hash->h8[5]); - g[6] = m[6] = DEC64E(hash->h8[6]); - g[7] = m[7] = DEC64E(hash->h8[7]); - g[8] = m[8] = 0x80; - g[9] = m[9] = 0; - g[10] = m[10] = 0; - g[11] = m[11] = 0; - g[12] = m[12] = 0; - g[13] = m[13] = 0; - g[14] = m[14] = 0; - g[15] = 0x102000000000000; - m[15] = 0x100000000000000; - + m[0] = DEC64E(hash->h8[0]); + m[1] = DEC64E(hash->h8[1]); + m[2] = DEC64E(hash->h8[2]); + m[3] = DEC64E(hash->h8[3]); + m[4] = DEC64E(hash->h8[4]); + m[5] = DEC64E(hash->h8[5]); + m[6] = DEC64E(hash->h8[6]); + m[7] = DEC64E(hash->h8[7]); + +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + g[u] = m[u] ^ H[u]; + m[8] = 0x80; g[8] = m[8] ^ H[8]; + m[9] = 0; g[9] = m[9] ^ H[9]; + m[10] = 0; g[10] = m[10] ^ H[10]; + m[11] = 0; g[11] = m[11] ^ H[11]; + m[12] = 0; g[12] = m[12] ^ H[12]; + m[13] = 0; g[13] = m[13] ^ H[13]; + m[14] = 0; g[14] = m[14] ^ H[14]; + m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; PERM_BIG_P(g); PERM_BIG_Q(m); - sph_u64 xH[16]; +//#pragma unroll 16 for (unsigned int u = 0; u < 16; u ++) - xH[u] = H[u] ^= g[u] ^ m[u]; + H[u] ^= g[u] ^ m[u]; + sph_u64 xH[16]; +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + xH[u] = H[u]; PERM_BIG_P(xH); - for (unsigned int u = 8; u < 16; u ++) - hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); +//#pragma unroll 16 + for (unsigned int u = 0; u < 16; u ++) + H[u] ^= xH[u]; + +//#pragma unroll 8 + for (unsigned int u = 0; u < 8; u ++) + hash->h8[u] = DEC64E(H[u + 8]); + barrier(CLK_GLOBAL_MEM_FENCE); - barrier(CLK_GLOBAL_MEM_FENCE); } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) @@ -846,7 +869,7 @@ __kernel void search8(__global hash_t* hashes) sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; - sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; + sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; rk00 = hash->h4[0]; rk01 = hash->h4[1]; diff --git a/logging.c b/logging.c index 81bd73ce..7f8b5d65 100644 --- a/logging.c +++ b/logging.c @@ -59,12 +59,20 @@ void applogsiz(int prio, int size, const char* fmt, ...) /* high-level logging function, based on global opt_log_level */ void vapplogsiz(int prio, int size, const char* fmt, va_list args) { - if (opt_debug || prio != LOG_DEBUG) { + if ((opt_debug || prio != LOG_DEBUG)) { char *tmp42 = (char *)calloc(size + 1, 1); vsnprintf(tmp42, size, fmt, args); _applog(prio, tmp42, false); free(tmp42); } +#ifdef DEV_DEBUG_MODE + else if(prio == LOG_DEBUG) { + char *tmp42 = (char *)calloc(size + 1, 1); + vsnprintf(tmp42, size, fmt, args); + __debug("", tmp42); + free(tmp42); + } +#endif } /* @@ -80,6 +88,13 @@ void _applog(int prio, const char *str, bool force) if (0) {} #endif else { + +#ifdef DEV_DEBUG_MODE + if(prio == LOG_DEBUG) { + __debug("", str); + } +#endif + bool write_console = opt_debug_console || (opt_verbose && prio != LOG_DEBUG) || prio <= opt_log_level; bool write_stderr = !isatty(fileno((FILE *)stderr)); if (!(write_console || write_stderr)) diff --git a/miner.h b/miner.h index 84434641..57032077 100644 --- a/miner.h +++ b/miner.h @@ -266,6 +266,11 @@ DRIVER_PARSE_COMMANDS(DRIVER_PROTOTYPE) #define strtobool(str) ((str && (!strcasecmp(str, "true") || !strcasecmp(str, "yes") || !strcasecmp(str, "1")))?true:false) #endif +extern int opt_remoteconf_retry; +extern int opt_remoteconf_wait; +extern bool opt_remoteconf_usecache; + + enum alive { LIFE_WELL, LIFE_SICK, @@ -1025,6 +1030,7 @@ extern char *sgminer_path; extern int opt_shares; extern bool opt_fail_only; extern int opt_fail_switch_delay; +extern int opt_watchpool_refresh; extern bool opt_autofan; extern bool opt_autoengine; extern bool use_curses; @@ -1099,8 +1105,8 @@ extern pthread_cond_t restart_cond; extern void clear_stratum_shares(struct pool *pool); extern void clear_pool_work(struct pool *pool); -extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2); -extern void set_target_neoscrypt(unsigned char *target, double diff); +extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2, const int thr_id); +extern void set_target_neoscrypt(unsigned char *target, double diff, const int thr_id); extern void kill_work(void); @@ -1274,6 +1280,7 @@ struct pool { bool remove_at_start; bool removed; bool lp_started; + bool backup; char *hdr_path; char *lp_url; @@ -1481,7 +1488,13 @@ extern void _wlogprint(const char *str); extern int curses_int(const char *query); extern char *curses_input(const char *query); extern void kill_work(void); -extern void switch_pools(struct pool *selected); + +//helper macro to preserve existing code +#ifndef switch_pools + #define switch_pools(p) __switch_pools(p, TRUE) +#endif +extern void __switch_pools(struct pool *selected, bool saveprio); + extern void discard_work(struct work *work); extern void remove_pool(struct pool *pool); //extern void write_config(FILE *fcfg); diff --git a/ocl.c b/ocl.c index 8be317a5..14ad7fcd 100644 --- a/ocl.c +++ b/ocl.c @@ -299,8 +299,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg char filename[255]; char strbuf[32]; - sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name)); + sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile)?cgpu->algorithm.kernelfile:cgpu->algorithm.name)); strcpy(filename, strbuf); + applog(LOG_DEBUG, "Using source file %s", filename); /* For some reason 2 vectors is still better even if the card says @@ -347,54 +348,111 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg // neoscrypt calculates TC differently if (!safe_cmp(cgpu->algorithm.name, "neoscrypt")) { - int max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); - size_t glob_thread_count = 1UL << max_int; - - // if TC is entered by user, use that value... otherwise use default - cgpu->thread_concurrency = ((cgpu->opt_tc) ? cgpu->opt_tc : ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count)); + size_t glob_thread_count; + long max_int; + unsigned char type = 0; - // if TC * scratchbuf size is too big for memory... reduce to max - if (((uint64_t)cgpu->thread_concurrency * NEOSCRYPT_SCRATCHBUF_SIZE) >(uint64_t)cgpu->max_alloc) { - /* Selected intensity will not run on this GPU. Not enough memory. - * Adapt the memory setting. */ - glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE; + // determine which intensity type to use + // raw intensity is the same as TC so use either or setting... + if (cgpu->rawintensity > 0 || cgpu->opt_tc) { - /* Find highest significant bit in glob_thread_count, which gives - * the intensity. */ - while (max_int && ((1U << max_int) & glob_thread_count) == 0) { - --max_int; + if (cgpu->opt_tc) { + glob_thread_count = cgpu->rawintensity = cgpu->opt_tc; } - - /* Check if max_intensity is >0. */ - if (max_int < MIN_INTENSITY) { - applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); - max_int = MIN_INTENSITY; + else { + glob_thread_count = cgpu->rawintensity; } - cgpu->intensity = max_int; - cgpu->thread_concurrency = 1U << max_int; + max_int = glob_thread_count; + type = 2; + } + else if (cgpu->xintensity > 0) { + glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift)?(1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)):cgpu->xintensity); + max_int = cgpu->xintensity; + type = 1; + } + else { + glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity); + max_int = ((cgpu->dynamic)?MAX_INTENSITY:cgpu->intensity); + } + + glob_thread_count = ((glob_thread_count < cgpu->work_size)?cgpu->work_size:glob_thread_count); + + // if TC * scratchbuf size is too big for memory... reduce to max + if ((glob_thread_count * NEOSCRYPT_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) { + + /* Selected intensity will not run on this GPU. Not enough memory. + * Adapt the memory setting. */ + // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc + switch (type) { + //raw intensity + case 2: + while ((glob_thread_count * NEOSCRYPT_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) { + --glob_thread_count; + } + + max_int = glob_thread_count; + cgpu->rawintensity = glob_thread_count; + break; + + //x intensity + case 1: + glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE; + max_int = glob_thread_count / clState->compute_shaders; + + while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_XINTENSITY) { + applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu); + max_int = MIN_XINTENSITY; + } + + cgpu->xintensity = max_int; + glob_thread_count = clState->compute_shaders * (1UL << max_int); + break; + + default: + glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE; + while (max_int && ((1UL << max_int) & glob_thread_count) == 0) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_INTENSITY) { + applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); + max_int = MIN_INTENSITY; + } + + cgpu->intensity = max_int; + glob_thread_count = 1UL << max_int; + break; + } } + // TC is glob thread count + cgpu->thread_concurrency = glob_thread_count; + applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); - } - else if (!cgpu->opt_tc) { + } else if (!cgpu->opt_tc) { unsigned int sixtyfours; sixtyfours = cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1; cgpu->thread_concurrency = sixtyfours * 64; if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; + if (cgpu->thread_concurrency > cgpu->shaders * 5) { cgpu->thread_concurrency = cgpu->shaders * 5; } } applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency)); + } else { + cgpu->thread_concurrency = cgpu->opt_tc; } - else { - cgpu->thread_concurrency = cgpu->opt_tc; - } - cl_uint slot, cpnd; @@ -420,7 +478,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg build_data->opencl_version = get_opencl_version(devices[gpu]); build_data->patch_bfi = needs_bfi_patch(build_data); - strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name)); + strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile)?cgpu->algorithm.kernelfile:cgpu->algorithm.name)); strcat(build_data->binary_filename, name); if (clState->goffset) strcat(build_data->binary_filename, "g"); @@ -491,20 +549,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg bufsize = NEOSCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency; /* This is the input buffer. For neoscrypt this is guaranteed to be - * 80 bytes only. */ + * 80 bytes only. */ readbufsize = 80; applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); - // scrypt/n-scrypt - } - else { + // scrypt/n-scrypt + } else { size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0)); bufsize = 128 * ipt * cgpu->thread_concurrency; applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); } - } - else { - bufsize = (size_t)algorithm->rw_buffer_size; + } else { + bufsize = (size_t) algorithm->rw_buffer_size; applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); } @@ -536,10 +592,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status); return NULL; } - + applog(LOG_DEBUG, "Using output buffer sized %lu", BUFFERSIZE); clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); - if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status); return NULL; diff --git a/pool.c b/pool.c index eb619953..d2d9dbea 100644 --- a/pool.c +++ b/pool.c @@ -33,18 +33,21 @@ #include char* get_pool_name(struct pool *pool) { - if (opt_incognito) - return ""; + if (opt_incognito) { + return ""; + } - if (strcmp(pool->name, "") == 0) - return pool->sockaddr_url; + if (empty_string(pool->name)) { + return pool->sockaddr_url; + } - return pool->name; + return pool->name; } char* get_pool_user(struct pool *pool) { - if (opt_incognito) - return ""; + if (opt_incognito) { + return ""; + } - return pool->rpc_user; + return pool->rpc_user; } diff --git a/sgminer.c b/sgminer.c index 60347904..f3e40933 100644 --- a/sgminer.c +++ b/sgminer.c @@ -77,6 +77,7 @@ static char packagename[256]; static bool startup = true; //sgminer is starting up static bool gpu_initialized = false; //gpu initialized static int init_pool; //pool used to initialize gpus +static bool on_backup_pool = false; //for simple connect strategy... flag if we're on a backup pool bool opt_work_update; bool opt_protocol; @@ -89,6 +90,11 @@ bool opt_loginput; bool opt_compact; bool opt_incognito; +// remote config options... +int opt_remoteconf_retry = 3; // number of retries +int opt_remoteconf_wait = 10; // wait in secs between retries +bool opt_remoteconf_usecache = false; // use last downloaded copy of the config file when download fails + const int opt_cutofftemp = 95; int opt_log_interval = 5; int opt_queue = 1; @@ -134,6 +140,7 @@ static bool opt_submit_stale = true; int opt_shares; bool opt_fail_only; int opt_fail_switch_delay = 60; +int opt_watchpool_refresh = 30; static bool opt_fix_protocol; static bool opt_lowmem; static bool opt_morenotices; @@ -218,7 +225,11 @@ pthread_mutex_t stats_lock; static void *restart_mining_threads_thread(void *userdata); static void apply_initial_gpu_settings(struct pool *pool); +static unsigned long compare_pool_settings(struct pool *oldpool, struct pool *newpool); +static void apply_switcher_options(unsigned long options, struct pool *pool); static void restart_mining_threads(unsigned int new_n_threads); +static void probe_pools(void); +static bool test_pool(struct pool *pool); int hw_errors; int total_accepted, total_rejected; @@ -530,7 +541,7 @@ struct pool *add_pool(void) char buf[32]; buf[0] = '\0'; pool->name = strdup(buf); - pool->profile = strdup(buf); //profile blank by default + pool->profile = strdup(buf); //profile blank by default pool->algorithm.name[0] = '\0'; //blank algorithm name /* intensities default to blank */ @@ -796,6 +807,13 @@ static char *set_pool_algorithm(const char *arg) return NULL; } +static char *set_pool_backup(const char *arg) +{ + struct pool *pool = get_current_pool(); + pool->backup = TRUE; + return NULL; +} + static char *set_pool_devices(const char *arg) { struct pool *pool = get_current_pool(); @@ -806,10 +824,10 @@ static char *set_pool_devices(const char *arg) static char *set_pool_kernelfile(const char *arg) { struct pool *pool = get_current_pool(); - + applog(LOG_DEBUG, "Setting pool %i algorithm kernel file to %s", pool->pool_no, arg); pool->algorithm.kernelfile = arg; - + return NULL; } @@ -1459,8 +1477,8 @@ struct opt_table opt_config_table[] = { set_int_0_to_9999, opt_show_intval, &opt_keccak_unroll, "Set SPH_KECCAK_UNROLL for Xn derived algorithms (Default: 0)"), OPT_WITH_ARG("--kernelfile", - set_default_kernelfile, NULL, NULL, - "Set the algorithm kernel source file (without file extension)."), + set_default_kernelfile, NULL, NULL, + "Set the algorithm kernel source file (without file extension)."), OPT_WITH_ARG("--lookup-gap", set_default_lookup_gap, NULL, NULL, "Set GPU lookup gap for scrypt mining, comma separated"), @@ -1556,6 +1574,9 @@ struct opt_table opt_config_table[] = { OPT_WITH_ARG("--pool-algorithm|--pool-kernel", set_pool_algorithm, NULL, NULL, "Set algorithm for pool"), + OPT_WITHOUT_ARG("--pool-backup", + set_pool_backup, NULL, + "Mark this pool as a backup for simple connect strategy"), OPT_WITH_ARG("--pool-device", set_pool_devices, NULL, NULL, "Select devices to use with pool, one value, range and/or comma separated (e.g. 0-2,4) default: all"), @@ -1785,6 +1806,9 @@ struct opt_table opt_config_table[] = { OPT_WITHOUT_ARG("--verbose|-v", opt_set_bool, &opt_verbose, "Log verbose output to stderr as well as status output"), + OPT_WITH_ARG("--watchpool-refresh", + set_int_1_to_65535, opt_show_intval, &opt_watchpool_refresh, + "Interval in seconds to refresh pool status"), OPT_WITH_ARG("--worksize|-w", set_default_worksize, NULL, NULL, "Override detected optimal worksize - one value or comma separated list"), @@ -1837,6 +1861,15 @@ static struct opt_table opt_cmdline_table[] = { set_default_config, NULL, NULL, "Specify the filename of the default config file\n" "Loaded at start and used when saving without a name."), + OPT_WITH_ARG("--remote-config-retry", + set_int_0_to_9999, opt_show_intval, &opt_remoteconf_retry, + "Number of times to retry downloading remote config file. Default: 3"), + OPT_WITH_ARG("--remote-config-wait", + set_int_0_to_9999, opt_show_intval, &opt_remoteconf_wait, + "Time in seconds to wait between download retries of remote config files. Default: 10secs"), + OPT_WITHOUT_ARG("--remote-config-usecache", + opt_set_bool, &opt_remoteconf_usecache, + "Use cached copy of the remote config file when download fails. Default: No"), OPT_WITHOUT_ARG("--help|-h", opt_verusage_and_exit, NULL, "Print this message"), @@ -1860,7 +1893,8 @@ static bool jobj_binary(const json_t *obj, const char *key, tmp = json_object_get(obj, key); if (unlikely(!tmp)) { if (unlikely(required)) - applog(LOG_ERR, "JSON key '%s' not found", key); + if (opt_morenotices) + applog(LOG_ERR, "JSON key '%s' not found", key); return false; } hexstr = json_string_value(tmp); @@ -2230,7 +2264,8 @@ static bool gbt_decode(struct pool *pool, json_t *res_val) static bool getwork_decode(json_t *res_val, struct work *work) { if (unlikely(!jobj_binary(res_val, "data", work->data, sizeof(work->data), true))) { - applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), "")); + if (opt_morenotices) + applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), "")); return false; } @@ -2246,7 +2281,8 @@ static bool getwork_decode(json_t *res_val, struct work *work) } if (unlikely(!jobj_binary(res_val, "target", work->target, sizeof(work->target), true))) { - applog(LOG_ERR, "%s: JSON inval target", isnull(get_pool_name(work->pool), "")); + if (opt_morenotices) + applog(LOG_ERR, "%s: JSON inval target", isnull(get_pool_name(work->pool), "")); return false; } return true; @@ -2698,9 +2734,8 @@ static void check_winsizes(void) static void disable_curses_windows(void); static void enable_curses_windows(void); -static void adjust_mostdevs(void); -void switch_logsize(bool __maybe_unused newdevs) +static void switch_logsize(bool __maybe_unused newdevs) { if (curses_active_locked()) { #ifdef WIN32 @@ -2710,7 +2745,6 @@ void switch_logsize(bool __maybe_unused newdevs) if (opt_compact) { logstart = devcursor + 1; } else { - adjust_mostdevs(); logstart = devcursor + most_devices + 1; } logcursor = logstart + 1; @@ -2812,7 +2846,7 @@ share_result(json_t *val, json_t *res, json_t *err, const struct work *work, cgpu->last_share_diff = work->work_difficulty; pool->last_share_time = cgpu->last_share_pool_time; pool->last_share_diff = work->work_difficulty; - applog(LOG_DEBUG, "PROOF OF WORK RESULT: true (yay!!!)"); + applog(LOG_DEBUG, "[THR%d] PROOF OF WORK RESULT: true (yay!!!)", work->thr_id); if (!QUIET) { if (total_pools > 1) { applog(LOG_NOTICE, "Accepted %s %s %d at %s %s%s", @@ -2821,9 +2855,12 @@ share_result(json_t *val, json_t *res, json_t *err, const struct work *work, cgpu->device_id, get_pool_name(pool), resubmit ? "(resubmit)" : "", worktime); - } else + } else { + applog(LOG_DEBUG, "[THR%d] Accepted %s %s %d %s%s", + work->thr_id, hashshow, cgpu->drv->name, cgpu->device_id, resubmit ? "(resubmit)" : "", worktime); applog(LOG_NOTICE, "Accepted %s %s %d %s%s", hashshow, cgpu->drv->name, cgpu->device_id, resubmit ? "(resubmit)" : "", worktime); + } } sharelog("accept", work); if (opt_shares && total_diff_accepted >= opt_shares) { @@ -2857,7 +2894,7 @@ share_result(json_t *val, json_t *res, json_t *err, const struct work *work, pool->seq_rejects++; mutex_unlock(&stats_lock); - applog(LOG_DEBUG, "PROOF OF WORK RESULT: false (booooo)"); + applog(LOG_DEBUG, "[THR%d] PROOF OF WORK RESULT: false (booooo)", work->thr_id); if (!QUIET) { char disposition[36] = "reject"; char reason[32]; @@ -3360,7 +3397,7 @@ static void calc_diff(struct work *work, double known) dcut64 = (double)*((uint64_t *)(work->target + 22)); } else { - dcut64 = le256todouble(work->target); + dcut64 = le256todouble(work->target); } if (unlikely(!dcut64)) dcut64 = 1; @@ -3876,8 +3913,9 @@ static void pool_died(struct pool *pool) if (pool == current_pool()) { applog(LOG_WARNING, "%s not responding!", get_pool_name(pool)); switch_pools(NULL); - } else + } else { applog(LOG_INFO, "%s failed to return work", get_pool_name(pool)); + } } } @@ -4010,7 +4048,7 @@ static bool pool_unusable(struct pool *pool) return false; } -void switch_pools(struct pool *selected) +void __switch_pools(struct pool *selected, bool saveprio) { struct pool *pool, *last_pool; int i, pool_no, next_pool; @@ -4020,7 +4058,7 @@ void switch_pools(struct pool *selected) pool_no = currentpool->pool_no; /* If a specific pool was selected, prioritise it over others */ - if (selected) + if (selected && saveprio) { if (selected->prio != 0) { @@ -4083,28 +4121,28 @@ void switch_pools(struct pool *selected) currentpool = pools[pool_no]; pool = currentpool; + on_backup_pool = pool->backup; cg_wunlock(&control_lock); /* Set the lagging flag to avoid pool not providing work fast enough * messages in failover only mode since we have to get all fresh work * as in restart_threads */ - if (opt_fail_only) + if (opt_fail_only) { pool_tset(pool, &pool->lagging); + } - if (pool != last_pool && pool_strategy != POOL_LOADBALANCE && pool_strategy != POOL_BALANCE) - { + if (pool != last_pool && pool_strategy != POOL_LOADBALANCE && pool_strategy != POOL_BALANCE) { //if the gpus have been initialized or first pool during startup, it's ok to switch... - if(gpu_initialized || startup) - { + if(gpu_initialized || startup) { applog(LOG_WARNING, "Switching to %s", get_pool_name(pool)); - if (pool_localgen(pool) || opt_fail_only) + if (pool_localgen(pool) || opt_fail_only) { clear_pool_work(last_pool); + } } } //if startup, initialize gpus and start mining threads - if(startup) - { + if(startup) { startup = false; //remove startup flag so we don't enter this block again applog(LOG_NOTICE, "Startup GPU initialization... Using settings from pool %s.", get_pool_name(pool)); @@ -4121,6 +4159,7 @@ void switch_pools(struct pool *selected) pthread_cond_broadcast(&lp_cond); mutex_unlock(&lp_lock); } + void discard_work(struct work *work) { if (!work->clone && !work->rolls && !work->mined) { @@ -4130,9 +4169,9 @@ void discard_work(struct work *work) work->pool->works--; } total_discarded++; - applog(LOG_DEBUG, "Discarded work"); + applog(LOG_DEBUG, "[THR%d] Discarded work", work->thr_id); } else - applog(LOG_DEBUG, "Discarded cloned or rolled work"); + applog(LOG_DEBUG, "[THR%d] Discarded cloned or rolled work", work->thr_id); free_work(work); } @@ -4415,7 +4454,7 @@ static bool hash_push(struct work *work) static void stage_work(struct work *work) { - applog(LOG_DEBUG, "Pushing work from %s to hash queue", get_pool_name(work->pool)); + applog(LOG_DEBUG, "[THR%d] Pushing work from %s to hash queue", work->thr_id, get_pool_name(work->pool)); work->work_block = work_block; test_work_current(work); work->pool->works++; @@ -5501,8 +5540,9 @@ static void *stratum_sthread(void *userdata) struct work *work; bool submitted; - if (unlikely(pool->removed)) + if (unlikely(pool->removed)) { break; + } work = (struct work *)tq_pop(pool->stratum_q, NULL); if (unlikely(!work)) @@ -5901,7 +5941,7 @@ out_unlock: return work; } -void set_target(unsigned char *dest_target, double diff, double diff_multiplier2) +void set_target(unsigned char *dest_target, double diff, double diff_multiplier2, const int thr_id) { unsigned char target[32]; uint64_t *data64, h64; @@ -5909,7 +5949,7 @@ void set_target(unsigned char *dest_target, double diff, double diff_multiplier2 if (unlikely(diff == 0.0)) { /* This shouldn't happen but best we check to prevent a crash */ - applog(LOG_ERR, "Diff zero passed to set_target"); + applog(LOG_ERR, "[THR%d] Diff zero passed to set_target", thr_id); diff = 1.0; } @@ -5947,16 +5987,16 @@ void set_target(unsigned char *dest_target, double diff, double diff_multiplier2 if (opt_debug) { char *htarget = bin2hex(target, 32); - applog(LOG_DEBUG, "Generated target %s", htarget); + applog(LOG_DEBUG, "[THR%d] Generated target %s", thr_id, htarget); free(htarget); } memcpy(dest_target, target, 32); } /***************************************************** -* Special set_target() function for Neoscrypt -****************************************************/ -void set_target_neoscrypt(unsigned char *target, double diff) + * Special set_target() function for Neoscrypt + ****************************************************/ +void set_target_neoscrypt(unsigned char *target, double diff, const int thr_id) { uint64_t m; int k; @@ -5992,7 +6032,7 @@ void set_target_neoscrypt(unsigned char *target, double diff) swab256(swaped, target); char *htarget = bin2hex((unsigned char *)swaped, 32); - applog(LOG_DEBUG, "Generated neoscrypt target 0x%s", htarget); + applog(LOG_DEBUG, "[THR%d] Generated neoscrypt target 0x%s", thr_id, htarget); free(htarget); } } @@ -6028,7 +6068,7 @@ static void gen_stratum_work(struct pool *pool, struct work *work) memcpy(merkle_sha, merkle_root, 32); } - applog(LOG_DEBUG, "gen_stratum_work() - algorithm = %s", pool->algorithm.name); + applog(LOG_DEBUG, "[THR%d] gen_stratum_work() - algorithm = %s", work->thr_id, pool->algorithm.name); // Different for Neoscrypt because of Little Endian if (!safe_cmp(pool->algorithm.name, "neoscrypt")) { @@ -6084,21 +6124,20 @@ static void gen_stratum_work(struct pool *pool, struct work *work) header = bin2hex(work->data, 128); merkle_hash = bin2hex((const unsigned char *)merkle_root, 32); - applog(LOG_DEBUG, "Generated stratum merkle %s", merkle_hash); - applog(LOG_DEBUG, "Generated stratum header %s", header); - applog(LOG_DEBUG, "Work job_id %s nonce2 %"PRIu64" ntime %s", work->job_id, - work->nonce2, work->ntime); + applog(LOG_DEBUG, "[THR%d] Generated stratum merkle %s", work->thr_id, merkle_hash); + applog(LOG_DEBUG, "[THR%d] Generated stratum header %s", work->thr_id, header); + applog(LOG_DEBUG, "[THR%d] Work job_id %s nonce2 %"PRIu64" ntime %s", work->thr_id, work->job_id, + work->nonce2, work->ntime); free(header); free(merkle_hash); } // For Neoscrypt use set_target_neoscrypt() function if (!safe_cmp(pool->algorithm.name, "neoscrypt")) { - set_target_neoscrypt(work->target, work->sdiff); - } - else { + set_target_neoscrypt(work->target, work->sdiff, work->thr_id); + } else { calc_midstate(work); - set_target(work->target, work->sdiff, pool->algorithm.diff_multiplier2); + set_target(work->target, work->sdiff, pool->algorithm.diff_multiplier2, work->thr_id); } local_work++; @@ -6147,14 +6186,20 @@ static void apply_initial_gpu_settings(struct pool *pool) { int i; const char *opt; - unsigned char options; //gpu adl options to apply + unsigned long options; //gpu adl options to apply unsigned int needed_threads = 0; //number of mining threads needed after we change devices applog(LOG_NOTICE, "Startup Pool No = %d", pool->pool_no); + //get compare options + options = compare_pool_settings(NULL, pool); + //apply gpu settings rd_lock(&mining_thr_lock); + apply_switcher_options(options, pool); + +/* //reset devices opt_devs_enabled = 0; for (i = 0; i < MAX_DEVICES; i++) @@ -6170,32 +6215,45 @@ static void apply_initial_gpu_settings(struct pool *pool) set_lookup_gap((char *)opt); //set intensity - if(!empty_string((opt = get_pool_setting(pool->rawintensity, default_profile.rawintensity)))) - set_rawintensity((char *)opt); - else if(!empty_string((opt = get_pool_setting(pool->xintensity, default_profile.xintensity)))) + if(!empty_string((opt = get_pool_setting(pool->rawintensity, default_profile.rawintensity)))) { + set_rawintensity((char *)opt); + } + else if(!empty_string((opt = get_pool_setting(pool->xintensity, default_profile.xintensity)))) { set_xintensity((char *)opt); - else if(!empty_string((opt = get_pool_setting(pool->intensity, ((!empty_string(default_profile.intensity))?default_profile.intensity:"8"))))) + } + else if(!empty_string((opt = get_pool_setting(pool->intensity, ((!empty_string(default_profile.intensity))?default_profile.intensity:"8"))))) { set_intensity((char *)opt); + } //shaders if(!empty_string((opt = get_pool_setting(pool->shaders, default_profile.shaders)))) set_shaders((char *)opt); //thread-concurrency - if(!empty_string((opt = get_pool_setting(pool->thread_concurrency, default_profile.thread_concurrency)))) - set_thread_concurrency((char *)opt); + // neoscrypt - if not specified set TC to 0 so that TC will be calculated by intensity settings + if (!safe_cmp(pool->algorithm.name, "neoscrypt")) { + opt = ((empty_string(pool->thread_concurrency))?"0":get_pool_setting(pool->thread_concurrency, default_profile.thread_concurrency)); + } + // otherwise use pool/profile setting or default to default profile setting + else { + opt = get_pool_setting(pool->thread_concurrency, default_profile.thread_concurrency); + } + + if (!empty_string(opt)) { + set_thread_concurrency(opt); + } //worksize if(!empty_string((opt = get_pool_setting(pool->worksize, default_profile.worksize)))) set_worksize(opt); - - //apply algorithm +*/ + //manually apply algorithm for (i = 0; i < nDevs; i++) { applog(LOG_DEBUG, "Set GPU %d to %s", i, isnull(pool->algorithm.name, "")); gpus[i].algorithm = pool->algorithm; } - +/* #ifdef HAVE_ADL options = APPLY_ENGINE | APPLY_MEMCLOCK | APPLY_FANSPEED | APPLY_POWERTUNE | APPLY_VDDC; @@ -6243,7 +6301,7 @@ static void apply_initial_gpu_settings(struct pool *pool) if(opt_isset(options, APPLY_VDDC)) set_vddc(i, gpus[i].gpu_vddc); } - #endif + #endif*/ rd_unlock(&mining_thr_lock); @@ -6271,161 +6329,304 @@ static void apply_initial_gpu_settings(struct pool *pool) restart_mining_threads(needed_threads); } -static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool2) +static unsigned long compare_pool_settings(struct pool *oldpool, struct pool *newpool) { unsigned int options = 0; const char *opt1, *opt2; applog(LOG_DEBUG, "compare_pool_settings()"); - if (!pool1 || !pool2) + if (!newpool) { return 0; + } //compare pool devices - opt1 = get_pool_setting(pool1->devices, ((!empty_string(default_profile.devices)) ? default_profile.devices : "all")); - opt2 = get_pool_setting(pool2->devices, ((!empty_string(default_profile.devices)) ? default_profile.devices : "all")); + opt1 = ((oldpool)?get_pool_setting(oldpool->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all")):""); + opt2 = get_pool_setting(newpool->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all")); //changing devices means a hard reset of mining threads - if (strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) { options |= (SWITCHER_APPLY_DEVICE | SWITCHER_HARD_RESET); + } //compare gpu threads - opt1 = get_pool_setting(pool1->gpu_threads, default_profile.gpu_threads); - opt2 = get_pool_setting(pool2->gpu_threads, default_profile.gpu_threads); + opt1 = ((oldpool)?get_pool_setting(oldpool->gpu_threads, default_profile.gpu_threads):""); + opt2 = get_pool_setting(newpool->gpu_threads, default_profile.gpu_threads); //changing gpu threads means a hard reset of mining threads - if (strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) { options |= (SWITCHER_APPLY_GT | SWITCHER_HARD_RESET); + } //compare algorithm - if (!cmp_algorithm(&pool1->algorithm, &pool2->algorithm)) + if ((oldpool && !cmp_algorithm(&oldpool->algorithm, &newpool->algorithm)) || (!oldpool)) { options |= (SWITCHER_APPLY_ALGO | SWITCHER_SOFT_RESET); + } //lookup gap - opt1 = get_pool_setting(pool1->lookup_gap, default_profile.lookup_gap); - opt2 = get_pool_setting(pool2->lookup_gap, default_profile.lookup_gap); + opt1 = ((oldpool)?get_pool_setting(oldpool->lookup_gap, default_profile.lookup_gap):""); + opt2 = get_pool_setting(newpool->lookup_gap, default_profile.lookup_gap); //lookup gap means soft reset but only if hard reset isnt set - if (strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) { options |= (SWITCHER_APPLY_LG | SWITCHER_SOFT_RESET); - - //intensities - opt1 = get_pool_setting(pool1->rawintensity, default_profile.rawintensity); - opt2 = get_pool_setting(pool2->rawintensity, default_profile.rawintensity); - - if (strcasecmp(opt1, opt2) != 0) - { - //intensity is soft reset - if (!empty_string(opt2)) - options |= (SWITCHER_APPLY_RAWINT | SWITCHER_SOFT_RESET); } - //xintensity -- only if raw intensity not set - if (!opt_isset(options, SWITCHER_APPLY_RAWINT)) - { - opt1 = get_pool_setting(pool1->xintensity, default_profile.xintensity); - opt2 = get_pool_setting(pool2->xintensity, default_profile.xintensity); + // Intensity - First determine the intensity type that we are going to use with the new pool + unsigned int intoptions = 0; - //if different... - if (strcasecmp(opt1, opt2) != 0) - { - //intensity is soft reset - if (!empty_string(opt2)) - options |= (SWITCHER_APPLY_XINT | SWITCHER_SOFT_RESET); - } + if (!empty_string(newpool->rawintensity)) { + intoptions = SWITCHER_APPLY_RAWINT | SWITCHER_SOFT_RESET; + opt1 = ((oldpool && !empty_string(oldpool->rawintensity))?oldpool->rawintensity:""); + opt2 = get_pool_setting(newpool->rawintensity, default_profile.rawintensity); + } + else if (!empty_string(newpool->xintensity)) { + intoptions = SWITCHER_APPLY_XINT | SWITCHER_SOFT_RESET; + opt1 = ((oldpool && !empty_string(oldpool->xintensity))?oldpool->xintensity:""); + opt2 = get_pool_setting(newpool->xintensity, default_profile.xintensity); + } + else { + intoptions = SWITCHER_APPLY_INT | SWITCHER_SOFT_RESET; + opt1 = ((oldpool && !empty_string(oldpool->intensity))?oldpool->intensity:""); + opt2 = get_pool_setting(newpool->intensity, default_profile.intensity); } - //intensity -- only if raw intensity and xintensity not set - if (!opt_isset(options, SWITCHER_APPLY_RAWINT) && !opt_isset(options, SWITCHER_APPLY_XINT)) - { - opt1 = get_pool_setting(pool1->intensity, default_profile.intensity); - opt2 = get_pool_setting(pool2->intensity, default_profile.intensity); - - //if different... - if (strcasecmp(opt1, opt2) != 0) - { - //intensity is soft reset - if (!empty_string(opt2)) - options |= (SWITCHER_APPLY_INT | SWITCHER_SOFT_RESET); - //if blank, set default profile to intensity 8 and apply - else - options |= (SWITCHER_APPLY_INT8 | SWITCHER_SOFT_RESET); + // if old intensity and new intensity different, set flags to update + if(strcasecmp(opt1, opt2) != 0) { + // in case we compared 2 empty strings make sure new intensity is not empty + if (!empty_string(opt2)) { + options |= intoptions; } } //shaders - opt1 = get_pool_setting(pool1->shaders, default_profile.shaders); - opt2 = get_pool_setting(pool2->shaders, default_profile.shaders); + opt1 = ((oldpool)?get_pool_setting(oldpool->shaders, default_profile.shaders):""); + opt2 = get_pool_setting(newpool->shaders, default_profile.shaders); - if (strcasecmp(opt1, opt2) != 0) - { - //shaders is soft reset - if (!empty_string(opt2)) - options |= (SWITCHER_APPLY_SHADER | SWITCHER_SOFT_RESET); + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) { + options |= (SWITCHER_APPLY_SHADER | SWITCHER_SOFT_RESET); } //thread-concurrency - opt1 = get_pool_setting(pool1->thread_concurrency, default_profile.thread_concurrency); - opt2 = get_pool_setting(pool2->thread_concurrency, default_profile.thread_concurrency); + // neoscrypt - if not specified set TC to 0 so that TC will be calculated by intensity settings + if (!safe_cmp(newpool->algorithm.name, "neoscrypt")) { + opt2 = ((empty_string(newpool->thread_concurrency))?"0":get_pool_setting(newpool->thread_concurrency, default_profile.thread_concurrency)); + } + // otherwise use pool/profile setting or default to default profile setting + else { + opt2 = get_pool_setting(newpool->thread_concurrency, default_profile.thread_concurrency); + } + + opt1 = ((oldpool)?get_pool_setting(oldpool->thread_concurrency, default_profile.thread_concurrency):""); //thread-concurrency is soft reset - if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) { options |= (SWITCHER_APPLY_TC | SWITCHER_SOFT_RESET); + } //worksize - opt1 = get_pool_setting(pool1->worksize, default_profile.worksize); - opt2 = get_pool_setting(pool2->worksize, default_profile.worksize); + opt1 = ((oldpool)?get_pool_setting(oldpool->worksize, default_profile.worksize):""); + opt2 = get_pool_setting(newpool->worksize, default_profile.worksize); //worksize is soft reset - if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= (SWITCHER_APPLY_WORKSIZE | SWITCHER_SOFT_RESET); + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) { + options |= (SWITCHER_APPLY_WORKSIZE | SWITCHER_SOFT_RESET); + } -#ifdef HAVE_ADL - //gpu-engine - opt1 = get_pool_setting(pool1->gpu_engine, default_profile.gpu_engine); - opt2 = get_pool_setting(pool2->gpu_engine, default_profile.gpu_engine); + #ifdef HAVE_ADL + //gpu-engine + opt1 = ((oldpool)?get_pool_setting(oldpool->gpu_engine, default_profile.gpu_engine):""); + opt2 = get_pool_setting(newpool->gpu_engine, default_profile.gpu_engine); - if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_ENGINE; + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) { + options |= SWITCHER_APPLY_GPU_ENGINE; + } - //gpu-memclock - opt1 = get_pool_setting(pool1->gpu_memclock, default_profile.gpu_memclock); - opt2 = get_pool_setting(pool2->gpu_memclock, default_profile.gpu_memclock); + //gpu-memclock + opt1 = ((oldpool)?get_pool_setting(oldpool->gpu_memclock, default_profile.gpu_memclock):""); + opt2 = get_pool_setting(newpool->gpu_memclock, default_profile.gpu_memclock); - if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_MEMCLOCK; + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) { + options |= SWITCHER_APPLY_GPU_MEMCLOCK; + } - //GPU fans - opt1 = get_pool_setting(pool1->gpu_fan, default_profile.gpu_fan); - opt2 = get_pool_setting(pool2->gpu_fan, default_profile.gpu_fan); + //GPU fans + opt1 = ((oldpool)?get_pool_setting(oldpool->gpu_fan, default_profile.gpu_fan):""); + opt2 = get_pool_setting(newpool->gpu_fan, default_profile.gpu_fan); - if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_FAN; + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) { + options |= SWITCHER_APPLY_GPU_FAN; + } - //GPU powertune - opt1 = get_pool_setting(pool1->gpu_powertune, default_profile.gpu_powertune); - opt2 = get_pool_setting(pool2->gpu_powertune, default_profile.gpu_powertune); + //GPU powertune + opt1 = ((oldpool)?get_pool_setting(oldpool->gpu_powertune, default_profile.gpu_powertune):""); + opt2 = get_pool_setting(newpool->gpu_powertune, default_profile.gpu_powertune); - if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_POWERTUNE; + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) { + options |= SWITCHER_APPLY_GPU_POWERTUNE; + } - //GPU vddc - opt1 = get_pool_setting(pool1->gpu_vddc, default_profile.gpu_vddc); - opt2 = get_pool_setting(pool2->gpu_vddc, default_profile.gpu_vddc); + //GPU vddc + opt1 = ((oldpool)?get_pool_setting(oldpool->gpu_vddc, default_profile.gpu_vddc):""); + opt2 = get_pool_setting(newpool->gpu_vddc, default_profile.gpu_vddc); - if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_VDDC; -#endif + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) { + options |= SWITCHER_APPLY_GPU_VDDC; + } + #endif // Remove soft reset if hard reset is set - if (opt_isset(options, SWITCHER_HARD_RESET) && - opt_isset(options, SWITCHER_SOFT_RESET)) { + if (opt_isset(options, SWITCHER_HARD_RESET) && opt_isset(options, SWITCHER_SOFT_RESET)) { options &= ~SWITCHER_SOFT_RESET; } return options; } +static void apply_switcher_options(unsigned long options, struct pool *pool) +{ + int i; + const char *opt; + + //nothing to change, abort + if (!options) { + return; + } + + if(opt_isset(options, SWITCHER_APPLY_DEVICE)) + { + //reset devices flags + opt_devs_enabled = 0; + for (i = 0; i < MAX_DEVICES; i++) + devices_enabled[i] = false; + + //assign pool devices if any + if(!empty_string((opt = get_pool_setting(pool->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all"))))) { + set_devices((char *)opt); + } + } + + //lookup gap + if(opt_isset(options, SWITCHER_APPLY_LG)) + { + if(!empty_string((opt = get_pool_setting(pool->lookup_gap, default_profile.lookup_gap)))) + set_lookup_gap((char *)opt); + } + + //raw intensity from pool + if(opt_isset(options, SWITCHER_APPLY_RAWINT)) + { + applog(LOG_DEBUG, "Switching to rawintensity: pool = %s, default = %s", pool->rawintensity, default_profile.rawintensity); + opt = get_pool_setting(pool->rawintensity, default_profile.rawintensity); + applog(LOG_DEBUG, "rawintensity -> %s", opt); + set_rawintensity(opt); + } + //xintensity + else if(opt_isset(options, SWITCHER_APPLY_XINT)) + { + applog(LOG_DEBUG, "Switching to xintensity: pool = %s, default = %s", pool->xintensity, default_profile.xintensity); + opt = get_pool_setting(pool->xintensity, default_profile.xintensity); + applog(LOG_DEBUG, "xintensity -> %s", opt); + set_xintensity(opt); + } + //intensity + else if(opt_isset(options, SWITCHER_APPLY_INT)) + { + applog(LOG_DEBUG, "Switching to intensity: pool = %s, default = %s", pool->intensity, default_profile.intensity); + opt = get_pool_setting(pool->intensity, default_profile.intensity); + applog(LOG_DEBUG, "intensity -> %s", opt); + set_intensity(opt); + } + //default basic intensity + else if(opt_isset(options, SWITCHER_APPLY_INT8)) + { + default_profile.intensity = strdup("8"); + set_intensity(default_profile.intensity); + } + + //shaders + if(opt_isset(options, SWITCHER_APPLY_SHADER)) + { + if(!empty_string((opt = get_pool_setting(pool->shaders, default_profile.shaders)))) + set_shaders((char *)opt); + } + + //thread-concurrency + if(opt_isset(options, SWITCHER_APPLY_TC)) + { + // neoscrypt - if not specified set TC to 0 so that TC will be calculated by intensity settings + if (!safe_cmp(pool->algorithm.name, "neoscrypt")) { + opt = ((empty_string(pool->thread_concurrency))?"0":get_pool_setting(pool->thread_concurrency, default_profile.thread_concurrency)); + } + // otherwise use pool/profile setting or default to default profile setting + else { + opt = get_pool_setting(pool->thread_concurrency, default_profile.thread_concurrency); + } + + if(!empty_string(opt)) { + set_thread_concurrency((char *)opt); + } + } + + //worksize + if(opt_isset(options, SWITCHER_APPLY_WORKSIZE)) + { + if(!empty_string((opt = get_pool_setting(pool->worksize, default_profile.worksize)))) + set_worksize(opt); + } + + #ifdef HAVE_ADL + //GPU clock + if(opt_isset(options, SWITCHER_APPLY_GPU_ENGINE)) + { + if(!empty_string((opt = get_pool_setting(pool->gpu_engine, default_profile.gpu_engine)))) + set_gpu_engine((char *)opt); + } + + //GPU memory clock + if(opt_isset(options, SWITCHER_APPLY_GPU_MEMCLOCK)) + { + if(!empty_string((opt = get_pool_setting(pool->gpu_memclock, default_profile.gpu_memclock)))) + set_gpu_memclock((char *)opt); + } + + //GPU fans + if(opt_isset(options, SWITCHER_APPLY_GPU_FAN)) + { + if(!empty_string((opt = get_pool_setting(pool->gpu_fan, default_profile.gpu_fan)))) + set_gpu_fan((char *)opt); + } + + //GPU powertune + if(opt_isset(options, SWITCHER_APPLY_GPU_POWERTUNE)) + { + if(!empty_string((opt = get_pool_setting(pool->gpu_powertune, default_profile.gpu_powertune)))) + set_gpu_powertune((char *)opt); + } + + //GPU vddc + if(opt_isset(options, SWITCHER_APPLY_GPU_VDDC)) + { + if(!empty_string((opt = get_pool_setting(pool->gpu_vddc, default_profile.gpu_vddc)))) + set_gpu_vddc((char *)opt); + } + + //apply gpu settings + for (i = 0; i < nDevs; ++i) { + if(opt_isset(options, SWITCHER_APPLY_GPU_ENGINE)) + set_engineclock(i, gpus[i].min_engine); + if(opt_isset(options, SWITCHER_APPLY_GPU_MEMCLOCK)) + set_memoryclock(i, gpus[i].gpu_memclock); + if(opt_isset(options, SWITCHER_APPLY_GPU_FAN)) + set_fanspeed(i, gpus[i].min_fan); + if(opt_isset(options, SWITCHER_APPLY_GPU_POWERTUNE)) + set_powertune(i, gpus[i].gpu_powertune); + if(opt_isset(options, SWITCHER_APPLY_GPU_VDDC)) + set_vddc(i, gpus[i].gpu_vddc); + } + #endif +} + static void mutex_unlock_cleanup_handler(void *mutex) { mutex_unlock((pthread_mutex_t *) mutex); @@ -6435,7 +6636,7 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) { int i; - applog(LOG_DEBUG, "get_work_prepare_thread()"); + applog(LOG_DEBUG, "[THR%d] get_work_prepare_thread", mythr->id); //if switcher is disabled if(opt_switchmode == SWITCH_OFF) @@ -6527,8 +6728,11 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) // Reset stats (e.g. for working_diff to be set properly in hash_sole_work) zero_stats(); + //apply switcher options + apply_switcher_options(pool_switch_options, work->pool); + //devices - if(opt_isset(pool_switch_options, SWITCHER_APPLY_DEVICE)) + /*if(opt_isset(pool_switch_options, SWITCHER_APPLY_DEVICE)) { //reset devices flags opt_devs_enabled = 0; @@ -6551,20 +6755,26 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) //raw intensity from pool if(opt_isset(pool_switch_options, SWITCHER_APPLY_RAWINT)) { - if(!empty_string((opt = get_pool_setting(work->pool->rawintensity, default_profile.rawintensity)))) - set_rawintensity((char *)opt); + applog(LOG_DEBUG, "Switching to rawintensity: pool = %s, default = %s", work->pool->rawintensity, default_profile.rawintensity); + opt = get_pool_setting(work->pool->rawintensity, default_profile.rawintensity); + applog(LOG_DEBUG, "rawintensity -> %s", opt); + set_rawintensity(opt); } //xintensity else if(opt_isset(pool_switch_options, SWITCHER_APPLY_XINT)) { - if(!empty_string((opt = get_pool_setting(work->pool->xintensity, default_profile.xintensity)))) - set_xintensity((char *)opt); + applog(LOG_DEBUG, "Switching to xintensity: pool = %s, default = %s", work->pool->xintensity, default_profile.xintensity); + opt = get_pool_setting(work->pool->xintensity, default_profile.xintensity); + applog(LOG_DEBUG, "xintensity -> %s", opt); + set_xintensity(opt); } //intensity else if(opt_isset(pool_switch_options, SWITCHER_APPLY_INT)) { - if(!empty_string((opt = get_pool_setting(work->pool->intensity, default_profile.intensity)))) - set_intensity((char *)opt); + applog(LOG_DEBUG, "Switching to intensity: pool = %s, default = %s", work->pool->intensity, default_profile.intensity); + opt = get_pool_setting(work->pool->intensity, default_profile.intensity); + applog(LOG_DEBUG, "intensity -> %s", opt); + set_intensity(opt); } //default basic intensity else if(opt_isset(pool_switch_options, SWITCHER_APPLY_INT8)) @@ -6583,8 +6793,18 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) //thread-concurrency if(opt_isset(pool_switch_options, SWITCHER_APPLY_TC)) { - if(!empty_string((opt = get_pool_setting(work->pool->thread_concurrency, default_profile.thread_concurrency)))) + // neoscrypt - if not specified set TC to 0 so that TC will be calculated by intensity settings + if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) { + opt = ((empty_string(work->pool->thread_concurrency))?"0":get_pool_setting(work->pool->thread_concurrency, default_profile.thread_concurrency)); + } + // otherwise use pool/profile setting or default to default profile setting + else { + opt = get_pool_setting(work->pool->thread_concurrency, default_profile.thread_concurrency); + } + + if(!empty_string(opt)) { set_thread_concurrency((char *)opt); + } } //worksize @@ -6645,7 +6865,7 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) set_vddc(i, gpus[i].gpu_vddc); } #endif - + */ // Change algorithm for each thread (thread_prepare calls initCl) if(opt_isset(pool_switch_options, SWITCHER_SOFT_RESET)) applog(LOG_DEBUG, "Soft Reset... Restarting threads..."); @@ -6752,19 +6972,19 @@ struct work *get_work(struct thr_info *thr, const int thr_id) time_t diff_t; thread_reportout(thr); - applog(LOG_DEBUG, "Popping work from get queue to get work"); + applog(LOG_DEBUG, "[THR%d] Popping work from get queue to get work", thr_id); diff_t = time(NULL); while (!work) { work = hash_pop(true); if (stale_work(work, false)) { - applog(LOG_DEBUG, "Work is stale, discarding"); + applog(LOG_DEBUG, "[THR%d] Work is stale, discarding", thr_id); discard_work(work); work = NULL; wake_gws(); } } - applog(LOG_DEBUG, "preparing thread..."); + applog(LOG_DEBUG, "[THR%d] preparing thread...", thr_id); get_work_prepare_thread(thr, work); diff_t = time(NULL) - diff_t; @@ -6772,10 +6992,10 @@ struct work *get_work(struct thr_info *thr, const int thr_id) * the device's last valid work to not make outages appear to be * device failures. */ if (diff_t > 0) { - applog(LOG_DEBUG, "Get work blocked for %d seconds", (int)diff_t); + applog(LOG_DEBUG, "[THR%d] Get work blocked for %d seconds", thr_id, (int)diff_t); thr->cgpu->last_device_valid_work += diff_t; } - applog(LOG_DEBUG, "Got work from get queue to get work for thread %d", thr_id); + applog(LOG_DEBUG, "[THR%d] Got work from get queue", thr_id); work->thr_id = thr_id; thread_reportin(thr); @@ -6829,7 +7049,7 @@ static void submit_work_async(struct work *work) void inc_hw_errors(struct thr_info *thr) { - applog(LOG_INFO, "%s%d: invalid nonce - HW error", thr->cgpu->drv->name, + applog(LOG_INFO, "[THR%d] %s%d: invalid nonce - HW error", thr->id, thr->cgpu->drv->name, thr->cgpu->device_id); mutex_lock(&stats_lock); @@ -6843,7 +7063,7 @@ void inc_hw_errors(struct thr_info *thr) /* Fills in the work nonce and builds the output data in work->hash */ static void rebuild_nonce(struct work *work, uint32_t nonce) { - uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12); + uint32_t *work_nonce = (uint32_t *)(work->data + 76); *work_nonce = htole32(nonce); @@ -6858,8 +7078,6 @@ bool test_nonce(struct work *work, uint32_t nonce) rebuild_nonce(work, nonce); - applog(LOG_DEBUG, "test_nonce() algorithm = %s", work->pool->algorithm.name); - // for Neoscrypt, the diff1targ value is in work->target if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) { diff1targ = ((uint32_t *)work->target)[7]; @@ -6916,14 +7134,13 @@ bool submit_tested_work(struct thr_info *thr, struct work *work) /* Returns true if nonce for work was a valid share */ bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce) { - if (test_nonce(work, nonce)) + if (test_nonce(work, nonce)) { submit_tested_work(thr, work); - else { - inc_hw_errors(thr); - return false; + return true; } - return true; + inc_hw_errors(thr); + return false; } static inline bool abandon_work(struct work *work, struct timeval *wdiff, uint64_t hashes) @@ -7003,7 +7220,12 @@ static void hash_sole_work(struct thr_info *mythr) work->device_diff = MIN(drv->working_diff, work->work_difficulty); } else if (drv->working_diff > work->work_difficulty) drv->working_diff = work->work_difficulty; - set_target(work->device_target, work->device_diff, work->pool->algorithm.diff_multiplier2); + + if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) { + set_target_neoscrypt(work->device_target, work->device_diff, work->thr_id); + } else { + set_target(work->device_target, work->device_diff, work->pool->algorithm.diff_multiplier2, work->thr_id); + } do { cgtime(&tv_start); @@ -7422,28 +7644,40 @@ static void *watchpool_thread(void __maybe_unused *userdata) while (42) { struct timeval now; + int sleeptimeout/* = 1000*/; int i; - if (++intervals > 20) - intervals = 0; + // get current time cgtime(&now); - for (i = 0; i < total_pools; i++) { + // sleep timeout is 30 secs for most cases + sleeptimeout = opt_watchpool_refresh * 1000; + + //limit to 5 secs minimum... + if (sleeptimeout < 5000) { + sleeptimeout = 5000; + } + + // check the status of each pool + for (i = 0; i < total_pools; ++i) { struct pool *pool = pools[i]; reap_curl(pool); /* Get a rolling utility per pool over 10 mins */ - if (intervals > 19) { + if (intervals >= 600) { int shares = pool->diff1 - pool->last_shares; pool->last_shares = pool->diff1; pool->utility = (pool->utility + (double)shares * 0.63) / 1.63; pool->shares = pool->utility; + intervals = 0; } - if (pool->state == POOL_DISABLED) + // if this pool is disabled, skip it + if (pool->state == POOL_DISABLED) { continue; + } /* Don't start testing any pools if the test threads * from startup are still doing their first attempt. */ @@ -7455,31 +7689,38 @@ static void *watchpool_thread(void __maybe_unused *userdata) /* Test pool is idle once every minute */ if (pool->idle && now.tv_sec - pool->tv_idle.tv_sec > 30) { cgtime(&pool->tv_idle); - if (pool_active(pool, true) && pool_tclear(pool, &pool->idle)) + if (pool_active(pool, true) && pool_tclear(pool, &pool->idle)) { pool_resus(pool); + } } - /* Only switch pools if the failback pool has been - * alive for more than fail_switch_delay seconds to - * prevent intermittently failing pools from being - * used. */ - if (!pool->idle && pool_strategy == POOL_FAILOVER && pool->prio < cp_prio() && - now.tv_sec - pool->tv_idle.tv_sec > opt_fail_switch_delay) { - applog(LOG_WARNING, "%s stable for %d seconds", get_pool_name(pool), opt_fail_switch_delay); - switch_pools(NULL); + // if this pool is alive and the priority is greater (lower) than currently connected pool + if (!pool->idle && pool->prio < cp_prio()) { + // failover strategy - switch when failover delay is met + if (pool_strategy == POOL_FAILOVER && (now.tv_sec - pool->tv_idle.tv_sec > opt_fail_switch_delay)) { + applog(LOG_WARNING, "%s stable for %d seconds", get_pool_name(pool), opt_fail_switch_delay); + switch_pools(NULL); + } } - } - if (current_pool()->idle) - switch_pools(NULL); + } //end pool loop + // if the pool stategy is rotation and we have been over the rotate delay, switch pool if (pool_strategy == POOL_ROTATE && now.tv_sec - rotate_tv.tv_sec > 60 * opt_rotate_period) { cgtime(&rotate_tv); switch_pools(NULL); } - cgsleep_ms(30000); - } + // if the current pool is dead/idle switch pool + if (current_pool()->idle) { + switch_pools(NULL); + } + + cgsleep_ms(sleeptimeout); + intervals += (sleeptimeout / 1000); + + } //end main loop + return NULL; } @@ -7822,8 +8063,7 @@ static void *test_pool_thread(void *arg) { struct pool *pool = (struct pool *)arg; - if (pool_active(pool, false)) - { + if (pool_active(pool, false)) { pool_tset(pool, &pool->lagging); pool_tclear(pool, &pool->idle); bool first_pool = false; @@ -7842,8 +8082,9 @@ static void *test_pool_thread(void *arg) pool_resus(pool); switch_pools(NULL); - } else + } else { pool_died(pool); + } return NULL; } @@ -8235,7 +8476,7 @@ static void probe_pools(void) static void restart_mining_threads(unsigned int new_n_threads) { struct thr_info *thr; - int i, j, k; + unsigned int i, j, k; // Stop and free threads if (mining_thr) @@ -8388,11 +8629,11 @@ int main(int argc, char *argv[]) { #ifndef _MSC_VER struct sigaction handler; - char *s; #endif struct thr_info *thr; struct block *block; int i; + char *s; /* This dangerous function tramples random dynamically allocated * variables so do it before anything at all */ @@ -8518,12 +8759,6 @@ int main(int argc, char *argv[]) //load default profile if specified in config load_default_profile(); - //apply default settings - apply_defaults(); - - //apply pool-specific config from profiles - apply_pool_profiles(); - #ifdef HAVE_CURSES if (opt_realquiet || opt_display_devs) use_curses = false; @@ -8563,6 +8798,7 @@ int main(int argc, char *argv[]) gwsched_thr_id = 0; + //Detect GPUs /* Use the DRIVER_PARSE_COMMANDS macro to fill all the device_drvs */ DRIVER_PARSE_COMMANDS(DRIVER_FILL_DEVICE_DRV) @@ -8581,8 +8817,13 @@ int main(int argc, char *argv[]) quit(0, "%d devices listed", total_devices); } - most_devices = 0; + //apply default settings to GPUs + apply_defaults(); + + //apply pool-specific config from profiles + apply_pool_profiles(); + most_devices = 0; mining_threads = 0; if (opt_devs_enabled) { for (i = 0; i < MAX_DEVICES; i++) { @@ -8843,8 +9084,8 @@ int main(int argc, char *argv[]) work = hash_pop(false); if (work) { applog(LOG_DEBUG, - "Staged work: total (%d) > max (%d), discarding", - ts, max_staged); + "[THR%d] Staged work: total (%d) > max (%d), discarding", + work->thr_id, ts, max_staged); discard_work(work); } continue; diff --git a/util.c b/util.c index 67f96c10..ae62be2d 100644 --- a/util.c +++ b/util.c @@ -1444,9 +1444,9 @@ char *recv_line(struct pool *pool) } buflen = strlen(pool->sockbuf); - tok = strtok(pool->sockbuf, "\n"); - if (!tok) { - applog(LOG_DEBUG, "Failed to parse a \\n terminated string in recv_line"); + + if ((tok = strtok(pool->sockbuf, "\n")) == NULL) { + applog(LOG_DEBUG, "Failed to parse a \\n terminated string in recv_line: buffer = %s", pool->sockbuf); goto out; } sret = strdup(tok); @@ -1675,12 +1675,16 @@ static bool parse_diff(struct pool *pool, json_t *val) if (old_diff != diff) { int idiff = diff; - if ((double)idiff == diff) + if ((double)idiff == diff) { applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %d", get_pool_name(pool), idiff); - else + } + else { applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %.3f", get_pool_name(pool), diff); - } else + } + } + else { applog(LOG_DEBUG, "%s difficulty set to %f", get_pool_name(pool), diff); + } return true; } @@ -1806,83 +1810,80 @@ bool parse_method(struct pool *pool, char *s) bool ret = false; char *buf; - if (!s) + if (!s) { return ret; + } - val = JSON_LOADS(s, &err); - if (!val) { + if (!(val = JSON_LOADS(s, &err))) { applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); return ret; } - method = json_object_get(val, "method"); - if (!method) { - json_decref(val); - return ret; + if (!(method = json_object_get(val, "method"))) { + goto done; } + err_val = json_object_get(val, "error"); params = json_object_get(val, "params"); if (err_val && !json_is_null(err_val)) { char *ss; - if (err_val) + if (err_val) { ss = json_dumps(err_val, JSON_INDENT(3)); - else + } + else { ss = strdup("(unknown reason)"); + } applog(LOG_INFO, "JSON-RPC method decode failed: %s", ss); - json_decref(val); free(ss); - - return ret; + goto done; } buf = (char *)json_string_value(method); if (!buf) { - json_decref(val); - return ret; + goto done; } if (!strncasecmp(buf, "mining.notify", 13)) { - if (parse_notify(pool, params)) + if (parse_notify(pool, params)) { pool->stratum_notify = ret = true; - else + } + else { pool->stratum_notify = ret = false; - json_decref(val); - return ret; + } + + goto done; } if (!strncasecmp(buf, "mining.set_difficulty", 21) && parse_diff(pool, params)) { ret = true; - json_decref(val); - return ret; + goto done; } if (!strncasecmp(buf, "mining.set_extranonce", 21) && parse_extranonce(pool, params)) { ret = true; - json_decref(val); - return ret; + goto done; } if (!strncasecmp(buf, "client.reconnect", 16) && parse_reconnect(pool, params)) { ret = true; - json_decref(val); - return ret; + goto done; } if (!strncasecmp(buf, "client.get_version", 18) && send_version(pool, val)) { ret = true; - json_decref(val); - return ret; + goto done; } if (!strncasecmp(buf, "client.show_message", 19) && show_message(pool, params)) { ret = true; - json_decref(val); - return ret; + goto done; } + +done: json_decref(val); return ret; } @@ -1894,11 +1895,11 @@ bool subscribe_extranonce(struct pool *pool) json_error_t err; bool ret = false; - sprintf(s, "{\"id\": %d, \"method\": \"mining.extranonce.subscribe\", \"params\": []}", - swork_id++); + sprintf(s, "{\"id\": %d, \"method\": \"mining.extranonce.subscribe\", \"params\": []}", swork_id++); - if (!stratum_send(pool, s, strlen(s))) + if (!stratum_send(pool, s, strlen(s))) { return ret; + } /* Parse all data in the queue and anything left should be the response */ while (42) { @@ -1910,12 +1911,15 @@ bool subscribe_extranonce(struct pool *pool) } sret = recv_line(pool); - if (!sret) + if (!sret) { return ret; - if (parse_method(pool, sret)) + } + else if (parse_method(pool, sret)) { free(sret); - else + } + else { break; + } } val = JSON_LOADS(sret, &err); @@ -1968,18 +1972,23 @@ bool auth_stratum(struct pool *pool) sprintf(s, "{\"id\": %d, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}", swork_id++, pool->rpc_user, pool->rpc_pass); - if (!stratum_send(pool, s, strlen(s))) + if (!stratum_send(pool, s, strlen(s))) { return ret; + } /* Parse all data in the queue and anything left should be auth */ while (42) { sret = recv_line(pool); - if (!sret) + + if (!sret) { return ret; - if (parse_method(pool, sret)) + } + else if (parse_method(pool, sret)) { free(sret); - else + } + else { break; + } } val = JSON_LOADS(sret, &err); diff --git a/winbuild/sgminer.vcxproj b/winbuild/sgminer.vcxproj index abf80519..403a7258 100644 --- a/winbuild/sgminer.vcxproj +++ b/winbuild/sgminer.vcxproj @@ -1,4 +1,4 @@ - + @@ -267,6 +267,7 @@ + @@ -327,6 +328,7 @@ + diff --git a/winbuild/sgminer.vcxproj.filters b/winbuild/sgminer.vcxproj.filters index 3487079d..df9c732e 100644 --- a/winbuild/sgminer.vcxproj.filters +++ b/winbuild/sgminer.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -185,6 +185,9 @@ Source Files\algorithm + + Source Files\algorithm + Source Files\sph @@ -373,6 +376,9 @@ Header Files\algorithm + + Header Files\algorithm + Header Files\sph