Browse Source

Merged develop branch with master +updates

Moved the develop code to master. Moving forward all updates will be
done on master unless it's work on a major feature.

This update contains all previous develop code as well as a few new ones
that weren't pushed yet:

* Added neoscrypt compatibility for xintensity/rawintensity
* Neoscrypt now uses correct TC if not specified or set to 0
* Reworked the application of pool settings on algorithm switch which
should resolve TC/Intensity changes between algos such as X11 and
neoscrypt
djm34
ystarnaud 10 years ago
parent
commit
4ba8a68043
  1. 1
      Makefile.am
  2. 91
      algorithm.c
  3. 1
      algorithm.h
  4. 5
      algorithm/neoscrypt.c
  5. 1
      algorithm/neoscrypt.h
  6. 171
      algorithm/whirlcoin.c
  7. 9
      algorithm/whirlcoin.h
  8. 171
      config_parser.c
  9. 4
      configure.ac
  10. 55
      driver-opencl.c
  11. 2
      findnonce.c
  12. 534
      kernel/animecoin.cl
  13. 993
      kernel/arebyp.cl
  14. 115
      kernel/bitblock.cl
  15. 119
      kernel/darkcoin-mod.cl
  16. 13
      kernel/darkcoin.cl
  17. 1853
      kernel/diamond.cl
  18. 1992
      kernel/groestlcoin.cl
  19. 41
      kernel/inkcoin.cl
  20. 117
      kernel/marucoin-mod.cl
  21. 80
      kernel/myriadcoin-groestl.cl
  22. 974
      kernel/neoscrypt.cl
  23. 684
      kernel/quarkcoin.cl
  24. 16
      kernel/qubitcoin.cl
  25. 328
      kernel/sifcoin.cl
  26. 70
      kernel/talkcoin-mod.cl
  27. 16
      kernel/twecoin.cl
  28. 1358
      kernel/whirlcoin.cl
  29. 117
      kernel/x14.cl
  30. 17
      logging.c
  31. 19
      miner.h
  32. 99
      ocl.c
  33. 9
      pool.c
  34. 599
      sgminer.c
  35. 97
      util.c
  36. 4
      winbuild/sgminer.vcxproj
  37. 8
      winbuild/sgminer.vcxproj.filters

1
Makefile.am

@ -66,6 +66,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
sgminer_SOURCES += algorithm/x14.c algorithm/x14.h sgminer_SOURCES += algorithm/x14.c algorithm/x14.h
sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h
sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h
sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h
bin_SCRIPTS = $(top_srcdir)/kernel/*.cl bin_SCRIPTS = $(top_srcdir)/kernel/*.cl

91
algorithm.c

@ -29,6 +29,7 @@
#include "algorithm/bitblock.h" #include "algorithm/bitblock.h"
#include "algorithm/x14.h" #include "algorithm/x14.h"
#include "algorithm/fresh.h" #include "algorithm/fresh.h"
#include "algorithm/whirlcoin.h"
#include "algorithm/neoscrypt.h" #include "algorithm/neoscrypt.h"
#include "compat.h" #include "compat.h"
@ -49,7 +50,9 @@ const char *algorithm_type_str[] = {
"Twecoin", "Twecoin",
"Fugue256", "Fugue256",
"NIST", "NIST",
"Fresh" "Fresh",
"Whirlcoin",
"Neoscrypt"
}; };
void sha256(const unsigned char *message, unsigned int len, unsigned char *digest) void sha256(const unsigned char *message, unsigned int len, unsigned char *digest)
@ -96,11 +99,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm) static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
{ {
char buf[255]; char buf[255];
sprintf(buf, " -D MAX_GLOBAL_THREADS=%u", sprintf(buf, " %s-D MAX_GLOBAL_THREADS=%lu ",
(unsigned int)cgpu->thread_concurrency); ((cgpu->lookup_gap > 0)?" -D LOOKUP_GAP=2 ":""), (unsigned long)cgpu->thread_concurrency);
strcat(data->compiler_options, buf); strcat(data->compiler_options, buf);
sprintf(buf, "tc%u", (unsigned int)cgpu->thread_concurrency); sprintf(buf, "%stc%lu", ((cgpu->lookup_gap > 0)?"lg":""), (unsigned long)cgpu->thread_concurrency);
strcat(data->binary_filename, buf); strcat(data->binary_filename, buf);
} }
@ -162,11 +165,10 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb
/* This looks like a unnecessary double cast, but to make sure, that /* This looks like a unnecessary double cast, but to make sure, that
* the target's most significant entry is adressed as a 32-bit value * the target's most significant entry is adressed as a 32-bit value
* and not accidently by something else the double cast seems wise. * and not accidently by something else the double cast seems wise.
* The compiler will get rid of it anyway. * The compiler will get rid of it anyway. */
*/
le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]); le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
memcpy(clState->cldata, blk->work->data, 80); memcpy(clState->cldata, blk->work->data, 80);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
CL_SET_ARG(clState->CLbuffer0); CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->outputBuffer); CL_SET_ARG(clState->outputBuffer);
@ -600,6 +602,34 @@ static cl_int queue_fresh_kernel(struct __clState *clState, struct _dev_blk_ctx
return status; return status;
} }
static cl_int queue_whirlcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel;
cl_ulong le_target;
cl_int status = 0;
le_target = *(cl_ulong *)(blk->work->device_target + 24);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
//clbuffer, hashes
kernel = &clState->kernel;
CL_SET_ARG_N(0,clState->CLbuffer0);
CL_SET_ARG_N(1,clState->padbuffer8);
kernel = clState->extra_kernels;
CL_SET_ARG_N(0,clState->padbuffer8);
CL_NEXTKERNEL_SET_ARG_N(0,clState->padbuffer8);
//hashes, output, target
CL_NEXTKERNEL_SET_ARG_N(0,clState->padbuffer8);
CL_SET_ARG_N(1,clState->outputBuffer);
CL_SET_ARG_N(2,le_target);
return status;
}
typedef struct _algorithm_settings_t { typedef struct _algorithm_settings_t {
const char *name; /* Human-readable identifier */ const char *name; /* Human-readable identifier */
algorithm_type_t type; //common algorithm type algorithm_type_t type; //common algorithm type
@ -631,6 +661,7 @@ static algorithm_settings_t algos[] = {
A_SCRYPT( "bufius" ), A_SCRYPT( "bufius" ),
A_SCRYPT( "psw" ), A_SCRYPT( "psw" ),
A_SCRYPT( "zuikkis" ), A_SCRYPT( "zuikkis" ),
A_SCRYPT( "arebyp" ),
#undef A_SCRYPT #undef A_SCRYPT
#define A_NEOSCRYPT(a) \ #define A_NEOSCRYPT(a) \
@ -640,7 +671,7 @@ static algorithm_settings_t algos[] = {
// kernels starting from this will have difficulty calculated by using quarkcoin algorithm // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
#define A_QUARK(a, b) \ #define A_QUARK(a, b) \
{ a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options } { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options}
A_QUARK( "quarkcoin", quarkcoin_regenhash), A_QUARK( "quarkcoin", quarkcoin_regenhash),
A_QUARK( "qubitcoin", qubitcoin_regenhash), A_QUARK( "qubitcoin", qubitcoin_regenhash),
A_QUARK( "animecoin", animecoin_regenhash), A_QUARK( "animecoin", animecoin_regenhash),
@ -649,40 +680,43 @@ static algorithm_settings_t algos[] = {
// kernels starting from this will have difficulty calculated by using bitcoin algorithm // kernels starting from this will have difficulty calculated by using bitcoin algorithm
#define A_DARK(a, b) \ #define A_DARK(a, b) \
{ a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options } { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options}
A_DARK( "darkcoin", darkcoin_regenhash), A_DARK( "darkcoin", darkcoin_regenhash),
A_DARK( "inkcoin", inkcoin_regenhash), A_DARK( "inkcoin", inkcoin_regenhash),
A_DARK( "myriadcoin-groestl", myriadcoin_groestl_regenhash), A_DARK( "myriadcoin-groestl", myriadcoin_groestl_regenhash),
#undef A_DARK #undef A_DARK
{ "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, queue_sph_kernel, sha256, NULL}, { "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, queue_sph_kernel, sha256, NULL},
{ "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL }, { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL},
{ "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options }, { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options},
{ "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options }, { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options},
{ "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options }, { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options},
{ "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options }, { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options},
{ "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options }, { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options},
{ "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options }, { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options},
{ "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options }, { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options},
{ "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options }, { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options},
{ "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options }, { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options},
{ "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL }, { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL},
// kernels starting from this will have difficulty calculated by using fuguecoin algorithm // kernels starting from this will have difficulty calculated by using fuguecoin algorithm
#define A_FUGUE(a, b) \ #define A_FUGUE(a, b, c) \
{ a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL } { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, c, NULL}
A_FUGUE( "fuguecoin", fuguecoin_regenhash), A_FUGUE("fuguecoin", fuguecoin_regenhash, sha256),
A_FUGUE( "groestlcoin", groestlcoin_regenhash), A_FUGUE("groestlcoin", groestlcoin_regenhash, sha256),
#undef A_FUGUE A_FUGUE("diamond", groestlcoin_regenhash, gen_hash),
#undef A_FUGUE
{ "whirlcoin", ALGO_WHIRL, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 3, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, whirlcoin_regenhash, queue_whirlcoin_kernel, sha256, NULL},
// Terminator (do not remove) // Terminator (do not remove)
{ NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL } { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL}
}; };
void copy_algorithm_settings(algorithm_t* dest, const char* algo) void copy_algorithm_settings(algorithm_t* dest, const char* algo)
@ -751,6 +785,7 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
ALGO_ALIAS("x15old", "bitblockold"); ALGO_ALIAS("x15old", "bitblockold");
ALGO_ALIAS("nist5", "talkcoin-mod"); ALGO_ALIAS("nist5", "talkcoin-mod");
ALGO_ALIAS("keccak", "maxcoin"); ALGO_ALIAS("keccak", "maxcoin");
ALGO_ALIAS("whirlpool", "whirlcoin");
#undef ALGO_ALIAS #undef ALGO_ALIAS
#undef ALGO_ALIAS_NF #undef ALGO_ALIAS_NF
@ -760,7 +795,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
void set_algorithm(algorithm_t* algo, const char* newname_alias) void set_algorithm(algorithm_t* algo, const char* newname_alias)
{ {
const char* newname; const char *newname;
//load previous algorithm nfactor in case nfactor was applied before algorithm... or default to 10 //load previous algorithm nfactor in case nfactor was applied before algorithm... or default to 10
uint8_t old_nfactor = ((algo->nfactor)?algo->nfactor:0); uint8_t old_nfactor = ((algo->nfactor)?algo->nfactor:0);
//load previous kernel file name if was applied before algorithm... //load previous kernel file name if was applied before algorithm...
@ -811,5 +847,6 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor)
bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2) bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2)
{ {
// return (strcmp(algo1->name, algo2->name) == 0) && (algo1->nfactor == algo2->nfactor);
return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor)); return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor));
} }

1
algorithm.h

@ -24,6 +24,7 @@ typedef enum {
ALGO_FUGUE, ALGO_FUGUE,
ALGO_NIST, ALGO_NIST,
ALGO_FRESH, ALGO_FRESH,
ALGO_WHIRL,
ALGO_NEOSCRYPT ALGO_NEOSCRYPT
} algorithm_type_t; } algorithm_type_t;

5
algorithm/neoscrypt.c

@ -1177,9 +1177,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) {
r = (1 << ((profile >> 5) & 0x7)); r = (1 << ((profile >> 5) & 0x7));
} }
uchar *stack; uchar stack[(N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align];
stack =(uchar*)malloc((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align);
/* X = r * 2 * SCRYPT_BLOCK_SIZE */ /* X = r * 2 * SCRYPT_BLOCK_SIZE */
X = (uint *) &stack[stack_align & ~(stack_align - 1)]; X = (uint *) &stack[stack_align & ~(stack_align - 1)];
/* Z is a copy of X for ChaCha */ /* Z is a copy of X for ChaCha */
@ -1287,7 +1285,6 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) {
} }
free(stack);
} }
void neoscrypt_regenhash(struct work *work) void neoscrypt_regenhash(struct work *work)

1
algorithm/neoscrypt.h

@ -5,7 +5,6 @@
/* The neoscrypt scratch buffer needs 32kBytes memory. */ /* The neoscrypt scratch buffer needs 32kBytes memory. */
#define NEOSCRYPT_SCRATCHBUF_SIZE (32 * 1024) #define NEOSCRYPT_SCRATCHBUF_SIZE (32 * 1024)
/* These routines are always available. */ /* These routines are always available. */
extern void neoscrypt_regenhash(struct work *work); extern void neoscrypt_regenhash(struct work *work);
extern void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile); extern void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile);

171
algorithm/whirlcoin.c

@ -0,0 +1,171 @@
/*-
* Copyright 2009 Colin Percival, 2011 ArtForz
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include "config.h"
#include "miner.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "sph/sph_whirlpool.h"
/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
typedef struct {
sph_whirlpool1_context whirlpool1;
sph_whirlpool1_context whirlpool2;
sph_whirlpool1_context whirlpool3;
sph_whirlpool1_context whirlpool4;
} Whash_context_holder;
Whash_context_holder base_contexts;
void init_whirlcoin_hash_contexts()
{
sph_whirlpool1_init(&base_contexts.whirlpool1);
sph_whirlpool1_init(&base_contexts.whirlpool2);
sph_whirlpool1_init(&base_contexts.whirlpool3);
sph_whirlpool1_init(&base_contexts.whirlpool4);
}
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static inline void
be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
{
uint32_t i;
for (i = 0; i < len; i++)
dst[i] = htobe32(src[i]);
}
inline void whirlcoin_hash(void *state, const void *input)
{
init_whirlcoin_hash_contexts();
Whash_context_holder ctx;
uint32_t hashA[16], hashB[16];
memcpy(&ctx, &base_contexts, sizeof(base_contexts));
sph_whirlpool1 (&ctx.whirlpool1, input, 80);
sph_whirlpool1_close (&ctx.whirlpool1, hashA);
sph_whirlpool1(&ctx.whirlpool2, hashA, 64);
sph_whirlpool1_close(&ctx.whirlpool2, hashB);
sph_whirlpool1(&ctx.whirlpool3, hashB, 64);
sph_whirlpool1_close(&ctx.whirlpool3, hashA);
sph_whirlpool1(&ctx.whirlpool4, hashA, 64);
sph_whirlpool1_close(&ctx.whirlpool4, hashB);
memcpy(state, hashB, 32);
}
static const uint32_t diff1targ = 0x0000ffff;
/* Used externally as confirmation of correct OCL code */
int whirlcoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
{
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
uint32_t data[20], ohash[8];
be32enc_vect(data, (const uint32_t *)pdata, 19);
data[19] = htobe32(nonce);
whirlcoin_hash(ohash, data);
tmp_hash7 = be32toh(ohash[7]);
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
(long unsigned int)Htarg,
(long unsigned int)diff1targ,
(long unsigned int)tmp_hash7);
if (tmp_hash7 > diff1targ)
return -1;
if (tmp_hash7 > Htarg)
return 0;
return 1;
}
void whirlcoin_regenhash(struct work *work)
{
uint32_t data[20];
uint32_t *nonce = (uint32_t *)(work->data + 76);
uint32_t *ohash = (uint32_t *)(work->hash);
be32enc_vect(data, (const uint32_t *)work->data, 19);
data[19] = htobe32(*nonce);
whirlcoin_hash(ohash, data);
}
bool scanhash_whirlcoin(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *pdata, unsigned char __maybe_unused *phash1,
unsigned char __maybe_unused *phash, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
{
uint32_t *nonce = (uint32_t *)(pdata + 76);
uint32_t data[20];
uint32_t tmp_hash7;
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
bool ret = false;
be32enc_vect(data, (const uint32_t *)pdata, 19);
while(1) {
uint32_t ostate[8];
*nonce = ++n;
data[19] = (n);
whirlcoin_hash(ostate, data);
tmp_hash7 = (ostate[7]);
applog(LOG_INFO, "data7 %08lx",
(long unsigned int)data[7]);
if (unlikely(tmp_hash7 <= Htarg)) {
((uint32_t *)pdata)[19] = htobe32(n);
*last_nonce = n;
ret = true;
break;
}
if (unlikely((n >= max_nonce) || thr->work_restart)) {
*last_nonce = n;
break;
}
}
return ret;
}

9
algorithm/whirlcoin.h

@ -0,0 +1,9 @@
#ifndef W_H
#define W_H
#include "miner.h"
extern int whirlcoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
extern void whirlcoin_regenhash(struct work *work);
#endif /* W_H */

171
config_parser.c

@ -598,8 +598,10 @@ static struct opt_table *opt_find(struct opt_table *tbl, char *optname)
//set url //set url
curl_easy_setopt(curl, CURLOPT_URL, url); curl_easy_setopt(curl, CURLOPT_URL, url);
//set write callback and fileinfo //set write callback and fileinfo
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fetch_remote_config_cb); curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); // fail on 404 or other 4xx http codes
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file); curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30); // timeout after 30 secs to prevent being stuck
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file); // stream to write data to
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fetch_remote_config_cb); // callback function to write to config file
if((res = curl_easy_perform(curl)) != CURLE_OK) if((res = curl_easy_perform(curl)) != CURLE_OK)
applog(LOG_ERR, "Fetch remote file failed: %s", curl_easy_strerror(res)); applog(LOG_ERR, "Fetch remote file failed: %s", curl_easy_strerror(res));
@ -715,17 +717,20 @@ char *parse_config(json_t *val, const char *key, const char *parentkey, bool fil
if((opt = opt_find(opt_config_table, optname)) != NULL) if((opt = opt_find(opt_config_table, optname)) != NULL)
{ {
//strings //strings
if ((opt->type & OPT_HASARG) && json_is_string(val)) if ((opt->type & OPT_HASARG) && json_is_string(val)) {
err = opt->cb_arg(json_string_value(val), opt->u.arg); err = opt->cb_arg(json_string_value(val), opt->u.arg);
}
//boolean values //boolean values
else if ((opt->type & OPT_NOARG) && json_is_true(val)) else if ((opt->type & OPT_NOARG) && json_is_true(val)) {
err = opt->cb(opt->u.arg); err = opt->cb(opt->u.arg);
else }
else {
err = "Invalid value"; err = "Invalid value";
} }
else }
else {
err = "Invalid option"; err = "Invalid option";
}
break; break;
} }
@ -756,37 +761,71 @@ char *load_config(const char *arg, const char *parentkey, void __maybe_unused *u
json_t *config; json_t *config;
#ifdef HAVE_LIBCURL #ifdef HAVE_LIBCURL
//if detected as url int retry = opt_remoteconf_retry;
if((strstr(arg, "http://") != NULL) || (strstr(arg, "https://") != NULL) || (strstr(arg, "ftp://") != NULL)) const char *url;
{
//download config file locally and reset arg to it so we can parse it // if detected as url
if((arg = fetch_remote_config(arg)) == NULL) if ((strstr(arg, "http://") != NULL) || (strstr(arg, "https://") != NULL) || (strstr(arg, "ftp://") != NULL)) {
return NULL; url = strdup(arg);
do {
// wait for next retry
if (retry < opt_remoteconf_retry) {
sleep(opt_remoteconf_wait);
}
// download config file locally and reset arg to it so we can parse it
if ((arg = fetch_remote_config(url)) != NULL) {
break;
}
--retry;
} while (retry);
// file not downloaded... abort
if (arg == NULL) {
// if we should use last downloaded copy...
if (opt_remoteconf_usecache) {
char *p;
// extract filename out of url
if ((p = (char *)strrchr(url, '/')) == NULL) {
quit(1, "%s: invalid URL.", url);
}
arg = p+1;
} else {
quit(1, "%s: unable to download config file.", url);
}
}
} }
#endif #endif
//most likely useless but leaving it here for now... // most likely useless but leaving it here for now...
if(!cnfbuf) if (!cnfbuf) {
cnfbuf = strdup(arg); cnfbuf = strdup(arg);
}
//no need to restrict the number of includes... if it causes problems, restore it later // no need to restrict the number of includes... if it causes problems, restore it later
/*if(++include_count > JSON_MAX_DEPTH) /*if(++include_count > JSON_MAX_DEPTH)
return JSON_MAX_DEPTH_ERR; return JSON_MAX_DEPTH_ERR;
*/ */
//check if the file exists // check if the file exists
if(access(arg, F_OK) == -1) if (access(arg, F_OK) == -1) {
quit(1, "%s: file not found.", arg); quit(1, "%s: file not found.", arg);
}
#if JANSSON_MAJOR_VERSION > 1 #if JANSSON_MAJOR_VERSION > 1
config = json_load_file(arg, 0, &err); config = json_load_file(arg, 0, &err);
#else #else
config = json_load_file(arg, &err); config = json_load_file(arg, &err);
#endif #endif
//if json root is not an object, error out // if json root is not an object, error out
if(!json_is_object(config)) if (!json_is_object(config)) {
return set_last_json_error("Error: JSON decode of file \"%s\" failed:\n %s", arg, err.text); return set_last_json_error("Error: JSON decode of file \"%s\" failed:\n %s", arg, err.text);
}
config_loaded = true; config_loaded = true;
@ -1009,8 +1048,7 @@ void apply_pool_profile(struct pool *pool)
pool->algorithm.kernelfile = profile->algorithm.kernelfile; pool->algorithm.kernelfile = profile->algorithm.kernelfile;
applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile); applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
// ...or default profile does, apply it to the pool // ...or default profile does, apply it to the pool
} } else if (!empty_string(default_profile.algorithm.kernelfile)) {
else if (!empty_string(default_profile.algorithm.kernelfile)) {
pool->algorithm.kernelfile = default_profile.algorithm.kernelfile; pool->algorithm.kernelfile = default_profile.algorithm.kernelfile;
applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile); applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
} }
@ -1034,39 +1072,76 @@ void apply_pool_profile(struct pool *pool)
} }
applog(LOG_DEBUG, "Pool %i lookup gap set to \"%s\"", pool->pool_no, pool->lookup_gap); applog(LOG_DEBUG, "Pool %i lookup gap set to \"%s\"", pool->pool_no, pool->lookup_gap);
if(pool_cmp(pool->intensity, default_profile.intensity)) int int_type = 0;
{
if(!empty_string(profile->intensity))
pool->intensity = profile->intensity;
else
pool->intensity = default_profile.intensity;
}
applog(LOG_DEBUG, "Pool %i Intensity set to \"%s\"", pool->pool_no, pool->intensity);
if(pool_cmp(pool->xintensity, default_profile.xintensity)) // FIXME: ifs from hell...
{ // First look for an existing intensity on pool
if(!empty_string(profile->xintensity)) if (!empty_string(pool->rawintensity)) {
int_type = 2;
}
else if (!empty_string(pool->xintensity)) {
int_type = 1;
}
else if (!empty_string(pool->intensity)) {
int_type = 0;
}
else {
//no intensity found on pool... check if the profile has one and use it...
if (!empty_string(profile->rawintensity)) {
int_type = 2;
pool->rawintensity = profile->rawintensity;
}
else if (!empty_string(profile->xintensity)) {
int_type = 1;
pool->xintensity = profile->xintensity; pool->xintensity = profile->xintensity;
else }
else if (!empty_string(profile->intensity)) {
int_type = 0;
pool->intensity = profile->intensity;
}
else {
//nothing in profile... check default profile/globals
if (!empty_string(default_profile.rawintensity)) {
int_type = 2;
pool->rawintensity = default_profile.rawintensity;
}
else if (!empty_string(default_profile.xintensity)) {
int_type = 1;
pool->xintensity = default_profile.xintensity; pool->xintensity = default_profile.xintensity;
} }
applog(LOG_DEBUG, "Pool %i XIntensity set to \"%s\"", pool->pool_no, pool->xintensity); else if (!empty_string(default_profile.intensity)) {
int_type = 0;
pool->intensity = default_profile.intensity;
}
else {
//nothing anywhere? default to sgminer default of 8
int_type = 0;
pool->intensity = strdup("8");
}
}
}
if(pool_cmp(pool->rawintensity, default_profile.rawintensity)) switch(int_type) {
{ case 2:
if(!empty_string(profile->rawintensity)) applog(LOG_DEBUG, "Pool %d Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity);
pool->rawintensity = profile->rawintensity; break;
else
pool->rawintensity = default_profile.rawintensity; case 1:
applog(LOG_DEBUG, "Pool %d XIntensity set to \"%s\"", pool->pool_no, pool->xintensity);
break;
default:
applog(LOG_DEBUG, "Pool %d Intensity set to \"%s\"", pool->pool_no, pool->intensity);
break;
} }
applog(LOG_DEBUG, "Pool %i Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity);
if(pool_cmp(pool->thread_concurrency, default_profile.thread_concurrency)) if(pool_cmp(pool->thread_concurrency, default_profile.thread_concurrency))
{ {
if(!empty_string(profile->thread_concurrency)) /* allow empty string TC
if(!empty_string(profile->thread_concurrency))*/
pool->thread_concurrency = profile->thread_concurrency; pool->thread_concurrency = profile->thread_concurrency;
else /* else
pool->thread_concurrency = default_profile.thread_concurrency; pool->thread_concurrency = default_profile.thread_concurrency;*/
} }
applog(LOG_DEBUG, "Pool %i Thread Concurrency set to \"%s\"", pool->pool_no, pool->thread_concurrency); applog(LOG_DEBUG, "Pool %i Thread Concurrency set to \"%s\"", pool->pool_no, pool->thread_concurrency);

4
configure.ac

@ -1,8 +1,8 @@
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
m4_define([v_maj], [5]) m4_define([v_maj], [5])
m4_define([v_min], [0]) m4_define([v_min], [1])
m4_define([v_mic], [1]) m4_define([v_mic], [0])
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
m4_define([v_ver], [v_maj.v_min.v_mic]) m4_define([v_ver], [v_maj.v_min.v_mic])
m4_define([lt_rev], m4_eval(v_maj + v_min)) m4_define([lt_rev], m4_eval(v_maj + v_min))

55
driver-opencl.c

@ -176,30 +176,42 @@ char *set_lookup_gap(char *arg)
return NULL; return NULL;
} }
char *set_thread_concurrency(const char *_arg) char *set_thread_concurrency(const char *arg)
{ {
int i, val = 0, device = 0; int i, device = 0;
size_t val = 0;
char *tmpstr = strdup(arg);
char *nextptr; char *nextptr;
char *arg = (char *)alloca(strlen(_arg) + 1);
strcpy(arg, _arg);
nextptr = strtok(arg, ",");
if (nextptr == NULL)
return "Invalid parameters for set thread concurrency";
val = atoi(nextptr);
// empty string - use 0 and let algo autodetect the TC
if (empty_string(tmpstr)) {
applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", device, val);
gpus[device++].opt_tc = val; gpus[device++].opt_tc = val;
}
// not empty string
else {
if ((nextptr = strtok(tmpstr, ",")) == NULL) {
free(tmpstr);
return "Invalid parameters for set_thread_concurrency";
}
while ((nextptr = strtok(NULL, ",")) != NULL) { do {
val = atoi(nextptr); val = (unsigned long)atol(nextptr);
applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", device, val);
gpus[device++].opt_tc = val; gpus[device++].opt_tc = val;
} while ((nextptr = strtok(NULL, ",")) != NULL);
} }
// if only 1 TC was passed, assign the same worksize for all remaining GPUs
if (device == 1) { if (device == 1) {
for (i = device; i < MAX_GPUDEVICES; i++) for (i = device; i < total_devices; ++i) {
gpus[i].opt_tc = gpus[0].opt_tc; gpus[i].opt_tc = gpus[0].opt_tc;
applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", i, gpus[i].opt_tc);
}
} }
free(tmpstr);
return NULL; return NULL;
} }
@ -1020,23 +1032,26 @@ static void set_threads_hashes(unsigned int vectors, unsigned int compute_shader
{ {
unsigned int threads = 0; unsigned int threads = 0;
while (threads < minthreads) { while (threads < minthreads) {
if (*rawintensity > 0) { if (*rawintensity > 0) {
threads = *rawintensity; threads = *rawintensity;
} else if (*xintensity > 0) { }
if (algorithm->xintensity_shift) else if (*xintensity > 0) {
threads = compute_shaders * (1 << (algorithm->xintensity_shift + *xintensity)); threads = compute_shaders * ((algorithm->xintensity_shift)?(1 << (algorithm->xintensity_shift + *xintensity)):*xintensity);
else }
threads = compute_shaders * *xintensity; else {
} else {
threads = 1 << (algorithm->intensity_shift + *intensity); threads = 1 << (algorithm->intensity_shift + *intensity);
} }
if (threads < minthreads) { if (threads < minthreads) {
if (likely(*intensity < MAX_INTENSITY)) if (likely(*intensity < MAX_INTENSITY)) {
(*intensity)++; (*intensity)++;
else }
else {
threads = minthreads; threads = minthreads;
} }
} }
}
*globalThreads = threads; *globalThreads = threads;
*hashes = threads * vectors; *hashes = threads * vectors;

2
findnonce.c

@ -202,7 +202,7 @@ static void *postcalc_hash(void *userdata)
if (found == 0x0F) if (found == 0x0F)
nonce = swab32(nonce); nonce = swab32(nonce);
applog(LOG_DEBUG, "OCL NONCE %u found in slot %d", nonce, entry); applog(LOG_DEBUG, "[THR%d] OCL NONCE %08x (%lu) found in slot %d (found = %d)", thr->id, nonce, nonce, entry, found);
submit_nonce(thr, pcd->work, nonce); submit_nonce(thr, pcd->work, nonce);
} }

534
kernel/animecoin.cl

@ -92,6 +92,14 @@ typedef long sph_s64;
#define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) #define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
#if SPH_BIG_ENDIAN #if SPH_BIG_ENDIAN
#define DEC64E(x) (x) #define DEC64E(x) (x)
#define DEC64BE(x) (*(const __global sph_u64 *) (x)); #define DEC64BE(x) (*(const __global sph_u64 *) (x));
@ -118,8 +126,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
for(unsigned u = 0; u < 16; u++) for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u]; BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16]; sph_u64 mv[16],q[32];
sph_u64 mv[16]; sph_u64 tmp;
mv[0] = DEC64LE(block + 0); mv[0] = DEC64LE(block + 0);
mv[1] = DEC64LE(block + 8); mv[1] = DEC64LE(block + 8);
@ -139,34 +147,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
mv[13] = 0; mv[13] = 0;
mv[14] = 0; mv[14] = 0;
mv[15] = 0x280; mv[15] = 0x280;
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
FOLDb; tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
#undef M tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
#undef H q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
#undef dH tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x]) #pragma unroll 4
#define H(x) (final_b[x]) for(int i=2;i<6;i++)
#define dH(x) (BMW_h1[x]) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
FOLDb; #pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#undef M #pragma unroll 4
#undef H for(int i=9;i<13;i++)
#undef dH {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
hash.h8[0] = SWAP8(BMW_h1[8]); XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
hash.h8[1] = SWAP8(BMW_h1[9]); XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
hash.h8[2] = SWAP8(BMW_h1[10]);
hash.h8[3] = SWAP8(BMW_h1[11]); BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
hash.h8[4] = SWAP8(BMW_h1[12]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
hash.h8[5] = SWAP8(BMW_h1[13]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
hash.h8[6] = SWAP8(BMW_h1[14]); BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
hash.h8[7] = SWAP8(BMW_h1[15]); BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = SWAP8(BMW_H[8]);
hash.h8[1] = SWAP8(BMW_H[9]);
hash.h8[2] = SWAP8(BMW_H[10]);
hash.h8[3] = SWAP8(BMW_H[11]);
hash.h8[4] = SWAP8(BMW_H[12]);
hash.h8[5] = SWAP8(BMW_H[13]);
hash.h8[6] = SWAP8(BMW_H[14]);
hash.h8[7] = SWAP8(BMW_H[15]);
} }
// blake // blake
@ -218,7 +434,6 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
bool dec = ((hash.h1[7] & 0x8) != 0); bool dec = ((hash.h1[7] & 0x8) != 0);
{ {
// groestl // groestl
sph_u64 H[16]; sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++) for (unsigned int u = 0; u < 15; u ++)
H[u] = 0; H[u] = 0;
@ -427,14 +642,13 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
hash.h8[7] = (dec ? H7 : hash.h8[7]); hash.h8[7] = (dec ? H7 : hash.h8[7]);
} }
{ {
// bmw // bmw
sph_u64 BMW_H[16]; sph_u64 BMW_H[16];
for(unsigned u = 0; u < 16; u++) for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u]; BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16]; sph_u64 mv[16],q[32];
sph_u64 mv[16]; sph_u64 tmp;
mv[ 0] = SWAP8(hash.h8[0]); mv[ 0] = SWAP8(hash.h8[0]);
mv[ 1] = SWAP8(hash.h8[1]); mv[ 1] = SWAP8(hash.h8[1]);
@ -452,34 +666,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
mv[13] = 0; mv[13] = 0;
mv[14] = 0; mv[14] = 0;
mv[15] = 0x200; mv[15] = 0x200;
#define M(x) (mv[x])
#define H(x) (BMW_H[x]) tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
#define dH(x) (BMW_h2[x]) q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
FOLDb; q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
#undef M q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
#undef H tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
#undef dH q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
#define M(x) (BMW_h2[x]) q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
#define H(x) (final_b[x]) tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
#define dH(x) (BMW_h1[x]) q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
FOLDb; q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
#undef M q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
#undef H tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
#undef dH q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
hash.h8[0] = (!dec ? SWAP8(BMW_h1[8]) : hash.h8[0]); q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
hash.h8[1] = (!dec ? SWAP8(BMW_h1[9]) : hash.h8[1]); tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
hash.h8[2] = (!dec ? SWAP8(BMW_h1[10]) : hash.h8[2]); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
hash.h8[3] = (!dec ? SWAP8(BMW_h1[11]) : hash.h8[3]); tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
hash.h8[4] = (!dec ? SWAP8(BMW_h1[12]) : hash.h8[4]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
hash.h8[5] = (!dec ? SWAP8(BMW_h1[13]) : hash.h8[5]); tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
hash.h8[6] = (!dec ? SWAP8(BMW_h1[14]) : hash.h8[6]); q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
hash.h8[7] = (!dec ? SWAP8(BMW_h1[15]) : hash.h8[7]); tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = (!dec ? SWAP8(BMW_H[8]) : hash.h8[0]);
hash.h8[1] = (!dec ? SWAP8(BMW_H[9]) : hash.h8[1]);
hash.h8[2] = (!dec ? SWAP8(BMW_H[10]) : hash.h8[2]);
hash.h8[3] = (!dec ? SWAP8(BMW_H[11]) : hash.h8[3]);
hash.h8[4] = (!dec ? SWAP8(BMW_H[12]) : hash.h8[4]);
hash.h8[5] = (!dec ? SWAP8(BMW_H[13]) : hash.h8[5]);
hash.h8[6] = (!dec ? SWAP8(BMW_H[14]) : hash.h8[6]);
hash.h8[7] = (!dec ? SWAP8(BMW_H[15]) : hash.h8[7]);
} }

993
kernel/arebyp.cl

@ -0,0 +1,993 @@
/*-
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
* 2012-2013 Con Kolivas, 2013 Alexey Karimov.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
/* N (nfactor), CPU/Memory cost parameter */
__constant uint N[] = {
0x00000001U, /* never used, padding */
0x00000002U,
0x00000004U,
0x00000008U,
0x00000010U,
0x00000020U,
0x00000040U,
0x00000080U,
0x00000100U,
0x00000200U,
0x00000400U, /* 2^10 == 1024, Litecoin scrypt default */
0x00000800U,
0x00001000U,
0x00002000U,
0x00004000U,
0x00008000U,
0x00010000U,
0x00020000U,
0x00040000U,
0x00080000U,
0x00100000U
};
/* Backwards compatibility, if NFACTOR not defined, default to 10 for scrypt */
#ifndef NFACTOR
#define NFACTOR 10
#endif
__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 };
__constant uint K[] = {
0x428a2f98U,
0x71374491U,
0xb5c0fbcfU,
0xe9b5dba5U,
0x3956c25bU,
0x59f111f1U,
0x923f82a4U,
0xab1c5ed5U,
0xd807aa98U,
0x12835b01U,
0x243185beU, // 10
0x550c7dc3U,
0x72be5d74U,
0x80deb1feU,
0x9bdc06a7U,
0xe49b69c1U,
0xefbe4786U,
0x0fc19dc6U,
0x240ca1ccU,
0x2de92c6fU,
0x4a7484aaU, // 20
0x5cb0a9dcU,
0x76f988daU,
0x983e5152U,
0xa831c66dU,
0xb00327c8U,
0xbf597fc7U,
0xc6e00bf3U,
0xd5a79147U,
0x06ca6351U,
0x14292967U, // 30
0x27b70a85U,
0x2e1b2138U,
0x4d2c6dfcU,
0x53380d13U,
0x650a7354U,
0x766a0abbU,
0x81c2c92eU,
0x92722c85U,
0xa2bfe8a1U,
0xa81a664bU, // 40
0xc24b8b70U,
0xc76c51a3U,
0xd192e819U,
0xd6990624U,
0xf40e3585U,
0x106aa070U,
0x19a4c116U,
0x1e376c08U,
0x2748774cU,
0x34b0bcb5U, // 50
0x391c0cb3U,
0x4ed8aa4aU,
0x5b9cca4fU,
0x682e6ff3U,
0x748f82eeU,
0x78a5636fU,
0x84c87814U,
0x8cc70208U,
0x90befffaU,
0xa4506cebU, // 60
0xbef9a3f7U,
0xc67178f2U,
0x98c7e2a2U,
0xfc08884dU,
0xcd2a11aeU,
0x510e527fU,
0x9b05688cU,
0xC3910C8EU,
0xfb6feee7U,
0x2a01a605U, // 70
0x0c2e12e0U,
0x4498517BU,
0x6a09e667U,
0xa4ce148bU,
0x95F61999U,
0xc19bf174U,
0xBB67AE85U,
0x3C6EF372U,
0xA54FF53AU,
0x1F83D9ABU, // 80
0x5BE0CD19U,
0x5C5C5C5CU,
0x36363636U,
0x80000000U,
0x000003FFU,
0x00000280U,
0x000004a0U,
0x00000300U
};
#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)
#define EndianSwap(n) (rotl(n & ES[0], 24U)|rotl(n & ES[1], 8U))
#define Tr2(x) (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
#define Tr1(x) (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
#define Wr2(x) (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
#define Wr1(x) (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))
#define RND(a, b, c, d, e, f, g, h, k) \
h += Tr1(e); \
h += Ch(e, f, g); \
h += k; \
d += h; \
h += Tr2(a); \
h += Maj(a, b, c);
void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
{
uint4 S0 = *state0;
uint4 S1 = *state1;
#define A S0.x
#define B S0.y
#define C S0.z
#define D S0.w
#define E S1.x
#define F S1.y
#define G S1.z
#define H S1.w
uint4 W[4];
W[ 0].x = block0.x;
RND(A,B,C,D,E,F,G,H, W[0].x+ K[0]);
W[ 0].y = block0.y;
RND(H,A,B,C,D,E,F,G, W[0].y+ K[1]);
W[ 0].z = block0.z;
RND(G,H,A,B,C,D,E,F, W[0].z+ K[2]);
W[ 0].w = block0.w;
RND(F,G,H,A,B,C,D,E, W[0].w+ K[3]);
W[ 1].x = block1.x;
RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]);
W[ 1].y = block1.y;
RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]);
W[ 1].z = block1.z;
RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]);
W[ 1].w = block1.w;
RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]);
W[ 2].x = block2.x;
RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]);
W[ 2].y = block2.y;
RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]);
W[ 2].z = block2.z;
RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]);
W[ 2].w = block2.w;
RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]);
W[ 3].x = block3.x;
RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]);
W[ 3].y = block3.y;
RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]);
W[ 3].z = block3.z;
RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]);
W[ 3].w = block3.w;
RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]);
W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]);
W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]);
W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]);
W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]);
W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]);
W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]);
W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]);
W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]);
W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]);
W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]);
W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]);
W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]);
W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]);
W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]);
W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]);
W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]);
W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]);
W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]);
W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]);
W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]);
W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]);
W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]);
W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]);
W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]);
W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]);
W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]);
W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]);
W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]);
W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]);
W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]);
W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]);
W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]);
W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]);
W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]);
W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]);
W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]);
W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]);
W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]);
W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]);
W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]);
W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]);
W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]);
W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]);
W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]);
W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]);
W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]);
W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]);
W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]);
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H
*state0 += S0;
*state1 += S1;
}
void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
{
#define A (*state0).x
#define B (*state0).y
#define C (*state0).z
#define D (*state0).w
#define E (*state1).x
#define F (*state1).y
#define G (*state1).z
#define H (*state1).w
uint4 W[4];
W[0].x = block0.x;
D= K[63] +W[0].x;
H= K[64] +W[0].x;
W[0].y = block0.y;
C= K[65] +Tr1(D)+Ch(D, K[66], K[67])+W[0].y;
G= K[68] +C+Tr2(H)+Ch(H, K[69] ,K[70]);
W[0].z = block0.z;
B= K[71] +Tr1(C)+Ch(C,D,K[66])+W[0].z;
F= K[72] +B+Tr2(G)+Maj(G,H, K[73]);
W[0].w = block0.w;
A= K[74] +Tr1(B)+Ch(B,C,D)+W[0].w;
E= K[75] +A+Tr2(F)+Maj(F,G,H);
W[1].x = block1.x;
RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]);
W[1].y = block1.y;
RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]);
W[1].z = block1.z;
RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]);
W[1].w = block1.w;
RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]);
W[2].x = block2.x;
RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]);
W[2].y = block2.y;
RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]);
W[2].z = block2.z;
RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]);
W[2].w = block2.w;
RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]);
W[3].x = block3.x;
RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]);
W[3].y = block3.y;
RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]);
W[3].z = block3.z;
RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]);
W[3].w = block3.w;
RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]);
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]);
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]);
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]);
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]);
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]);
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]);
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]);
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]);
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]);
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]);
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]);
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]);
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]);
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]);
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]);
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]);
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]);
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]);
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]);
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]);
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]);
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]);
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]);
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]);
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]);
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]);
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]);
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]);
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]);
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]);
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]);
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]);
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]);
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]);
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]);
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]);
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]);
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]);
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]);
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]);
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]);
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]);
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]);
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]);
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]);
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]);
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]);
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]);
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H
*state0 += (uint4)(K[73], K[77], K[78], K[79]);
*state1 += (uint4)(K[66], K[67], K[80], K[81]);
}
__constant uint fixedW[64] =
{
0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,
0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,
0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,
0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,
0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,
0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,
0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,
};
void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
{
uint4 S0 = *state0;
uint4 S1 = *state1;
#define A S0.x
#define B S0.y
#define C S0.z
#define D S0.w
#define E S1.x
#define F S1.y
#define G S1.z
#define H S1.w
RND(A,B,C,D,E,F,G,H, fixedW[0]);
RND(H,A,B,C,D,E,F,G, fixedW[1]);
RND(G,H,A,B,C,D,E,F, fixedW[2]);
RND(F,G,H,A,B,C,D,E, fixedW[3]);
RND(E,F,G,H,A,B,C,D, fixedW[4]);
RND(D,E,F,G,H,A,B,C, fixedW[5]);
RND(C,D,E,F,G,H,A,B, fixedW[6]);
RND(B,C,D,E,F,G,H,A, fixedW[7]);
RND(A,B,C,D,E,F,G,H, fixedW[8]);
RND(H,A,B,C,D,E,F,G, fixedW[9]);
RND(G,H,A,B,C,D,E,F, fixedW[10]);
RND(F,G,H,A,B,C,D,E, fixedW[11]);
RND(E,F,G,H,A,B,C,D, fixedW[12]);
RND(D,E,F,G,H,A,B,C, fixedW[13]);
RND(C,D,E,F,G,H,A,B, fixedW[14]);
RND(B,C,D,E,F,G,H,A, fixedW[15]);
RND(A,B,C,D,E,F,G,H, fixedW[16]);
RND(H,A,B,C,D,E,F,G, fixedW[17]);
RND(G,H,A,B,C,D,E,F, fixedW[18]);
RND(F,G,H,A,B,C,D,E, fixedW[19]);
RND(E,F,G,H,A,B,C,D, fixedW[20]);
RND(D,E,F,G,H,A,B,C, fixedW[21]);
RND(C,D,E,F,G,H,A,B, fixedW[22]);
RND(B,C,D,E,F,G,H,A, fixedW[23]);
RND(A,B,C,D,E,F,G,H, fixedW[24]);
RND(H,A,B,C,D,E,F,G, fixedW[25]);
RND(G,H,A,B,C,D,E,F, fixedW[26]);
RND(F,G,H,A,B,C,D,E, fixedW[27]);
RND(E,F,G,H,A,B,C,D, fixedW[28]);
RND(D,E,F,G,H,A,B,C, fixedW[29]);
RND(C,D,E,F,G,H,A,B, fixedW[30]);
RND(B,C,D,E,F,G,H,A, fixedW[31]);
RND(A,B,C,D,E,F,G,H, fixedW[32]);
RND(H,A,B,C,D,E,F,G, fixedW[33]);
RND(G,H,A,B,C,D,E,F, fixedW[34]);
RND(F,G,H,A,B,C,D,E, fixedW[35]);
RND(E,F,G,H,A,B,C,D, fixedW[36]);
RND(D,E,F,G,H,A,B,C, fixedW[37]);
RND(C,D,E,F,G,H,A,B, fixedW[38]);
RND(B,C,D,E,F,G,H,A, fixedW[39]);
RND(A,B,C,D,E,F,G,H, fixedW[40]);
RND(H,A,B,C,D,E,F,G, fixedW[41]);
RND(G,H,A,B,C,D,E,F, fixedW[42]);
RND(F,G,H,A,B,C,D,E, fixedW[43]);
RND(E,F,G,H,A,B,C,D, fixedW[44]);
RND(D,E,F,G,H,A,B,C, fixedW[45]);
RND(C,D,E,F,G,H,A,B, fixedW[46]);
RND(B,C,D,E,F,G,H,A, fixedW[47]);
RND(A,B,C,D,E,F,G,H, fixedW[48]);
RND(H,A,B,C,D,E,F,G, fixedW[49]);
RND(G,H,A,B,C,D,E,F, fixedW[50]);
RND(F,G,H,A,B,C,D,E, fixedW[51]);
RND(E,F,G,H,A,B,C,D, fixedW[52]);
RND(D,E,F,G,H,A,B,C, fixedW[53]);
RND(C,D,E,F,G,H,A,B, fixedW[54]);
RND(B,C,D,E,F,G,H,A, fixedW[55]);
RND(A,B,C,D,E,F,G,H, fixedW[56]);
RND(H,A,B,C,D,E,F,G, fixedW[57]);
RND(G,H,A,B,C,D,E,F, fixedW[58]);
RND(F,G,H,A,B,C,D,E, fixedW[59]);
RND(E,F,G,H,A,B,C,D, fixedW[60]);
RND(D,E,F,G,H,A,B,C, fixedW[61]);
RND(C,D,E,F,G,H,A,B, fixedW[62]);
RND(B,C,D,E,F,G,H,A, fixedW[63]);
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H
*state0 += S0;
*state1 += S1;
}
void shittify(uint4 B[8])
{
uint4 tmp[4];
tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);
tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);
tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);
tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);
#pragma unroll
for(uint i=0; i<4; ++i)
B[i] = EndianSwap(tmp[i]);
tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);
tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);
tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);
tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);
#pragma unroll
for(uint i=0; i<4; ++i)
B[i+4] = EndianSwap(tmp[i]);
}
void unshittify(uint4 B[8])
{
uint4 tmp[4];
tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);
tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);
tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);
tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);
#pragma unroll
for(uint i=0; i<4; ++i)
B[i] = EndianSwap(tmp[i]);
tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);
tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);
tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);
tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);
#pragma unroll
for(uint i=0; i<4; ++i)
B[i+4] = EndianSwap(tmp[i]);
}
#define SALSAUNROLLED
#ifdef SALSAUNROLLED
void salsa(uint4 B[8])
{
uint4 w[4];
w[0] = (B[0]^=B[4]);
w[1] = (B[1]^=B[5]);
w[2] = (B[2]^=B[6]);
w[3] = (B[3]^=B[7]);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] = (B[4]^=(B[0]+=w[0]));
w[1] = (B[5]^=(B[1]+=w[1]));
w[2] = (B[6]^=(B[2]+=w[2]));
w[3] = (B[7]^=(B[3]+=w[3]));
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
B[4] += w[0];
B[5] += w[1];
B[6] += w[2];
B[7] += w[3];
}
#else
void salsa(uint4 B[8])
{
uint4 w[4];
#pragma unroll
for(uint i=0; i<4; ++i)
w[i] = (B[i]^=B[i+4]);
#pragma unroll
for(uint i=0; i<4; ++i)
{
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
}
#pragma unroll
for(uint i=0; i<4; ++i)
w[i] = (B[i+4]^=(B[i]+=w[i]));
#pragma unroll
for(uint i=0; i<4; ++i)
{
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
}
#pragma unroll
for(uint i=0; i<4; ++i)
B[i+4] += w[i];
}
#endif
#if (LOOKUP_GAP != 2)
wrong lookup gap!
#endif
void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
{
const uint zSIZE = 8;
const uint ySIZE = N[NFACTOR] / 2;
const uint xSIZE = CONCURRENT_THREADS;
uint4 V[8];
uint x = get_global_id(0) % xSIZE;
uint z;
uint y;
uint i;
uint CO;
ushort progress; // Progress state
ushort state;
CO = 8 * x;
for (y = 0; y < ySIZE; ++y, CO += (xSIZE - 1) * (zSIZE))
{
#pragma unroll
for (z = 0; z < zSIZE; ++z, CO++)
lookup[CO] = X[z];
// Next salsa
salsa(X);
salsa(X);
}
//------------------------------------------------------------------------------------------------------------
uint cotmp = x * zSIZE;
progress = 0;
for (i = 0; i < N[NFACTOR] + 512 + 42; i++)
{
//if (progress < 2 * N[NFACTOR])
{
y = X[7].x & (N[NFACTOR]-1);
CO = cotmp + (y / LOOKUP_GAP) * (xSIZE) * zSIZE;
state = ((progress & 1) << 1) | (y & 1);
if (state != 3)
{
#pragma unroll
for (z = 0; z < zSIZE; ++z, CO++)
V[z] = lookup[CO];
}
if (state != 1)
{
#pragma unroll
for (z = 0; z < zSIZE; ++z)
V[z] ^= X[z];
}
salsa(V);
ushort cond = (state != 1) && (progress < 2 * N[NFACTOR]);
if (cond)
{
#pragma unroll
for (z = 0; z < zSIZE; ++z)
X[z] = V[z];
}
// S/y
// 00 +2
// 01 +1
// 11 +1
// 10 error
progress += (state == 0)? 2 : 1;
}
}
}
#define SCRYPT_FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uint4 * restrict input,
volatile __global uint*restrict output, __global uint4*restrict padcache,
const uint4 midstate0, const uint4 midstate16, const uint target)
{
uint4 X[8];
uint4 tstate0, tstate1, ostate0, ostate1;
uint4 tmp0, tmp1;
uint4 data = (uint4)(input[4].x,input[4].y,input[4].z, get_global_id(0));
uint4 pad0 = midstate0, pad1 = midstate16;
SHA256(&pad0,&pad1, data, (uint4)(K[84],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[86]));
SHA256_fresh(&ostate0,&ostate1, pad0^ K[82], pad1^ K[82], K[82], K[82]);
SHA256_fresh(&tstate0,&tstate1, pad0^ K[83], pad1^ K[83], K[83], K[83]);
tmp0 = tstate0;
tmp1 = tstate1;
SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);
#pragma unroll
for (uint i=0; i<4; i++)
{
pad0 = tstate0;
pad1 = tstate1;
X[rotl(i,1U) ] = ostate0;
X[rotl(i,1U)+1] = ostate1;
SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
SHA256(X+rotl(i,1U),X+rotl(i,1U)+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
}
shittify(X);
scrypt_core(X,padcache);
unshittify(X);
SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
SHA256_fixed(&tmp0,&tmp1);
SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
bool result = (EndianSwap(ostate1.w) <= target);
if (result)
SETFOUND(get_global_id(0));
}

115
kernel/bitblock.cl

@ -464,69 +464,92 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; #if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_L[i] = T0[i]; T0_C[i] = T0[i];
T4_L[i] = T4[i]; T4_C[i] = T4[i];
T1_L[i] = T1[i]; #if !SPH_SMALL_FOOTPRINT_GROESTL
T2_L[i] = T2[i]; T1_C[i] = T1[i];
T3_L[i] = T3[i]; T2_C[i] = T2[i];
T5_L[i] = T5[i]; T3_C[i] = T3[i];
T6_L[i] = T6[i]; T5_C[i] = T5[i];
T7_L[i] = T7[i]; T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); // groestl
barrier(CLK_LOCAL_MEM_FENCE); #define T0 T0_C
#define T1 T1_C
#define T0 T0_L #define T2 T2_C
#define T1 T1_L #define T3 T3_C
#define T2 T2_L #define T4 T4_C
#define T3 T3_L #define T5 T5_C
#define T4 T4_L #define T6 T6_C
#define T5 T5_L #define T7 T7_C
#define T6 T6_L
#define T7 T7_L
sph_u64 H[16];
// groestl //#pragma unroll 15
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]); m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]); m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]); m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]); m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]); m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]); m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]); m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]); m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0; //#pragma unroll 16
g[10] = m[10] = 0; for (unsigned int u = 0; u < 16; u ++)
g[11] = m[11] = 0; g[u] = m[u] ^ H[u];
g[12] = m[12] = 0; m[8] = 0x80; g[8] = m[8] ^ H[8];
g[13] = m[13] = 0; m[9] = 0; g[9] = m[9] ^ H[9];
g[14] = m[14] = 0; m[10] = 0; g[10] = m[10] ^ H[10];
g[15] = 0x102000000000000; m[11] = 0; g[11] = m[11] ^ H[11];
m[15] = 0x100000000000000; m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
sph_u64 xH[16]; //#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u]; H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++) //#pragma unroll 16
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))

119
kernel/darkcoin-mod.cl

@ -457,74 +457,97 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; #if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_L[i] = T0[i]; T0_C[i] = T0[i];
T4_L[i] = T4[i]; T4_C[i] = T4[i];
T1_L[i] = T1[i]; #if !SPH_SMALL_FOOTPRINT_GROESTL
T2_L[i] = T2[i]; T1_C[i] = T1[i];
T3_L[i] = T3[i]; T2_C[i] = T2[i];
T5_L[i] = T5[i]; T3_C[i] = T3[i];
T6_L[i] = T6[i]; T5_C[i] = T5[i];
T7_L[i] = T7[i]; T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); // groestl
barrier(CLK_LOCAL_MEM_FENCE); #define T0 T0_C
#define T1 T1_C
#define T0 T0_L #define T2 T2_C
#define T1 T1_L #define T3 T3_C
#define T2 T2_L #define T4 T4_C
#define T3 T3_L #define T5 T5_C
#define T4 T4_L #define T6 T6_C
#define T5 T5_L #define T7 T7_C
#define T6 T6_L
#define T7 T7_L
sph_u64 H[16];
// groestl //#pragma unroll 15
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]); m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]); m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]); m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]); m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]); m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]); m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]); m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]); m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0; //#pragma unroll 16
g[10] = m[10] = 0; for (unsigned int u = 0; u < 16; u ++)
g[11] = m[11] = 0; g[u] = m[u] ^ H[u];
g[12] = m[12] = 0; m[8] = 0x80; g[8] = m[8] ^ H[8];
g[13] = m[13] = 0; m[9] = 0; g[9] = m[9] ^ H[9];
g[14] = m[14] = 0; m[10] = 0; g[10] = m[10] ^ H[10];
g[15] = 0x102000000000000; m[11] = 0; g[11] = m[11] ^ H[11];
m[15] = 0x100000000000000; m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
sph_u64 xH[16]; //#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u]; H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++) //#pragma unroll 16
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
}
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search3(__global hash_t* hashes) __kernel void search3(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@ -840,7 +863,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0]; rk00 = hash->h4[0];
rk01 = hash->h4[1]; rk01 = hash->h4[1];

13
kernel/darkcoin.cl

@ -71,16 +71,9 @@ typedef long sph_s64;
#define SPH_SMALL_FOOTPRINT_GROESTL 0 #define SPH_SMALL_FOOTPRINT_GROESTL 0
#define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0 #define SPH_CUBEHASH_UNROLL 0
#define SPH_COMPACT_BLAKE_64 0
#ifndef SPH_COMPACT_BLAKE_64 #define SPH_LUFFA_PARALLEL 0
#define SPH_COMPACT_BLAKE_64 0 #define SPH_KECCAK_UNROLL 0
#endif
#ifndef SPH_LUFFA_PARALLEL
#define SPH_LUFFA_PARALLEL 0
#endif
#ifndef SPH_KECCAK_UNROLL
#define SPH_KECCAK_UNROLL 0
#endif
#include "blake.cl" #include "blake.cl"
#include "bmw.cl" #include "bmw.cl"

1853
kernel/diamond.cl

File diff suppressed because it is too large Load Diff

1992
kernel/groestlcoin.cl

File diff suppressed because it is too large Load Diff

41
kernel/inkcoin.cl

@ -29,8 +29,8 @@
* @author phm <phm@inbox.com> * @author phm <phm@inbox.com>
*/ */
#ifndef DARKCOIN_CL #ifndef INKCOIN_CL
#define DARKCOIN_CL #define INKCOIN_CL
#if __ENDIAN_LITTLE__ #if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1 #define SPH_LITTLE_ENDIAN 1
@ -54,34 +54,15 @@ typedef long sph_s64;
#define SPH_64_TRUE 1 #define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U)) #define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL)) #define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) #define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1
#define SPH_KECCAK_64 1
#define SPH_JH_64 1
#define SPH_SIMD_NOCOPY 0
#define SPH_KECCAK_NOCOPY 0
#define SPH_SMALL_FOOTPRINT_GROESTL 0
#define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0
#ifndef SPH_COMPACT_BLAKE_64
#define SPH_COMPACT_BLAKE_64 0
#endif
#ifndef SPH_LUFFA_PARALLEL
#define SPH_LUFFA_PARALLEL 0
#endif
#ifndef SPH_KECCAK_UNROLL
#define SPH_KECCAK_UNROLL 0
#endif
#include "shavite.cl" #include "shavite.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
@ -97,6 +78,14 @@ typedef long sph_s64;
#define DEC32LE(x) (*(const __global sph_u32 *) (x)); #define DEC32LE(x) (*(const __global sph_u32 *) (x));
#endif #endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
// __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) // __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
{ {
@ -244,4 +233,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
output[output[0xFF]++] = SWAP4(gid); output[output[0xFF]++] = SWAP4(gid);
} }
#endif // DARKCOIN_CL #endif // INKCOIN_CL

117
kernel/marucoin-mod.cl

@ -460,69 +460,92 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; #if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_L[i] = T0[i]; T0_C[i] = T0[i];
T4_L[i] = T4[i]; T4_C[i] = T4[i];
T1_L[i] = T1[i]; #if !SPH_SMALL_FOOTPRINT_GROESTL
T2_L[i] = T2[i]; T1_C[i] = T1[i];
T3_L[i] = T3[i]; T2_C[i] = T2[i];
T5_L[i] = T5[i]; T3_C[i] = T3[i];
T6_L[i] = T6[i]; T5_C[i] = T5[i];
T7_L[i] = T7[i]; T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); // groestl
barrier(CLK_LOCAL_MEM_FENCE); #define T0 T0_C
#define T1 T1_C
#define T0 T0_L #define T2 T2_C
#define T1 T1_L #define T3 T3_C
#define T2 T2_L #define T4 T4_C
#define T3 T3_L #define T5 T5_C
#define T4 T4_L #define T6 T6_C
#define T5 T5_L #define T7 T7_C
#define T6 T6_L
#define T7 T7_L
sph_u64 H[16];
// groestl //#pragma unroll 15
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]); m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]); m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]); m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]); m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]); m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]); m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]); m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]); m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0; //#pragma unroll 16
g[10] = m[10] = 0; for (unsigned int u = 0; u < 16; u ++)
g[11] = m[11] = 0; g[u] = m[u] ^ H[u];
g[12] = m[12] = 0; m[8] = 0x80; g[8] = m[8] ^ H[8];
g[13] = m[13] = 0; m[9] = 0; g[9] = m[9] ^ H[9];
g[14] = m[14] = 0; m[10] = 0; g[10] = m[10] ^ H[10];
g[15] = 0x102000000000000; m[11] = 0; g[11] = m[11] ^ H[11];
m[15] = 0x100000000000000; m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
sph_u64 xH[16]; //#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u]; H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++) //#pragma unroll 16
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -843,7 +866,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0]; rk00 = hash->h4[0];
rk01 = hash->h4[1]; rk01 = hash->h4[1];

80
kernel/myriadcoin-groestl.cl

@ -54,23 +54,19 @@ typedef long sph_s64;
#define SPH_64_TRUE 1 #define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U)) #define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL)) #define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) #define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1 #define SPH_ECHO_64 1
#define SPH_SIMD_NOCOPY 0 #define SPH_SIMD_NOCOPY 0
#define SPH_CUBEHASH_UNROLL 0 #define SPH_CUBEHASH_UNROLL 0
#ifndef SPH_LUFFA_PARALLEL
#define SPH_LUFFA_PARALLEL 0
#endif
#include "groestl.cl" #include "groestl.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
@ -84,6 +80,14 @@ typedef long sph_s64;
#define DEC64E(x) (*(const __global sph_u64 *) (x)); #define DEC64E(x) (*(const __global sph_u64 *) (x));
#endif #endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
#define ROL32(x, n) rotate(x, (uint) n) #define ROL32(x, n) rotate(x, (uint) n)
#define SHR(x, n) ((x) >> n) #define SHR(x, n) ((x) >> n)
#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) #define SWAP32(a) (as_uint(as_uchar4(a).wzyx))
@ -138,34 +142,41 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
ulong h8[8]; ulong h8[8];
} hash; } hash;
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; #if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_L[i] = T0[i]; T0_C[i] = T0[i];
T1_L[i] = T1[i]; T4_C[i] = T4[i];
T2_L[i] = T2[i]; #if !SPH_SMALL_FOOTPRINT_GROESTL
T3_L[i] = T3[i]; T1_C[i] = T1[i];
T4_L[i] = T4[i]; T2_C[i] = T2[i];
T5_L[i] = T5[i]; T3_C[i] = T3[i];
T6_L[i] = T6[i]; T5_C[i] = T5[i];
T7_L[i] = T7[i]; T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
// groestl
sph_u64 H[16]; sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++) for (unsigned int u = 0; u < 15; u ++)
H[u] = 0; H[u] = 0;
#if USE_LE #if USE_LE
@ -193,20 +204,33 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
m[13] = 0; m[13] = 0;
m[14] = 0; m[14] = 0;
m[15] = 0x100000000000000; m[15] = 0x100000000000000;
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u]; g[u] = m[u] ^ H[u];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u]; H[u] ^= g[u] ^ m[u];
sph_u64 xH[16]; sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u]; xH[u] = H[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u]; H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++) for (unsigned int u = 0; u < 8; u ++)
hash.h8[u] = ENC64E(H[u + 8]); hash.h8[u] = ENC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
uint temp1; uint temp1;
uint W0 = SWAP32(hash.h4[0x0]); uint W0 = SWAP32(hash.h4[0x0]);
uint W1 = SWAP32(hash.h4[0x1]); uint W1 = SWAP32(hash.h4[0x1]);

974
kernel/neoscrypt.cl

File diff suppressed because it is too large Load Diff

684
kernel/quarkcoin.cl

@ -54,13 +54,13 @@ typedef long sph_s64;
#define SPH_64_TRUE 1 #define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U)) #define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL)) #define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) #define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1 #define SPH_ECHO_64 1
@ -75,9 +75,6 @@ typedef long sph_s64;
#ifndef SPH_COMPACT_BLAKE_64 #ifndef SPH_COMPACT_BLAKE_64
#define SPH_COMPACT_BLAKE_64 0 #define SPH_COMPACT_BLAKE_64 0
#endif #endif
#ifndef SPH_LUFFA_PARALLEL
#define SPH_LUFFA_PARALLEL 0
#endif
#ifndef SPH_KECCAK_UNROLL #ifndef SPH_KECCAK_UNROLL
#define SPH_KECCAK_UNROLL 0 #define SPH_KECCAK_UNROLL 0
#endif #endif
@ -92,6 +89,14 @@ typedef long sph_s64;
#define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) #define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
#if SPH_BIG_ENDIAN #if SPH_BIG_ENDIAN
#define DEC64E(x) (x) #define DEC64E(x) (x)
#define DEC64BE(x) (*(const __global sph_u64 *) (x)); #define DEC64BE(x) (*(const __global sph_u64 *) (x));
@ -111,7 +116,7 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
} hash; } hash;
// blake // blake
{ {
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -156,16 +161,17 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
hash.h8[5] = H5; hash.h8[5] = H5;
hash.h8[6] = H6; hash.h8[6] = H6;
hash.h8[7] = H7; hash.h8[7] = H7;
} }
// bmw // bmw
{
sph_u64 BMW_H[16]; sph_u64 BMW_H[16];
#pragma unroll 16
for(unsigned u = 0; u < 16; u++) for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u]; BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16]; sph_u64 mv[16],q[32];
sph_u64 mv[16]; sph_u64 tmp;
mv[ 0] = SWAP8(hash.h8[0]); mv[ 0] = SWAP8(hash.h8[0]);
mv[ 1] = SWAP8(hash.h8[1]); mv[ 1] = SWAP8(hash.h8[1]);
@ -182,40 +188,264 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
mv[12] = 0; mv[12] = 0;
mv[13] = 0; mv[13] = 0;
mv[14] = 0; mv[14] = 0;
mv[15] = 0x200; mv[15] = SPH_C64(512);
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
FOLDb; tmp = (mv[ 5] ^ BMW_H[ 5]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[ 6] ^ BMW_H[ 6]) - (mv[ 8] ^ BMW_H[ 8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[ 0] ^ BMW_H[ 0]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 1] ^ BMW_H[ 1]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 2] ^ BMW_H[ 2]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 2] ^ BMW_H[ 2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[ 4] ^ BMW_H[ 4]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[ 6] ^ BMW_H[ 6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) + (mv[ 6] ^ BMW_H[ 6]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 9] ^ BMW_H[ 9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 3] ^ BMW_H[ 3]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[ 2] ^ BMW_H[ 2]) + (mv[ 4] ^ BMW_H[ 4]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#undef M #pragma unroll 2
#undef H for(int i=0;i<2;i++)
#undef dH {
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x]) #pragma unroll 4
#define H(x) (final_b[x]) for(int i=2;i<6;i++) {
#define dH(x) (BMW_h1[x]) q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[ 0]) + ( XL64 ^ q[24] ^ q[ 0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[ 1]) + ( XL64 ^ q[25] ^ q[ 1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[ 2]) + ( XL64 ^ q[26] ^ q[ 2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[ 3]) + ( XL64 ^ q[27] ^ q[ 3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[ 4]) + ( XL64 ^ q[28] ^ q[ 4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[ 5]) + ( XL64 ^ q[29] ^ q[ 5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[ 6]) + ( XL64 ^ q[30] ^ q[ 6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[ 7]) + ( XL64 ^ q[31] ^ q[ 7]);
BMW_H[ 8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
BMW_H[ 9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
FOLDb; #pragma unroll 16
for(int i=0;i<16;i++) {
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[ 5] ^ BMW_H[ 5]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[ 6] ^ BMW_H[ 6]) - (mv[ 8] ^ BMW_H[ 8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[ 0] ^ BMW_H[ 0]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 1] ^ BMW_H[ 1]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 2] ^ BMW_H[ 2]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 2] ^ BMW_H[ 2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[ 4] ^ BMW_H[ 4]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[ 6] ^ BMW_H[ 6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) + (mv[ 6] ^ BMW_H[ 6]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 9] ^ BMW_H[ 9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 3] ^ BMW_H[ 3]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[ 2] ^ BMW_H[ 2]) + (mv[ 4] ^ BMW_H[ 4]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#undef M
#undef H
#undef dH
hash.h8[0] = SWAP8(BMW_h1[8]); #pragma unroll 2
hash.h8[1] = SWAP8(BMW_h1[9]); for(int i=0;i<2;i++)
hash.h8[2] = SWAP8(BMW_h1[10]); {
hash.h8[3] = SWAP8(BMW_h1[11]); q[i+16] =
hash.h8[4] = SWAP8(BMW_h1[12]); (SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
hash.h8[5] = SWAP8(BMW_h1[13]); (SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
hash.h8[6] = SWAP8(BMW_h1[14]); (SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
hash.h8[7] = SWAP8(BMW_h1[15]); (SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
} }
if ((hash.h1[7] & 0x8) != 0) XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[ 0]) + ( XL64 ^ q[24] ^ q[ 0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[ 1]) + ( XL64 ^ q[25] ^ q[ 1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[ 2]) + ( XL64 ^ q[26] ^ q[ 2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[ 3]) + ( XL64 ^ q[27] ^ q[ 3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[ 4]) + ( XL64 ^ q[28] ^ q[ 4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[ 5]) + ( XL64 ^ q[29] ^ q[ 5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[ 6]) + ( XL64 ^ q[30] ^ q[ 6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[ 7]) + ( XL64 ^ q[31] ^ q[ 7]);
BMW_H[ 8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
BMW_H[ 9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = SWAP8(BMW_H[8]);
hash.h8[1] = SWAP8(BMW_H[9]);
hash.h8[2] = SWAP8(BMW_H[10]);
hash.h8[3] = SWAP8(BMW_H[11]);
hash.h8[4] = SWAP8(BMW_H[12]);
hash.h8[5] = SWAP8(BMW_H[13]);
hash.h8[6] = SWAP8(BMW_H[14]);
hash.h8[7] = SWAP8(BMW_H[15]);
bool dec = ((hash.h1[7] & 0x8) != 0);
{ {
// groestl // groestl
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16]; sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++) for (unsigned int u = 0; u < 15; u ++)
H[u] = 0; H[u] = 0;
@ -255,11 +485,13 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u]; H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++) for (unsigned int u = 0; u < 8; u ++)
hash.h8[u] = DEC64E(H[u + 8]); hash.h8[u] = (dec ? DEC64E(H[u + 8]) : hash.h8[u]);
} }
else
{ {
// skein // skein
sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33);
sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; sph_u64 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u64 bcount = 0; sph_u64 bcount = 0;
@ -276,26 +508,56 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
bcount = 0; bcount = 0;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
UBI_BIG(510, 8); UBI_BIG(510, 8);
hash.h8[0] = SWAP8(h0); hash.h8[0] = (!dec ? SWAP8(h0) : hash.h8[0]);
hash.h8[1] = SWAP8(h1); hash.h8[1] = (!dec ? SWAP8(h1) : hash.h8[1]);
hash.h8[2] = SWAP8(h2); hash.h8[2] = (!dec ? SWAP8(h2) : hash.h8[2]);
hash.h8[3] = SWAP8(h3); hash.h8[3] = (!dec ? SWAP8(h3) : hash.h8[3]);
hash.h8[4] = SWAP8(h4); hash.h8[4] = (!dec ? SWAP8(h4) : hash.h8[4]);
hash.h8[5] = SWAP8(h5); hash.h8[5] = (!dec ? SWAP8(h5) : hash.h8[5]);
hash.h8[6] = SWAP8(h6); hash.h8[6] = (!dec ? SWAP8(h6) : hash.h8[6]);
hash.h8[7] = SWAP8(h7); hash.h8[7] = (!dec ? SWAP8(h7) : hash.h8[7]);
} }
// groestl // groestl
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{ {
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16]; sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++) for (unsigned int u = 0; u < 15; u ++)
H[u] = 0; H[u] = 0;
#if USE_LE #if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else #else
H[15] = (sph_u64)512; H[15] = (sph_u64)512;
#endif #endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
m[0] = DEC64E(hash.h8[0]); m[0] = DEC64E(hash.h8[0]);
@ -328,13 +590,12 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
H[u] ^= xH[u]; H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++) for (unsigned int u = 0; u < 8; u ++)
hash.h8[u] = DEC64E(H[u + 8]); hash.h8[u] = DEC64E(H[u + 8]);
}
// jh // jh
{
sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7);
sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b);
sph_u64 tmp; // tmp;
for(int i = 0; i < 2; i++) for(int i = 0; i < 2; i++)
{ {
@ -373,11 +634,12 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
hash.h8[5] = DEC64E(h6l); hash.h8[5] = DEC64E(h6l);
hash.h8[6] = DEC64E(h7h); hash.h8[6] = DEC64E(h7h);
hash.h8[7] = DEC64E(h7l); hash.h8[7] = DEC64E(h7l);
}
if ((hash.h1[7] & 0x8) != 0) dec = ((hash.h1[7] & 0x8) != 0);
{ {
// blake // blake
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -412,24 +674,26 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
COMPRESS64; COMPRESS64;
hash.h8[0] = H0; hash.h8[0] = (dec ? H0 : hash.h8[0]);
hash.h8[1] = H1; hash.h8[1] = (dec ? H1 : hash.h8[1]);
hash.h8[2] = H2; hash.h8[2] = (dec ? H2 : hash.h8[2]);
hash.h8[3] = H3; hash.h8[3] = (dec ? H3 : hash.h8[3]);
hash.h8[4] = H4; hash.h8[4] = (dec ? H4 : hash.h8[4]);
hash.h8[5] = H5; hash.h8[5] = (dec ? H5 : hash.h8[5]);
hash.h8[6] = H6; hash.h8[6] = (dec ? H6 : hash.h8[6]);
hash.h8[7] = H7; hash.h8[7] = (dec ? H7 : hash.h8[7]);
} }
else
{ {
// bmw // bmw
sph_u64 BMW_H[16]; sph_u64 BMW_H[16];
#pragma unroll 16
for(unsigned u = 0; u < 16; u++) for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u]; BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16]; sph_u64 mv[16],q[32];
sph_u64 mv[16]; // sph_u64 tmp;
mv[ 0] = SWAP8(hash.h8[0]); mv[ 0] = SWAP8(hash.h8[0]);
mv[ 1] = SWAP8(hash.h8[1]); mv[ 1] = SWAP8(hash.h8[1]);
@ -446,39 +710,234 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
mv[12] = 0; mv[12] = 0;
mv[13] = 0; mv[13] = 0;
mv[14] = 0; mv[14] = 0;
mv[15] = 0x200; mv[15] = SPH_C64(512);
#define M(x) (mv[x])
#define H(x) (BMW_H[x]) tmp = (mv[ 5] ^ BMW_H[ 5]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
#define dH(x) (BMW_h2[x]) q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[ 6] ^ BMW_H[ 6]) - (mv[ 8] ^ BMW_H[ 8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[ 0] ^ BMW_H[ 0]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 1] ^ BMW_H[ 1]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 2] ^ BMW_H[ 2]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 2] ^ BMW_H[ 2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[ 4] ^ BMW_H[ 4]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[ 6] ^ BMW_H[ 6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) + (mv[ 6] ^ BMW_H[ 6]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 9] ^ BMW_H[ 9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 3] ^ BMW_H[ 3]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[ 2] ^ BMW_H[ 2]) + (mv[ 4] ^ BMW_H[ 4]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
FOLDb; #pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#undef M #pragma unroll 4
#undef H for(int i=2;i<6;i++) {
#undef dH q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#define M(x) (BMW_h2[x]) sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
#define H(x) (final_b[x]) sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
#define dH(x) (BMW_h1[x])
FOLDb; BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[ 0]) + ( XL64 ^ q[24] ^ q[ 0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[ 1]) + ( XL64 ^ q[25] ^ q[ 1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[ 2]) + ( XL64 ^ q[26] ^ q[ 2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[ 3]) + ( XL64 ^ q[27] ^ q[ 3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[ 4]) + ( XL64 ^ q[28] ^ q[ 4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[ 5]) + ( XL64 ^ q[29] ^ q[ 5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[ 6]) + ( XL64 ^ q[30] ^ q[ 6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[ 7]) + ( XL64 ^ q[31] ^ q[ 7]);
#undef M BMW_H[ 8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
#undef H BMW_H[ 9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
#undef dH BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = SWAP8(BMW_h1[8]); #pragma unroll 16
hash.h8[1] = SWAP8(BMW_h1[9]); for(int i=0;i<16;i++) {
hash.h8[2] = SWAP8(BMW_h1[10]); mv[i] = BMW_H[i];
hash.h8[3] = SWAP8(BMW_h1[11]); BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
hash.h8[4] = SWAP8(BMW_h1[12]);
hash.h8[5] = SWAP8(BMW_h1[13]);
hash.h8[6] = SWAP8(BMW_h1[14]);
hash.h8[7] = SWAP8(BMW_h1[15]);
} }
// keccak tmp = (mv[ 5] ^ BMW_H[ 5]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[ 6] ^ BMW_H[ 6]) - (mv[ 8] ^ BMW_H[ 8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[ 0] ^ BMW_H[ 0]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 1] ^ BMW_H[ 1]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 2] ^ BMW_H[ 2]) + (mv[ 9] ^ BMW_H[ 9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 2] ^ BMW_H[ 2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[ 4] ^ BMW_H[ 4]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) - (mv[ 6] ^ BMW_H[ 6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[ 0] ^ BMW_H[ 0]) - (mv[ 3] ^ BMW_H[ 3]) + (mv[ 6] ^ BMW_H[ 6]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 1] ^ BMW_H[ 1]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 7] ^ BMW_H[ 7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[ 8] ^ BMW_H[ 8]) - (mv[ 0] ^ BMW_H[ 0]) - (mv[ 2] ^ BMW_H[ 2]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 9] ^ BMW_H[ 9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[ 1] ^ BMW_H[ 1]) + (mv[ 3] ^ BMW_H[ 3]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[ 2] ^ BMW_H[ 2]) + (mv[ 4] ^ BMW_H[ 4]) + (mv[ 7] ^ BMW_H[ 7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[ 3] ^ BMW_H[ 3]) - (mv[ 5] ^ BMW_H[ 5]) + (mv[ 8] ^ BMW_H[ 8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[ 4] ^ BMW_H[ 4]) - (mv[ 6] ^ BMW_H[ 6]) - (mv[ 9] ^ BMW_H[ 9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{ {
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[ 0]) + ( XL64 ^ q[24] ^ q[ 0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[ 1]) + ( XL64 ^ q[25] ^ q[ 1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[ 2]) + ( XL64 ^ q[26] ^ q[ 2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[ 3]) + ( XL64 ^ q[27] ^ q[ 3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[ 4]) + ( XL64 ^ q[28] ^ q[ 4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[ 5]) + ( XL64 ^ q[29] ^ q[ 5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[ 6]) + ( XL64 ^ q[30] ^ q[ 6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[ 7]) + ( XL64 ^ q[31] ^ q[ 7]);
BMW_H[ 8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
BMW_H[ 9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = (!dec ? SWAP8(BMW_H[8]) : hash.h8[0]);
hash.h8[1] = (!dec ? SWAP8(BMW_H[9]) : hash.h8[1]);
hash.h8[2] = (!dec ? SWAP8(BMW_H[10]) : hash.h8[2]);
hash.h8[3] = (!dec ? SWAP8(BMW_H[11]) : hash.h8[3]);
hash.h8[4] = (!dec ? SWAP8(BMW_H[12]) : hash.h8[4]);
hash.h8[5] = (!dec ? SWAP8(BMW_H[13]) : hash.h8[5]);
hash.h8[6] = (!dec ? SWAP8(BMW_H[14]) : hash.h8[6]);
hash.h8[7] = (!dec ? SWAP8(BMW_H[15]) : hash.h8[7]);
}
// keccak
sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0;
sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0;
sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0;
@ -514,10 +973,9 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
hash.h8[5] = SWAP8(a01); hash.h8[5] = SWAP8(a01);
hash.h8[6] = SWAP8(a11); hash.h8[6] = SWAP8(a11);
hash.h8[7] = SWAP8(a21); hash.h8[7] = SWAP8(a21);
}
// skein // skein
{
sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33); sph_u64 h0 = SPH_C64(0x4903ADFF749C51CE), h1 = SPH_C64(0x0D95DE399746DF03), h2 = SPH_C64(0x8FD1934127C79BCE), h3 = SPH_C64(0x9A255629FF352CB1), h4 = SPH_C64(0x5DB62599DF6CA7B0), h5 = SPH_C64(0xEABE394CA9D5C3F4), h6 = SPH_C64(0x991112C71A75B523), h7 = SPH_C64(0xAE18A40B660FCC33);
sph_u64 m0, m1, m2, m3, m4, m5, m6, m7; sph_u64 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u64 bcount = 0; sph_u64 bcount = 0;
@ -542,11 +1000,12 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
hash.h8[5] = SWAP8(h5); hash.h8[5] = SWAP8(h5);
hash.h8[6] = SWAP8(h6); hash.h8[6] = SWAP8(h6);
hash.h8[7] = SWAP8(h7); hash.h8[7] = SWAP8(h7);
}
if ((hash.h1[7] & 0x8) != 0) dec = ((hash.h1[7] & 0x8) != 0);
{ {
// keccak // keccak
sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0; sph_u64 a00 = 0, a01 = 0, a02 = 0, a03 = 0, a04 = 0;
sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0; sph_u64 a10 = 0, a11 = 0, a12 = 0, a13 = 0, a14 = 0;
sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0; sph_u64 a20 = 0, a21 = 0, a22 = 0, a23 = 0, a24 = 0;
@ -574,18 +1033,20 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
a10 = ~a10; a10 = ~a10;
a20 = ~a20; a20 = ~a20;
hash.h8[0] = SWAP8(a00); hash.h8[0] = (dec ? SWAP8(a00) : hash.h8[0]);
hash.h8[1] = SWAP8(a10); hash.h8[1] = (dec ? SWAP8(a10) : hash.h8[1]);
hash.h8[2] = SWAP8(a20); hash.h8[2] = (dec ? SWAP8(a20) : hash.h8[2]);
hash.h8[3] = SWAP8(a30); hash.h8[3] = (dec ? SWAP8(a30) : hash.h8[3]);
hash.h8[4] = SWAP8(a40); hash.h8[4] = (dec ? SWAP8(a40) : hash.h8[4]);
hash.h8[5] = SWAP8(a01); hash.h8[5] = (dec ? SWAP8(a01) : hash.h8[5]);
hash.h8[6] = SWAP8(a11); hash.h8[6] = (dec ? SWAP8(a11) : hash.h8[6]);
hash.h8[7] = SWAP8(a21); hash.h8[7] = (dec ? SWAP8(a21) : hash.h8[7]);
} }
else
{ {
// jh // jh
sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7); sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7);
sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b); sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b);
sph_u64 tmp; sph_u64 tmp;
@ -619,14 +1080,15 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
h4h ^= 0x80; h4h ^= 0x80;
h7l ^= 0x2000000000000; h7l ^= 0x2000000000000;
hash.h8[0] = DEC64E(h4h); hash.h8[0] = (!dec ? DEC64E(h4h) : hash.h8[0]);
hash.h8[1] = DEC64E(h4l); hash.h8[1] = (!dec ? DEC64E(h4l) : hash.h8[1]);
hash.h8[2] = DEC64E(h5h); hash.h8[2] = (!dec ? DEC64E(h5h) : hash.h8[2]);
hash.h8[3] = DEC64E(h5l); hash.h8[3] = (!dec ? DEC64E(h5l) : hash.h8[3]);
hash.h8[4] = DEC64E(h6h); hash.h8[4] = (!dec ? DEC64E(h6h) : hash.h8[4]);
hash.h8[5] = DEC64E(h6l); hash.h8[5] = (!dec ? DEC64E(h6l) : hash.h8[5]);
hash.h8[6] = DEC64E(h7h); hash.h8[6] = (!dec ? DEC64E(h7h) : hash.h8[6]);
hash.h8[7] = DEC64E(h7l); hash.h8[7] = (!dec ? DEC64E(h7l) : hash.h8[7]);
} }
bool result = (SWAP8(hash.h8[3]) <= target); bool result = (SWAP8(hash.h8[3]) <= target);

16
kernel/qubitcoin.cl

@ -54,13 +54,13 @@ typedef long sph_s64;
#define SPH_64_TRUE 1 #define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U)) #define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL)) #define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) #define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1 #define SPH_ECHO_64 1
@ -88,6 +88,14 @@ typedef long sph_s64;
#define DEC32BE(x) SWAP4(*(const __global sph_u32 *) (x)); #define DEC32BE(x) SWAP4(*(const __global sph_u32 *) (x));
#endif #endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
{ {

328
kernel/sifcoin.cl

@ -54,13 +54,13 @@ typedef long sph_s64;
#define SPH_64_TRUE 1 #define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U)) #define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL)) #define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) #define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1 #define SPH_ECHO_64 1
@ -100,6 +100,14 @@ typedef long sph_s64;
#define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
#endif #endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target) __kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
{ {
@ -164,8 +172,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
for(unsigned u = 0; u < 16; u++) for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u]; BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16]; sph_u64 mv[16],q[32];
sph_u64 mv[16]; sph_u64 tmp;
mv[ 0] = SWAP8(hash.h8[0]); mv[ 0] = SWAP8(hash.h8[0]);
mv[ 1] = SWAP8(hash.h8[1]); mv[ 1] = SWAP8(hash.h8[1]);
@ -183,39 +191,280 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
mv[13] = 0; mv[13] = 0;
mv[14] = 0; mv[14] = 0;
mv[15] = 0x200; mv[15] = 0x200;
#define M(x) (mv[x])
#define H(x) (BMW_H[x]) tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
#define dH(x) (BMW_h2[x]) q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
FOLDb; q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
#undef M q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
#undef H tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
#undef dH q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
#define M(x) (BMW_h2[x]) q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
#define H(x) (final_b[x]) tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
#define dH(x) (BMW_h1[x]) q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
FOLDb; q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
#undef M q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
#undef H tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
#undef dH q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
hash.h8[0] = SWAP8(BMW_h1[8]); q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
hash.h8[1] = SWAP8(BMW_h1[9]); tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
hash.h8[2] = SWAP8(BMW_h1[10]); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
hash.h8[3] = SWAP8(BMW_h1[11]); tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
hash.h8[4] = SWAP8(BMW_h1[12]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
hash.h8[5] = SWAP8(BMW_h1[13]); tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
hash.h8[6] = SWAP8(BMW_h1[14]); q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
hash.h8[7] = SWAP8(BMW_h1[15]); tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
} }
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = SWAP8(BMW_H[8]);
hash.h8[1] = SWAP8(BMW_H[9]);
hash.h8[2] = SWAP8(BMW_H[10]);
hash.h8[3] = SWAP8(BMW_H[11]);
hash.h8[4] = SWAP8(BMW_H[12]);
hash.h8[5] = SWAP8(BMW_H[13]);
hash.h8[6] = SWAP8(BMW_H[14]);
hash.h8[7] = SWAP8(BMW_H[15]);
}
// groestl // groestl
{ {
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16]; sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++) for (unsigned int u = 0; u < 15; u ++)
H[u] = 0; H[u] = 0;
#if USE_LE #if USE_LE
@ -233,6 +482,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
m[5] = DEC64E(hash.h8[5]); m[5] = DEC64E(hash.h8[5]);
m[6] = DEC64E(hash.h8[6]); m[6] = DEC64E(hash.h8[6]);
m[7] = DEC64E(hash.h8[7]); m[7] = DEC64E(hash.h8[7]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u]; g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8]; m[8] = 0x80; g[8] = m[8] ^ H[8];
@ -245,16 +496,25 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u]; H[u] ^= g[u] ^ m[u];
sph_u64 xH[16]; sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u]; xH[u] = H[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u]; H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++) for (unsigned int u = 0; u < 8; u ++)
hash.h8[u] = DEC64E(H[u + 8]); hash.h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
} }
// jh // jh

70
kernel/talkcoin-mod.cl

@ -68,10 +68,7 @@ typedef int sph_s32;
#define SPH_JH_64 1 #define SPH_JH_64 1
#define SPH_KECCAK_64 1 #define SPH_KECCAK_64 1
#define SPH_KECCAK_NOCOPY 0 #define SPH_KECCAK_NOCOPY 0
#define SPH_COMPACT_BLAKE_64 0
#ifndef SPH_COMPACT_BLAKE_64
#define SPH_COMPACT_BLAKE_64 0
#endif
#ifndef SPH_KECCAK_UNROLL #ifndef SPH_KECCAK_UNROLL
#define SPH_KECCAK_UNROLL 0 #define SPH_KECCAK_UNROLL 0
#endif #endif
@ -162,44 +159,48 @@ __kernel void search1(__global hash_t* hashes)
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; #if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_L[i] = T0[i]; T0_C[i] = T0[i];
T1_L[i] = T1[i]; T4_C[i] = T4[i];
T2_L[i] = T2[i]; #if !SPH_SMALL_FOOTPRINT_GROESTL
T3_L[i] = T3[i]; T1_C[i] = T1[i];
T4_L[i] = T4[i]; T2_C[i] = T2[i];
T5_L[i] = T5[i]; T3_C[i] = T3[i];
T6_L[i] = T6[i]; T5_C[i] = T5[i];
T7_L[i] = T7[i]; T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
sph_u64 H[16]; sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++) for (unsigned int u = 0; u < 15; u ++)
H[u] = 0; H[u] = 0;
#if USE_LE
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else #else
H[15] = (sph_u64)512; H[15] = (sph_u64)512;
#endif #endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]); m[0] = DEC64E(hash->h8[0]);
@ -211,9 +212,9 @@ __kernel void search1(__global hash_t* hashes)
m[6] = DEC64E(hash->h8[6]); m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]); m[7] = DEC64E(hash->h8[7]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u]; g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8]; m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9]; m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10]; m[10] = 0; g[10] = m[10] ^ H[10];
@ -222,27 +223,28 @@ __kernel void search1(__global hash_t* hashes)
m[13] = 0; g[13] = m[13] ^ H[13]; m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14]; m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15]; m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u]; H[u] ^= g[u] ^ m[u];
sph_u64 xH[16]; sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u]; xH[u] = H[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u]; H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++) for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]); hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
// jh // jh

16
kernel/twecoin.cl

@ -20,13 +20,13 @@ typedef long sph_s64;
#define SPH_64_TRUE 1 #define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U)) #define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) #define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL)) #define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) #define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_HAMSI_EXPAND_SMALL 1 #define SPH_HAMSI_EXPAND_SMALL 1
@ -47,6 +47,14 @@ typedef long sph_s64;
#define sph_bswap32(x) SWAP4(x) #define sph_bswap32(x) SWAP4(x)
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
static void sph_enc32be(void *dst, sph_u32 val) static void sph_enc32be(void *dst, sph_u32 val)
{ {
#if defined SPH_UPTR #if defined SPH_UPTR

1358
kernel/whirlcoin.cl

File diff suppressed because it is too large Load Diff

117
kernel/x14.cl

@ -463,69 +463,92 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256]; #if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_L[i] = T0[i]; T0_C[i] = T0[i];
T4_L[i] = T4[i]; T4_C[i] = T4[i];
T1_L[i] = T1[i]; #if !SPH_SMALL_FOOTPRINT_GROESTL
T2_L[i] = T2[i]; T1_C[i] = T1[i];
T3_L[i] = T3[i]; T2_C[i] = T2[i];
T5_L[i] = T5[i]; T3_C[i] = T3[i];
T6_L[i] = T6[i]; T5_C[i] = T5[i];
T7_L[i] = T7[i]; T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); // groestl
barrier(CLK_LOCAL_MEM_FENCE); #define T0 T0_C
#define T1 T1_C
#define T0 T0_L #define T2 T2_C
#define T1 T1_L #define T3 T3_C
#define T2 T2_L #define T4 T4_C
#define T3 T3_L #define T5 T5_C
#define T4 T4_L #define T6 T6_C
#define T5 T5_L #define T7 T7_C
#define T6 T6_L
#define T7 T7_L
sph_u64 H[16];
// groestl //#pragma unroll 15
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]); m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]); m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]); m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]); m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]); m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]); m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]); m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]); m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0; //#pragma unroll 16
g[10] = m[10] = 0; for (unsigned int u = 0; u < 16; u ++)
g[11] = m[11] = 0; g[u] = m[u] ^ H[u];
g[12] = m[12] = 0; m[8] = 0x80; g[8] = m[8] ^ H[8];
g[13] = m[13] = 0; m[9] = 0; g[9] = m[9] ^ H[9];
g[14] = m[14] = 0; m[10] = 0; g[10] = m[10] ^ H[10];
g[15] = 0x102000000000000; m[11] = 0; g[11] = m[11] ^ H[11];
m[15] = 0x100000000000000; m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
sph_u64 xH[16]; //#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u]; H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++) //#pragma unroll 16
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -846,7 +869,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0]; rk00 = hash->h4[0];
rk01 = hash->h4[1]; rk01 = hash->h4[1];

17
logging.c

@ -59,12 +59,20 @@ void applogsiz(int prio, int size, const char* fmt, ...)
/* high-level logging function, based on global opt_log_level */ /* high-level logging function, based on global opt_log_level */
void vapplogsiz(int prio, int size, const char* fmt, va_list args) void vapplogsiz(int prio, int size, const char* fmt, va_list args)
{ {
if (opt_debug || prio != LOG_DEBUG) { if ((opt_debug || prio != LOG_DEBUG)) {
char *tmp42 = (char *)calloc(size + 1, 1); char *tmp42 = (char *)calloc(size + 1, 1);
vsnprintf(tmp42, size, fmt, args); vsnprintf(tmp42, size, fmt, args);
_applog(prio, tmp42, false); _applog(prio, tmp42, false);
free(tmp42); free(tmp42);
} }
#ifdef DEV_DEBUG_MODE
else if(prio == LOG_DEBUG) {
char *tmp42 = (char *)calloc(size + 1, 1);
vsnprintf(tmp42, size, fmt, args);
__debug("", tmp42);
free(tmp42);
}
#endif
} }
/* /*
@ -80,6 +88,13 @@ void _applog(int prio, const char *str, bool force)
if (0) {} if (0) {}
#endif #endif
else { else {
#ifdef DEV_DEBUG_MODE
if(prio == LOG_DEBUG) {
__debug("", str);
}
#endif
bool write_console = opt_debug_console || (opt_verbose && prio != LOG_DEBUG) || prio <= opt_log_level; bool write_console = opt_debug_console || (opt_verbose && prio != LOG_DEBUG) || prio <= opt_log_level;
bool write_stderr = !isatty(fileno((FILE *)stderr)); bool write_stderr = !isatty(fileno((FILE *)stderr));
if (!(write_console || write_stderr)) if (!(write_console || write_stderr))

19
miner.h

@ -266,6 +266,11 @@ DRIVER_PARSE_COMMANDS(DRIVER_PROTOTYPE)
#define strtobool(str) ((str && (!strcasecmp(str, "true") || !strcasecmp(str, "yes") || !strcasecmp(str, "1")))?true:false) #define strtobool(str) ((str && (!strcasecmp(str, "true") || !strcasecmp(str, "yes") || !strcasecmp(str, "1")))?true:false)
#endif #endif
extern int opt_remoteconf_retry;
extern int opt_remoteconf_wait;
extern bool opt_remoteconf_usecache;
enum alive { enum alive {
LIFE_WELL, LIFE_WELL,
LIFE_SICK, LIFE_SICK,
@ -1025,6 +1030,7 @@ extern char *sgminer_path;
extern int opt_shares; extern int opt_shares;
extern bool opt_fail_only; extern bool opt_fail_only;
extern int opt_fail_switch_delay; extern int opt_fail_switch_delay;
extern int opt_watchpool_refresh;
extern bool opt_autofan; extern bool opt_autofan;
extern bool opt_autoengine; extern bool opt_autoengine;
extern bool use_curses; extern bool use_curses;
@ -1099,8 +1105,8 @@ extern pthread_cond_t restart_cond;
extern void clear_stratum_shares(struct pool *pool); extern void clear_stratum_shares(struct pool *pool);
extern void clear_pool_work(struct pool *pool); extern void clear_pool_work(struct pool *pool);
extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2); extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2, const int thr_id);
extern void set_target_neoscrypt(unsigned char *target, double diff); extern void set_target_neoscrypt(unsigned char *target, double diff, const int thr_id);
extern void kill_work(void); extern void kill_work(void);
@ -1274,6 +1280,7 @@ struct pool {
bool remove_at_start; bool remove_at_start;
bool removed; bool removed;
bool lp_started; bool lp_started;
bool backup;
char *hdr_path; char *hdr_path;
char *lp_url; char *lp_url;
@ -1481,7 +1488,13 @@ extern void _wlogprint(const char *str);
extern int curses_int(const char *query); extern int curses_int(const char *query);
extern char *curses_input(const char *query); extern char *curses_input(const char *query);
extern void kill_work(void); extern void kill_work(void);
extern void switch_pools(struct pool *selected);
//helper macro to preserve existing code
#ifndef switch_pools
#define switch_pools(p) __switch_pools(p, TRUE)
#endif
extern void __switch_pools(struct pool *selected, bool saveprio);
extern void discard_work(struct work *work); extern void discard_work(struct work *work);
extern void remove_pool(struct pool *pool); extern void remove_pool(struct pool *pool);
//extern void write_config(FILE *fcfg); //extern void write_config(FILE *fcfg);

99
ocl.c

@ -299,8 +299,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
char filename[255]; char filename[255];
char strbuf[32]; char strbuf[32];
sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name)); sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile)?cgpu->algorithm.kernelfile:cgpu->algorithm.name));
strcpy(filename, strbuf); strcpy(filename, strbuf);
applog(LOG_DEBUG, "Using source file %s", filename); applog(LOG_DEBUG, "Using source file %s", filename);
/* For some reason 2 vectors is still better even if the card says /* For some reason 2 vectors is still better even if the card says
@ -347,21 +348,75 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
// neoscrypt calculates TC differently // neoscrypt calculates TC differently
if (!safe_cmp(cgpu->algorithm.name, "neoscrypt")) { if (!safe_cmp(cgpu->algorithm.name, "neoscrypt")) {
int max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); size_t glob_thread_count;
size_t glob_thread_count = 1UL << max_int; long max_int;
unsigned char type = 0;
// determine which intensity type to use
// raw intensity is the same as TC so use either or setting...
if (cgpu->rawintensity > 0 || cgpu->opt_tc) {
if (cgpu->opt_tc) {
glob_thread_count = cgpu->rawintensity = cgpu->opt_tc;
}
else {
glob_thread_count = cgpu->rawintensity;
}
max_int = glob_thread_count;
type = 2;
}
else if (cgpu->xintensity > 0) {
glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift)?(1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)):cgpu->xintensity);
max_int = cgpu->xintensity;
type = 1;
}
else {
glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity);
max_int = ((cgpu->dynamic)?MAX_INTENSITY:cgpu->intensity);
}
// if TC is entered by user, use that value... otherwise use default glob_thread_count = ((glob_thread_count < cgpu->work_size)?cgpu->work_size:glob_thread_count);
cgpu->thread_concurrency = ((cgpu->opt_tc) ? cgpu->opt_tc : ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count));
// if TC * scratchbuf size is too big for memory... reduce to max // if TC * scratchbuf size is too big for memory... reduce to max
if (((uint64_t)cgpu->thread_concurrency * NEOSCRYPT_SCRATCHBUF_SIZE) >(uint64_t)cgpu->max_alloc) { if ((glob_thread_count * NEOSCRYPT_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) {
/* Selected intensity will not run on this GPU. Not enough memory. /* Selected intensity will not run on this GPU. Not enough memory.
* Adapt the memory setting. */ * Adapt the memory setting. */
// depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc
switch (type) {
//raw intensity
case 2:
while ((glob_thread_count * NEOSCRYPT_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) {
--glob_thread_count;
}
max_int = glob_thread_count;
cgpu->rawintensity = glob_thread_count;
break;
//x intensity
case 1:
glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE; glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
max_int = glob_thread_count / clState->compute_shaders;
/* Find highest significant bit in glob_thread_count, which gives while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) {
* the intensity. */ --max_int;
while (max_int && ((1U << max_int) & glob_thread_count) == 0) { }
/* Check if max_intensity is >0. */
if (max_int < MIN_XINTENSITY) {
applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu);
max_int = MIN_XINTENSITY;
}
cgpu->xintensity = max_int;
glob_thread_count = clState->compute_shaders * (1UL << max_int);
break;
default:
glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
while (max_int && ((1UL << max_int) & glob_thread_count) == 0) {
--max_int; --max_int;
} }
@ -372,30 +427,33 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
} }
cgpu->intensity = max_int; cgpu->intensity = max_int;
cgpu->thread_concurrency = 1U << max_int; glob_thread_count = 1UL << max_int;
break;
} }
}
// TC is glob thread count
cgpu->thread_concurrency = glob_thread_count;
applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
} } else if (!cgpu->opt_tc) {
else if (!cgpu->opt_tc) {
unsigned int sixtyfours; unsigned int sixtyfours;
sixtyfours = cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1; sixtyfours = cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1;
cgpu->thread_concurrency = sixtyfours * 64; cgpu->thread_concurrency = sixtyfours * 64;
if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
if (cgpu->thread_concurrency > cgpu->shaders * 5) { if (cgpu->thread_concurrency > cgpu->shaders * 5) {
cgpu->thread_concurrency = cgpu->shaders * 5; cgpu->thread_concurrency = cgpu->shaders * 5;
} }
} }
applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency)); applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency));
} } else {
else {
cgpu->thread_concurrency = cgpu->opt_tc; cgpu->thread_concurrency = cgpu->opt_tc;
} }
cl_uint slot, cpnd; cl_uint slot, cpnd;
slot = cpnd = 0; slot = cpnd = 0;
@ -420,7 +478,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
build_data->opencl_version = get_opencl_version(devices[gpu]); build_data->opencl_version = get_opencl_version(devices[gpu]);
build_data->patch_bfi = needs_bfi_patch(build_data); build_data->patch_bfi = needs_bfi_patch(build_data);
strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name)); strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile)?cgpu->algorithm.kernelfile:cgpu->algorithm.name));
strcat(build_data->binary_filename, name); strcat(build_data->binary_filename, name);
if (clState->goffset) if (clState->goffset)
strcat(build_data->binary_filename, "g"); strcat(build_data->binary_filename, "g");
@ -496,15 +554,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
// scrypt/n-scrypt // scrypt/n-scrypt
} } else {
else {
size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0)); size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0));
bufsize = 128 * ipt * cgpu->thread_concurrency; bufsize = 128 * ipt * cgpu->thread_concurrency;
applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
} }
} } else {
else { bufsize = (size_t) algorithm->rw_buffer_size;
bufsize = (size_t)algorithm->rw_buffer_size;
applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
} }
@ -539,7 +595,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
applog(LOG_DEBUG, "Using output buffer sized %lu", BUFFERSIZE); applog(LOG_DEBUG, "Using output buffer sized %lu", BUFFERSIZE);
clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
if (status != CL_SUCCESS) { if (status != CL_SUCCESS) {
applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status); applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
return NULL; return NULL;

9
pool.c

@ -33,18 +33,21 @@
#include <string.h> #include <string.h>
char* get_pool_name(struct pool *pool) { char* get_pool_name(struct pool *pool) {
if (opt_incognito) if (opt_incognito) {
return "<pool>"; return "<pool>";
}
if (strcmp(pool->name, "") == 0) if (empty_string(pool->name)) {
return pool->sockaddr_url; return pool->sockaddr_url;
}
return pool->name; return pool->name;
} }
char* get_pool_user(struct pool *pool) { char* get_pool_user(struct pool *pool) {
if (opt_incognito) if (opt_incognito) {
return "<user>"; return "<user>";
}
return pool->rpc_user; return pool->rpc_user;
} }

599
sgminer.c

File diff suppressed because it is too large Load Diff

97
util.c

@ -1444,9 +1444,9 @@ char *recv_line(struct pool *pool)
} }
buflen = strlen(pool->sockbuf); buflen = strlen(pool->sockbuf);
tok = strtok(pool->sockbuf, "\n");
if (!tok) { if ((tok = strtok(pool->sockbuf, "\n")) == NULL) {
applog(LOG_DEBUG, "Failed to parse a \\n terminated string in recv_line"); applog(LOG_DEBUG, "Failed to parse a \\n terminated string in recv_line: buffer = %s", pool->sockbuf);
goto out; goto out;
} }
sret = strdup(tok); sret = strdup(tok);
@ -1675,12 +1675,16 @@ static bool parse_diff(struct pool *pool, json_t *val)
if (old_diff != diff) { if (old_diff != diff) {
int idiff = diff; int idiff = diff;
if ((double)idiff == diff) if ((double)idiff == diff) {
applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %d", get_pool_name(pool), idiff); applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %d", get_pool_name(pool), idiff);
else }
else {
applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %.3f", get_pool_name(pool), diff); applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %.3f", get_pool_name(pool), diff);
} else }
}
else {
applog(LOG_DEBUG, "%s difficulty set to %f", get_pool_name(pool), diff); applog(LOG_DEBUG, "%s difficulty set to %f", get_pool_name(pool), diff);
}
return true; return true;
} }
@ -1806,83 +1810,80 @@ bool parse_method(struct pool *pool, char *s)
bool ret = false; bool ret = false;
char *buf; char *buf;
if (!s) if (!s) {
return ret; return ret;
}
val = JSON_LOADS(s, &err); if (!(val = JSON_LOADS(s, &err))) {
if (!val) {
applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
return ret; return ret;
} }
method = json_object_get(val, "method"); if (!(method = json_object_get(val, "method"))) {
if (!method) { goto done;
json_decref(val);
return ret;
} }
err_val = json_object_get(val, "error"); err_val = json_object_get(val, "error");
params = json_object_get(val, "params"); params = json_object_get(val, "params");
if (err_val && !json_is_null(err_val)) { if (err_val && !json_is_null(err_val)) {
char *ss; char *ss;
if (err_val) if (err_val) {
ss = json_dumps(err_val, JSON_INDENT(3)); ss = json_dumps(err_val, JSON_INDENT(3));
else }
else {
ss = strdup("(unknown reason)"); ss = strdup("(unknown reason)");
}
applog(LOG_INFO, "JSON-RPC method decode failed: %s", ss); applog(LOG_INFO, "JSON-RPC method decode failed: %s", ss);
json_decref(val);
free(ss); free(ss);
goto done;
return ret;
} }
buf = (char *)json_string_value(method); buf = (char *)json_string_value(method);
if (!buf) { if (!buf) {
json_decref(val); goto done;
return ret;
} }
if (!strncasecmp(buf, "mining.notify", 13)) { if (!strncasecmp(buf, "mining.notify", 13)) {
if (parse_notify(pool, params)) if (parse_notify(pool, params)) {
pool->stratum_notify = ret = true; pool->stratum_notify = ret = true;
else }
else {
pool->stratum_notify = ret = false; pool->stratum_notify = ret = false;
json_decref(val); }
return ret;
goto done;
} }
if (!strncasecmp(buf, "mining.set_difficulty", 21) && parse_diff(pool, params)) { if (!strncasecmp(buf, "mining.set_difficulty", 21) && parse_diff(pool, params)) {
ret = true; ret = true;
json_decref(val); goto done;
return ret;
} }
if (!strncasecmp(buf, "mining.set_extranonce", 21) && parse_extranonce(pool, params)) { if (!strncasecmp(buf, "mining.set_extranonce", 21) && parse_extranonce(pool, params)) {
ret = true; ret = true;
json_decref(val); goto done;
return ret;
} }
if (!strncasecmp(buf, "client.reconnect", 16) && parse_reconnect(pool, params)) { if (!strncasecmp(buf, "client.reconnect", 16) && parse_reconnect(pool, params)) {
ret = true; ret = true;
json_decref(val); goto done;
return ret;
} }
if (!strncasecmp(buf, "client.get_version", 18) && send_version(pool, val)) { if (!strncasecmp(buf, "client.get_version", 18) && send_version(pool, val)) {
ret = true; ret = true;
json_decref(val); goto done;
return ret;
} }
if (!strncasecmp(buf, "client.show_message", 19) && show_message(pool, params)) { if (!strncasecmp(buf, "client.show_message", 19) && show_message(pool, params)) {
ret = true; ret = true;
json_decref(val); goto done;
return ret;
} }
done:
json_decref(val); json_decref(val);
return ret; return ret;
} }
@ -1894,11 +1895,11 @@ bool subscribe_extranonce(struct pool *pool)
json_error_t err; json_error_t err;
bool ret = false; bool ret = false;
sprintf(s, "{\"id\": %d, \"method\": \"mining.extranonce.subscribe\", \"params\": []}", sprintf(s, "{\"id\": %d, \"method\": \"mining.extranonce.subscribe\", \"params\": []}", swork_id++);
swork_id++);
if (!stratum_send(pool, s, strlen(s))) if (!stratum_send(pool, s, strlen(s))) {
return ret; return ret;
}
/* Parse all data in the queue and anything left should be the response */ /* Parse all data in the queue and anything left should be the response */
while (42) { while (42) {
@ -1910,13 +1911,16 @@ bool subscribe_extranonce(struct pool *pool)
} }
sret = recv_line(pool); sret = recv_line(pool);
if (!sret) if (!sret) {
return ret; return ret;
if (parse_method(pool, sret)) }
else if (parse_method(pool, sret)) {
free(sret); free(sret);
else }
else {
break; break;
} }
}
val = JSON_LOADS(sret, &err); val = JSON_LOADS(sret, &err);
free(sret); free(sret);
@ -1968,19 +1972,24 @@ bool auth_stratum(struct pool *pool)
sprintf(s, "{\"id\": %d, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}", sprintf(s, "{\"id\": %d, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
swork_id++, pool->rpc_user, pool->rpc_pass); swork_id++, pool->rpc_user, pool->rpc_pass);
if (!stratum_send(pool, s, strlen(s))) if (!stratum_send(pool, s, strlen(s))) {
return ret; return ret;
}
/* Parse all data in the queue and anything left should be auth */ /* Parse all data in the queue and anything left should be auth */
while (42) { while (42) {
sret = recv_line(pool); sret = recv_line(pool);
if (!sret)
if (!sret) {
return ret; return ret;
if (parse_method(pool, sret)) }
else if (parse_method(pool, sret)) {
free(sret); free(sret);
else }
else {
break; break;
} }
}
val = JSON_LOADS(sret, &err); val = JSON_LOADS(sret, &err);
free(sret); free(sret);

4
winbuild/sgminer.vcxproj

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations"> <ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32"> <ProjectConfiguration Include="Debug|Win32">
@ -267,6 +267,7 @@
<ClCompile Include="..\algorithm\talkcoin.c" /> <ClCompile Include="..\algorithm\talkcoin.c" />
<ClCompile Include="..\algorithm\x14.c" /> <ClCompile Include="..\algorithm\x14.c" />
<ClCompile Include="..\algorithm\fresh.c" /> <ClCompile Include="..\algorithm\fresh.c" />
<ClCompile Include="..\algorithm\whirlcoin.c" />
<ClCompile Include="..\api.c" /> <ClCompile Include="..\api.c" />
<ClCompile Include="..\ccan\opt\helpers.c" /> <ClCompile Include="..\ccan\opt\helpers.c" />
<ClCompile Include="..\ccan\opt\opt.c" /> <ClCompile Include="..\ccan\opt\opt.c" />
@ -327,6 +328,7 @@
<ClInclude Include="..\algorithm\talkcoin.h" /> <ClInclude Include="..\algorithm\talkcoin.h" />
<ClInclude Include="..\algorithm\x14.h" /> <ClInclude Include="..\algorithm\x14.h" />
<ClInclude Include="..\algorithm\fresh.h" /> <ClInclude Include="..\algorithm\fresh.h" />
<ClInclude Include="..\algorithm\whirlcoin.h" />
<ClInclude Include="..\api.h" /> <ClInclude Include="..\api.h" />
<ClInclude Include="..\arg-nonnull.h" /> <ClInclude Include="..\arg-nonnull.h" />
<ClInclude Include="..\bench_block.h" /> <ClInclude Include="..\bench_block.h" />

8
winbuild/sgminer.vcxproj.filters

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup> <ItemGroup>
<Filter Include="Source Files"> <Filter Include="Source Files">
@ -185,6 +185,9 @@
<ClCompile Include="..\algorithm\fresh.c"> <ClCompile Include="..\algorithm\fresh.c">
<Filter>Source Files\algorithm</Filter> <Filter>Source Files\algorithm</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="..\algorithm\whirlcoin.c">
<Filter>Source Files\algorithm</Filter>
</ClCompile>
<ClCompile Include="..\sph\shabal.c"> <ClCompile Include="..\sph\shabal.c">
<Filter>Source Files\sph</Filter> <Filter>Source Files\sph</Filter>
</ClCompile> </ClCompile>
@ -373,6 +376,9 @@
<ClInclude Include="..\algorithm\fresh.h"> <ClInclude Include="..\algorithm\fresh.h">
<Filter>Header Files\algorithm</Filter> <Filter>Header Files\algorithm</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\algorithm\whirlcoin.h">
<Filter>Header Files\algorithm</Filter>
</ClInclude>
<ClInclude Include="..\sph\sph_whirlpool.h"> <ClInclude Include="..\sph\sph_whirlpool.h">
<Filter>Header Files\sph</Filter> <Filter>Header Files\sph</Filter>
</ClInclude> </ClInclude>

Loading…
Cancel
Save