Browse Source

Merged develop branch with master +updates

Moved the develop code to master. Moving forward all updates will be
done on master unless it's work on a major feature.

This update contains all previous develop code as well as a few new ones
that weren't pushed yet:

* Added neoscrypt compatibility for xintensity/rawintensity
* Neoscrypt now uses correct TC if not specified or set to 0
* Reworked the application of pool settings on algorithm switch which
should resolve TC/Intensity changes between algos such as X11 and
neoscrypt
djm34
ystarnaud 10 years ago
parent
commit
4ba8a68043
  1. 1
      Makefile.am
  2. 97
      algorithm.c
  3. 1
      algorithm.h
  4. 5
      algorithm/neoscrypt.c
  5. 1
      algorithm/neoscrypt.h
  6. 171
      algorithm/whirlcoin.c
  7. 9
      algorithm/whirlcoin.h
  8. 189
      config_parser.c
  9. 4
      configure.ac
  10. 59
      driver-opencl.c
  11. 2
      findnonce.c
  12. 534
      kernel/animecoin.cl
  13. 993
      kernel/arebyp.cl
  14. 117
      kernel/bitblock.cl
  15. 123
      kernel/darkcoin-mod.cl
  16. 13
      kernel/darkcoin.cl
  17. 1853
      kernel/diamond.cl
  18. 2030
      kernel/groestlcoin.cl
  19. 41
      kernel/inkcoin.cl
  20. 119
      kernel/marucoin-mod.cl
  21. 80
      kernel/myriadcoin-groestl.cl
  22. 986
      kernel/neoscrypt.cl
  23. 1120
      kernel/quarkcoin.cl
  24. 16
      kernel/qubitcoin.cl
  25. 400
      kernel/sifcoin.cl
  26. 82
      kernel/talkcoin-mod.cl
  27. 16
      kernel/twecoin.cl
  28. 1358
      kernel/whirlcoin.cl
  29. 119
      kernel/x14.cl
  30. 17
      logging.c
  31. 19
      miner.h
  32. 129
      ocl.c
  33. 19
      pool.c
  34. 649
      sgminer.c
  35. 97
      util.c
  36. 4
      winbuild/sgminer.vcxproj
  37. 8
      winbuild/sgminer.vcxproj.filters

1
Makefile.am

@ -66,6 +66,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h @@ -66,6 +66,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
sgminer_SOURCES += algorithm/x14.c algorithm/x14.h
sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h
sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h
sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h
bin_SCRIPTS = $(top_srcdir)/kernel/*.cl

97
algorithm.c

@ -29,6 +29,7 @@ @@ -29,6 +29,7 @@
#include "algorithm/bitblock.h"
#include "algorithm/x14.h"
#include "algorithm/fresh.h"
#include "algorithm/whirlcoin.h"
#include "algorithm/neoscrypt.h"
#include "compat.h"
@ -49,7 +50,9 @@ const char *algorithm_type_str[] = { @@ -49,7 +50,9 @@ const char *algorithm_type_str[] = {
"Twecoin",
"Fugue256",
"NIST",
"Fresh"
"Fresh",
"Whirlcoin",
"Neoscrypt"
};
void sha256(const unsigned char *message, unsigned int len, unsigned char *digest)
@ -96,11 +99,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru @@ -96,11 +99,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
{
char buf[255];
sprintf(buf, " -D MAX_GLOBAL_THREADS=%u",
(unsigned int)cgpu->thread_concurrency);
sprintf(buf, " %s-D MAX_GLOBAL_THREADS=%lu ",
((cgpu->lookup_gap > 0)?" -D LOOKUP_GAP=2 ":""), (unsigned long)cgpu->thread_concurrency);
strcat(data->compiler_options, buf);
sprintf(buf, "tc%u", (unsigned int)cgpu->thread_concurrency);
sprintf(buf, "%stc%lu", ((cgpu->lookup_gap > 0)?"lg":""), (unsigned long)cgpu->thread_concurrency);
strcat(data->binary_filename, buf);
}
@ -162,11 +165,10 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb @@ -162,11 +165,10 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb
/* This looks like a unnecessary double cast, but to make sure, that
* the target's most significant entry is adressed as a 32-bit value
* and not accidently by something else the double cast seems wise.
* The compiler will get rid of it anyway.
*/
* The compiler will get rid of it anyway. */
le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
memcpy(clState->cldata, blk->work->data, 80);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->outputBuffer);
@ -600,6 +602,34 @@ static cl_int queue_fresh_kernel(struct __clState *clState, struct _dev_blk_ctx @@ -600,6 +602,34 @@ static cl_int queue_fresh_kernel(struct __clState *clState, struct _dev_blk_ctx
return status;
}
static cl_int queue_whirlcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel;
cl_ulong le_target;
cl_int status = 0;
le_target = *(cl_ulong *)(blk->work->device_target + 24);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
//clbuffer, hashes
kernel = &clState->kernel;
CL_SET_ARG_N(0,clState->CLbuffer0);
CL_SET_ARG_N(1,clState->padbuffer8);
kernel = clState->extra_kernels;
CL_SET_ARG_N(0,clState->padbuffer8);
CL_NEXTKERNEL_SET_ARG_N(0,clState->padbuffer8);
//hashes, output, target
CL_NEXTKERNEL_SET_ARG_N(0,clState->padbuffer8);
CL_SET_ARG_N(1,clState->outputBuffer);
CL_SET_ARG_N(2,le_target);
return status;
}
typedef struct _algorithm_settings_t {
const char *name; /* Human-readable identifier */
algorithm_type_t type; //common algorithm type
@ -624,23 +654,24 @@ typedef struct _algorithm_settings_t { @@ -624,23 +654,24 @@ typedef struct _algorithm_settings_t {
static algorithm_settings_t algos[] = {
// kernels starting from this will have difficulty calculated by using litecoin algorithm
#define A_SCRYPT(a) \
{ a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options}
{ a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options}
A_SCRYPT( "ckolivas" ),
A_SCRYPT( "alexkarnew" ),
A_SCRYPT( "alexkarnold" ),
A_SCRYPT( "bufius" ),
A_SCRYPT( "psw" ),
A_SCRYPT( "zuikkis" ),
A_SCRYPT( "arebyp" ),
#undef A_SCRYPT
#define A_NEOSCRYPT(a) \
{ a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
{ a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
A_NEOSCRYPT("neoscrypt"),
#undef A_NEOSCRYPT
// kernels starting from this will have difficulty calculated by using quarkcoin algorithm
#define A_QUARK(a, b) \
{ a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
{ a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options}
A_QUARK( "quarkcoin", quarkcoin_regenhash),
A_QUARK( "qubitcoin", qubitcoin_regenhash),
A_QUARK( "animecoin", animecoin_regenhash),
@ -649,40 +680,43 @@ static algorithm_settings_t algos[] = { @@ -649,40 +680,43 @@ static algorithm_settings_t algos[] = {
// kernels starting from this will have difficulty calculated by using bitcoin algorithm
#define A_DARK(a, b) \
{ a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
{ a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options}
A_DARK( "darkcoin", darkcoin_regenhash),
A_DARK( "inkcoin", inkcoin_regenhash),
A_DARK( "myriadcoin-groestl", myriadcoin_groestl_regenhash),
#undef A_DARK
{ "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, queue_sph_kernel, sha256, NULL},
{ "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL },
{ "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL},
{ "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
{ "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options},
{ "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options },
{ "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options },
{ "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options },
{ "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options},
{ "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options},
{ "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options},
{ "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options },
{ "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options },
{ "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options},
{ "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options},
{ "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options },
{ "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options },
{ "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options},
{ "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options},
{ "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options },
{ "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options},
{ "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL },
{ "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL},
// kernels starting from this will have difficulty calculated by using fuguecoin algorithm
#define A_FUGUE(a, b) \
{ a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL }
A_FUGUE( "fuguecoin", fuguecoin_regenhash),
A_FUGUE( "groestlcoin", groestlcoin_regenhash),
#undef A_FUGUE
#define A_FUGUE(a, b, c) \
{ a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, c, NULL}
A_FUGUE("fuguecoin", fuguecoin_regenhash, sha256),
A_FUGUE("groestlcoin", groestlcoin_regenhash, sha256),
A_FUGUE("diamond", groestlcoin_regenhash, gen_hash),
#undef A_FUGUE
{ "whirlcoin", ALGO_WHIRL, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 3, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, whirlcoin_regenhash, queue_whirlcoin_kernel, sha256, NULL},
// Terminator (do not remove)
{ NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL }
{ NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL}
};
void copy_algorithm_settings(algorithm_t* dest, const char* algo)
@ -695,7 +729,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo) @@ -695,7 +729,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo)
if (strcmp(src->name, algo) == 0)
{
strcpy(dest->name, src->name);
dest->kernelfile = src->kernelfile;
dest->kernelfile = src->kernelfile;
dest->type = src->type;
dest->diff_multiplier1 = src->diff_multiplier1;
@ -751,6 +785,7 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa @@ -751,6 +785,7 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
ALGO_ALIAS("x15old", "bitblockold");
ALGO_ALIAS("nist5", "talkcoin-mod");
ALGO_ALIAS("keccak", "maxcoin");
ALGO_ALIAS("whirlpool", "whirlcoin");
#undef ALGO_ALIAS
#undef ALGO_ALIAS_NF
@ -760,7 +795,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa @@ -760,7 +795,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
void set_algorithm(algorithm_t* algo, const char* newname_alias)
{
const char* newname;
const char *newname;
//load previous algorithm nfactor in case nfactor was applied before algorithm... or default to 10
uint8_t old_nfactor = ((algo->nfactor)?algo->nfactor:0);
//load previous kernel file name if was applied before algorithm...
@ -811,5 +847,6 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor) @@ -811,5 +847,6 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor)
bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2)
{
// return (strcmp(algo1->name, algo2->name) == 0) && (algo1->nfactor == algo2->nfactor);
return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor));
}

1
algorithm.h

@ -24,6 +24,7 @@ typedef enum { @@ -24,6 +24,7 @@ typedef enum {
ALGO_FUGUE,
ALGO_NIST,
ALGO_FRESH,
ALGO_WHIRL,
ALGO_NEOSCRYPT
} algorithm_type_t;

5
algorithm/neoscrypt.c

@ -1177,9 +1177,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { @@ -1177,9 +1177,7 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) {
r = (1 << ((profile >> 5) & 0x7));
}
uchar *stack;
stack =(uchar*)malloc((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align);
uchar stack[(N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align];
/* X = r * 2 * SCRYPT_BLOCK_SIZE */
X = (uint *) &stack[stack_align & ~(stack_align - 1)];
/* Z is a copy of X for ChaCha */
@ -1287,7 +1285,6 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) { @@ -1287,7 +1285,6 @@ void neoscrypt(const uchar *password, uchar *output, uint profile) {
}
free(stack);
}
void neoscrypt_regenhash(struct work *work)

1
algorithm/neoscrypt.h

@ -5,7 +5,6 @@ @@ -5,7 +5,6 @@
/* The neoscrypt scratch buffer needs 32kBytes memory. */
#define NEOSCRYPT_SCRATCHBUF_SIZE (32 * 1024)
/* These routines are always available. */
extern void neoscrypt_regenhash(struct work *work);
extern void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile);

171
algorithm/whirlcoin.c

@ -0,0 +1,171 @@ @@ -0,0 +1,171 @@
/*-
* Copyright 2009 Colin Percival, 2011 ArtForz
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include "config.h"
#include "miner.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "sph/sph_whirlpool.h"
/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
typedef struct {
sph_whirlpool1_context whirlpool1;
sph_whirlpool1_context whirlpool2;
sph_whirlpool1_context whirlpool3;
sph_whirlpool1_context whirlpool4;
} Whash_context_holder;
Whash_context_holder base_contexts;
void init_whirlcoin_hash_contexts()
{
sph_whirlpool1_init(&base_contexts.whirlpool1);
sph_whirlpool1_init(&base_contexts.whirlpool2);
sph_whirlpool1_init(&base_contexts.whirlpool3);
sph_whirlpool1_init(&base_contexts.whirlpool4);
}
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static inline void
be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
{
uint32_t i;
for (i = 0; i < len; i++)
dst[i] = htobe32(src[i]);
}
inline void whirlcoin_hash(void *state, const void *input)
{
init_whirlcoin_hash_contexts();
Whash_context_holder ctx;
uint32_t hashA[16], hashB[16];
memcpy(&ctx, &base_contexts, sizeof(base_contexts));
sph_whirlpool1 (&ctx.whirlpool1, input, 80);
sph_whirlpool1_close (&ctx.whirlpool1, hashA);
sph_whirlpool1(&ctx.whirlpool2, hashA, 64);
sph_whirlpool1_close(&ctx.whirlpool2, hashB);
sph_whirlpool1(&ctx.whirlpool3, hashB, 64);
sph_whirlpool1_close(&ctx.whirlpool3, hashA);
sph_whirlpool1(&ctx.whirlpool4, hashA, 64);
sph_whirlpool1_close(&ctx.whirlpool4, hashB);
memcpy(state, hashB, 32);
}
static const uint32_t diff1targ = 0x0000ffff;
/* Used externally as confirmation of correct OCL code */
int whirlcoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
{
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
uint32_t data[20], ohash[8];
be32enc_vect(data, (const uint32_t *)pdata, 19);
data[19] = htobe32(nonce);
whirlcoin_hash(ohash, data);
tmp_hash7 = be32toh(ohash[7]);
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
(long unsigned int)Htarg,
(long unsigned int)diff1targ,
(long unsigned int)tmp_hash7);
if (tmp_hash7 > diff1targ)
return -1;
if (tmp_hash7 > Htarg)
return 0;
return 1;
}
void whirlcoin_regenhash(struct work *work)
{
uint32_t data[20];
uint32_t *nonce = (uint32_t *)(work->data + 76);
uint32_t *ohash = (uint32_t *)(work->hash);
be32enc_vect(data, (const uint32_t *)work->data, 19);
data[19] = htobe32(*nonce);
whirlcoin_hash(ohash, data);
}
bool scanhash_whirlcoin(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *pdata, unsigned char __maybe_unused *phash1,
unsigned char __maybe_unused *phash, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
{
uint32_t *nonce = (uint32_t *)(pdata + 76);
uint32_t data[20];
uint32_t tmp_hash7;
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
bool ret = false;
be32enc_vect(data, (const uint32_t *)pdata, 19);
while(1) {
uint32_t ostate[8];
*nonce = ++n;
data[19] = (n);
whirlcoin_hash(ostate, data);
tmp_hash7 = (ostate[7]);
applog(LOG_INFO, "data7 %08lx",
(long unsigned int)data[7]);
if (unlikely(tmp_hash7 <= Htarg)) {
((uint32_t *)pdata)[19] = htobe32(n);
*last_nonce = n;
ret = true;
break;
}
if (unlikely((n >= max_nonce) || thr->work_restart)) {
*last_nonce = n;
break;
}
}
return ret;
}

9
algorithm/whirlcoin.h

@ -0,0 +1,9 @@ @@ -0,0 +1,9 @@
#ifndef W_H
#define W_H
#include "miner.h"
extern int whirlcoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
extern void whirlcoin_regenhash(struct work *work);
#endif /* W_H */

189
config_parser.c

@ -598,8 +598,10 @@ static struct opt_table *opt_find(struct opt_table *tbl, char *optname) @@ -598,8 +598,10 @@ static struct opt_table *opt_find(struct opt_table *tbl, char *optname)
//set url
curl_easy_setopt(curl, CURLOPT_URL, url);
//set write callback and fileinfo
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fetch_remote_config_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file);
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); // fail on 404 or other 4xx http codes
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30); // timeout after 30 secs to prevent being stuck
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &file); // stream to write data to
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, fetch_remote_config_cb); // callback function to write to config file
if((res = curl_easy_perform(curl)) != CURLE_OK)
applog(LOG_ERR, "Fetch remote file failed: %s", curl_easy_strerror(res));
@ -715,17 +717,20 @@ char *parse_config(json_t *val, const char *key, const char *parentkey, bool fil @@ -715,17 +717,20 @@ char *parse_config(json_t *val, const char *key, const char *parentkey, bool fil
if((opt = opt_find(opt_config_table, optname)) != NULL)
{
//strings
if ((opt->type & OPT_HASARG) && json_is_string(val))
if ((opt->type & OPT_HASARG) && json_is_string(val)) {
err = opt->cb_arg(json_string_value(val), opt->u.arg);
}
//boolean values
else if ((opt->type & OPT_NOARG) && json_is_true(val))
else if ((opt->type & OPT_NOARG) && json_is_true(val)) {
err = opt->cb(opt->u.arg);
else
}
else {
err = "Invalid value";
}
}
else
else {
err = "Invalid option";
}
break;
}
@ -756,37 +761,71 @@ char *load_config(const char *arg, const char *parentkey, void __maybe_unused *u @@ -756,37 +761,71 @@ char *load_config(const char *arg, const char *parentkey, void __maybe_unused *u
json_t *config;
#ifdef HAVE_LIBCURL
//if detected as url
if((strstr(arg, "http://") != NULL) || (strstr(arg, "https://") != NULL) || (strstr(arg, "ftp://") != NULL))
{
//download config file locally and reset arg to it so we can parse it
if((arg = fetch_remote_config(arg)) == NULL)
return NULL;
int retry = opt_remoteconf_retry;
const char *url;
// if detected as url
if ((strstr(arg, "http://") != NULL) || (strstr(arg, "https://") != NULL) || (strstr(arg, "ftp://") != NULL)) {
url = strdup(arg);
do {
// wait for next retry
if (retry < opt_remoteconf_retry) {
sleep(opt_remoteconf_wait);
}
// download config file locally and reset arg to it so we can parse it
if ((arg = fetch_remote_config(url)) != NULL) {
break;
}
--retry;
} while (retry);
// file not downloaded... abort
if (arg == NULL) {
// if we should use last downloaded copy...
if (opt_remoteconf_usecache) {
char *p;
// extract filename out of url
if ((p = (char *)strrchr(url, '/')) == NULL) {
quit(1, "%s: invalid URL.", url);
}
arg = p+1;
} else {
quit(1, "%s: unable to download config file.", url);
}
}
}
#endif
//most likely useless but leaving it here for now...
if(!cnfbuf)
// most likely useless but leaving it here for now...
if (!cnfbuf) {
cnfbuf = strdup(arg);
}
//no need to restrict the number of includes... if it causes problems, restore it later
// no need to restrict the number of includes... if it causes problems, restore it later
/*if(++include_count > JSON_MAX_DEPTH)
return JSON_MAX_DEPTH_ERR;
*/
//check if the file exists
if(access(arg, F_OK) == -1)
// check if the file exists
if (access(arg, F_OK) == -1) {
quit(1, "%s: file not found.", arg);
}
#if JANSSON_MAJOR_VERSION > 1
config = json_load_file(arg, 0, &err);
#else
config = json_load_file(arg, &err);
#endif
#if JANSSON_MAJOR_VERSION > 1
config = json_load_file(arg, 0, &err);
#else
config = json_load_file(arg, &err);
#endif
//if json root is not an object, error out
if(!json_is_object(config))
// if json root is not an object, error out
if (!json_is_object(config)) {
return set_last_json_error("Error: JSON decode of file \"%s\" failed:\n %s", arg, err.text);
}
config_loaded = true;
@ -1006,13 +1045,12 @@ void apply_pool_profile(struct pool *pool) @@ -1006,13 +1045,12 @@ void apply_pool_profile(struct pool *pool)
if (empty_string(pool->algorithm.kernelfile)) {
// ...but profile does, apply it to the pool
if (!empty_string(profile->algorithm.kernelfile)) {
pool->algorithm.kernelfile = profile->algorithm.kernelfile;
applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
// ...or default profile does, apply it to the pool
}
else if (!empty_string(default_profile.algorithm.kernelfile)) {
pool->algorithm.kernelfile = default_profile.algorithm.kernelfile;
applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
pool->algorithm.kernelfile = profile->algorithm.kernelfile;
applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
// ...or default profile does, apply it to the pool
} else if (!empty_string(default_profile.algorithm.kernelfile)) {
pool->algorithm.kernelfile = default_profile.algorithm.kernelfile;
applog(LOG_DEBUG, "Pool %i Kernel File set to \"%s\"", pool->pool_no, pool->algorithm.kernelfile);
}
}
@ -1034,39 +1072,76 @@ void apply_pool_profile(struct pool *pool) @@ -1034,39 +1072,76 @@ void apply_pool_profile(struct pool *pool)
}
applog(LOG_DEBUG, "Pool %i lookup gap set to \"%s\"", pool->pool_no, pool->lookup_gap);
if(pool_cmp(pool->intensity, default_profile.intensity))
{
if(!empty_string(profile->intensity))
pool->intensity = profile->intensity;
else
pool->intensity = default_profile.intensity;
}
applog(LOG_DEBUG, "Pool %i Intensity set to \"%s\"", pool->pool_no, pool->intensity);
int int_type = 0;
if(pool_cmp(pool->xintensity, default_profile.xintensity))
{
if(!empty_string(profile->xintensity))
pool->xintensity = profile->xintensity;
else
// FIXME: ifs from hell...
// First look for an existing intensity on pool
if (!empty_string(pool->rawintensity)) {
int_type = 2;
}
else if (!empty_string(pool->xintensity)) {
int_type = 1;
}
else if (!empty_string(pool->intensity)) {
int_type = 0;
}
else {
//no intensity found on pool... check if the profile has one and use it...
if (!empty_string(profile->rawintensity)) {
int_type = 2;
pool->rawintensity = profile->rawintensity;
}
else if (!empty_string(profile->xintensity)) {
int_type = 1;
pool->xintensity = profile->xintensity;
}
else if (!empty_string(profile->intensity)) {
int_type = 0;
pool->intensity = profile->intensity;
}
else {
//nothing in profile... check default profile/globals
if (!empty_string(default_profile.rawintensity)) {
int_type = 2;
pool->rawintensity = default_profile.rawintensity;
}
else if (!empty_string(default_profile.xintensity)) {
int_type = 1;
pool->xintensity = default_profile.xintensity;
}
else if (!empty_string(default_profile.intensity)) {
int_type = 0;
pool->intensity = default_profile.intensity;
}
else {
//nothing anywhere? default to sgminer default of 8
int_type = 0;
pool->intensity = strdup("8");
}
}
}
applog(LOG_DEBUG, "Pool %i XIntensity set to \"%s\"", pool->pool_no, pool->xintensity);
if(pool_cmp(pool->rawintensity, default_profile.rawintensity))
{
if(!empty_string(profile->rawintensity))
pool->rawintensity = profile->rawintensity;
else
pool->rawintensity = default_profile.rawintensity;
switch(int_type) {
case 2:
applog(LOG_DEBUG, "Pool %d Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity);
break;
case 1:
applog(LOG_DEBUG, "Pool %d XIntensity set to \"%s\"", pool->pool_no, pool->xintensity);
break;
default:
applog(LOG_DEBUG, "Pool %d Intensity set to \"%s\"", pool->pool_no, pool->intensity);
break;
}
applog(LOG_DEBUG, "Pool %i Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity);
if(pool_cmp(pool->thread_concurrency, default_profile.thread_concurrency))
{
if(!empty_string(profile->thread_concurrency))
pool->thread_concurrency = profile->thread_concurrency;
else
pool->thread_concurrency = default_profile.thread_concurrency;
/* allow empty string TC
if(!empty_string(profile->thread_concurrency))*/
pool->thread_concurrency = profile->thread_concurrency;
/* else
pool->thread_concurrency = default_profile.thread_concurrency;*/
}
applog(LOG_DEBUG, "Pool %i Thread Concurrency set to \"%s\"", pool->pool_no, pool->thread_concurrency);

4
configure.ac

@ -1,8 +1,8 @@ @@ -1,8 +1,8 @@
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
m4_define([v_maj], [5])
m4_define([v_min], [0])
m4_define([v_mic], [1])
m4_define([v_min], [1])
m4_define([v_mic], [0])
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
m4_define([v_ver], [v_maj.v_min.v_mic])
m4_define([lt_rev], m4_eval(v_maj + v_min))

59
driver-opencl.c

@ -176,30 +176,42 @@ char *set_lookup_gap(char *arg) @@ -176,30 +176,42 @@ char *set_lookup_gap(char *arg)
return NULL;
}
char *set_thread_concurrency(const char *_arg)
char *set_thread_concurrency(const char *arg)
{
int i, val = 0, device = 0;
int i, device = 0;
size_t val = 0;
char *tmpstr = strdup(arg);
char *nextptr;
char *arg = (char *)alloca(strlen(_arg) + 1);
strcpy(arg, _arg);
nextptr = strtok(arg, ",");
if (nextptr == NULL)
return "Invalid parameters for set thread concurrency";
val = atoi(nextptr);
// empty string - use 0 and let algo autodetect the TC
if (empty_string(tmpstr)) {
applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", device, val);
gpus[device++].opt_tc = val;
}
// not empty string
else {
if ((nextptr = strtok(tmpstr, ",")) == NULL) {
free(tmpstr);
return "Invalid parameters for set_thread_concurrency";
}
gpus[device++].opt_tc = val;
do {
val = (unsigned long)atol(nextptr);
while ((nextptr = strtok(NULL, ",")) != NULL) {
val = atoi(nextptr);
gpus[device++].opt_tc = val;
applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", device, val);
gpus[device++].opt_tc = val;
} while ((nextptr = strtok(NULL, ",")) != NULL);
}
// if only 1 TC was passed, assign the same worksize for all remaining GPUs
if (device == 1) {
for (i = device; i < MAX_GPUDEVICES; i++)
for (i = device; i < total_devices; ++i) {
gpus[i].opt_tc = gpus[0].opt_tc;
applog(LOG_DEBUG, "GPU %d Thread Concurrency set to %lu.", i, gpus[i].opt_tc);
}
}
free(tmpstr);
return NULL;
}
@ -1020,21 +1032,24 @@ static void set_threads_hashes(unsigned int vectors, unsigned int compute_shader @@ -1020,21 +1032,24 @@ static void set_threads_hashes(unsigned int vectors, unsigned int compute_shader
{
unsigned int threads = 0;
while (threads < minthreads) {
if (*rawintensity > 0) {
threads = *rawintensity;
} else if (*xintensity > 0) {
if (algorithm->xintensity_shift)
threads = compute_shaders * (1 << (algorithm->xintensity_shift + *xintensity));
else
threads = compute_shaders * *xintensity;
} else {
}
else if (*xintensity > 0) {
threads = compute_shaders * ((algorithm->xintensity_shift)?(1 << (algorithm->xintensity_shift + *xintensity)):*xintensity);
}
else {
threads = 1 << (algorithm->intensity_shift + *intensity);
}
if (threads < minthreads) {
if (likely(*intensity < MAX_INTENSITY))
if (likely(*intensity < MAX_INTENSITY)) {
(*intensity)++;
else
}
else {
threads = minthreads;
}
}
}

2
findnonce.c

@ -202,7 +202,7 @@ static void *postcalc_hash(void *userdata) @@ -202,7 +202,7 @@ static void *postcalc_hash(void *userdata)
if (found == 0x0F)
nonce = swab32(nonce);
applog(LOG_DEBUG, "OCL NONCE %u found in slot %d", nonce, entry);
applog(LOG_DEBUG, "[THR%d] OCL NONCE %08x (%lu) found in slot %d (found = %d)", thr->id, nonce, nonce, entry, found);
submit_nonce(thr, pcd->work, nonce);
}

534
kernel/animecoin.cl

@ -92,6 +92,14 @@ typedef long sph_s64; @@ -92,6 +92,14 @@ typedef long sph_s64;
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
#if SPH_BIG_ENDIAN
#define DEC64E(x) (x)
#define DEC64BE(x) (*(const __global sph_u64 *) (x));
@ -118,8 +126,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -118,8 +126,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16];
sph_u64 mv[16];
sph_u64 mv[16],q[32];
sph_u64 tmp;
mv[0] = DEC64LE(block + 0);
mv[1] = DEC64LE(block + 8);
@ -139,34 +147,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -139,34 +147,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
mv[13] = 0;
mv[14] = 0;
mv[15] = 0x280;
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
FOLDb;
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#undef M
#undef H
#undef dH
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x])
#define H(x) (final_b[x])
#define dH(x) (BMW_h1[x])
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
FOLDb;
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#undef M
#undef H
#undef dH
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
hash.h8[0] = SWAP8(BMW_h1[8]);
hash.h8[1] = SWAP8(BMW_h1[9]);
hash.h8[2] = SWAP8(BMW_h1[10]);
hash.h8[3] = SWAP8(BMW_h1[11]);
hash.h8[4] = SWAP8(BMW_h1[12]);
hash.h8[5] = SWAP8(BMW_h1[13]);
hash.h8[6] = SWAP8(BMW_h1[14]);
hash.h8[7] = SWAP8(BMW_h1[15]);
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = SWAP8(BMW_H[8]);
hash.h8[1] = SWAP8(BMW_H[9]);
hash.h8[2] = SWAP8(BMW_H[10]);
hash.h8[3] = SWAP8(BMW_H[11]);
hash.h8[4] = SWAP8(BMW_H[12]);
hash.h8[5] = SWAP8(BMW_H[13]);
hash.h8[6] = SWAP8(BMW_H[14]);
hash.h8[7] = SWAP8(BMW_H[15]);
}
// blake
@ -218,7 +434,6 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -218,7 +434,6 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
bool dec = ((hash.h1[7] & 0x8) != 0);
{
// groestl
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
@ -427,14 +642,13 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -427,14 +642,13 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
hash.h8[7] = (dec ? H7 : hash.h8[7]);
}
{
// bmw
sph_u64 BMW_H[16];
for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16];
sph_u64 mv[16];
sph_u64 mv[16],q[32];
sph_u64 tmp;
mv[ 0] = SWAP8(hash.h8[0]);
mv[ 1] = SWAP8(hash.h8[1]);
@ -452,34 +666,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -452,34 +666,242 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
mv[13] = 0;
mv[14] = 0;
mv[15] = 0x200;
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
FOLDb;
#undef M
#undef H
#undef dH
#define M(x) (BMW_h2[x])
#define H(x) (final_b[x])
#define dH(x) (BMW_h1[x])
FOLDb;
#undef M
#undef H
#undef dH
hash.h8[0] = (!dec ? SWAP8(BMW_h1[8]) : hash.h8[0]);
hash.h8[1] = (!dec ? SWAP8(BMW_h1[9]) : hash.h8[1]);
hash.h8[2] = (!dec ? SWAP8(BMW_h1[10]) : hash.h8[2]);
hash.h8[3] = (!dec ? SWAP8(BMW_h1[11]) : hash.h8[3]);
hash.h8[4] = (!dec ? SWAP8(BMW_h1[12]) : hash.h8[4]);
hash.h8[5] = (!dec ? SWAP8(BMW_h1[13]) : hash.h8[5]);
hash.h8[6] = (!dec ? SWAP8(BMW_h1[14]) : hash.h8[6]);
hash.h8[7] = (!dec ? SWAP8(BMW_h1[15]) : hash.h8[7]);
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = (!dec ? SWAP8(BMW_H[8]) : hash.h8[0]);
hash.h8[1] = (!dec ? SWAP8(BMW_H[9]) : hash.h8[1]);
hash.h8[2] = (!dec ? SWAP8(BMW_H[10]) : hash.h8[2]);
hash.h8[3] = (!dec ? SWAP8(BMW_H[11]) : hash.h8[3]);
hash.h8[4] = (!dec ? SWAP8(BMW_H[12]) : hash.h8[4]);
hash.h8[5] = (!dec ? SWAP8(BMW_H[13]) : hash.h8[5]);
hash.h8[6] = (!dec ? SWAP8(BMW_H[14]) : hash.h8[6]);
hash.h8[7] = (!dec ? SWAP8(BMW_H[15]) : hash.h8[7]);
}

993
kernel/arebyp.cl

@ -0,0 +1,993 @@ @@ -0,0 +1,993 @@
/*-
* Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
* 2012-2013 Con Kolivas, 2013 Alexey Karimov.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
/* N (nfactor), CPU/Memory cost parameter */
__constant uint N[] = {
0x00000001U, /* never used, padding */
0x00000002U,
0x00000004U,
0x00000008U,
0x00000010U,
0x00000020U,
0x00000040U,
0x00000080U,
0x00000100U,
0x00000200U,
0x00000400U, /* 2^10 == 1024, Litecoin scrypt default */
0x00000800U,
0x00001000U,
0x00002000U,
0x00004000U,
0x00008000U,
0x00010000U,
0x00020000U,
0x00040000U,
0x00080000U,
0x00100000U
};
/* Backwards compatibility, if NFACTOR not defined, default to 10 for scrypt */
#ifndef NFACTOR
#define NFACTOR 10
#endif
__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 };
__constant uint K[] = {
0x428a2f98U,
0x71374491U,
0xb5c0fbcfU,
0xe9b5dba5U,
0x3956c25bU,
0x59f111f1U,
0x923f82a4U,
0xab1c5ed5U,
0xd807aa98U,
0x12835b01U,
0x243185beU, // 10
0x550c7dc3U,
0x72be5d74U,
0x80deb1feU,
0x9bdc06a7U,
0xe49b69c1U,
0xefbe4786U,
0x0fc19dc6U,
0x240ca1ccU,
0x2de92c6fU,
0x4a7484aaU, // 20
0x5cb0a9dcU,
0x76f988daU,
0x983e5152U,
0xa831c66dU,
0xb00327c8U,
0xbf597fc7U,
0xc6e00bf3U,
0xd5a79147U,
0x06ca6351U,
0x14292967U, // 30
0x27b70a85U,
0x2e1b2138U,
0x4d2c6dfcU,
0x53380d13U,
0x650a7354U,
0x766a0abbU,
0x81c2c92eU,
0x92722c85U,
0xa2bfe8a1U,
0xa81a664bU, // 40
0xc24b8b70U,
0xc76c51a3U,
0xd192e819U,
0xd6990624U,
0xf40e3585U,
0x106aa070U,
0x19a4c116U,
0x1e376c08U,
0x2748774cU,
0x34b0bcb5U, // 50
0x391c0cb3U,
0x4ed8aa4aU,
0x5b9cca4fU,
0x682e6ff3U,
0x748f82eeU,
0x78a5636fU,
0x84c87814U,
0x8cc70208U,
0x90befffaU,
0xa4506cebU, // 60
0xbef9a3f7U,
0xc67178f2U,
0x98c7e2a2U,
0xfc08884dU,
0xcd2a11aeU,
0x510e527fU,
0x9b05688cU,
0xC3910C8EU,
0xfb6feee7U,
0x2a01a605U, // 70
0x0c2e12e0U,
0x4498517BU,
0x6a09e667U,
0xa4ce148bU,
0x95F61999U,
0xc19bf174U,
0xBB67AE85U,
0x3C6EF372U,
0xA54FF53AU,
0x1F83D9ABU, // 80
0x5BE0CD19U,
0x5C5C5C5CU,
0x36363636U,
0x80000000U,
0x000003FFU,
0x00000280U,
0x000004a0U,
0x00000300U
};
#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)
#define EndianSwap(n) (rotl(n & ES[0], 24U)|rotl(n & ES[1], 8U))
#define Tr2(x) (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
#define Tr1(x) (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
#define Wr2(x) (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
#define Wr1(x) (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))
#define RND(a, b, c, d, e, f, g, h, k) \
h += Tr1(e); \
h += Ch(e, f, g); \
h += k; \
d += h; \
h += Tr2(a); \
h += Maj(a, b, c);
void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
{
uint4 S0 = *state0;
uint4 S1 = *state1;
#define A S0.x
#define B S0.y
#define C S0.z
#define D S0.w
#define E S1.x
#define F S1.y
#define G S1.z
#define H S1.w
uint4 W[4];
W[ 0].x = block0.x;
RND(A,B,C,D,E,F,G,H, W[0].x+ K[0]);
W[ 0].y = block0.y;
RND(H,A,B,C,D,E,F,G, W[0].y+ K[1]);
W[ 0].z = block0.z;
RND(G,H,A,B,C,D,E,F, W[0].z+ K[2]);
W[ 0].w = block0.w;
RND(F,G,H,A,B,C,D,E, W[0].w+ K[3]);
W[ 1].x = block1.x;
RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]);
W[ 1].y = block1.y;
RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]);
W[ 1].z = block1.z;
RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]);
W[ 1].w = block1.w;
RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]);
W[ 2].x = block2.x;
RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]);
W[ 2].y = block2.y;
RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]);
W[ 2].z = block2.z;
RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]);
W[ 2].w = block2.w;
RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]);
W[ 3].x = block3.x;
RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]);
W[ 3].y = block3.y;
RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]);
W[ 3].z = block3.z;
RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]);
W[ 3].w = block3.w;
RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]);
W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]);
W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]);
W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]);
W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]);
W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]);
W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]);
W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]);
W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]);
W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]);
W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]);
W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]);
W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]);
W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]);
W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]);
W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]);
W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]);
W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]);
W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]);
W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]);
W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]);
W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]);
W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]);
W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]);
W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]);
W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]);
W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]);
W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]);
W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]);
W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]);
W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]);
W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]);
W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]);
W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]);
W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]);
W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]);
W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]);
W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]);
W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]);
W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]);
W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]);
W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]);
W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]);
W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]);
W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]);
W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]);
W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]);
W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]);
W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]);
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H
*state0 += S0;
*state1 += S1;
}
void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
{
#define A (*state0).x
#define B (*state0).y
#define C (*state0).z
#define D (*state0).w
#define E (*state1).x
#define F (*state1).y
#define G (*state1).z
#define H (*state1).w
uint4 W[4];
W[0].x = block0.x;
D= K[63] +W[0].x;
H= K[64] +W[0].x;
W[0].y = block0.y;
C= K[65] +Tr1(D)+Ch(D, K[66], K[67])+W[0].y;
G= K[68] +C+Tr2(H)+Ch(H, K[69] ,K[70]);
W[0].z = block0.z;
B= K[71] +Tr1(C)+Ch(C,D,K[66])+W[0].z;
F= K[72] +B+Tr2(G)+Maj(G,H, K[73]);
W[0].w = block0.w;
A= K[74] +Tr1(B)+Ch(B,C,D)+W[0].w;
E= K[75] +A+Tr2(F)+Maj(F,G,H);
W[1].x = block1.x;
RND(E,F,G,H,A,B,C,D, W[1].x+ K[4]);
W[1].y = block1.y;
RND(D,E,F,G,H,A,B,C, W[1].y+ K[5]);
W[1].z = block1.z;
RND(C,D,E,F,G,H,A,B, W[1].z+ K[6]);
W[1].w = block1.w;
RND(B,C,D,E,F,G,H,A, W[1].w+ K[7]);
W[2].x = block2.x;
RND(A,B,C,D,E,F,G,H, W[2].x+ K[8]);
W[2].y = block2.y;
RND(H,A,B,C,D,E,F,G, W[2].y+ K[9]);
W[2].z = block2.z;
RND(G,H,A,B,C,D,E,F, W[2].z+ K[10]);
W[2].w = block2.w;
RND(F,G,H,A,B,C,D,E, W[2].w+ K[11]);
W[3].x = block3.x;
RND(E,F,G,H,A,B,C,D, W[3].x+ K[12]);
W[3].y = block3.y;
RND(D,E,F,G,H,A,B,C, W[3].y+ K[13]);
W[3].z = block3.z;
RND(C,D,E,F,G,H,A,B, W[3].z+ K[14]);
W[3].w = block3.w;
RND(B,C,D,E,F,G,H,A, W[3].w+ K[76]);
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[15]);
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[16]);
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[17]);
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[18]);
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[19]);
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[20]);
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[21]);
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[22]);
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[23]);
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[24]);
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[25]);
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[26]);
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[27]);
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[28]);
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[29]);
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[30]);
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[31]);
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[32]);
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[33]);
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[34]);
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[35]);
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[36]);
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[37]);
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[38]);
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[39]);
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[40]);
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[41]);
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[42]);
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[43]);
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[44]);
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[45]);
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[46]);
W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
RND(A,B,C,D,E,F,G,H, W[0].x+ K[47]);
W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
RND(H,A,B,C,D,E,F,G, W[0].y+ K[48]);
W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
RND(G,H,A,B,C,D,E,F, W[0].z+ K[49]);
W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
RND(F,G,H,A,B,C,D,E, W[0].w+ K[50]);
W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
RND(E,F,G,H,A,B,C,D, W[1].x+ K[51]);
W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
RND(D,E,F,G,H,A,B,C, W[1].y+ K[52]);
W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
RND(C,D,E,F,G,H,A,B, W[1].z+ K[53]);
W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
RND(B,C,D,E,F,G,H,A, W[1].w+ K[54]);
W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
RND(A,B,C,D,E,F,G,H, W[2].x+ K[55]);
W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
RND(H,A,B,C,D,E,F,G, W[2].y+ K[56]);
W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
RND(G,H,A,B,C,D,E,F, W[2].z+ K[57]);
W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
RND(F,G,H,A,B,C,D,E, W[2].w+ K[58]);
W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
RND(E,F,G,H,A,B,C,D, W[3].x+ K[59]);
W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
RND(D,E,F,G,H,A,B,C, W[3].y+ K[60]);
W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
RND(C,D,E,F,G,H,A,B, W[3].z+ K[61]);
W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
RND(B,C,D,E,F,G,H,A, W[3].w+ K[62]);
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H
*state0 += (uint4)(K[73], K[77], K[78], K[79]);
*state1 += (uint4)(K[66], K[67], K[80], K[81]);
}
__constant uint fixedW[64] =
{
0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,
0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,
0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,
0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,
0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,
0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,
0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,
};
void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
{
uint4 S0 = *state0;
uint4 S1 = *state1;
#define A S0.x
#define B S0.y
#define C S0.z
#define D S0.w
#define E S1.x
#define F S1.y
#define G S1.z
#define H S1.w
RND(A,B,C,D,E,F,G,H, fixedW[0]);
RND(H,A,B,C,D,E,F,G, fixedW[1]);
RND(G,H,A,B,C,D,E,F, fixedW[2]);
RND(F,G,H,A,B,C,D,E, fixedW[3]);
RND(E,F,G,H,A,B,C,D, fixedW[4]);
RND(D,E,F,G,H,A,B,C, fixedW[5]);
RND(C,D,E,F,G,H,A,B, fixedW[6]);
RND(B,C,D,E,F,G,H,A, fixedW[7]);
RND(A,B,C,D,E,F,G,H, fixedW[8]);
RND(H,A,B,C,D,E,F,G, fixedW[9]);
RND(G,H,A,B,C,D,E,F, fixedW[10]);
RND(F,G,H,A,B,C,D,E, fixedW[11]);
RND(E,F,G,H,A,B,C,D, fixedW[12]);
RND(D,E,F,G,H,A,B,C, fixedW[13]);
RND(C,D,E,F,G,H,A,B, fixedW[14]);
RND(B,C,D,E,F,G,H,A, fixedW[15]);
RND(A,B,C,D,E,F,G,H, fixedW[16]);
RND(H,A,B,C,D,E,F,G, fixedW[17]);
RND(G,H,A,B,C,D,E,F, fixedW[18]);
RND(F,G,H,A,B,C,D,E, fixedW[19]);
RND(E,F,G,H,A,B,C,D, fixedW[20]);
RND(D,E,F,G,H,A,B,C, fixedW[21]);
RND(C,D,E,F,G,H,A,B, fixedW[22]);
RND(B,C,D,E,F,G,H,A, fixedW[23]);
RND(A,B,C,D,E,F,G,H, fixedW[24]);
RND(H,A,B,C,D,E,F,G, fixedW[25]);
RND(G,H,A,B,C,D,E,F, fixedW[26]);
RND(F,G,H,A,B,C,D,E, fixedW[27]);
RND(E,F,G,H,A,B,C,D, fixedW[28]);
RND(D,E,F,G,H,A,B,C, fixedW[29]);
RND(C,D,E,F,G,H,A,B, fixedW[30]);
RND(B,C,D,E,F,G,H,A, fixedW[31]);
RND(A,B,C,D,E,F,G,H, fixedW[32]);
RND(H,A,B,C,D,E,F,G, fixedW[33]);
RND(G,H,A,B,C,D,E,F, fixedW[34]);
RND(F,G,H,A,B,C,D,E, fixedW[35]);
RND(E,F,G,H,A,B,C,D, fixedW[36]);
RND(D,E,F,G,H,A,B,C, fixedW[37]);
RND(C,D,E,F,G,H,A,B, fixedW[38]);
RND(B,C,D,E,F,G,H,A, fixedW[39]);
RND(A,B,C,D,E,F,G,H, fixedW[40]);
RND(H,A,B,C,D,E,F,G, fixedW[41]);
RND(G,H,A,B,C,D,E,F, fixedW[42]);
RND(F,G,H,A,B,C,D,E, fixedW[43]);
RND(E,F,G,H,A,B,C,D, fixedW[44]);
RND(D,E,F,G,H,A,B,C, fixedW[45]);
RND(C,D,E,F,G,H,A,B, fixedW[46]);
RND(B,C,D,E,F,G,H,A, fixedW[47]);
RND(A,B,C,D,E,F,G,H, fixedW[48]);
RND(H,A,B,C,D,E,F,G, fixedW[49]);
RND(G,H,A,B,C,D,E,F, fixedW[50]);
RND(F,G,H,A,B,C,D,E, fixedW[51]);
RND(E,F,G,H,A,B,C,D, fixedW[52]);
RND(D,E,F,G,H,A,B,C, fixedW[53]);
RND(C,D,E,F,G,H,A,B, fixedW[54]);
RND(B,C,D,E,F,G,H,A, fixedW[55]);
RND(A,B,C,D,E,F,G,H, fixedW[56]);
RND(H,A,B,C,D,E,F,G, fixedW[57]);
RND(G,H,A,B,C,D,E,F, fixedW[58]);
RND(F,G,H,A,B,C,D,E, fixedW[59]);
RND(E,F,G,H,A,B,C,D, fixedW[60]);
RND(D,E,F,G,H,A,B,C, fixedW[61]);
RND(C,D,E,F,G,H,A,B, fixedW[62]);
RND(B,C,D,E,F,G,H,A, fixedW[63]);
#undef A
#undef B
#undef C
#undef D
#undef E
#undef F
#undef G
#undef H
*state0 += S0;
*state1 += S1;
}
void shittify(uint4 B[8])
{
uint4 tmp[4];
tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);
tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);
tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);
tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);
#pragma unroll
for(uint i=0; i<4; ++i)
B[i] = EndianSwap(tmp[i]);
tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);
tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);
tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);
tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);
#pragma unroll
for(uint i=0; i<4; ++i)
B[i+4] = EndianSwap(tmp[i]);
}
void unshittify(uint4 B[8])
{
uint4 tmp[4];
tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);
tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);
tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);
tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);
#pragma unroll
for(uint i=0; i<4; ++i)
B[i] = EndianSwap(tmp[i]);
tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);
tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);
tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);
tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);
#pragma unroll
for(uint i=0; i<4; ++i)
B[i+4] = EndianSwap(tmp[i]);
}
#define SALSAUNROLLED
#ifdef SALSAUNROLLED
void salsa(uint4 B[8])
{
uint4 w[4];
w[0] = (B[0]^=B[4]);
w[1] = (B[1]^=B[5]);
w[2] = (B[2]^=B[6]);
w[3] = (B[3]^=B[7]);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] = (B[4]^=(B[0]+=w[0]));
w[1] = (B[5]^=(B[1]+=w[1]));
w[2] = (B[6]^=(B[2]+=w[2]));
w[3] = (B[7]^=(B[3]+=w[3]));
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
B[4] += w[0];
B[5] += w[1];
B[6] += w[2];
B[7] += w[3];
}
#else
void salsa(uint4 B[8])
{
uint4 w[4];
#pragma unroll
for(uint i=0; i<4; ++i)
w[i] = (B[i]^=B[i+4]);
#pragma unroll
for(uint i=0; i<4; ++i)
{
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
}
#pragma unroll
for(uint i=0; i<4; ++i)
w[i] = (B[i+4]^=(B[i]+=w[i]));
#pragma unroll
for(uint i=0; i<4; ++i)
{
w[0] ^= rotl(w[3] +w[2] , 7U);
w[1] ^= rotl(w[0] +w[3] , 9U);
w[2] ^= rotl(w[1] +w[0] ,13U);
w[3] ^= rotl(w[2] +w[1] ,18U);
w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
}
#pragma unroll
for(uint i=0; i<4; ++i)
B[i+4] += w[i];
}
#endif
#if (LOOKUP_GAP != 2)
wrong lookup gap!
#endif
void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
{
const uint zSIZE = 8;
const uint ySIZE = N[NFACTOR] / 2;
const uint xSIZE = CONCURRENT_THREADS;
uint4 V[8];
uint x = get_global_id(0) % xSIZE;
uint z;
uint y;
uint i;
uint CO;
ushort progress; // Progress state
ushort state;
CO = 8 * x;
for (y = 0; y < ySIZE; ++y, CO += (xSIZE - 1) * (zSIZE))
{
#pragma unroll
for (z = 0; z < zSIZE; ++z, CO++)
lookup[CO] = X[z];
// Next salsa
salsa(X);
salsa(X);
}
//------------------------------------------------------------------------------------------------------------
uint cotmp = x * zSIZE;
progress = 0;
for (i = 0; i < N[NFACTOR] + 512 + 42; i++)
{
//if (progress < 2 * N[NFACTOR])
{
y = X[7].x & (N[NFACTOR]-1);
CO = cotmp + (y / LOOKUP_GAP) * (xSIZE) * zSIZE;
state = ((progress & 1) << 1) | (y & 1);
if (state != 3)
{
#pragma unroll
for (z = 0; z < zSIZE; ++z, CO++)
V[z] = lookup[CO];
}
if (state != 1)
{
#pragma unroll
for (z = 0; z < zSIZE; ++z)
V[z] ^= X[z];
}
salsa(V);
ushort cond = (state != 1) && (progress < 2 * N[NFACTOR]);
if (cond)
{
#pragma unroll
for (z = 0; z < zSIZE; ++z)
X[z] = V[z];
}
// S/y
// 00 +2
// 01 +1
// 11 +1
// 10 error
progress += (state == 0)? 2 : 1;
}
}
}
#define SCRYPT_FOUND (0xFF)
#define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uint4 * restrict input,
volatile __global uint*restrict output, __global uint4*restrict padcache,
const uint4 midstate0, const uint4 midstate16, const uint target)
{
uint4 X[8];
uint4 tstate0, tstate1, ostate0, ostate1;
uint4 tmp0, tmp1;
uint4 data = (uint4)(input[4].x,input[4].y,input[4].z, get_global_id(0));
uint4 pad0 = midstate0, pad1 = midstate16;
SHA256(&pad0,&pad1, data, (uint4)(K[84],0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[86]));
SHA256_fresh(&ostate0,&ostate1, pad0^ K[82], pad1^ K[82], K[82], K[82]);
SHA256_fresh(&tstate0,&tstate1, pad0^ K[83], pad1^ K[83], K[83], K[83]);
tmp0 = tstate0;
tmp1 = tstate1;
SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);
#pragma unroll
for (uint i=0; i<4; i++)
{
pad0 = tstate0;
pad1 = tstate1;
X[rotl(i,1U) ] = ostate0;
X[rotl(i,1U)+1] = ostate1;
SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
SHA256(X+rotl(i,1U),X+rotl(i,1U)+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
}
shittify(X);
scrypt_core(X,padcache);
unshittify(X);
SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
SHA256_fixed(&tmp0,&tmp1);
SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
bool result = (EndianSwap(ostate1.w) <= target);
if (result)
SETFOUND(get_global_id(0));
}

117
kernel/bitblock.cl

@ -464,69 +464,92 @@ __kernel void search2(__global hash_t* hashes) @@ -464,69 +464,92 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
// groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u];
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))

123
kernel/darkcoin-mod.cl

@ -193,7 +193,7 @@ __kernel void search1(__global hash_t* hashes) @@ -193,7 +193,7 @@ __kernel void search1(__global hash_t* hashes)
BMW_H[u] = BMW_IV512[u];
sph_u64 mv[16],q[32];
sph_u64 tmp;
sph_u64 tmp;
mv[0] = SWAP8(hash->h8[0]);
mv[1] = SWAP8(hash->h8[1]);
@ -457,74 +457,97 @@ __kernel void search2(__global hash_t* hashes) @@ -457,74 +457,97 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
// groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u];
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
barrier(CLK_GLOBAL_MEM_FENCE);
}
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search3(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
@ -840,7 +863,7 @@ __kernel void search8(__global hash_t* hashes) @@ -840,7 +863,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0];
rk01 = hash->h4[1];

13
kernel/darkcoin.cl

@ -71,16 +71,9 @@ typedef long sph_s64; @@ -71,16 +71,9 @@ typedef long sph_s64;
#define SPH_SMALL_FOOTPRINT_GROESTL 0
#define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0
#ifndef SPH_COMPACT_BLAKE_64
#define SPH_COMPACT_BLAKE_64 0
#endif
#ifndef SPH_LUFFA_PARALLEL
#define SPH_LUFFA_PARALLEL 0
#endif
#ifndef SPH_KECCAK_UNROLL
#define SPH_KECCAK_UNROLL 0
#endif
#define SPH_COMPACT_BLAKE_64 0
#define SPH_LUFFA_PARALLEL 0
#define SPH_KECCAK_UNROLL 0
#include "blake.cl"
#include "bmw.cl"

1853
kernel/diamond.cl

File diff suppressed because it is too large Load Diff

2030
kernel/groestlcoin.cl

File diff suppressed because it is too large Load Diff

41
kernel/inkcoin.cl

@ -29,8 +29,8 @@ @@ -29,8 +29,8 @@
* @author phm <phm@inbox.com>
*/
#ifndef DARKCOIN_CL
#define DARKCOIN_CL
#ifndef INKCOIN_CL
#define INKCOIN_CL
#if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1
@ -54,34 +54,15 @@ typedef long sph_s64; @@ -54,34 +54,15 @@ typedef long sph_s64;
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
#define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1
#define SPH_KECCAK_64 1
#define SPH_JH_64 1
#define SPH_SIMD_NOCOPY 0
#define SPH_KECCAK_NOCOPY 0
#define SPH_SMALL_FOOTPRINT_GROESTL 0
#define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0
#ifndef SPH_COMPACT_BLAKE_64
#define SPH_COMPACT_BLAKE_64 0
#endif
#ifndef SPH_LUFFA_PARALLEL
#define SPH_LUFFA_PARALLEL 0
#endif
#ifndef SPH_KECCAK_UNROLL
#define SPH_KECCAK_UNROLL 0
#endif
#include "shavite.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
@ -97,6 +78,14 @@ typedef long sph_s64; @@ -97,6 +78,14 @@ typedef long sph_s64;
#define DEC32LE(x) (*(const __global sph_u32 *) (x));
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
// __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
{
@ -244,4 +233,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -244,4 +233,4 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
output[output[0xFF]++] = SWAP4(gid);
}
#endif // DARKCOIN_CL
#endif // INKCOIN_CL

119
kernel/marucoin-mod.cl

@ -460,69 +460,92 @@ __kernel void search2(__global hash_t* hashes) @@ -460,69 +460,92 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
// groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u];
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -843,7 +866,7 @@ __kernel void search8(__global hash_t* hashes) @@ -843,7 +866,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0];
rk01 = hash->h4[1];

80
kernel/myriadcoin-groestl.cl

@ -54,23 +54,19 @@ typedef long sph_s64; @@ -54,23 +54,19 @@ typedef long sph_s64;
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
#define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1
#define SPH_SIMD_NOCOPY 0
#define SPH_CUBEHASH_UNROLL 0
#ifndef SPH_LUFFA_PARALLEL
#define SPH_LUFFA_PARALLEL 0
#endif
#include "groestl.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
@ -84,6 +80,14 @@ typedef long sph_s64; @@ -84,6 +80,14 @@ typedef long sph_s64;
#define DEC64E(x) (*(const __global sph_u64 *) (x));
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
#define ROL32(x, n) rotate(x, (uint) n)
#define SHR(x, n) ((x) >> n)
#define SWAP32(a) (as_uint(as_uchar4(a).wzyx))
@ -138,34 +142,41 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -138,34 +142,41 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
ulong h8[8];
} hash;
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T4_L[i] = T4[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
// groestl
sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
@ -193,20 +204,33 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -193,20 +204,33 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
m[13] = 0;
m[14] = 0;
m[15] = 0x100000000000000;
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
PERM_BIG_P(g);
PERM_BIG_Q(m);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash.h8[u] = ENC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
uint temp1;
uint W0 = SWAP32(hash.h4[0x0]);
uint W1 = SWAP32(hash.h4[0x1]);

986
kernel/neoscrypt.cl

File diff suppressed because it is too large Load Diff

1120
kernel/quarkcoin.cl

File diff suppressed because it is too large Load Diff

16
kernel/qubitcoin.cl

@ -54,13 +54,13 @@ typedef long sph_s64; @@ -54,13 +54,13 @@ typedef long sph_s64;
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
#define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1
@ -88,6 +88,14 @@ typedef long sph_s64; @@ -88,6 +88,14 @@ typedef long sph_s64;
#define DEC32BE(x) SWAP4(*(const __global sph_u32 *) (x));
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
{

400
kernel/sifcoin.cl

@ -54,13 +54,13 @@ typedef long sph_s64; @@ -54,13 +54,13 @@ typedef long sph_s64;
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
#define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1
@ -100,6 +100,14 @@ typedef long sph_s64; @@ -100,6 +100,14 @@ typedef long sph_s64;
#define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global unsigned char* block, volatile __global uint* output, const ulong target)
{
@ -164,8 +172,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -164,8 +172,8 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16];
sph_u64 mv[16];
sph_u64 mv[16],q[32];
sph_u64 tmp;
mv[ 0] = SWAP8(hash.h8[0]);
mv[ 1] = SWAP8(hash.h8[1]);
@ -183,78 +191,330 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp @@ -183,78 +191,330 @@ __kernel void search(__global unsigned char* block, volatile __global uint* outp
mv[13] = 0;
mv[14] = 0;
mv[15] = 0x200;
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
FOLDb;
#undef M
#undef H
#undef dH
#define M(x) (BMW_h2[x])
#define H(x) (final_b[x])
#define dH(x) (BMW_h1[x])
FOLDb;
#undef M
#undef H
#undef dH
hash.h8[0] = SWAP8(BMW_h1[8]);
hash.h8[1] = SWAP8(BMW_h1[9]);
hash.h8[2] = SWAP8(BMW_h1[10]);
hash.h8[3] = SWAP8(BMW_h1[11]);
hash.h8[4] = SWAP8(BMW_h1[12]);
hash.h8[5] = SWAP8(BMW_h1[13]);
hash.h8[6] = SWAP8(BMW_h1[14]);
hash.h8[7] = SWAP8(BMW_h1[15]);
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash.h8[0] = SWAP8(BMW_H[8]);
hash.h8[1] = SWAP8(BMW_H[9]);
hash.h8[2] = SWAP8(BMW_H[10]);
hash.h8[3] = SWAP8(BMW_H[11]);
hash.h8[4] = SWAP8(BMW_H[12]);
hash.h8[5] = SWAP8(BMW_H[13]);
hash.h8[6] = SWAP8(BMW_H[14]);
hash.h8[7] = SWAP8(BMW_H[15]);
}
// groestl
{
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16];
m[0] = DEC64E(hash.h8[0]);
m[1] = DEC64E(hash.h8[1]);
m[2] = DEC64E(hash.h8[2]);
m[3] = DEC64E(hash.h8[3]);
m[4] = DEC64E(hash.h8[4]);
m[5] = DEC64E(hash.h8[5]);
m[6] = DEC64E(hash.h8[6]);
m[7] = DEC64E(hash.h8[7]);
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++)
hash.h8[u] = DEC64E(H[u + 8]);
sph_u64 g[16], m[16];
m[0] = DEC64E(hash.h8[0]);
m[1] = DEC64E(hash.h8[1]);
m[2] = DEC64E(hash.h8[2]);
m[3] = DEC64E(hash.h8[3]);
m[4] = DEC64E(hash.h8[4]);
m[5] = DEC64E(hash.h8[5]);
m[6] = DEC64E(hash.h8[6]);
m[7] = DEC64E(hash.h8[7]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash.h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
// jh

82
kernel/talkcoin-mod.cl

@ -68,10 +68,7 @@ typedef int sph_s32; @@ -68,10 +68,7 @@ typedef int sph_s32;
#define SPH_JH_64 1
#define SPH_KECCAK_64 1
#define SPH_KECCAK_NOCOPY 0
#ifndef SPH_COMPACT_BLAKE_64
#define SPH_COMPACT_BLAKE_64 0
#endif
#define SPH_COMPACT_BLAKE_64 0
#ifndef SPH_KECCAK_UNROLL
#define SPH_KECCAK_UNROLL 0
#endif
@ -162,44 +159,48 @@ __kernel void search1(__global hash_t* hashes) @@ -162,44 +159,48 @@ __kernel void search1(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T4_L[i] = T4[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
#else
H[15] = (sph_u64)512;
#endif
#endif
sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]);
@ -211,9 +212,9 @@ __kernel void search1(__global hash_t* hashes) @@ -211,9 +212,9 @@ __kernel void search1(__global hash_t* hashes)
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
@ -222,27 +223,28 @@ __kernel void search1(__global hash_t* hashes) @@ -222,27 +223,28 @@ __kernel void search1(__global hash_t* hashes)
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
xH[u] = H[u];
PERM_BIG_P(xH);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
}
// jh

16
kernel/twecoin.cl

@ -20,13 +20,13 @@ typedef long sph_s64; @@ -20,13 +20,13 @@ typedef long sph_s64;
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
#define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_HAMSI_EXPAND_SMALL 1
@ -47,6 +47,14 @@ typedef long sph_s64; @@ -47,6 +47,14 @@ typedef long sph_s64;
#define sph_bswap32(x) SWAP4(x)
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
static void sph_enc32be(void *dst, sph_u32 val)
{
#if defined SPH_UPTR

1358
kernel/whirlcoin.cl

File diff suppressed because it is too large Load Diff

119
kernel/x14.cl

@ -463,69 +463,92 @@ __kernel void search2(__global hash_t* hashes) @@ -463,69 +463,92 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
// groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16];
//#pragma unroll 15
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u];
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
//#pragma unroll 16
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
//#pragma unroll 8
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -846,7 +869,7 @@ __kernel void search8(__global hash_t* hashes) @@ -846,7 +869,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0];
rk01 = hash->h4[1];

17
logging.c

@ -59,12 +59,20 @@ void applogsiz(int prio, int size, const char* fmt, ...) @@ -59,12 +59,20 @@ void applogsiz(int prio, int size, const char* fmt, ...)
/* high-level logging function, based on global opt_log_level */
void vapplogsiz(int prio, int size, const char* fmt, va_list args)
{
if (opt_debug || prio != LOG_DEBUG) {
if ((opt_debug || prio != LOG_DEBUG)) {
char *tmp42 = (char *)calloc(size + 1, 1);
vsnprintf(tmp42, size, fmt, args);
_applog(prio, tmp42, false);
free(tmp42);
}
#ifdef DEV_DEBUG_MODE
else if(prio == LOG_DEBUG) {
char *tmp42 = (char *)calloc(size + 1, 1);
vsnprintf(tmp42, size, fmt, args);
__debug("", tmp42);
free(tmp42);
}
#endif
}
/*
@ -80,6 +88,13 @@ void _applog(int prio, const char *str, bool force) @@ -80,6 +88,13 @@ void _applog(int prio, const char *str, bool force)
if (0) {}
#endif
else {
#ifdef DEV_DEBUG_MODE
if(prio == LOG_DEBUG) {
__debug("", str);
}
#endif
bool write_console = opt_debug_console || (opt_verbose && prio != LOG_DEBUG) || prio <= opt_log_level;
bool write_stderr = !isatty(fileno((FILE *)stderr));
if (!(write_console || write_stderr))

19
miner.h

@ -266,6 +266,11 @@ DRIVER_PARSE_COMMANDS(DRIVER_PROTOTYPE) @@ -266,6 +266,11 @@ DRIVER_PARSE_COMMANDS(DRIVER_PROTOTYPE)
#define strtobool(str) ((str && (!strcasecmp(str, "true") || !strcasecmp(str, "yes") || !strcasecmp(str, "1")))?true:false)
#endif
extern int opt_remoteconf_retry;
extern int opt_remoteconf_wait;
extern bool opt_remoteconf_usecache;
enum alive {
LIFE_WELL,
LIFE_SICK,
@ -1025,6 +1030,7 @@ extern char *sgminer_path; @@ -1025,6 +1030,7 @@ extern char *sgminer_path;
extern int opt_shares;
extern bool opt_fail_only;
extern int opt_fail_switch_delay;
extern int opt_watchpool_refresh;
extern bool opt_autofan;
extern bool opt_autoengine;
extern bool use_curses;
@ -1099,8 +1105,8 @@ extern pthread_cond_t restart_cond; @@ -1099,8 +1105,8 @@ extern pthread_cond_t restart_cond;
extern void clear_stratum_shares(struct pool *pool);
extern void clear_pool_work(struct pool *pool);
extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2);
extern void set_target_neoscrypt(unsigned char *target, double diff);
extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2, const int thr_id);
extern void set_target_neoscrypt(unsigned char *target, double diff, const int thr_id);
extern void kill_work(void);
@ -1274,6 +1280,7 @@ struct pool { @@ -1274,6 +1280,7 @@ struct pool {
bool remove_at_start;
bool removed;
bool lp_started;
bool backup;
char *hdr_path;
char *lp_url;
@ -1481,7 +1488,13 @@ extern void _wlogprint(const char *str); @@ -1481,7 +1488,13 @@ extern void _wlogprint(const char *str);
extern int curses_int(const char *query);
extern char *curses_input(const char *query);
extern void kill_work(void);
extern void switch_pools(struct pool *selected);
//helper macro to preserve existing code
#ifndef switch_pools
#define switch_pools(p) __switch_pools(p, TRUE)
#endif
extern void __switch_pools(struct pool *selected, bool saveprio);
extern void discard_work(struct work *work);
extern void remove_pool(struct pool *pool);
//extern void write_config(FILE *fcfg);

129
ocl.c

@ -299,8 +299,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -299,8 +299,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
char filename[255];
char strbuf[32];
sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile)?cgpu->algorithm.kernelfile:cgpu->algorithm.name));
strcpy(filename, strbuf);
applog(LOG_DEBUG, "Using source file %s", filename);
/* For some reason 2 vectors is still better even if the card says
@ -347,54 +348,111 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -347,54 +348,111 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
// neoscrypt calculates TC differently
if (!safe_cmp(cgpu->algorithm.name, "neoscrypt")) {
int max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
size_t glob_thread_count = 1UL << max_int;
// if TC is entered by user, use that value... otherwise use default
cgpu->thread_concurrency = ((cgpu->opt_tc) ? cgpu->opt_tc : ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count));
size_t glob_thread_count;
long max_int;
unsigned char type = 0;
// if TC * scratchbuf size is too big for memory... reduce to max
if (((uint64_t)cgpu->thread_concurrency * NEOSCRYPT_SCRATCHBUF_SIZE) >(uint64_t)cgpu->max_alloc) {
/* Selected intensity will not run on this GPU. Not enough memory.
* Adapt the memory setting. */
glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
// determine which intensity type to use
// raw intensity is the same as TC so use either or setting...
if (cgpu->rawintensity > 0 || cgpu->opt_tc) {
/* Find highest significant bit in glob_thread_count, which gives
* the intensity. */
while (max_int && ((1U << max_int) & glob_thread_count) == 0) {
--max_int;
if (cgpu->opt_tc) {
glob_thread_count = cgpu->rawintensity = cgpu->opt_tc;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_INTENSITY) {
applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
max_int = MIN_INTENSITY;
else {
glob_thread_count = cgpu->rawintensity;
}
cgpu->intensity = max_int;
cgpu->thread_concurrency = 1U << max_int;
max_int = glob_thread_count;
type = 2;
}
else if (cgpu->xintensity > 0) {
glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift)?(1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)):cgpu->xintensity);
max_int = cgpu->xintensity;
type = 1;
}
else {
glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity);
max_int = ((cgpu->dynamic)?MAX_INTENSITY:cgpu->intensity);
}
glob_thread_count = ((glob_thread_count < cgpu->work_size)?cgpu->work_size:glob_thread_count);
// if TC * scratchbuf size is too big for memory... reduce to max
if ((glob_thread_count * NEOSCRYPT_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) {
/* Selected intensity will not run on this GPU. Not enough memory.
* Adapt the memory setting. */
// depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc
switch (type) {
//raw intensity
case 2:
while ((glob_thread_count * NEOSCRYPT_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) {
--glob_thread_count;
}
max_int = glob_thread_count;
cgpu->rawintensity = glob_thread_count;
break;
//x intensity
case 1:
glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
max_int = glob_thread_count / clState->compute_shaders;
while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) {
--max_int;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_XINTENSITY) {
applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu);
max_int = MIN_XINTENSITY;
}
cgpu->xintensity = max_int;
glob_thread_count = clState->compute_shaders * (1UL << max_int);
break;
default:
glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
while (max_int && ((1UL << max_int) & glob_thread_count) == 0) {
--max_int;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_INTENSITY) {
applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
max_int = MIN_INTENSITY;
}
cgpu->intensity = max_int;
glob_thread_count = 1UL << max_int;
break;
}
}
// TC is glob thread count
cgpu->thread_concurrency = glob_thread_count;
applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
}
else if (!cgpu->opt_tc) {
} else if (!cgpu->opt_tc) {
unsigned int sixtyfours;
sixtyfours = cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1;
cgpu->thread_concurrency = sixtyfours * 64;
if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
if (cgpu->thread_concurrency > cgpu->shaders * 5) {
cgpu->thread_concurrency = cgpu->shaders * 5;
}
}
applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency));
} else {
cgpu->thread_concurrency = cgpu->opt_tc;
}
else {
cgpu->thread_concurrency = cgpu->opt_tc;
}
cl_uint slot, cpnd;
@ -420,7 +478,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -420,7 +478,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
build_data->opencl_version = get_opencl_version(devices[gpu]);
build_data->patch_bfi = needs_bfi_patch(build_data);
strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile)?cgpu->algorithm.kernelfile:cgpu->algorithm.name));
strcat(build_data->binary_filename, name);
if (clState->goffset)
strcat(build_data->binary_filename, "g");
@ -491,20 +549,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -491,20 +549,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
bufsize = NEOSCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency;
/* This is the input buffer. For neoscrypt this is guaranteed to be
* 80 bytes only. */
* 80 bytes only. */
readbufsize = 80;
applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
// scrypt/n-scrypt
}
else {
// scrypt/n-scrypt
} else {
size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0));
bufsize = 128 * ipt * cgpu->thread_concurrency;
applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
}
}
else {
bufsize = (size_t)algorithm->rw_buffer_size;
} else {
bufsize = (size_t) algorithm->rw_buffer_size;
applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
}
@ -539,7 +595,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -539,7 +595,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
applog(LOG_DEBUG, "Using output buffer sized %lu", BUFFERSIZE);
clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
if (status != CL_SUCCESS) {
applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
return NULL;

19
pool.c

@ -33,18 +33,21 @@ @@ -33,18 +33,21 @@
#include <string.h>
char* get_pool_name(struct pool *pool) {
if (opt_incognito)
return "<pool>";
if (opt_incognito) {
return "<pool>";
}
if (strcmp(pool->name, "") == 0)
return pool->sockaddr_url;
if (empty_string(pool->name)) {
return pool->sockaddr_url;
}
return pool->name;
return pool->name;
}
char* get_pool_user(struct pool *pool) {
if (opt_incognito)
return "<user>";
if (opt_incognito) {
return "<user>";
}
return pool->rpc_user;
return pool->rpc_user;
}

649
sgminer.c

File diff suppressed because it is too large Load Diff

97
util.c

@ -1444,9 +1444,9 @@ char *recv_line(struct pool *pool) @@ -1444,9 +1444,9 @@ char *recv_line(struct pool *pool)
}
buflen = strlen(pool->sockbuf);
tok = strtok(pool->sockbuf, "\n");
if (!tok) {
applog(LOG_DEBUG, "Failed to parse a \\n terminated string in recv_line");
if ((tok = strtok(pool->sockbuf, "\n")) == NULL) {
applog(LOG_DEBUG, "Failed to parse a \\n terminated string in recv_line: buffer = %s", pool->sockbuf);
goto out;
}
sret = strdup(tok);
@ -1675,12 +1675,16 @@ static bool parse_diff(struct pool *pool, json_t *val) @@ -1675,12 +1675,16 @@ static bool parse_diff(struct pool *pool, json_t *val)
if (old_diff != diff) {
int idiff = diff;
if ((double)idiff == diff)
if ((double)idiff == diff) {
applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %d", get_pool_name(pool), idiff);
else
}
else {
applog(pool == current_pool() ? LOG_NOTICE : LOG_DEBUG, "%s difficulty changed to %.3f", get_pool_name(pool), diff);
} else
}
}
else {
applog(LOG_DEBUG, "%s difficulty set to %f", get_pool_name(pool), diff);
}
return true;
}
@ -1806,83 +1810,80 @@ bool parse_method(struct pool *pool, char *s) @@ -1806,83 +1810,80 @@ bool parse_method(struct pool *pool, char *s)
bool ret = false;
char *buf;
if (!s)
if (!s) {
return ret;
}
val = JSON_LOADS(s, &err);
if (!val) {
if (!(val = JSON_LOADS(s, &err))) {
applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text);
return ret;
}
method = json_object_get(val, "method");
if (!method) {
json_decref(val);
return ret;
if (!(method = json_object_get(val, "method"))) {
goto done;
}
err_val = json_object_get(val, "error");
params = json_object_get(val, "params");
if (err_val && !json_is_null(err_val)) {
char *ss;
if (err_val)
if (err_val) {
ss = json_dumps(err_val, JSON_INDENT(3));
else
}
else {
ss = strdup("(unknown reason)");
}
applog(LOG_INFO, "JSON-RPC method decode failed: %s", ss);
json_decref(val);
free(ss);
return ret;
goto done;
}
buf = (char *)json_string_value(method);
if (!buf) {
json_decref(val);
return ret;
goto done;
}
if (!strncasecmp(buf, "mining.notify", 13)) {
if (parse_notify(pool, params))
if (parse_notify(pool, params)) {
pool->stratum_notify = ret = true;
else
}
else {
pool->stratum_notify = ret = false;
json_decref(val);
return ret;
}
goto done;
}
if (!strncasecmp(buf, "mining.set_difficulty", 21) && parse_diff(pool, params)) {
ret = true;
json_decref(val);
return ret;
goto done;
}
if (!strncasecmp(buf, "mining.set_extranonce", 21) && parse_extranonce(pool, params)) {
ret = true;
json_decref(val);
return ret;
goto done;
}
if (!strncasecmp(buf, "client.reconnect", 16) && parse_reconnect(pool, params)) {
ret = true;
json_decref(val);
return ret;
goto done;
}
if (!strncasecmp(buf, "client.get_version", 18) && send_version(pool, val)) {
ret = true;
json_decref(val);
return ret;
goto done;
}
if (!strncasecmp(buf, "client.show_message", 19) && show_message(pool, params)) {
ret = true;
json_decref(val);
return ret;
goto done;
}
done:
json_decref(val);
return ret;
}
@ -1894,11 +1895,11 @@ bool subscribe_extranonce(struct pool *pool) @@ -1894,11 +1895,11 @@ bool subscribe_extranonce(struct pool *pool)
json_error_t err;
bool ret = false;
sprintf(s, "{\"id\": %d, \"method\": \"mining.extranonce.subscribe\", \"params\": []}",
swork_id++);
sprintf(s, "{\"id\": %d, \"method\": \"mining.extranonce.subscribe\", \"params\": []}", swork_id++);
if (!stratum_send(pool, s, strlen(s)))
if (!stratum_send(pool, s, strlen(s))) {
return ret;
}
/* Parse all data in the queue and anything left should be the response */
while (42) {
@ -1910,12 +1911,15 @@ bool subscribe_extranonce(struct pool *pool) @@ -1910,12 +1911,15 @@ bool subscribe_extranonce(struct pool *pool)
}
sret = recv_line(pool);
if (!sret)
if (!sret) {
return ret;
if (parse_method(pool, sret))
}
else if (parse_method(pool, sret)) {
free(sret);
else
}
else {
break;
}
}
val = JSON_LOADS(sret, &err);
@ -1968,18 +1972,23 @@ bool auth_stratum(struct pool *pool) @@ -1968,18 +1972,23 @@ bool auth_stratum(struct pool *pool)
sprintf(s, "{\"id\": %d, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
swork_id++, pool->rpc_user, pool->rpc_pass);
if (!stratum_send(pool, s, strlen(s)))
if (!stratum_send(pool, s, strlen(s))) {
return ret;
}
/* Parse all data in the queue and anything left should be auth */
while (42) {
sret = recv_line(pool);
if (!sret)
if (!sret) {
return ret;
if (parse_method(pool, sret))
}
else if (parse_method(pool, sret)) {
free(sret);
else
}
else {
break;
}
}
val = JSON_LOADS(sret, &err);

4
winbuild/sgminer.vcxproj

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
@ -267,6 +267,7 @@ @@ -267,6 +267,7 @@
<ClCompile Include="..\algorithm\talkcoin.c" />
<ClCompile Include="..\algorithm\x14.c" />
<ClCompile Include="..\algorithm\fresh.c" />
<ClCompile Include="..\algorithm\whirlcoin.c" />
<ClCompile Include="..\api.c" />
<ClCompile Include="..\ccan\opt\helpers.c" />
<ClCompile Include="..\ccan\opt\opt.c" />
@ -327,6 +328,7 @@ @@ -327,6 +328,7 @@
<ClInclude Include="..\algorithm\talkcoin.h" />
<ClInclude Include="..\algorithm\x14.h" />
<ClInclude Include="..\algorithm\fresh.h" />
<ClInclude Include="..\algorithm\whirlcoin.h" />
<ClInclude Include="..\api.h" />
<ClInclude Include="..\arg-nonnull.h" />
<ClInclude Include="..\bench_block.h" />

8
winbuild/sgminer.vcxproj.filters

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Source Files">
@ -185,6 +185,9 @@ @@ -185,6 +185,9 @@
<ClCompile Include="..\algorithm\fresh.c">
<Filter>Source Files\algorithm</Filter>
</ClCompile>
<ClCompile Include="..\algorithm\whirlcoin.c">
<Filter>Source Files\algorithm</Filter>
</ClCompile>
<ClCompile Include="..\sph\shabal.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
@ -373,6 +376,9 @@ @@ -373,6 +376,9 @@
<ClInclude Include="..\algorithm\fresh.h">
<Filter>Header Files\algorithm</Filter>
</ClInclude>
<ClInclude Include="..\algorithm\whirlcoin.h">
<Filter>Header Files\algorithm</Filter>
</ClInclude>
<ClInclude Include="..\sph\sph_whirlpool.h">
<Filter>Header Files\sph</Filter>
</ClInclude>

Loading…
Cancel
Save