Browse Source

X14 Implementation

Added X14 and cleaned up the X13/X15 kernels so all 3 offshoots are in
sync. New option "--hamsi-short" or "hamsi-short":true to add a small
boost. May not work on all GPUs.
djm34
ystarnaud 11 years ago
parent
commit
5c9126fd61
  1. 1
      Makefile.am
  2. 110
      algorithm.c
  3. 2
      algorithm.h
  4. 247
      algorithm/x14.c
  5. 10
      algorithm/x14.h
  6. 199
      kernel/bitblock.cl
  7. 701
      kernel/bitblockold.cl
  8. 110
      kernel/darkcoin-mod.cl
  9. 436
      kernel/marucoin-mod.cl
  10. 588
      kernel/marucoin-modold.cl
  11. 1338
      kernel/x14.cl
  12. 1294
      kernel/x14old.cl
  13. 1
      miner.h
  14. 6
      sgminer.c
  15. 2
      winbuild/sgminer.vcxproj
  16. 6
      winbuild/sgminer.vcxproj.filters

1
Makefile.am

@ -63,6 +63,7 @@ sgminer_SOURCES += algorithm/marucoin.c algorithm/marucoin.h
sgminer_SOURCES += algorithm/maxcoin.c algorithm/maxcoin.h sgminer_SOURCES += algorithm/maxcoin.c algorithm/maxcoin.h
sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
sgminer_SOURCES += algorithm/x14.c algorithm/x14.h
bin_SCRIPTS = $(top_srcdir)/kernel/*.cl bin_SCRIPTS = $(top_srcdir)/kernel/*.cl

110
algorithm.c

@ -27,6 +27,7 @@
#include "algorithm/maxcoin.h" #include "algorithm/maxcoin.h"
#include "algorithm/talkcoin.h" #include "algorithm/talkcoin.h"
#include "algorithm/bitblock.h" #include "algorithm/bitblock.h"
#include "algorithm/x14.h"
#include "compat.h" #include "compat.h"
@ -40,6 +41,8 @@ const char *algorithm_type_str[] = {
"NScrypt", "NScrypt",
"X11", "X11",
"X13", "X13",
"X14",
"X15",
"Keccak", "Keccak",
"Quarkcoin", "Quarkcoin",
"Twecoin", "Twecoin",
@ -91,11 +94,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
static void append_hamsi_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm) static void append_hamsi_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
{ {
char buf[255]; char buf[255];
sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d", sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d%s ",
opt_hamsi_expand_big); opt_hamsi_expand_big, ((opt_hamsi_short)?" -D SPH_HAMSI_SHORT=1 ":""));
strcat(data->compiler_options, buf); strcat(data->compiler_options, buf);
sprintf(buf, "big%u", (unsigned int)opt_hamsi_expand_big); sprintf(buf, "big%u%s", (unsigned int)opt_hamsi_expand_big, ((opt_hamsi_short)?"hs":""));
strcat(data->binary_filename, buf); strcat(data->binary_filename, buf);
} }
@ -419,6 +422,100 @@ static cl_int queue_talkcoin_mod_kernel(struct __clState *clState, struct _dev_b
return status; return status;
} }
static cl_int queue_x14_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel;
unsigned int num;
cl_ulong le_target;
cl_int status = 0;
le_target = *(cl_ulong *)(blk->work->device_target + 24);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
// blake - search
kernel = &clState->kernel;
num = 0;
CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->padbuffer8);
// bmw - search1
kernel = clState->extra_kernels;
CL_SET_ARG_0(clState->padbuffer8);
// groestl - search2
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// skein - search3
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// jh - search4
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// keccak - search5
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// luffa - search6
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// cubehash - search7
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// shavite - search8
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// simd - search9
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// echo - search10
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// hamsi - search11
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// fugue - search12
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// shabal - search13
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(le_target);
return status;
}
static cl_int queue_x14_old_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel;
unsigned int num;
cl_ulong le_target;
cl_int status = 0;
le_target = *(cl_ulong *)(blk->work->device_target + 24);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
// blake - search
kernel = &clState->kernel;
num = 0;
CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->padbuffer8);
// bmw - search1
kernel = clState->extra_kernels;
CL_SET_ARG_0(clState->padbuffer8);
// groestl - search2
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// skein - search3
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// jh - search4
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// keccak - search5
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// luffa - search6
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// cubehash - search7
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// shavite - search8
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// simd - search9
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// combined echo, hamsi, fugue - shabal - search10
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(le_target);
return status;
}
typedef struct _algorithm_settings_t { typedef struct _algorithm_settings_t {
const char *name; /* Human-readable identifier */ const char *name; /* Human-readable identifier */
algorithm_type_t type; //common algorithm type algorithm_type_t type; //common algorithm type
@ -477,8 +574,13 @@ static algorithm_settings_t algos[] = {
{ "marucoin-mod", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_hamsi_compiler_options}, { "marucoin-mod", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_hamsi_compiler_options},
{ "marucoin-modold", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_hamsi_compiler_options}, { "marucoin-modold", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_hamsi_compiler_options},
{ "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL}, { "x14", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_hamsi_compiler_options},
{ "x14old", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_hamsi_compiler_options},
{ "bitblock", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_hamsi_compiler_options},
{ "bitblockold", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_hamsi_compiler_options},
{ "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL},
// kernels starting from this will have difficulty calculated by using fuguecoin algorithm // kernels starting from this will have difficulty calculated by using fuguecoin algorithm
#define A_FUGUE(a, b) \ #define A_FUGUE(a, b) \
{ a, ALGO_FUGUE, 1, 256, 256, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL} { a, ALGO_FUGUE, 1, 256, 256, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL}

2
algorithm.h

@ -16,6 +16,8 @@ typedef enum {
ALGO_NSCRYPT, ALGO_NSCRYPT,
ALGO_X11, ALGO_X11,
ALGO_X13, ALGO_X13,
ALGO_X14,
ALGO_X15,
ALGO_KECCAK, ALGO_KECCAK,
ALGO_QUARK, ALGO_QUARK,
ALGO_TWE, ALGO_TWE,

247
algorithm/x14.c

@ -0,0 +1,247 @@
/*-
* Copyright 2009 Colin Percival, 2011 ArtForz
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include "config.h"
#include "miner.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_skein.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "sph/sph_hamsi.h"
#include "sph/sph_fugue.h"
#include "sph/sph_shabal.h"
/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
typedef struct {
sph_blake512_context blake1;
sph_bmw512_context bmw1;
sph_groestl512_context groestl1;
sph_skein512_context skein1;
sph_jh512_context jh1;
sph_keccak512_context keccak1;
sph_luffa512_context luffa1;
sph_cubehash512_context cubehash1;
sph_shavite512_context shavite1;
sph_simd512_context simd1;
sph_echo512_context echo1;
sph_hamsi512_context hamsi1;
sph_fugue512_context fugue1;
sph_shabal512_context shabal1;
} Xhash_context_holder;
static Xhash_context_holder base_contexts;
void init_X14hash_contexts()
{
sph_blake512_init(&base_contexts.blake1);
sph_bmw512_init(&base_contexts.bmw1);
sph_groestl512_init(&base_contexts.groestl1);
sph_skein512_init(&base_contexts.skein1);
sph_jh512_init(&base_contexts.jh1);
sph_keccak512_init(&base_contexts.keccak1);
sph_luffa512_init(&base_contexts.luffa1);
sph_cubehash512_init(&base_contexts.cubehash1);
sph_shavite512_init(&base_contexts.shavite1);
sph_simd512_init(&base_contexts.simd1);
sph_echo512_init(&base_contexts.echo1);
sph_hamsi512_init(&base_contexts.hamsi1);
sph_fugue512_init(&base_contexts.fugue1);
sph_shabal512_init(&base_contexts.shabal1);
}
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static inline void be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
{
uint32_t i;
for (i = 0; i < len; i++)
dst[i] = htobe32(src[i]);
}
inline void x14hash(void *state, const void *input)
{
init_X14hash_contexts();
Xhash_context_holder ctx;
uint32_t hashA[16], hashB[16];
//blake-bmw-groestl-sken-jh-meccak-luffa-cubehash-shivite-simd-echo
memcpy(&ctx, &base_contexts, sizeof(base_contexts));
sph_blake512 (&ctx.blake1, input, 80);
sph_blake512_close (&ctx.blake1, hashA);
sph_bmw512 (&ctx.bmw1, hashA, 64);
sph_bmw512_close(&ctx.bmw1, hashB);
sph_groestl512 (&ctx.groestl1, hashB, 64);
sph_groestl512_close(&ctx.groestl1, hashA);
sph_skein512 (&ctx.skein1, hashA, 64);
sph_skein512_close(&ctx.skein1, hashB);
sph_jh512 (&ctx.jh1, hashB, 64);
sph_jh512_close(&ctx.jh1, hashA);
sph_keccak512 (&ctx.keccak1, hashA, 64);
sph_keccak512_close(&ctx.keccak1, hashB);
sph_luffa512 (&ctx.luffa1, hashB, 64);
sph_luffa512_close (&ctx.luffa1, hashA);
sph_cubehash512 (&ctx.cubehash1, hashA, 64);
sph_cubehash512_close(&ctx.cubehash1, hashB);
sph_shavite512 (&ctx.shavite1, hashB, 64);
sph_shavite512_close(&ctx.shavite1, hashA);
sph_simd512 (&ctx.simd1, hashA, 64);
sph_simd512_close(&ctx.simd1, hashB);
sph_echo512 (&ctx.echo1, hashB, 64);
sph_echo512_close(&ctx.echo1, hashA);
sph_hamsi512 (&ctx.hamsi1, hashA, 64);
sph_hamsi512_close(&ctx.hamsi1, hashB);
sph_fugue512 (&ctx.fugue1, hashB, 64);
sph_fugue512_close(&ctx.fugue1, hashA);
sph_shabal512 (&ctx.shabal1, (const unsigned char*)hashA, 64);
sph_shabal512_close(&ctx.shabal1, hashB);
memcpy(state, hashB, 32);
}
static const uint32_t diff1targ = 0x0000ffff;
/* Used externally as confirmation of correct OCL code */
int x14_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
{
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
uint32_t data[20], ohash[8];
be32enc_vect(data, (const uint32_t *)pdata, 19);
data[19] = htobe32(nonce);
x14hash(ohash, data);
tmp_hash7 = be32toh(ohash[7]);
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
(long unsigned int)Htarg,
(long unsigned int)diff1targ,
(long unsigned int)tmp_hash7);
if (tmp_hash7 > diff1targ)
return -1;
if (tmp_hash7 > Htarg)
return 0;
return 1;
}
void x14_regenhash(struct work *work)
{
uint32_t data[20];
uint32_t *nonce = (uint32_t *)(work->data + 76);
uint32_t *ohash = (uint32_t *)(work->hash);
be32enc_vect(data, (const uint32_t *)work->data, 19);
data[19] = htobe32(*nonce);
x14hash(ohash, data);
}
static inline void be32enc(void *pp, uint32_t x)
{
uint8_t *p = (uint8_t *)pp;
p[3] = x & 0xff;
p[2] = (x >> 8) & 0xff;
p[1] = (x >> 16) & 0xff;
p[0] = (x >> 24) & 0xff;
}
bool scanhash_x14(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *pdata, unsigned char __maybe_unused *phash1,
unsigned char __maybe_unused *phash, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
{
uint32_t *nonce = (uint32_t *)(pdata + 76);
uint32_t data[20];
uint32_t tmp_hash7;
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
bool ret = false;
be32enc_vect(data, (const uint32_t *)pdata, 19);
while(1)
{
uint32_t ostate[8];
*nonce = ++n;
data[19] = (n);
x14hash(ostate, data);
tmp_hash7 = (ostate[7]);
applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]);
if(unlikely(tmp_hash7 <= Htarg))
{
((uint32_t *)pdata)[19] = htobe32(n);
*last_nonce = n;
ret = true;
break;
}
if (unlikely((n >= max_nonce) || thr->work_restart))
{
*last_nonce = n;
break;
}
}
return ret;
}

10
algorithm/x14.h

@ -0,0 +1,10 @@
#ifndef X14_H
#define X14_H
#include "miner.h"
extern int x14_test(unsigned char *pdata, const unsigned char *ptarget,
uint32_t nonce);
extern void x14_regenhash(struct work *work);
#endif /* X14_H */

199
kernel/bitblock.cl

@ -72,13 +72,17 @@ typedef int sph_s32;
#define SPH_SIMD_NOCOPY 0 #define SPH_SIMD_NOCOPY 0
#define SPH_KECCAK_NOCOPY 0 #define SPH_KECCAK_NOCOPY 0
#define SPH_COMPACT_BLAKE_64 0 #define SPH_COMPACT_BLAKE_64 0
#define SPH_LUFFA_PARALLEL 1 #define SPH_LUFFA_PARALLEL 0
#define SPH_SMALL_FOOTPRINT_GROESTL 0 #define SPH_SMALL_FOOTPRINT_GROESTL 0
#define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0 #define SPH_CUBEHASH_UNROLL 0
#define SPH_KECCAK_UNROLL 1 #define SPH_KECCAK_UNROLL 1
#ifndef SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 1 #define SPH_HAMSI_EXPAND_BIG 1
#define SPH_HAMSI_SHORT 1 #endif
#ifndef SPH_HAMSI_SHORT
#define SPH_HAMSI_SHORT 1
#endif
#include "blake.cl" #include "blake.cl"
#include "bmw.cl" #include "bmw.cl"
@ -128,8 +132,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// blake
// blake
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -144,6 +148,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
M0 = DEC64BE(block + 0); M0 = DEC64BE(block + 0);
M1 = DEC64BE(block + 8); M1 = DEC64BE(block + 8);
M2 = DEC64BE(block + 16); M2 = DEC64BE(block + 16);
@ -186,7 +191,7 @@ __kernel void search1(__global hash_t* hashes)
// bmw // bmw
sph_u64 BMW_H[16]; sph_u64 BMW_H[16];
#pragma unroll 16 #pragma unroll 16
for(unsigned u = 0; u < 16; u++) for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u]; BMW_H[u] = BMW_IV512[u];
@ -243,7 +248,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2 #pragma unroll 2
for(int i=0;i<2;i++) for(int i=0;i<2;i++)
{ {
q[i+16] = q[i+16] =
@ -267,7 +272,7 @@ __kernel void search1(__global hash_t* hashes)
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
} }
#pragma unroll 4 #pragma unroll 4
for(int i=2;i<6;i++) for(int i=2;i<6;i++)
{ {
q[i+16] = CONST_EXP2 + q[i+16] = CONST_EXP2 +
@ -275,7 +280,7 @@ __kernel void search1(__global hash_t* hashes)
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
} }
#pragma unroll 3 #pragma unroll 3
for(int i=6;i<9;i++) for(int i=6;i<9;i++)
{ {
q[i+16] = CONST_EXP2 + q[i+16] = CONST_EXP2 +
@ -283,7 +288,7 @@ __kernel void search1(__global hash_t* hashes)
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
} }
#pragma unroll 4 #pragma unroll 4
for(int i=9;i<13;i++) for(int i=9;i<13;i++)
{ {
q[i+16] = CONST_EXP2 + q[i+16] = CONST_EXP2 +
@ -291,7 +296,7 @@ __kernel void search1(__global hash_t* hashes)
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
} }
#pragma unroll 3 #pragma unroll 3
for(int i=13;i<16;i++) for(int i=13;i<16;i++)
{ {
q[i+16] = CONST_EXP2 + q[i+16] = CONST_EXP2 +
@ -320,7 +325,7 @@ __kernel void search1(__global hash_t* hashes)
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]); BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16 #pragma unroll 16
for(int i=0;i<16;i++) for(int i=0;i<16;i++)
{ {
mv[i] = BMW_H[i]; mv[i] = BMW_H[i];
@ -360,7 +365,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]); tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0]; q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2 #pragma unroll 2
for(int i=0;i<2;i++) for(int i=0;i<2;i++)
{ {
q[i+16] = q[i+16] =
@ -384,7 +389,7 @@ __kernel void search1(__global hash_t* hashes)
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
} }
#pragma unroll 4 #pragma unroll 4
for(int i=2;i<6;i++) for(int i=2;i<6;i++)
{ {
q[i+16] = CONST_EXP2 + q[i+16] = CONST_EXP2 +
@ -392,7 +397,7 @@ __kernel void search1(__global hash_t* hashes)
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]); SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
} }
#pragma unroll 3 #pragma unroll 3
for(int i=6;i<9;i++) for(int i=6;i<9;i++)
{ {
q[i+16] = CONST_EXP2 + q[i+16] = CONST_EXP2 +
@ -400,7 +405,7 @@ __kernel void search1(__global hash_t* hashes)
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]); SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
} }
#pragma unroll 4 #pragma unroll 4
for(int i=9;i<13;i++) for(int i=9;i<13;i++)
{ {
q[i+16] = CONST_EXP2 + q[i+16] = CONST_EXP2 +
@ -408,7 +413,7 @@ __kernel void search1(__global hash_t* hashes)
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]); SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
} }
#pragma unroll 3 #pragma unroll 3
for(int i=13;i<16;i++) for(int i=13;i<16;i++)
{ {
q[i+16] = CONST_EXP2 + q[i+16] = CONST_EXP2 +
@ -418,6 +423,7 @@ __kernel void search1(__global hash_t* hashes)
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31]; XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]); BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
@ -454,94 +460,67 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
#if !SPH_SMALL_FOOTPRINT_GROESTL __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_C[i] = T0[i]; T0_L[i] = T0[i];
T4_C[i] = T4[i]; T4_L[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL T1_L[i] = T1[i];
T1_C[i] = T1[i]; T2_L[i] = T2[i];
T2_C[i] = T2[i]; T3_L[i] = T3[i];
T3_C[i] = T3[i]; T5_L[i] = T5[i];
T5_C[i] = T5[i]; T6_L[i] = T6[i];
T6_C[i] = T6[i]; T7_L[i] = T7[i];
T7_C[i] = T7[i];
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); // groestl barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_C #define T0 T0_L
#define T1 T1_C #define T1 T1_L
#define T2 T2_C #define T2 T2_L
#define T3 T3_C #define T3 T3_L
#define T4 T4_C #define T4 T4_L
#define T5 T5_C #define T5 T5_L
#define T6 T6_C #define T6 T6_L
#define T7 T7_C #define T7 T7_L
// groestl // groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]); g[0] = m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]); g[1] = m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]); g[2] = m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]); g[3] = m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]); g[4] = m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]); g[5] = m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]); g[6] = m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]); g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
for (unsigned int u = 0; u < 16; u ++) g[9] = m[9] = 0;
g[u] = m[u] ^ H[u]; g[10] = m[10] = 0;
g[11] = m[11] = 0;
m[8] = 0x80; g[8] = m[8] ^ H[8]; g[12] = m[12] = 0;
m[9] = 0; g[9] = m[9] ^ H[9]; g[13] = m[13] = 0;
m[10] = 0; g[10] = m[10] ^ H[10]; g[14] = m[14] = 0;
m[11] = 0; g[11] = m[11] ^ H[11]; g[15] = 0x102000000000000;
m[12] = 0; g[12] = m[12] ^ H[12]; m[15] = 0x100000000000000;
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16]; sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u]; xH[u] = H[u] ^= g[u] ^ m[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 8; u < 16; u ++)
H[u] ^= xH[u]; hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -566,10 +545,14 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8(hash->h8[5]); m5 = SWAP8(hash->h8[5]);
m6 = SWAP8(hash->h8[6]); m6 = SWAP8(hash->h8[6]);
m7 = SWAP8(hash->h8[7]); m7 = SWAP8(hash->h8[7]);
UBI_BIG(480, 64); UBI_BIG(480, 64);
bcount = 0; bcount = 0;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
UBI_BIG(510, 8); UBI_BIG(510, 8);
hash->h8[0] = SWAP8(h0); hash->h8[0] = SWAP8(h0);
hash->h8[1] = SWAP8(h1); hash->h8[1] = SWAP8(h1);
hash->h8[2] = SWAP8(h2); hash->h8[2] = SWAP8(h2);
@ -669,6 +652,7 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8(hash->h8[7]); a21 ^= SWAP8(hash->h8[7]);
a31 ^= 0x8000000000000001; a31 ^= 0x8000000000000001;
KECCAK_F_1600; KECCAK_F_1600;
// Finalize the "lane complement" // Finalize the "lane complement"
a10 = ~a10; a10 = ~a10;
a20 = ~a20; a20 = ~a20;
@ -830,6 +814,7 @@ __kernel void search8(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
int init = get_local_id(0); int init = get_local_id(0);
@ -858,7 +843,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0]; rk00 = hash->h4[0];
rk01 = hash->h4[1]; rk01 = hash->h4[1];
@ -913,10 +898,8 @@ __kernel void search9(__global hash_t* hashes)
// simd // simd
s32 q[256]; s32 q[256];
unsigned char x[128]; unsigned char x[128];
for(unsigned int i = 0; i < 64; i++) for(unsigned int i = 0; i < 64; i++)
x[i] = hash->h1[i]; x[i] = hash->h1[i];
for(unsigned int i = 64; i < 128; i++) for(unsigned int i = 64; i < 128; i++)
x[i] = 0; x[i] = 0;
@ -926,7 +909,8 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);
FFT256(0, 1, 0, ll1); FFT256(0, 1, 0, ll1);
for (int i = 0; i < 256; i ++) { for (int i = 0; i < 256; i ++)
{
s32 tq; s32 tq;
tq = q[i] + yoff_b_n[i]; tq = q[i] + yoff_b_n[i];
@ -962,14 +946,17 @@ __kernel void search9(__global hash_t* hashes)
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
IF, 4, 13, PP8_4_); IF, 4, 13, PP8_4_);
STEP_BIG( STEP_BIG(
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
IF, 13, 10, PP8_5_); IF, 13, 10, PP8_5_);
STEP_BIG( STEP_BIG(
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
IF, 10, 25, PP8_6_); IF, 10, 25, PP8_6_);
STEP_BIG( STEP_BIG(
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
@ -988,22 +975,27 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
STEP_BIG( STEP_BIG(
COPY_A0, COPY_A1, COPY_A2, COPY_A3, COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7, COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4, 13, PP8_4_); IF, 4, 13, PP8_4_);
STEP_BIG( STEP_BIG(
COPY_B0, COPY_B1, COPY_B2, COPY_B3, COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7, COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13, 10, PP8_5_); IF, 13, 10, PP8_5_);
STEP_BIG( STEP_BIG(
COPY_C0, COPY_C1, COPY_C2, COPY_C3, COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7, COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10, 25, PP8_6_); IF, 10, 25, PP8_6_);
STEP_BIG( STEP_BIG(
COPY_D0, COPY_D1, COPY_D2, COPY_D3, COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7, COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25, 4, PP8_0_); IF, 25, 4, PP8_0_);
#undef q #undef q
hash->h4[0] = A0; hash->h4[0] = A0;
@ -1116,18 +1108,16 @@ __kernel void search11(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 T512_L[1024]; __local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0]; __constant const sph_u32 *T512_C = &T512[0][0];
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 1024; i += step) for (int i = init; i < 1024; i += step)
{
T512_L[i] = T512_C[i]; T512_L[i] = T512_C[i];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
{
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
@ -1136,26 +1126,31 @@ __kernel void search11(__global hash_t* hashes)
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) hash->h1[i + u] #define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8) { for(int i = 0; i < 64; i += 8) {
INPUT_BIG_LOCAL; INPUT_BIG_LOCAL;
P_BIG; P_BIG;
T_BIG; T_BIG;
} }
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0) #undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG_LOCAL; INPUT_BIG_LOCAL;
P_BIG; P_BIG;
T_BIG; T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0) #undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG_LOCAL; INPUT_BIG_LOCAL;
PF_BIG; PF_BIG;
T_BIG; T_BIG;
for (unsigned u = 0; u < 16; u ++) for (unsigned u = 0; u < 16; u ++)
hash->h4[u] = h[u]; hash->h4[u] = h[u];
}
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -1362,6 +1357,7 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
LT6[i] = plain_T6[i]; LT6[i] = plain_T6[i];
LT7[i] = plain_T7[i]; LT7[i] = plain_T7[i];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
@ -1390,7 +1386,9 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
n6 ^= h6; n6 ^= h6;
n7 ^= h7; n7 ^= h7;
for (unsigned r = 0; r < 10; r ++) { #pragma unroll 10
for (unsigned r = 0; r < 10; r ++)
{
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]); ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
@ -1430,7 +1428,9 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
n6 ^= h6; n6 ^= h6;
n7 ^= h7; n7 ^= h7;
for (unsigned r = 0; r < 10; r ++) { #pragma unroll 10
for (unsigned r = 0; r < 10; r ++)
{
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(LT, h, tmp, plain_RC[r]); ROUND_KSCHED(LT, h, tmp, plain_RC[r]);
@ -1451,11 +1451,6 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
for (unsigned i = 0; i < 8; i ++) for (unsigned i = 0; i < 8; i ++)
hash->h8[i] = state[i]; hash->h8[i] = state[i];
// for(uint i = 0; i < 8; i++)
// output[(NUMHASH * 9) * 15 + gid * 9 + i] = hash->h8[i];
// output[(NUMHASH * 9) * 15 + gid * 9 + 8] = nonce;
bool result = (hash->h8[3] <= target); bool result = (hash->h8[3] <= target);
if (result) if (result)
output[atomic_inc(output+0xFF)] = SWAP4(gid); output[atomic_inc(output+0xFF)] = SWAP4(gid);

701
kernel/bitblockold.cl

File diff suppressed because it is too large Load Diff

110
kernel/darkcoin-mod.cl

@ -95,8 +95,6 @@
#include "shavite.cl" #include "shavite.cl"
#include "simd.cl" #include "simd.cl"
#include "echo.cl" #include "echo.cl"
#include "hamsi.cl"
#include "fugue.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx) #define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) #define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
@ -456,91 +454,67 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
#if !SPH_SMALL_FOOTPRINT_GROESTL __local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_C[i] = T0[i]; T0_L[i] = T0[i];
T4_C[i] = T4[i]; T4_L[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL T1_L[i] = T1[i];
T1_C[i] = T1[i]; T2_L[i] = T2[i];
T2_C[i] = T2[i]; T3_L[i] = T3[i];
T3_C[i] = T3[i]; T5_L[i] = T5[i];
T5_C[i] = T5[i]; T6_L[i] = T6[i];
T6_C[i] = T6[i]; T7_L[i] = T7[i];
T7_C[i] = T7[i];
#endif
} }
barrier(CLK_LOCAL_MEM_FENCE); // groestl barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++) #define T0 T0_L
H[u] = 0; #define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
#if USE_LE // groestl
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40); sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]); g[0] = m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]); g[1] = m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]); g[2] = m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]); g[3] = m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]); g[4] = m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]); g[5] = m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]); g[6] = m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]); g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
for (unsigned int u = 0; u < 16; u ++) g[9] = m[9] = 0;
g[u] = m[u] ^ H[u]; g[10] = m[10] = 0;
g[11] = m[11] = 0;
m[8] = 0x80; g[8] = m[8] ^ H[8]; g[12] = m[12] = 0;
m[9] = 0; g[9] = m[9] ^ H[9]; g[13] = m[13] = 0;
m[10] = 0; g[10] = m[10] ^ H[10]; g[14] = m[14] = 0;
m[11] = 0; g[11] = m[11] ^ H[11]; g[15] = 0x102000000000000;
m[12] = 0; g[12] = m[12] ^ H[12]; m[15] = 0x100000000000000;
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16]; sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u]; xH[u] = H[u] ^= g[u] ^ m[u];
PERM_BIG_P(xH);
for (unsigned int u = 0; u < 16; u ++) PERM_BIG_P(xH);
H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++) for (unsigned int u = 8; u < 16; u ++)
hash->h8[u] = DEC64E(H[u + 8]); hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -863,7 +837,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0]; rk00 = hash->h4[0];
rk01 = hash->h4[1]; rk01 = hash->h4[1];

436
kernel/marucoin-mod.cl

@ -70,12 +70,17 @@ typedef long sph_s64;
#define SPH_SIMD_NOCOPY 0 #define SPH_SIMD_NOCOPY 0
#define SPH_KECCAK_NOCOPY 0 #define SPH_KECCAK_NOCOPY 0
#define SPH_COMPACT_BLAKE_64 0 #define SPH_COMPACT_BLAKE_64 0
#define SPH_LUFFA_PARALLEL 1 #define SPH_LUFFA_PARALLEL 0
#define SPH_SMALL_FOOTPRINT_GROESTL 0 #define SPH_SMALL_FOOTPRINT_GROESTL 0
#define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0 #define SPH_CUBEHASH_UNROLL 0
#define SPH_KECCAK_UNROLL 1 #define SPH_KECCAK_UNROLL 1
#define SPH_HAMSI_EXPAND_BIG 1 #if !defined SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 1
#endif
#ifndef SPH_HAMSI_SHORT
#define SPH_HAMSI_SHORT 1
#endif
#include "blake.cl" #include "blake.cl"
#include "bmw.cl" #include "bmw.cl"
@ -113,8 +118,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// blake
// blake
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -123,13 +128,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;;
if ((T0 = SPH_T64(T0 + 1024)) < 1024) if ((T0 = SPH_T64(T0 + 1024)) < 1024)
{
T1 = SPH_T64(T1 + 1); T1 = SPH_T64(T1 + 1);
}
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; sph_u64 M0, M1, M2, M3, M4, M5, M6, M7;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
M0 = DEC64BE(block + 0); M0 = DEC64BE(block + 0);
M1 = DEC64BE(block + 8); M1 = DEC64BE(block + 8);
M2 = DEC64BE(block + 16); M2 = DEC64BE(block + 16);
@ -160,7 +165,6 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
hash->h8[6] = H6; hash->h8[6] = H6;
hash->h8[7] = H7; hash->h8[7] = H7;
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -169,58 +173,269 @@ __kernel void search1(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// bmw // bmw
sph_u64 BMW_H[16]; sph_u64 BMW_H[16];
#pragma unroll 16
for(unsigned u = 0; u < 16; u++) for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u]; BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16]; sph_u64 mv[16],q[32];
sph_u64 mv[16]; sph_u64 tmp;
mv[ 0] = SWAP8(hash->h8[0]); mv[0] = SWAP8(hash->h8[0]);
mv[ 1] = SWAP8(hash->h8[1]); mv[1] = SWAP8(hash->h8[1]);
mv[ 2] = SWAP8(hash->h8[2]); mv[2] = SWAP8(hash->h8[2]);
mv[ 3] = SWAP8(hash->h8[3]); mv[3] = SWAP8(hash->h8[3]);
mv[ 4] = SWAP8(hash->h8[4]); mv[4] = SWAP8(hash->h8[4]);
mv[ 5] = SWAP8(hash->h8[5]); mv[5] = SWAP8(hash->h8[5]);
mv[ 6] = SWAP8(hash->h8[6]); mv[6] = SWAP8(hash->h8[6]);
mv[ 7] = SWAP8(hash->h8[7]); mv[7] = SWAP8(hash->h8[7]);
mv[ 8] = 0x80; mv[8] = 0x80;
mv[ 9] = 0; mv[9] = 0;
mv[10] = 0; mv[10] = 0;
mv[11] = 0; mv[11] = 0;
mv[12] = 0; mv[12] = 0;
mv[13] = 0; mv[13] = 0;
mv[14] = 0; mv[14] = 0;
mv[15] = 0x200; mv[15] = SPH_C64(512);
#define M(x) (mv[x])
#define H(x) (BMW_H[x]) tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
#define dH(x) (BMW_h2[x]) q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
FOLDb; #pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#undef M #pragma unroll 3
#undef H for(int i=6;i<9;i++)
#undef dH {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x]) #pragma unroll 4
#define H(x) (final_b[x]) for(int i=9;i<13;i++)
#define dH(x) (BMW_h1[x]) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
FOLDb; sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#undef M #undef M
#undef H XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
#undef dH XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
hash->h8[0] = SWAP8(BMW_h1[8]); BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
hash->h8[1] = SWAP8(BMW_h1[9]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
hash->h8[2] = SWAP8(BMW_h1[10]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
hash->h8[3] = SWAP8(BMW_h1[11]); BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
hash->h8[4] = SWAP8(BMW_h1[12]); BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
hash->h8[5] = SWAP8(BMW_h1[13]); BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
hash->h8[6] = SWAP8(BMW_h1[14]); BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
hash->h8[7] = SWAP8(BMW_h1[15]); BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash->h8[0] = SWAP8(BMW_H[8]);
hash->h8[1] = SWAP8(BMW_H[9]);
hash->h8[2] = SWAP8(BMW_H[10]);
hash->h8[3] = SWAP8(BMW_H[11]);
hash->h8[4] = SWAP8(BMW_H[12]);
hash->h8[5] = SWAP8(BMW_H[13]);
hash->h8[6] = SWAP8(BMW_H[14]);
hash->h8[7] = SWAP8(BMW_H[15]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -239,27 +454,27 @@ __kernel void search2(__global hash_t* hashes)
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_L[i] = T0[i]; T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i]; T1_L[i] = T1[i];
T2_L[i] = T2[i]; T2_L[i] = T2[i];
T3_L[i] = T3[i]; T3_L[i] = T3[i];
T4_L[i] = T4[i];
T5_L[i] = T5[i]; T5_L[i] = T5[i];
T6_L[i] = T6[i]; T6_L[i] = T6[i];
T7_L[i] = T7[i]; T7_L[i] = T7[i];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L #define T0 T0_L
#define T1 T1_L #define T1 T1_L
#define T2 T2_L #define T2 T2_L
#define T3 T3_L #define T3 T3_L
#define T4 T4_L #define T4 T4_L
#define T5 T5_L #define T5 T5_L
#define T6 T6_L #define T6 T6_L
#define T7 T7_L #define T7 T7_L
// groestl // groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000}; sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
@ -287,9 +502,12 @@ __kernel void search2(__global hash_t* hashes)
sph_u64 xH[16]; sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u] ^= g[u] ^ m[u]; xH[u] = H[u] ^= g[u] ^ m[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
for (unsigned int u = 8; u < 16; u ++) for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]); hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -313,10 +531,14 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8(hash->h8[5]); m5 = SWAP8(hash->h8[5]);
m6 = SWAP8(hash->h8[6]); m6 = SWAP8(hash->h8[6]);
m7 = SWAP8(hash->h8[7]); m7 = SWAP8(hash->h8[7]);
UBI_BIG(480, 64); UBI_BIG(480, 64);
bcount = 0; bcount = 0;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
UBI_BIG(510, 8); UBI_BIG(510, 8);
hash->h8[0] = SWAP8(h0); hash->h8[0] = SWAP8(h0);
hash->h8[1] = SWAP8(h1); hash->h8[1] = SWAP8(h1);
hash->h8[2] = SWAP8(h2); hash->h8[2] = SWAP8(h2);
@ -343,7 +565,8 @@ __kernel void search4(__global hash_t* hashes)
for(int i = 0; i < 2; i++) for(int i = 0; i < 2; i++)
{ {
if (i == 0) { if (i == 0)
{
h0h ^= DEC64E(hash->h8[0]); h0h ^= DEC64E(hash->h8[0]);
h0l ^= DEC64E(hash->h8[1]); h0l ^= DEC64E(hash->h8[1]);
h1h ^= DEC64E(hash->h8[2]); h1h ^= DEC64E(hash->h8[2]);
@ -352,7 +575,9 @@ __kernel void search4(__global hash_t* hashes)
h2l ^= DEC64E(hash->h8[5]); h2l ^= DEC64E(hash->h8[5]);
h3h ^= DEC64E(hash->h8[6]); h3h ^= DEC64E(hash->h8[6]);
h3l ^= DEC64E(hash->h8[7]); h3l ^= DEC64E(hash->h8[7]);
} else if(i == 1) { }
else if(i == 1)
{
h4h ^= DEC64E(hash->h8[0]); h4h ^= DEC64E(hash->h8[0]);
h4l ^= DEC64E(hash->h8[1]); h4l ^= DEC64E(hash->h8[1]);
h5h ^= DEC64E(hash->h8[2]); h5h ^= DEC64E(hash->h8[2]);
@ -382,7 +607,6 @@ __kernel void search4(__global hash_t* hashes)
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search5(__global hash_t* hashes) __kernel void search5(__global hash_t* hashes)
{ {
@ -414,6 +638,7 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8(hash->h8[7]); a21 ^= SWAP8(hash->h8[7]);
a31 ^= 0x8000000000000001; a31 ^= 0x8000000000000001;
KECCAK_F_1600; KECCAK_F_1600;
// Finalize the "lane complement" // Finalize the "lane complement"
a10 = ~a10; a10 = ~a10;
a20 = ~a20; a20 = ~a20;
@ -460,7 +685,8 @@ __kernel void search6(__global hash_t* hashes)
MI5; MI5;
LUFFA_P5; LUFFA_P5;
if(i == 0) { if(i == 0)
{
M0 = hash->h4[9]; M0 = hash->h4[9];
M1 = hash->h4[8]; M1 = hash->h4[8];
M2 = hash->h4[11]; M2 = hash->h4[11];
@ -469,12 +695,16 @@ __kernel void search6(__global hash_t* hashes)
M5 = hash->h4[12]; M5 = hash->h4[12];
M6 = hash->h4[15]; M6 = hash->h4[15];
M7 = hash->h4[14]; M7 = hash->h4[14];
} else if(i == 1) { }
else if(i == 1)
{
M0 = 0x80000000; M0 = 0x80000000;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
} else if(i == 2) { }
else if(i == 2)
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
} else if(i == 3) { else if(i == 3)
{
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42;
@ -524,10 +754,12 @@ __kernel void search7(__global hash_t* hashes)
x6 ^= SWAP4(hash->h4[7]); x6 ^= SWAP4(hash->h4[7]);
x7 ^= SWAP4(hash->h4[6]); x7 ^= SWAP4(hash->h4[6]);
for (int i = 0; i < 13; i ++) { for (int i = 0; i < 13; i ++)
{
SIXTEEN_ROUNDS; SIXTEEN_ROUNDS;
if (i == 0) { if (i == 0)
{
x0 ^= SWAP4(hash->h4[9]); x0 ^= SWAP4(hash->h4[9]);
x1 ^= SWAP4(hash->h4[8]); x1 ^= SWAP4(hash->h4[8]);
x2 ^= SWAP4(hash->h4[11]); x2 ^= SWAP4(hash->h4[11]);
@ -536,12 +768,12 @@ __kernel void search7(__global hash_t* hashes)
x5 ^= SWAP4(hash->h4[12]); x5 ^= SWAP4(hash->h4[12]);
x6 ^= SWAP4(hash->h4[15]); x6 ^= SWAP4(hash->h4[15]);
x7 ^= SWAP4(hash->h4[14]); x7 ^= SWAP4(hash->h4[14]);
} else if(i == 1) { }
else if(i == 1)
x0 ^= 0x80; x0 ^= 0x80;
} else if (i == 2) { else if (i == 2)
xv ^= SPH_C32(1); xv ^= SPH_C32(1);
} }
}
hash->h4[0] = x0; hash->h4[0] = x0;
hash->h4[1] = x1; hash->h4[1] = x1;
@ -568,6 +800,7 @@ __kernel void search8(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
int init = get_local_id(0); int init = get_local_id(0);
@ -596,7 +829,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0]; rk00 = hash->h4[0];
rk01 = hash->h4[1]; rk01 = hash->h4[1];
@ -662,7 +895,8 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);
FFT256(0, 1, 0, ll1); FFT256(0, 1, 0, ll1);
for (int i = 0; i < 256; i ++) { for (int i = 0; i < 256; i ++)
{
s32 tq; s32 tq;
tq = q[i] + yoff_b_n[i]; tq = q[i] + yoff_b_n[i];
@ -698,14 +932,17 @@ __kernel void search9(__global hash_t* hashes)
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
IF, 4, 13, PP8_4_); IF, 4, 13, PP8_4_);
STEP_BIG( STEP_BIG(
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
IF, 13, 10, PP8_5_); IF, 13, 10, PP8_5_);
STEP_BIG( STEP_BIG(
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
IF, 10, 25, PP8_6_); IF, 10, 25, PP8_6_);
STEP_BIG( STEP_BIG(
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
@ -724,22 +961,27 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
STEP_BIG( STEP_BIG(
COPY_A0, COPY_A1, COPY_A2, COPY_A3, COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7, COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4, 13, PP8_4_); IF, 4, 13, PP8_4_);
STEP_BIG( STEP_BIG(
COPY_B0, COPY_B1, COPY_B2, COPY_B3, COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7, COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13, 10, PP8_5_); IF, 13, 10, PP8_5_);
STEP_BIG( STEP_BIG(
COPY_C0, COPY_C1, COPY_C2, COPY_C3, COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7, COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10, 25, PP8_6_); IF, 10, 25, PP8_6_);
STEP_BIG( STEP_BIG(
COPY_D0, COPY_D1, COPY_D2, COPY_D3, COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7, COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25, 4, PP8_0_); IF, 25, 4, PP8_0_);
#undef q #undef q
hash->h4[0] = A0; hash->h4[0] = A0;
@ -766,9 +1008,8 @@ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search10(__global hash_t* hashes) __kernel void search10(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
uint offset = get_global_offset(0);
__global hash_t *hash = &(hashes[gid-offset]);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
@ -785,9 +1026,8 @@ __kernel void search10(__global hash_t* hashes)
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < 8; i++)
hash->h8[i] = hashes[gid-offset].h8[i];
// echo // echo
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1;
@ -833,27 +1073,18 @@ __kernel void search10(__global hash_t* hashes)
WF0 = 0x200; WF0 = 0x200;
WF1 = 0; WF1 = 0;
for (unsigned u = 0; u < 10; u ++) { for (unsigned u = 0; u < 10; u ++)
BIG_ROUND; BIG_ROUND;
}
hash->h8[0] ^= Vb00 ^ W00 ^ W80;
hash->h8[1] ^= Vb01 ^ W01 ^ W81;
hash->h8[2] ^= Vb10 ^ W10 ^ W90;
hash->h8[3] ^= Vb11 ^ W11 ^ W91;
hash->h8[4] ^= Vb20 ^ W20 ^ WA0;
hash->h8[5] ^= Vb21 ^ W21 ^ WA1;
hash->h8[6] ^= Vb30 ^ W30 ^ WB0;
hash->h8[7] ^= Vb31 ^ W31 ^ WB1;
hash->h8[0] = Vb00 ^ hash->h8[0] ^ W00 ^ W80;
hash->h8[1] = Vb01 ^ hash->h8[1] ^ W01 ^ W81;
hash->h8[2] = Vb10 ^ hash->h8[2] ^ W10 ^ W90;
hash->h8[3] = Vb11 ^ hash->h8[3] ^ W11 ^ W91;
hash->h8[4] = Vb20 ^ hash->h8[4] ^ W20 ^ WA0;
hash->h8[5] = Vb21 ^ hash->h8[5] ^ W21 ^ WA1;
hash->h8[6] = Vb30 ^ hash->h8[6] ^ W30 ^ WB0;
hash->h8[7] = Vb31 ^ hash->h8[7] ^ W31 ^ WB1;
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -862,18 +1093,16 @@ __kernel void search11(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 T512_L[1024]; __local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0]; __constant const sph_u32 *T512_C = &T512[0][0];
int init = get_local_id(0); int init = get_local_id(0);
int step = get_local_size(0); int step = get_local_size(0);
for (int i = init; i < 1024; i += step) for (int i = init; i < 1024; i += step)
{
T512_L[i] = T512_C[i]; T512_L[i] = T512_C[i];
}
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
{
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11]; sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
@ -882,26 +1111,31 @@ __kernel void search11(__global hash_t* hashes)
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) hash->h1[i + u] #define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8) { for(int i = 0; i < 64; i += 8) {
INPUT_BIG_LOCAL; INPUT_BIG_LOCAL;
P_BIG; P_BIG;
T_BIG; T_BIG;
} }
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0) #undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG_LOCAL; INPUT_BIG_LOCAL;
P_BIG; P_BIG;
T_BIG; T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0) #undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG_LOCAL; INPUT_BIG_LOCAL;
PF_BIG; PF_BIG;
T_BIG; T_BIG;
for (unsigned u = 0; u < 16; u ++) for (unsigned u = 0; u < 16; u ++)
hash->h4[u] = h[u]; hash->h4[u] = h[u];
}
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -931,7 +1165,7 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
sph_u32 S30, S31, S32, S33, S34, S35; sph_u32 S30, S31, S32, S33, S34, S35;
ulong fc_bit_count = (sph_u64) 64 << 3; ulong fc_bit_count = (sph_u64) 0x200;
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);

588
kernel/marucoin-modold.cl

@ -75,8 +75,8 @@ typedef long sph_s64;
#define SPH_GROESTL_BIG_ENDIAN 0 #define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0 #define SPH_CUBEHASH_UNROLL 0
#define SPH_KECCAK_UNROLL 0 #define SPH_KECCAK_UNROLL 0
#if !defined SPH_HAMSI_EXPAND_BIG #ifndef SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 4 #define SPH_HAMSI_EXPAND_BIG 4
#endif #endif
#include "blake.cl" #include "blake.cl"
@ -115,8 +115,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// blake
// blake
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B); sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1); sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F); sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -125,13 +125,13 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;; sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;;
if ((T0 = SPH_T64(T0 + 1024)) < 1024) if ((T0 = SPH_T64(T0 + 1024)) < 1024)
{
T1 = SPH_T64(T1 + 1); T1 = SPH_T64(T1 + 1);
}
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; sph_u64 M0, M1, M2, M3, M4, M5, M6, M7;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
M0 = DEC64BE(block + 0); M0 = DEC64BE(block + 0);
M1 = DEC64BE(block + 8); M1 = DEC64BE(block + 8);
M2 = DEC64BE(block + 16); M2 = DEC64BE(block + 16);
@ -170,58 +170,269 @@ __kernel void search1(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// bmw // bmw
sph_u64 BMW_H[16]; sph_u64 BMW_H[16];
#pragma unroll 16
for(unsigned u = 0; u < 16; u++) for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u]; BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16]; sph_u64 mv[16],q[32];
sph_u64 mv[16]; sph_u64 tmp;
mv[ 0] = SWAP8(hash->h8[0]); mv[0] = SWAP8(hash->h8[0]);
mv[ 1] = SWAP8(hash->h8[1]); mv[1] = SWAP8(hash->h8[1]);
mv[ 2] = SWAP8(hash->h8[2]); mv[2] = SWAP8(hash->h8[2]);
mv[ 3] = SWAP8(hash->h8[3]); mv[3] = SWAP8(hash->h8[3]);
mv[ 4] = SWAP8(hash->h8[4]); mv[4] = SWAP8(hash->h8[4]);
mv[ 5] = SWAP8(hash->h8[5]); mv[5] = SWAP8(hash->h8[5]);
mv[ 6] = SWAP8(hash->h8[6]); mv[6] = SWAP8(hash->h8[6]);
mv[ 7] = SWAP8(hash->h8[7]); mv[7] = SWAP8(hash->h8[7]);
mv[ 8] = 0x80; mv[8] = 0x80;
mv[ 9] = 0; mv[9] = 0;
mv[10] = 0; mv[10] = 0;
mv[11] = 0; mv[11] = 0;
mv[12] = 0; mv[12] = 0;
mv[13] = 0; mv[13] = 0;
mv[14] = 0; mv[14] = 0;
mv[15] = 0x200; mv[15] = SPH_C64(512);
#define M(x) (mv[x])
#define H(x) (BMW_H[x]) tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
#define dH(x) (BMW_h2[x]) q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
FOLDb; #pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#undef M #pragma unroll 3
#undef H for(int i=6;i<9;i++)
#undef dH {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x]) #pragma unroll 3
#define H(x) (final_b[x]) for(int i=6;i<9;i++)
#define dH(x) (BMW_h1[x]) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
FOLDb; #pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#undef M #pragma unroll 3
#undef H for(int i=13;i<16;i++)
#undef dH {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
hash->h8[0] = SWAP8(BMW_h1[8]); XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
hash->h8[1] = SWAP8(BMW_h1[9]); XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
hash->h8[2] = SWAP8(BMW_h1[10]);
hash->h8[3] = SWAP8(BMW_h1[11]); BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
hash->h8[4] = SWAP8(BMW_h1[12]); BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
hash->h8[5] = SWAP8(BMW_h1[13]); BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
hash->h8[6] = SWAP8(BMW_h1[14]); BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
hash->h8[7] = SWAP8(BMW_h1[15]); BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash->h8[0] = SWAP8(BMW_H[8]);
hash->h8[1] = SWAP8(BMW_H[9]);
hash->h8[2] = SWAP8(BMW_H[10]);
hash->h8[3] = SWAP8(BMW_H[11]);
hash->h8[4] = SWAP8(BMW_H[12]);
hash->h8[5] = SWAP8(BMW_H[13]);
hash->h8[6] = SWAP8(BMW_H[14]);
hash->h8[7] = SWAP8(BMW_H[15]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -240,67 +451,59 @@ __kernel void search2(__global hash_t* hashes)
for (int i = init; i < 256; i += step) for (int i = init; i < 256; i += step)
{ {
T0_L[i] = T0[i]; T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i]; T1_L[i] = T1[i];
T2_L[i] = T2[i]; T2_L[i] = T2[i];
T3_L[i] = T3[i]; T3_L[i] = T3[i];
T4_L[i] = T4[i];
T5_L[i] = T5[i]; T5_L[i] = T5[i];
T6_L[i] = T6[i]; T6_L[i] = T6[i];
T7_L[i] = T7[i]; T7_L[i] = T7[i];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L #define T0 T0_L
#define T1 T1_L #define T1 T1_L
#define T2 T2_L #define T2 T2_L
#define T3 T3_L #define T3 T3_L
#define T4 T4_L #define T4 T4_L
#define T5 T5_L #define T5 T5_L
#define T6 T6_L #define T6 T6_L
#define T7 T7_L #define T7 T7_L
// groestl // groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 g[16], m[16]; sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]); g[0] = m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]); g[1] = m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]); g[2] = m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]); g[3] = m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]); g[4] = m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]); g[5] = m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]); g[6] = m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]); g[7] = m[7] = DEC64E(hash->h8[7]);
for (unsigned int u = 0; u < 16; u ++) g[8] = m[8] = 0x80;
g[u] = m[u] ^ H[u]; g[9] = m[9] = 0;
m[8] = 0x80; g[8] = m[8] ^ H[8]; g[10] = m[10] = 0;
m[9] = 0; g[9] = m[9] ^ H[9]; g[11] = m[11] = 0;
m[10] = 0; g[10] = m[10] ^ H[10]; g[12] = m[12] = 0;
m[11] = 0; g[11] = m[11] ^ H[11]; g[13] = m[13] = 0;
m[12] = 0; g[12] = m[12] ^ H[12]; g[14] = m[14] = 0;
m[13] = 0; g[13] = m[13] ^ H[13]; g[15] = 0x102000000000000;
m[14] = 0; g[14] = m[14] ^ H[14]; m[15] = 0x100000000000000;
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g); PERM_BIG_P(g);
PERM_BIG_Q(m); PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16]; sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++) for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u]; xH[u] = H[u] ^= g[u] ^ m[u];
PERM_BIG_P(xH); PERM_BIG_P(xH);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u]; for (unsigned int u = 8; u < 16; u ++)
for (unsigned int u = 0; u < 8; u ++) hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
hash->h8[u] = DEC64E(H[u + 8]);
barrier(CLK_GLOBAL_MEM_FENCE); barrier(CLK_GLOBAL_MEM_FENCE);
} }
@ -325,10 +528,14 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8(hash->h8[5]); m5 = SWAP8(hash->h8[5]);
m6 = SWAP8(hash->h8[6]); m6 = SWAP8(hash->h8[6]);
m7 = SWAP8(hash->h8[7]); m7 = SWAP8(hash->h8[7]);
UBI_BIG(480, 64); UBI_BIG(480, 64);
bcount = 0; bcount = 0;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0; m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
UBI_BIG(510, 8); UBI_BIG(510, 8);
hash->h8[0] = SWAP8(h0); hash->h8[0] = SWAP8(h0);
hash->h8[1] = SWAP8(h1); hash->h8[1] = SWAP8(h1);
hash->h8[2] = SWAP8(h2); hash->h8[2] = SWAP8(h2);
@ -355,7 +562,8 @@ __kernel void search4(__global hash_t* hashes)
for(int i = 0; i < 2; i++) for(int i = 0; i < 2; i++)
{ {
if (i == 0) { if (i == 0)
{
h0h ^= DEC64E(hash->h8[0]); h0h ^= DEC64E(hash->h8[0]);
h0l ^= DEC64E(hash->h8[1]); h0l ^= DEC64E(hash->h8[1]);
h1h ^= DEC64E(hash->h8[2]); h1h ^= DEC64E(hash->h8[2]);
@ -364,7 +572,9 @@ __kernel void search4(__global hash_t* hashes)
h2l ^= DEC64E(hash->h8[5]); h2l ^= DEC64E(hash->h8[5]);
h3h ^= DEC64E(hash->h8[6]); h3h ^= DEC64E(hash->h8[6]);
h3l ^= DEC64E(hash->h8[7]); h3l ^= DEC64E(hash->h8[7]);
} else if(i == 1) { }
else if(i == 1)
{
h4h ^= DEC64E(hash->h8[0]); h4h ^= DEC64E(hash->h8[0]);
h4l ^= DEC64E(hash->h8[1]); h4l ^= DEC64E(hash->h8[1]);
h5h ^= DEC64E(hash->h8[2]); h5h ^= DEC64E(hash->h8[2]);
@ -425,6 +635,7 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8(hash->h8[7]); a21 ^= SWAP8(hash->h8[7]);
a31 ^= 0x8000000000000001; a31 ^= 0x8000000000000001;
KECCAK_F_1600; KECCAK_F_1600;
// Finalize the "lane complement" // Finalize the "lane complement"
a10 = ~a10; a10 = ~a10;
a20 = ~a20; a20 = ~a20;
@ -471,7 +682,8 @@ __kernel void search6(__global hash_t* hashes)
MI5; MI5;
LUFFA_P5; LUFFA_P5;
if(i == 0) { if(i == 0)
{
M0 = hash->h4[9]; M0 = hash->h4[9];
M1 = hash->h4[8]; M1 = hash->h4[8];
M2 = hash->h4[11]; M2 = hash->h4[11];
@ -480,12 +692,16 @@ __kernel void search6(__global hash_t* hashes)
M5 = hash->h4[12]; M5 = hash->h4[12];
M6 = hash->h4[15]; M6 = hash->h4[15];
M7 = hash->h4[14]; M7 = hash->h4[14];
} else if(i == 1) { }
else if(i == 1)
{
M0 = 0x80000000; M0 = 0x80000000;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
} else if(i == 2) { }
else if(i == 2)
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0; M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
} else if(i == 3) { else if(i == 3)
{
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40; hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41; hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42; hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42;
@ -535,10 +751,12 @@ __kernel void search7(__global hash_t* hashes)
x6 ^= SWAP4(hash->h4[7]); x6 ^= SWAP4(hash->h4[7]);
x7 ^= SWAP4(hash->h4[6]); x7 ^= SWAP4(hash->h4[6]);
for (int i = 0; i < 13; i ++) { for (int i = 0; i < 13; i ++)
{
SIXTEEN_ROUNDS; SIXTEEN_ROUNDS;
if (i == 0) { if (i == 0)
{
x0 ^= SWAP4(hash->h4[9]); x0 ^= SWAP4(hash->h4[9]);
x1 ^= SWAP4(hash->h4[8]); x1 ^= SWAP4(hash->h4[8]);
x2 ^= SWAP4(hash->h4[11]); x2 ^= SWAP4(hash->h4[11]);
@ -547,12 +765,12 @@ __kernel void search7(__global hash_t* hashes)
x5 ^= SWAP4(hash->h4[12]); x5 ^= SWAP4(hash->h4[12]);
x6 ^= SWAP4(hash->h4[15]); x6 ^= SWAP4(hash->h4[15]);
x7 ^= SWAP4(hash->h4[14]); x7 ^= SWAP4(hash->h4[14]);
} else if(i == 1) { }
else if(i == 1)
x0 ^= 0x80; x0 ^= 0x80;
} else if (i == 2) { else if (i == 2)
xv ^= SPH_C32(1); xv ^= SPH_C32(1);
} }
}
hash->h4[0] = x0; hash->h4[0] = x0;
hash->h4[1] = x1; hash->h4[1] = x1;
@ -579,6 +797,7 @@ __kernel void search8(__global hash_t* hashes)
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]); __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
int init = get_local_id(0); int init = get_local_id(0);
@ -607,7 +826,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0; sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0]; rk00 = hash->h4[0];
rk01 = hash->h4[1]; rk01 = hash->h4[1];
@ -673,7 +892,8 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22); u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);
FFT256(0, 1, 0, ll1); FFT256(0, 1, 0, ll1);
for (int i = 0; i < 256; i ++) { for (int i = 0; i < 256; i ++)
{
s32 tq; s32 tq;
tq = q[i] + yoff_b_n[i]; tq = q[i] + yoff_b_n[i];
@ -709,14 +929,17 @@ __kernel void search9(__global hash_t* hashes)
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
IF, 4, 13, PP8_4_); IF, 4, 13, PP8_4_);
STEP_BIG( STEP_BIG(
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
IF, 13, 10, PP8_5_); IF, 13, 10, PP8_5_);
STEP_BIG( STEP_BIG(
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
IF, 10, 25, PP8_6_); IF, 10, 25, PP8_6_);
STEP_BIG( STEP_BIG(
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
@ -735,22 +958,27 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
STEP_BIG( STEP_BIG(
COPY_A0, COPY_A1, COPY_A2, COPY_A3, COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7, COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4, 13, PP8_4_); IF, 4, 13, PP8_4_);
STEP_BIG( STEP_BIG(
COPY_B0, COPY_B1, COPY_B2, COPY_B3, COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7, COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13, 10, PP8_5_); IF, 13, 10, PP8_5_);
STEP_BIG( STEP_BIG(
COPY_C0, COPY_C1, COPY_C2, COPY_C3, COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7, COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10, 25, PP8_6_); IF, 10, 25, PP8_6_);
STEP_BIG( STEP_BIG(
COPY_D0, COPY_D1, COPY_D2, COPY_D3, COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7, COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25, 4, PP8_0_); IF, 25, 4, PP8_0_);
#undef q #undef q
hash->h4[0] = A0; hash->h4[0] = A0;
@ -778,7 +1006,7 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
{ {
uint gid = get_global_id(0); uint gid = get_global_id(0);
uint offset = get_global_offset(0); uint offset = get_global_offset(0);
hash_t hash; __global hash_t *hash = &(hashes[gid-offset]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256]; __local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
@ -795,14 +1023,23 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < 8; i++) { //mixtab
hash.h8[i] = hashes[gid-offset].h8[i]; __local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256];
for (int i = init; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i];
mixtab1[i] = mixtab1_c[i];
mixtab2[i] = mixtab2_c[i];
mixtab3[i] = mixtab3_c[i];
} }
// echo barrier(CLK_LOCAL_MEM_FENCE);
{ for (int i = 0; i < 8; i++)
hash->h8[i] = hashes[gid-offset].h8[i];
// echo
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1; sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1;
sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71; sph_u64 Vb00, Vb01, Vb10, Vb11, Vb20, Vb21, Vb30, Vb31, Vb40, Vb41, Vb50, Vb51, Vb60, Vb61, Vb70, Vb71;
Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL; Vb00 = Vb10 = Vb20 = Vb30 = Vb40 = Vb50 = Vb60 = Vb70 = 512UL;
@ -829,14 +1066,14 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
W61 = Vb61; W61 = Vb61;
W70 = Vb70; W70 = Vb70;
W71 = Vb71; W71 = Vb71;
W80 = hash.h8[0]; W80 = hash->h8[0];
W81 = hash.h8[1]; W81 = hash->h8[1];
W90 = hash.h8[2]; W90 = hash->h8[2];
W91 = hash.h8[3]; W91 = hash->h8[3];
WA0 = hash.h8[4]; WA0 = hash->h8[4];
WA1 = hash.h8[5]; WA1 = hash->h8[5];
WB0 = hash.h8[6]; WB0 = hash->h8[6];
WB1 = hash.h8[7]; WB1 = hash->h8[7];
WC0 = 0x80; WC0 = 0x80;
WC1 = 0; WC1 = 0;
WD0 = 0; WD0 = 0;
@ -846,24 +1083,26 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
WF0 = 0x200; WF0 = 0x200;
WF1 = 0; WF1 = 0;
for (unsigned u = 0; u < 10; u ++) { for (unsigned u = 0; u < 10; u ++)
BIG_ROUND; BIG_ROUND;
}
hash.h8[0] ^= Vb00 ^ W00 ^ W80;
hash.h8[1] ^= Vb01 ^ W01 ^ W81;
hash.h8[2] ^= Vb10 ^ W10 ^ W90;
hash.h8[3] ^= Vb11 ^ W11 ^ W91;
hash.h8[4] ^= Vb20 ^ W20 ^ WA0;
hash.h8[5] ^= Vb21 ^ W21 ^ WA1;
hash.h8[6] ^= Vb30 ^ W30 ^ WB0;
hash.h8[7] ^= Vb31 ^ W31 ^ WB1;
} hash->h8[0] ^= Vb00 ^ W00 ^ W80;
hash->h8[1] ^= Vb01 ^ W01 ^ W81;
hash->h8[2] ^= Vb10 ^ W10 ^ W90;
hash->h8[3] ^= Vb11 ^ W11 ^ W91;
hash->h8[4] ^= Vb20 ^ W20 ^ WA0;
hash->h8[5] ^= Vb21 ^ W21 ^ WA1;
hash->h8[6] ^= Vb30 ^ W30 ^ WB0;
hash->h8[7] ^= Vb31 ^ W31 ^ WB1;
// hamsi // hamsi
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
{ for (int i = init; i < 1024; i += step)
T512_L[i] = T512_C[i];
barrier(CLK_LOCAL_MEM_FENCE);
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3]; sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7]; sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
@ -873,51 +1112,39 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF; sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) hash.h1[i + u] #define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8) {
INPUT_BIG; for(int i = 0; i < 64; i += 8)
{
INPUT_BIG_LOCAL;
P_BIG; P_BIG;
T_BIG; T_BIG;
} }
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0) #undef buf
INPUT_BIG; #define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG_LOCAL;
P_BIG; P_BIG;
T_BIG; T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0) #undef buf
INPUT_BIG; #define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG_LOCAL;
PF_BIG; PF_BIG;
T_BIG; T_BIG;
for (unsigned u = 0; u < 16; u ++) for (unsigned u = 0; u < 16; u ++)
hash.h4[u] = h[u]; hash->h4[u] = h[u];
}
//mixtab
__local sph_u32 mixtab0[256], mixtab1[256], mixtab2[256], mixtab3[256];
init = get_local_id(0);
step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
mixtab0[i] = mixtab0_c[i];
mixtab1[i] = mixtab1_c[i];
mixtab2[i] = mixtab2_c[i];
mixtab3[i] = mixtab3_c[i];
}
barrier(CLK_GLOBAL_MEM_FENCE);
// fugue // fugue
{
sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
sph_u32 S30, S31, S32, S33, S34, S35; sph_u32 S30, S31, S32, S33, S34, S35;
ulong fc_bit_count = (sph_u64) 64 << 3; ulong fc_bit_count = (sph_u64) 0x200;
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027);
@ -925,22 +1152,25 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f); S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567); S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3((hash.h4[0x0]), (hash.h4[0x1]), (hash.h4[0x2])); FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2]));
FUGUE512_3((hash.h4[0x3]), (hash.h4[0x4]), (hash.h4[0x5])); FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5]));
FUGUE512_3((hash.h4[0x6]), (hash.h4[0x7]), (hash.h4[0x8])); FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8]));
FUGUE512_3((hash.h4[0x9]), (hash.h4[0xA]), (hash.h4[0xB])); FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB]));
FUGUE512_3((hash.h4[0xC]), (hash.h4[0xD]), (hash.h4[0xE])); FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE]));
FUGUE512_3((hash.h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x); FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
// apply round shift if necessary // apply round shift if necessary
int i; int i;
for (i = 0; i < 32; i ++) { for (i = 0; i < 32; i ++)
{
ROR3; ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03); SMIX(S00, S01, S02, S03);
} }
for (i = 0; i < 13; i ++) {
for (i = 0; i < 13; i ++)
{
S04 ^= S00; S04 ^= S00;
S09 ^= S00; S09 ^= S00;
S18 ^= S00; S18 ^= S00;
@ -971,26 +1201,24 @@ __kernel void search10(__global hash_t* hashes, __global uint* output, const ulo
S18 ^= S00; S18 ^= S00;
S27 ^= S00; S27 ^= S00;
hash.h4[0] = SWAP4(S01); hash->h4[0] = SWAP4(S01);
hash.h4[1] = SWAP4(S02); hash->h4[1] = SWAP4(S02);
hash.h4[2] = SWAP4(S03); hash->h4[2] = SWAP4(S03);
hash.h4[3] = SWAP4(S04); hash->h4[3] = SWAP4(S04);
hash.h4[4] = SWAP4(S09); hash->h4[4] = SWAP4(S09);
hash.h4[5] = SWAP4(S10); hash->h4[5] = SWAP4(S10);
hash.h4[6] = SWAP4(S11); hash->h4[6] = SWAP4(S11);
hash.h4[7] = SWAP4(S12); hash->h4[7] = SWAP4(S12);
hash.h4[8] = SWAP4(S18); hash->h4[8] = SWAP4(S18);
hash.h4[9] = SWAP4(S19); hash->h4[9] = SWAP4(S19);
hash.h4[10] = SWAP4(S20); hash->h4[10] = SWAP4(S20);
hash.h4[11] = SWAP4(S21); hash->h4[11] = SWAP4(S21);
hash.h4[12] = SWAP4(S27); hash->h4[12] = SWAP4(S27);
hash.h4[13] = SWAP4(S28); hash->h4[13] = SWAP4(S28);
hash.h4[14] = SWAP4(S29); hash->h4[14] = SWAP4(S29);
hash.h4[15] = SWAP4(S30); hash->h4[15] = SWAP4(S30);
} bool result = (hash->h8[3] <= target);
bool result = (hash.h8[3] <= target);
if (result) if (result)
output[atomic_inc(output+0xFF)] = SWAP4(gid); output[atomic_inc(output+0xFF)] = SWAP4(gid);

1338
kernel/x14.cl

File diff suppressed because it is too large Load Diff

1294
kernel/x14old.cl

File diff suppressed because it is too large Load Diff

1
miner.h

@ -1033,6 +1033,7 @@ extern int swork_id;
extern int opt_tcp_keepalive; extern int opt_tcp_keepalive;
extern bool opt_incognito; extern bool opt_incognito;
extern int opt_hamsi_expand_big; extern int opt_hamsi_expand_big;
extern bool opt_hamsi_short;
#if LOCK_TRACKING #if LOCK_TRACKING
extern pthread_mutex_t lockstat_lock; extern pthread_mutex_t lockstat_lock;

6
sgminer.c

@ -192,6 +192,7 @@ int nDevs;
int opt_dynamic_interval = 7; int opt_dynamic_interval = 7;
int opt_g_threads = -1; int opt_g_threads = -1;
int opt_hamsi_expand_big = 4; int opt_hamsi_expand_big = 4;
bool opt_hamsi_short = false;
bool opt_restart = true; bool opt_restart = true;
struct list_head scan_devices; struct list_head scan_devices;
@ -1459,7 +1460,10 @@ struct opt_table opt_config_table[] = {
"Set GPU lookup gap for scrypt mining, comma separated"), "Set GPU lookup gap for scrypt mining, comma separated"),
OPT_WITH_ARG("--hamsi-expand-big", OPT_WITH_ARG("--hamsi-expand-big",
set_int_1_to_10, opt_show_intval, &opt_hamsi_expand_big, set_int_1_to_10, opt_show_intval, &opt_hamsi_expand_big,
"Set SPH_HAMSI_EXPAND_BIG for X13 algorithms (1 or 4 are common)"), "Set SPH_HAMSI_EXPAND_BIG for X13 derived algorithms (1 or 4 are common)"),
OPT_WITHOUT_ARG("--hamsi-short",
opt_set_bool, &opt_hamsi_short,
"Set SPH_HAMSI_SHORT for X13 derived algorithms (Can give better hashrate for some GPUs)"),
#ifdef HAVE_CURSES #ifdef HAVE_CURSES
OPT_WITHOUT_ARG("--incognito", OPT_WITHOUT_ARG("--incognito",
opt_set_bool, &opt_incognito, opt_set_bool, &opt_incognito,

2
winbuild/sgminer.vcxproj

@ -263,6 +263,7 @@
<ClCompile Include="..\algorithm\animecoin.c" /> <ClCompile Include="..\algorithm\animecoin.c" />
<ClCompile Include="..\algorithm\bitblock.c" /> <ClCompile Include="..\algorithm\bitblock.c" />
<ClCompile Include="..\algorithm\talkcoin.c" /> <ClCompile Include="..\algorithm\talkcoin.c" />
<ClCompile Include="..\algorithm\x14.c" />
<ClCompile Include="..\api.c" /> <ClCompile Include="..\api.c" />
<ClCompile Include="..\ccan\opt\helpers.c" /> <ClCompile Include="..\ccan\opt\helpers.c" />
<ClCompile Include="..\ccan\opt\opt.c" /> <ClCompile Include="..\ccan\opt\opt.c" />
@ -329,6 +330,7 @@
<ClInclude Include="..\algorithm\animecoin.h" /> <ClInclude Include="..\algorithm\animecoin.h" />
<ClInclude Include="..\algorithm\bitblock.h" /> <ClInclude Include="..\algorithm\bitblock.h" />
<ClInclude Include="..\algorithm\talkcoin.h" /> <ClInclude Include="..\algorithm\talkcoin.h" />
<ClInclude Include="..\algorithm\x14.h" />
<ClInclude Include="..\api.h" /> <ClInclude Include="..\api.h" />
<ClInclude Include="..\arg-nonnull.h" /> <ClInclude Include="..\arg-nonnull.h" />
<ClInclude Include="..\bench_block.h" /> <ClInclude Include="..\bench_block.h" />

6
winbuild/sgminer.vcxproj.filters

@ -227,6 +227,9 @@
<ClCompile Include="..\algorithm\bitblock.c"> <ClCompile Include="..\algorithm\bitblock.c">
<Filter>Source Files\algorithm</Filter> <Filter>Source Files\algorithm</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="..\algorithm\x14.c">
<Filter>Source Files\algorithm</Filter>
</ClCompile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClInclude Include="..\adl.h"> <ClInclude Include="..\adl.h">
@ -412,6 +415,9 @@
<ClInclude Include="..\algorithm\bitblock.h"> <ClInclude Include="..\algorithm\bitblock.h">
<Filter>Header Files\algorithm</Filter> <Filter>Header Files\algorithm</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="..\algorithm\x14.h">
<Filter>Header Files\algorithm</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<None Include="README.txt" /> <None Include="README.txt" />

Loading…
Cancel
Save