Browse Source

X14 Implementation

Added X14 and cleaned up the X13/X15 kernels so all 3 offshoots are in
sync. New option "--hamsi-short" or "hamsi-short":true to add a small
boost. May not work on all GPUs.
djm34
ystarnaud 11 years ago
parent
commit
5c9126fd61
  1. 1
      Makefile.am
  2. 110
      algorithm.c
  3. 2
      algorithm.h
  4. 247
      algorithm/x14.c
  5. 10
      algorithm/x14.h
  6. 637
      kernel/bitblock.cl
  7. 2171
      kernel/bitblockold.cl
  8. 112
      kernel/darkcoin-mod.cl
  9. 1614
      kernel/marucoin-mod.cl
  10. 1788
      kernel/marucoin-modold.cl
  11. 1338
      kernel/x14.cl
  12. 1294
      kernel/x14old.cl
  13. 1
      miner.h
  14. 6
      sgminer.c
  15. 2
      winbuild/sgminer.vcxproj
  16. 6
      winbuild/sgminer.vcxproj.filters

1
Makefile.am

@ -63,6 +63,7 @@ sgminer_SOURCES += algorithm/marucoin.c algorithm/marucoin.h @@ -63,6 +63,7 @@ sgminer_SOURCES += algorithm/marucoin.c algorithm/marucoin.h
sgminer_SOURCES += algorithm/maxcoin.c algorithm/maxcoin.h
sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
sgminer_SOURCES += algorithm/x14.c algorithm/x14.h
bin_SCRIPTS = $(top_srcdir)/kernel/*.cl

110
algorithm.c

@ -27,6 +27,7 @@ @@ -27,6 +27,7 @@
#include "algorithm/maxcoin.h"
#include "algorithm/talkcoin.h"
#include "algorithm/bitblock.h"
#include "algorithm/x14.h"
#include "compat.h"
@ -40,6 +41,8 @@ const char *algorithm_type_str[] = { @@ -40,6 +41,8 @@ const char *algorithm_type_str[] = {
"NScrypt",
"X11",
"X13",
"X14",
"X15",
"Keccak",
"Quarkcoin",
"Twecoin",
@ -91,11 +94,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru @@ -91,11 +94,11 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
static void append_hamsi_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
{
char buf[255];
sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d",
opt_hamsi_expand_big);
sprintf(buf, " -D SPH_HAMSI_EXPAND_BIG=%d%s ",
opt_hamsi_expand_big, ((opt_hamsi_short)?" -D SPH_HAMSI_SHORT=1 ":""));
strcat(data->compiler_options, buf);
sprintf(buf, "big%u", (unsigned int)opt_hamsi_expand_big);
sprintf(buf, "big%u%s", (unsigned int)opt_hamsi_expand_big, ((opt_hamsi_short)?"hs":""));
strcat(data->binary_filename, buf);
}
@ -419,6 +422,100 @@ static cl_int queue_talkcoin_mod_kernel(struct __clState *clState, struct _dev_b @@ -419,6 +422,100 @@ static cl_int queue_talkcoin_mod_kernel(struct __clState *clState, struct _dev_b
return status;
}
static cl_int queue_x14_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel;
unsigned int num;
cl_ulong le_target;
cl_int status = 0;
le_target = *(cl_ulong *)(blk->work->device_target + 24);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
// blake - search
kernel = &clState->kernel;
num = 0;
CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->padbuffer8);
// bmw - search1
kernel = clState->extra_kernels;
CL_SET_ARG_0(clState->padbuffer8);
// groestl - search2
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// skein - search3
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// jh - search4
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// keccak - search5
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// luffa - search6
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// cubehash - search7
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// shavite - search8
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// simd - search9
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// echo - search10
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// hamsi - search11
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// fugue - search12
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// shabal - search13
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(le_target);
return status;
}
static cl_int queue_x14_old_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel;
unsigned int num;
cl_ulong le_target;
cl_int status = 0;
le_target = *(cl_ulong *)(blk->work->device_target + 24);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
// blake - search
kernel = &clState->kernel;
num = 0;
CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->padbuffer8);
// bmw - search1
kernel = clState->extra_kernels;
CL_SET_ARG_0(clState->padbuffer8);
// groestl - search2
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// skein - search3
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// jh - search4
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// keccak - search5
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// luffa - search6
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// cubehash - search7
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// shavite - search8
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// simd - search9
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// combined echo, hamsi, fugue - shabal - search10
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(le_target);
return status;
}
typedef struct _algorithm_settings_t {
const char *name; /* Human-readable identifier */
algorithm_type_t type; //common algorithm type
@ -477,8 +574,13 @@ static algorithm_settings_t algos[] = { @@ -477,8 +574,13 @@ static algorithm_settings_t algos[] = {
{ "marucoin-mod", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_hamsi_compiler_options},
{ "marucoin-modold", ALGO_X13, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_hamsi_compiler_options},
{ "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL},
{ "x14", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_hamsi_compiler_options},
{ "x14old", ALGO_X14, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_hamsi_compiler_options},
{ "bitblock", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_hamsi_compiler_options},
{ "bitblockold", ALGO_X15, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_hamsi_compiler_options},
{ "talkcoin-mod", ALGO_NIST, 1, 1, 1, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, NULL},
// kernels starting from this will have difficulty calculated by using fuguecoin algorithm
#define A_FUGUE(a, b) \
{ a, ALGO_FUGUE, 1, 256, 256, 0, 0, 0xFF, 0x00000000ffff0000ULL, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, sha256, NULL}

2
algorithm.h

@ -16,6 +16,8 @@ typedef enum { @@ -16,6 +16,8 @@ typedef enum {
ALGO_NSCRYPT,
ALGO_X11,
ALGO_X13,
ALGO_X14,
ALGO_X15,
ALGO_KECCAK,
ALGO_QUARK,
ALGO_TWE,

247
algorithm/x14.c

@ -0,0 +1,247 @@ @@ -0,0 +1,247 @@
/*-
* Copyright 2009 Colin Percival, 2011 ArtForz
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#include "config.h"
#include "miner.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_skein.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "sph/sph_hamsi.h"
#include "sph/sph_fugue.h"
#include "sph/sph_shabal.h"
/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
typedef struct {
sph_blake512_context blake1;
sph_bmw512_context bmw1;
sph_groestl512_context groestl1;
sph_skein512_context skein1;
sph_jh512_context jh1;
sph_keccak512_context keccak1;
sph_luffa512_context luffa1;
sph_cubehash512_context cubehash1;
sph_shavite512_context shavite1;
sph_simd512_context simd1;
sph_echo512_context echo1;
sph_hamsi512_context hamsi1;
sph_fugue512_context fugue1;
sph_shabal512_context shabal1;
} Xhash_context_holder;
static Xhash_context_holder base_contexts;
void init_X14hash_contexts()
{
sph_blake512_init(&base_contexts.blake1);
sph_bmw512_init(&base_contexts.bmw1);
sph_groestl512_init(&base_contexts.groestl1);
sph_skein512_init(&base_contexts.skein1);
sph_jh512_init(&base_contexts.jh1);
sph_keccak512_init(&base_contexts.keccak1);
sph_luffa512_init(&base_contexts.luffa1);
sph_cubehash512_init(&base_contexts.cubehash1);
sph_shavite512_init(&base_contexts.shavite1);
sph_simd512_init(&base_contexts.simd1);
sph_echo512_init(&base_contexts.echo1);
sph_hamsi512_init(&base_contexts.hamsi1);
sph_fugue512_init(&base_contexts.fugue1);
sph_shabal512_init(&base_contexts.shabal1);
}
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static inline void be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
{
uint32_t i;
for (i = 0; i < len; i++)
dst[i] = htobe32(src[i]);
}
inline void x14hash(void *state, const void *input)
{
init_X14hash_contexts();
Xhash_context_holder ctx;
uint32_t hashA[16], hashB[16];
//blake-bmw-groestl-sken-jh-meccak-luffa-cubehash-shivite-simd-echo
memcpy(&ctx, &base_contexts, sizeof(base_contexts));
sph_blake512 (&ctx.blake1, input, 80);
sph_blake512_close (&ctx.blake1, hashA);
sph_bmw512 (&ctx.bmw1, hashA, 64);
sph_bmw512_close(&ctx.bmw1, hashB);
sph_groestl512 (&ctx.groestl1, hashB, 64);
sph_groestl512_close(&ctx.groestl1, hashA);
sph_skein512 (&ctx.skein1, hashA, 64);
sph_skein512_close(&ctx.skein1, hashB);
sph_jh512 (&ctx.jh1, hashB, 64);
sph_jh512_close(&ctx.jh1, hashA);
sph_keccak512 (&ctx.keccak1, hashA, 64);
sph_keccak512_close(&ctx.keccak1, hashB);
sph_luffa512 (&ctx.luffa1, hashB, 64);
sph_luffa512_close (&ctx.luffa1, hashA);
sph_cubehash512 (&ctx.cubehash1, hashA, 64);
sph_cubehash512_close(&ctx.cubehash1, hashB);
sph_shavite512 (&ctx.shavite1, hashB, 64);
sph_shavite512_close(&ctx.shavite1, hashA);
sph_simd512 (&ctx.simd1, hashA, 64);
sph_simd512_close(&ctx.simd1, hashB);
sph_echo512 (&ctx.echo1, hashB, 64);
sph_echo512_close(&ctx.echo1, hashA);
sph_hamsi512 (&ctx.hamsi1, hashA, 64);
sph_hamsi512_close(&ctx.hamsi1, hashB);
sph_fugue512 (&ctx.fugue1, hashB, 64);
sph_fugue512_close(&ctx.fugue1, hashA);
sph_shabal512 (&ctx.shabal1, (const unsigned char*)hashA, 64);
sph_shabal512_close(&ctx.shabal1, hashB);
memcpy(state, hashB, 32);
}
static const uint32_t diff1targ = 0x0000ffff;
/* Used externally as confirmation of correct OCL code */
int x14_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
{
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
uint32_t data[20], ohash[8];
be32enc_vect(data, (const uint32_t *)pdata, 19);
data[19] = htobe32(nonce);
x14hash(ohash, data);
tmp_hash7 = be32toh(ohash[7]);
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
(long unsigned int)Htarg,
(long unsigned int)diff1targ,
(long unsigned int)tmp_hash7);
if (tmp_hash7 > diff1targ)
return -1;
if (tmp_hash7 > Htarg)
return 0;
return 1;
}
void x14_regenhash(struct work *work)
{
uint32_t data[20];
uint32_t *nonce = (uint32_t *)(work->data + 76);
uint32_t *ohash = (uint32_t *)(work->hash);
be32enc_vect(data, (const uint32_t *)work->data, 19);
data[19] = htobe32(*nonce);
x14hash(ohash, data);
}
static inline void be32enc(void *pp, uint32_t x)
{
uint8_t *p = (uint8_t *)pp;
p[3] = x & 0xff;
p[2] = (x >> 8) & 0xff;
p[1] = (x >> 16) & 0xff;
p[0] = (x >> 24) & 0xff;
}
bool scanhash_x14(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *pdata, unsigned char __maybe_unused *phash1,
unsigned char __maybe_unused *phash, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
{
uint32_t *nonce = (uint32_t *)(pdata + 76);
uint32_t data[20];
uint32_t tmp_hash7;
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
bool ret = false;
be32enc_vect(data, (const uint32_t *)pdata, 19);
while(1)
{
uint32_t ostate[8];
*nonce = ++n;
data[19] = (n);
x14hash(ostate, data);
tmp_hash7 = (ostate[7]);
applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]);
if(unlikely(tmp_hash7 <= Htarg))
{
((uint32_t *)pdata)[19] = htobe32(n);
*last_nonce = n;
ret = true;
break;
}
if (unlikely((n >= max_nonce) || thr->work_restart))
{
*last_nonce = n;
break;
}
}
return ret;
}

10
algorithm/x14.h

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
#ifndef X14_H
#define X14_H
#include "miner.h"
extern int x14_test(unsigned char *pdata, const unsigned char *ptarget,
uint32_t nonce);
extern void x14_regenhash(struct work *work);
#endif /* X14_H */

637
kernel/bitblock.cl

@ -72,13 +72,17 @@ typedef int sph_s32; @@ -72,13 +72,17 @@ typedef int sph_s32;
#define SPH_SIMD_NOCOPY 0
#define SPH_KECCAK_NOCOPY 0
#define SPH_COMPACT_BLAKE_64 0
#define SPH_LUFFA_PARALLEL 1
#define SPH_LUFFA_PARALLEL 0
#define SPH_SMALL_FOOTPRINT_GROESTL 0
#define SPH_GROESTL_BIG_ENDIAN 0
#define SPH_CUBEHASH_UNROLL 0
#define SPH_KECCAK_UNROLL 1
#ifndef SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 1
#define SPH_HAMSI_SHORT 1
#endif
#ifndef SPH_HAMSI_SHORT
#define SPH_HAMSI_SHORT 1
#endif
#include "blake.cl"
#include "bmw.cl"
@ -126,10 +130,10 @@ typedef union { @@ -126,10 +130,10 @@ typedef union {
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global unsigned char* block, __global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// blake
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// blake
sph_u64 H0 = SPH_C64(0x6A09E667F3BCC908), H1 = SPH_C64(0xBB67AE8584CAA73B);
sph_u64 H2 = SPH_C64(0x3C6EF372FE94F82B), H3 = SPH_C64(0xA54FF53A5F1D36F1);
sph_u64 H4 = SPH_C64(0x510E527FADE682D1), H5 = SPH_C64(0x9B05688C2B3E6C1F);
@ -144,6 +148,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -144,6 +148,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
sph_u64 V8, V9, VA, VB, VC, VD, VE, VF;
M0 = DEC64BE(block + 0);
M1 = DEC64BE(block + 8);
M2 = DEC64BE(block + 16);
@ -180,18 +185,18 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -180,18 +185,18 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search1(__global hash_t* hashes)
{
uint gid = get_global_id(0);
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// bmw
sph_u64 BMW_H[16];
#pragma unroll 16
#pragma unroll 16
for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u];
sph_u64 mv[16],q[32];
sph_u64 tmp;
sph_u64 tmp;
mv[0] = SWAP8(hash->h8[0]);
mv[1] = SWAP8(hash->h8[1]);
@ -211,7 +216,7 @@ __kernel void search1(__global hash_t* hashes) @@ -211,7 +216,7 @@ __kernel void search1(__global hash_t* hashes)
mv[15] = SPH_C64(512);
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
@ -221,7 +226,7 @@ __kernel void search1(__global hash_t* hashes) @@ -221,7 +226,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
@ -231,7 +236,7 @@ __kernel void search1(__global hash_t* hashes) @@ -231,7 +236,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
@ -243,60 +248,60 @@ __kernel void search1(__global hash_t* hashes) @@ -243,60 +248,60 @@ __kernel void search1(__global hash_t* hashes)
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
@ -320,7 +325,7 @@ __kernel void search1(__global hash_t* hashes) @@ -320,7 +325,7 @@ __kernel void search1(__global hash_t* hashes)
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
@ -328,7 +333,7 @@ __kernel void search1(__global hash_t* hashes) @@ -328,7 +333,7 @@ __kernel void search1(__global hash_t* hashes)
}
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
@ -338,7 +343,7 @@ __kernel void search1(__global hash_t* hashes) @@ -338,7 +343,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
@ -348,7 +353,7 @@ __kernel void search1(__global hash_t* hashes) @@ -348,7 +353,7 @@ __kernel void search1(__global hash_t* hashes)
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
@ -360,64 +365,65 @@ __kernel void search1(__global hash_t* hashes) @@ -360,64 +365,65 @@ __kernel void search1(__global hash_t* hashes)
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
@ -454,94 +460,67 @@ __kernel void search2(__global hash_t* hashes) @@ -454,94 +460,67 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
}
barrier(CLK_LOCAL_MEM_FENCE); // groestl
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
// groestl
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
PERM_BIG_P(g);
PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
xH[u] = H[u] ^= g[u] ^ m[u];
PERM_BIG_P(xH);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -566,10 +545,14 @@ __kernel void search3(__global hash_t* hashes) @@ -566,10 +545,14 @@ __kernel void search3(__global hash_t* hashes)
m5 = SWAP8(hash->h8[5]);
m6 = SWAP8(hash->h8[6]);
m7 = SWAP8(hash->h8[7]);
UBI_BIG(480, 64);
bcount = 0;
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = 0;
UBI_BIG(510, 8);
hash->h8[0] = SWAP8(h0);
hash->h8[1] = SWAP8(h1);
hash->h8[2] = SWAP8(h2);
@ -588,7 +571,7 @@ __kernel void search4(__global hash_t* hashes) @@ -588,7 +571,7 @@ __kernel void search4(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// jh
// jh
sph_u64 h0h = C64e(0x6fd14b963e00aa17), h0l = C64e(0x636a2e057a15d543), h1h = C64e(0x8a225e8d0c97ef0b), h1l = C64e(0xe9341259f2b3c361), h2h = C64e(0x891da0c1536f801e), h2l = C64e(0x2aa9056bea2b6d80), h3h = C64e(0x588eccdb2075baa6), h3l = C64e(0xa90f3a76baf83bf7);
sph_u64 h4h = C64e(0x0169e60541e34a69), h4l = C64e(0x46b58a8e2e6fe65a), h5h = C64e(0x1047a7d0c1843c24), h5l = C64e(0x3b6e71b12d5ac199), h6h = C64e(0xcf57f6ec9db1f856), h6l = C64e(0xa706887c5716b156), h7h = C64e(0xe3c2fcdfe68517fb), h7l = C64e(0x545a4678cc8cdd4b);
@ -669,6 +652,7 @@ __kernel void search5(__global hash_t* hashes) @@ -669,6 +652,7 @@ __kernel void search5(__global hash_t* hashes)
a21 ^= SWAP8(hash->h8[7]);
a31 ^= 0x8000000000000001;
KECCAK_F_1600;
// Finalize the "lane complement"
a10 = ~a10;
a20 = ~a20;
@ -830,6 +814,7 @@ __kernel void search8(__global hash_t* hashes) @@ -830,6 +814,7 @@ __kernel void search8(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 AES0[256], AES1[256], AES2[256], AES3[256];
int init = get_local_id(0);
@ -858,7 +843,7 @@ __kernel void search8(__global hash_t* hashes) @@ -858,7 +843,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0];
rk01 = hash->h4[1];
@ -913,10 +898,8 @@ __kernel void search9(__global hash_t* hashes) @@ -913,10 +898,8 @@ __kernel void search9(__global hash_t* hashes)
// simd
s32 q[256];
unsigned char x[128];
for(unsigned int i = 0; i < 64; i++)
x[i] = hash->h1[i];
for(unsigned int i = 64; i < 128; i++)
x[i] = 0;
@ -926,14 +909,15 @@ __kernel void search9(__global hash_t* hashes) @@ -926,14 +909,15 @@ __kernel void search9(__global hash_t* hashes)
u32 D0 = C32(0x09254899), D1 = C32(0xD699C7BC), D2 = C32(0x9019B6DC), D3 = C32(0x2B9022E4), D4 = C32(0x8FA14956), D5 = C32(0x21BF9BD3), D6 = C32(0xB94D0943), D7 = C32(0x6FFDDC22);
FFT256(0, 1, 0, ll1);
for (int i = 0; i < 256; i ++) {
s32 tq;
tq = q[i] + yoff_b_n[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
for (int i = 0; i < 256; i ++)
{
s32 tq;
tq = q[i] + yoff_b_n[i];
tq = REDS2(tq);
tq = REDS1(tq);
tq = REDS1(tq);
q[i] = (tq <= 128 ? tq : tq - 257);
}
A0 ^= hash->h4[0];
@ -959,21 +943,24 @@ __kernel void search9(__global hash_t* hashes) @@ -959,21 +943,24 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
STEP_BIG(
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
IF, 4, 13, PP8_4_);
C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
IF, 4, 13, PP8_4_);
STEP_BIG(
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
IF, 13, 10, PP8_5_);
C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
IF, 13, 10, PP8_5_);
STEP_BIG(
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
IF, 10, 25, PP8_6_);
C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
IF, 10, 25, PP8_6_);
STEP_BIG(
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
IF, 25, 4, PP8_0_);
C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
IF, 25, 4, PP8_0_);
u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7;
u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7;
@ -988,22 +975,27 @@ __kernel void search9(__global hash_t* hashes) @@ -988,22 +975,27 @@ __kernel void search9(__global hash_t* hashes)
ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
STEP_BIG(
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4, 13, PP8_4_);
COPY_A0, COPY_A1, COPY_A2, COPY_A3,
COPY_A4, COPY_A5, COPY_A6, COPY_A7,
IF, 4, 13, PP8_4_);
STEP_BIG(
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13, 10, PP8_5_);
COPY_B0, COPY_B1, COPY_B2, COPY_B3,
COPY_B4, COPY_B5, COPY_B6, COPY_B7,
IF, 13, 10, PP8_5_);
STEP_BIG(
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10, 25, PP8_6_);
COPY_C0, COPY_C1, COPY_C2, COPY_C3,
COPY_C4, COPY_C5, COPY_C6, COPY_C7,
IF, 10, 25, PP8_6_);
STEP_BIG(
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25, 4, PP8_0_);
COPY_D0, COPY_D1, COPY_D2, COPY_D3,
COPY_D4, COPY_D5, COPY_D6, COPY_D7,
IF, 25, 4, PP8_0_);
#undef q
hash->h4[0] = A0;
@ -1114,49 +1106,52 @@ __kernel void search10(__global hash_t* hashes) @@ -1114,49 +1106,52 @@ __kernel void search10(__global hash_t* hashes)
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search11(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
__local sph_u32 T512_L[1024];
__constant const sph_u32 *T512_C = &T512[0][0];
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 1024; i += step)
{
T512_L[i] = T512_C[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 1024; i += step)
T512_L[i] = T512_C[i];
{
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15];
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8) {
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG_LOCAL;
PF_BIG;
T_BIG;
for (unsigned u = 0; u < 16; u ++)
hash->h4[u] = h[u];
}
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
sph_u32 c8 = HAMSI_IV512[8], c9 = HAMSI_IV512[9], cA = HAMSI_IV512[10], cB = HAMSI_IV512[11];
sph_u32 cC = HAMSI_IV512[12], cD = HAMSI_IV512[13], cE = HAMSI_IV512[14], cF = HAMSI_IV512[15];
sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8) {
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG_LOCAL;
P_BIG;
T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG_LOCAL;
PF_BIG;
T_BIG;
for (unsigned u = 0; u < 16; u ++)
hash->h4[u] = h[u];
barrier(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
@ -1346,115 +1341,115 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo @@ -1346,115 +1341,115 @@ __kernel void search14(__global hash_t* hashes, __global uint* output, const ulo
uint offset = get_global_offset(0);
__global hash_t *hash = &(hashes[gid-offset]);
__local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256];
__local sph_u64 LT0[256], LT1[256], LT2[256], LT3[256], LT4[256], LT5[256], LT6[256], LT7[256];
int init = get_local_id(0);
int step = get_local_size(0);
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
LT0[i] = plain_T0[i];
LT1[i] = plain_T1[i];
LT2[i] = plain_T2[i];
LT3[i] = plain_T3[i];
LT4[i] = plain_T4[i];
LT5[i] = plain_T5[i];
LT6[i] = plain_T6[i];
LT7[i] = plain_T7[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = init; i < 256; i += step)
{
LT0[i] = plain_T0[i];
LT1[i] = plain_T1[i];
LT2[i] = plain_T2[i];
LT3[i] = plain_T3[i];
LT4[i] = plain_T4[i];
LT5[i] = plain_T5[i];
LT6[i] = plain_T6[i];
LT7[i] = plain_T7[i];
}
barrier(CLK_LOCAL_MEM_FENCE);
// whirlpool
sph_u64 n0, n1, n2, n3, n4, n5, n6, n7;
sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
sph_u64 state[8];
n0 = (hash->h8[0]);
n1 = (hash->h8[1]);
n2 = (hash->h8[2]);
n3 = (hash->h8[3]);
n4 = (hash->h8[4]);
n5 = (hash->h8[5]);
n6 = (hash->h8[6]);
n7 = (hash->h8[7]);
h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0;
n0 ^= h0;
n1 ^= h1;
n2 ^= h2;
n3 ^= h3;
n4 ^= h4;
n5 ^= h5;
n6 ^= h6;
n7 ^= h7;
for (unsigned r = 0; r < 10; r ++) {
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
TRANSFER(h, tmp);
ROUND_WENC(plain_T, n, h, tmp);
TRANSFER(n, tmp);
}
state[0] = n0 ^ (hash->h8[0]);
state[1] = n1 ^ (hash->h8[1]);
state[2] = n2 ^ (hash->h8[2]);
state[3] = n3 ^ (hash->h8[3]);
state[4] = n4 ^ (hash->h8[4]);
state[5] = n5 ^ (hash->h8[5]);
state[6] = n6 ^ (hash->h8[6]);
state[7] = n7 ^ (hash->h8[7]);
n0 = 0x80;
n1 = n2 = n3 = n4 = n5 = n6 = 0;
n7 = 0x2000000000000;
h0 = state[0];
h1 = state[1];
h2 = state[2];
h3 = state[3];
h4 = state[4];
h5 = state[5];
h6 = state[6];
h7 = state[7];
n0 ^= h0;
n1 ^= h1;
n2 ^= h2;
n3 ^= h3;
n4 ^= h4;
n5 ^= h5;
n6 ^= h6;
n7 ^= h7;
for (unsigned r = 0; r < 10; r ++) {
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(LT, h, tmp, plain_RC[r]);
TRANSFER(h, tmp);
ROUND_WENC(plain_T, n, h, tmp);
TRANSFER(n, tmp);
}
// whirlpool
sph_u64 n0, n1, n2, n3, n4, n5, n6, n7;
sph_u64 h0, h1, h2, h3, h4, h5, h6, h7;
sph_u64 state[8];
n0 = (hash->h8[0]);
n1 = (hash->h8[1]);
n2 = (hash->h8[2]);
n3 = (hash->h8[3]);
n4 = (hash->h8[4]);
n5 = (hash->h8[5]);
n6 = (hash->h8[6]);
n7 = (hash->h8[7]);
h0 = h1 = h2 = h3 = h4 = h5 = h6 = h7 = 0;
n0 ^= h0;
n1 ^= h1;
n2 ^= h2;
n3 ^= h3;
n4 ^= h4;
n5 ^= h5;
n6 ^= h6;
n7 ^= h7;
#pragma unroll 10
for (unsigned r = 0; r < 10; r ++)
{
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
state[0] ^= n0 ^ 0x80;
state[1] ^= n1;
state[2] ^= n2;
state[3] ^= n3;
state[4] ^= n4;
state[5] ^= n5;
state[6] ^= n6;
state[7] ^= n7 ^ 0x2000000000000;
ROUND_KSCHED(plain_T, h, tmp, plain_RC[r]);
TRANSFER(h, tmp);
ROUND_WENC(plain_T, n, h, tmp);
TRANSFER(n, tmp);
}
for (unsigned i = 0; i < 8; i ++)
hash->h8[i] = state[i];
state[0] = n0 ^ (hash->h8[0]);
state[1] = n1 ^ (hash->h8[1]);
state[2] = n2 ^ (hash->h8[2]);
state[3] = n3 ^ (hash->h8[3]);
state[4] = n4 ^ (hash->h8[4]);
state[5] = n5 ^ (hash->h8[5]);
state[6] = n6 ^ (hash->h8[6]);
state[7] = n7 ^ (hash->h8[7]);
n0 = 0x80;
n1 = n2 = n3 = n4 = n5 = n6 = 0;
n7 = 0x2000000000000;
h0 = state[0];
h1 = state[1];
h2 = state[2];
h3 = state[3];
h4 = state[4];
h5 = state[5];
h6 = state[6];
h7 = state[7];
n0 ^= h0;
n1 ^= h1;
n2 ^= h2;
n3 ^= h3;
n4 ^= h4;
n5 ^= h5;
n6 ^= h6;
n7 ^= h7;
#pragma unroll 10
for (unsigned r = 0; r < 10; r ++)
{
sph_u64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
// for(uint i = 0; i < 8; i++)
// output[(NUMHASH * 9) * 15 + gid * 9 + i] = hash->h8[i];
ROUND_KSCHED(LT, h, tmp, plain_RC[r]);
TRANSFER(h, tmp);
ROUND_WENC(plain_T, n, h, tmp);
TRANSFER(n, tmp);
}
// output[(NUMHASH * 9) * 15 + gid * 9 + 8] = nonce;
state[0] ^= n0 ^ 0x80;
state[1] ^= n1;
state[2] ^= n2;
state[3] ^= n3;
state[4] ^= n4;
state[5] ^= n5;
state[6] ^= n6;
state[7] ^= n7 ^ 0x2000000000000;
for (unsigned i = 0; i < 8; i ++)
hash->h8[i] = state[i];
bool result = (hash->h8[3] <= target);
if (result)

2171
kernel/bitblockold.cl

File diff suppressed because it is too large Load Diff

112
kernel/darkcoin-mod.cl

@ -95,8 +95,6 @@ @@ -95,8 +95,6 @@
#include "shavite.cl"
#include "simd.cl"
#include "echo.cl"
#include "hamsi.cl"
#include "fugue.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
@ -181,7 +179,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -181,7 +179,7 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search1(__global hash_t* hashes)
{
uint gid = get_global_id(0);
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// bmw
@ -456,91 +454,67 @@ __kernel void search2(__global hash_t* hashes) @@ -456,91 +454,67 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
T0_L[i] = T0[i];
T4_L[i] = T4[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
}
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
sph_u64 H[16];
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
H[15] = (sph_u64)512;
#endif
// groestl
sph_u64 H[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x0002000000000000};
sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]);
m[1] = DEC64E(hash->h8[1]);
m[2] = DEC64E(hash->h8[2]);
m[3] = DEC64E(hash->h8[3]);
m[4] = DEC64E(hash->h8[4]);
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
m[11] = 0; g[11] = m[11] ^ H[11];
m[12] = 0; g[12] = m[12] ^ H[12];
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
g[0] = m[0] = DEC64E(hash->h8[0]);
g[1] = m[1] = DEC64E(hash->h8[1]);
g[2] = m[2] = DEC64E(hash->h8[2]);
g[3] = m[3] = DEC64E(hash->h8[3]);
g[4] = m[4] = DEC64E(hash->h8[4]);
g[5] = m[5] = DEC64E(hash->h8[5]);
g[6] = m[6] = DEC64E(hash->h8[6]);
g[7] = m[7] = DEC64E(hash->h8[7]);
g[8] = m[8] = 0x80;
g[9] = m[9] = 0;
g[10] = m[10] = 0;
g[11] = m[11] = 0;
g[12] = m[12] = 0;
g[13] = m[13] = 0;
g[14] = m[14] = 0;
g[15] = 0x102000000000000;
m[15] = 0x100000000000000;
PERM_BIG_P(g);
PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
xH[u] = H[u] ^= g[u] ^ m[u];
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
PERM_BIG_P(xH);
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
for (unsigned int u = 8; u < 16; u ++)
hash->h8[u-8] = DEC64E(H[u] ^ xH[u]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -863,7 +837,7 @@ __kernel void search8(__global hash_t* hashes) @@ -863,7 +837,7 @@ __kernel void search8(__global hash_t* hashes)
sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
sph_u32 sc_count0 = (64 << 3), sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
sph_u32 sc_count0 = 0x200, sc_count1 = 0, sc_count2 = 0, sc_count3 = 0;
rk00 = hash->h4[0];
rk01 = hash->h4[1];

1614
kernel/marucoin-mod.cl

File diff suppressed because it is too large Load Diff

1788
kernel/marucoin-modold.cl

File diff suppressed because it is too large Load Diff

1338
kernel/x14.cl

File diff suppressed because it is too large Load Diff

1294
kernel/x14old.cl

File diff suppressed because it is too large Load Diff

1
miner.h

@ -1033,6 +1033,7 @@ extern int swork_id; @@ -1033,6 +1033,7 @@ extern int swork_id;
extern int opt_tcp_keepalive;
extern bool opt_incognito;
extern int opt_hamsi_expand_big;
extern bool opt_hamsi_short;
#if LOCK_TRACKING
extern pthread_mutex_t lockstat_lock;

6
sgminer.c

@ -192,6 +192,7 @@ int nDevs; @@ -192,6 +192,7 @@ int nDevs;
int opt_dynamic_interval = 7;
int opt_g_threads = -1;
int opt_hamsi_expand_big = 4;
bool opt_hamsi_short = false;
bool opt_restart = true;
struct list_head scan_devices;
@ -1459,7 +1460,10 @@ struct opt_table opt_config_table[] = { @@ -1459,7 +1460,10 @@ struct opt_table opt_config_table[] = {
"Set GPU lookup gap for scrypt mining, comma separated"),
OPT_WITH_ARG("--hamsi-expand-big",
set_int_1_to_10, opt_show_intval, &opt_hamsi_expand_big,
"Set SPH_HAMSI_EXPAND_BIG for X13 algorithms (1 or 4 are common)"),
"Set SPH_HAMSI_EXPAND_BIG for X13 derived algorithms (1 or 4 are common)"),
OPT_WITHOUT_ARG("--hamsi-short",
opt_set_bool, &opt_hamsi_short,
"Set SPH_HAMSI_SHORT for X13 derived algorithms (Can give better hashrate for some GPUs)"),
#ifdef HAVE_CURSES
OPT_WITHOUT_ARG("--incognito",
opt_set_bool, &opt_incognito,

2
winbuild/sgminer.vcxproj

@ -263,6 +263,7 @@ @@ -263,6 +263,7 @@
<ClCompile Include="..\algorithm\animecoin.c" />
<ClCompile Include="..\algorithm\bitblock.c" />
<ClCompile Include="..\algorithm\talkcoin.c" />
<ClCompile Include="..\algorithm\x14.c" />
<ClCompile Include="..\api.c" />
<ClCompile Include="..\ccan\opt\helpers.c" />
<ClCompile Include="..\ccan\opt\opt.c" />
@ -329,6 +330,7 @@ @@ -329,6 +330,7 @@
<ClInclude Include="..\algorithm\animecoin.h" />
<ClInclude Include="..\algorithm\bitblock.h" />
<ClInclude Include="..\algorithm\talkcoin.h" />
<ClInclude Include="..\algorithm\x14.h" />
<ClInclude Include="..\api.h" />
<ClInclude Include="..\arg-nonnull.h" />
<ClInclude Include="..\bench_block.h" />

6
winbuild/sgminer.vcxproj.filters

@ -227,6 +227,9 @@ @@ -227,6 +227,9 @@
<ClCompile Include="..\algorithm\bitblock.c">
<Filter>Source Files\algorithm</Filter>
</ClCompile>
<ClCompile Include="..\algorithm\x14.c">
<Filter>Source Files\algorithm</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\adl.h">
@ -412,6 +415,9 @@ @@ -412,6 +415,9 @@
<ClInclude Include="..\algorithm\bitblock.h">
<Filter>Header Files\algorithm</Filter>
</ClInclude>
<ClInclude Include="..\algorithm\x14.h">
<Filter>Header Files\algorithm</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="README.txt" />

Loading…
Cancel
Save