Browse Source

Merge pull request #282 from ystarnaud/x11mod

Updated x11 and x13 kernels
djm34
ystarnaud 11 years ago
parent
commit
707ddd6c4a
  1. 4
      api.c
  2. 161
      config_parser.c
  3. 7
      config_parser.h
  4. 3
      driver-opencl.c
  5. 8163
      kernel/darkcoin-mod.cl
  6. 504
      kernel/marucoin-mod.cl
  7. 2
      miner.h
  8. 102
      sgminer.c

4
api.c

@ -3585,7 +3585,7 @@ void api(int api_thr_id) @@ -3585,7 +3585,7 @@ void api(int api_thr_id)
struct sockaddr_in cli;
socklen_t clisiz;
char cmdbuf[100];
char *cmd = NULL, *cmdptr, *cmdsbuf;
char *cmd = NULL, *cmdptr, *cmdsbuf = NULL;
char *param;
bool addrok;
char group;
@ -3593,7 +3593,7 @@ void api(int api_thr_id) @@ -3593,7 +3593,7 @@ void api(int api_thr_id)
json_t *json_config = NULL;
json_t *json_val;
bool isjson;
bool did, isjoin, firstjoin;
bool did, isjoin = false, firstjoin;
int i;
SOCKETTYPE *apisock;

161
config_parser.c

@ -83,6 +83,7 @@ static struct profile *add_profile() @@ -83,6 +83,7 @@ static struct profile *add_profile()
//default profile name is the profile index
sprintf(buf, "%d", profile->profile_no);
profile->name = strdup(buf);
profile->algorithm.name[0] = '\0';
profiles = (struct profile **)realloc(profiles, sizeof(struct profile *) * (total_profiles + 2));
profiles[total_profiles++] = profile;
@ -146,6 +147,22 @@ static struct profile *get_profile(char *name) @@ -146,6 +147,22 @@ static struct profile *get_profile(char *name)
}
/******* Default profile functions used during config parsing *****/
char *set_default_algorithm(const char *arg)
{
set_algorithm(&default_profile.algorithm, arg);
applog(LOG_INFO, "Set default algorithm to %s", default_profile.algorithm.name);
return NULL;
}
char *set_default_nfactor(const char *arg)
{
set_algorithm_nfactor(&default_profile.algorithm, (const uint8_t) atoi(arg));
applog(LOG_INFO, "Set algorithm N-factor to %d (N to %d)", default_profile.algorithm.nfactor);
return NULL;
}
char *set_default_devices(const char *arg)
{
default_profile.devices = arg;
@ -696,19 +713,24 @@ void load_default_profile() @@ -696,19 +713,24 @@ void load_default_profile()
//apply default settings
void apply_defaults()
{
set_algorithm(&opt_algorithm, default_profile.algorithm.name);
//if no algorithm specified, use scrypt as default
if (empty_string(default_profile.algorithm.name))
set_algorithm(&default_profile.algorithm, "scrypt");
if (!empty_string(default_profile.devices))
//by default all unless specified
if (empty_string(default_profile.devices))
default_profile.devices = strdup("all");
set_devices((char *)default_profile.devices);
if (!empty_string(default_profile.intensity))
set_intensity(default_profile.intensity);
if (!empty_string(default_profile.xintensity))
set_xintensity(default_profile.xintensity);
//set raw intensity first
if (!empty_string(default_profile.rawintensity))
set_rawintensity(default_profile.rawintensity);
//then try xintensity
else if (!empty_string(default_profile.xintensity))
set_xintensity(default_profile.xintensity);
//then try intensity
else if (!empty_string(default_profile.intensity))
set_intensity(default_profile.intensity);
if (!empty_string(default_profile.lookup_gap))
set_lookup_gap((char *)default_profile.lookup_gap);
@ -758,68 +780,167 @@ void apply_pool_profile(struct pool *pool) @@ -758,68 +780,167 @@ void apply_pool_profile(struct pool *pool)
{
struct profile *profile;
//if the pool has a profile set
//if the pool has a profile set load it
if(!empty_string(pool->profile))
{
applog(LOG_DEBUG, "Loading settings from profile \"%s\" for pool %i", pool->profile, pool->pool_no);
//find profile and apply settings to the pool
if((profile = get_profile(pool->profile)))
if(!(profile = get_profile(pool->profile)))
{
//if not found, remove profile name and use default profile.
applog(LOG_DEBUG, "Profile load failed for pool %i: profile %s not found. Using default profile.", pool->pool_no, pool->profile);
//remove profile name
pool->profile[0] = '\0';
profile = &default_profile;
}
}
//no profile specified in pool, use default profile
else
{
applog(LOG_DEBUG, "Loading settings from default_profile for pool %i", pool->profile, pool->pool_no);
profile = &default_profile;
}
//only apply profiles settings not already defined in the pool
//if no algorithm is specified, use profile's or default profile's
if(empty_string(pool->algorithm.name))
{
if(!empty_string(profile->algorithm.name))
pool->algorithm = profile->algorithm;
else
pool->algorithm = default_profile.algorithm;
}
applog(LOG_DEBUG, "Pool %i Algorithm set to \"%s\"", pool->pool_no, pool->algorithm.name);
if(pool_cmp(pool->devices, default_profile.devices))
{
if(!empty_string(profile->devices))
pool->devices = profile->devices;
else
pool->devices = default_profile.devices;
}
applog(LOG_DEBUG, "Pool %i devices set to \"%s\"", pool->pool_no, pool->devices);
if(pool_cmp(pool->lookup_gap, default_profile.lookup_gap))
{
if(!empty_string(profile->lookup_gap))
pool->lookup_gap = profile->lookup_gap;
else
pool->lookup_gap = default_profile.lookup_gap;
}
applog(LOG_DEBUG, "Pool %i lookup gap set to \"%s\"", pool->pool_no, pool->lookup_gap);
if(pool_cmp(pool->intensity, default_profile.intensity))
{
if(!empty_string(profile->intensity))
pool->intensity = profile->intensity;
else
pool->intensity = default_profile.intensity;
}
applog(LOG_DEBUG, "Pool %i Intensity set to \"%s\"", pool->pool_no, pool->intensity);
if(pool_cmp(pool->xintensity, default_profile.xintensity))
{
if(!empty_string(profile->xintensity))
pool->xintensity = profile->xintensity;
else
pool->xintensity = default_profile.xintensity;
}
applog(LOG_DEBUG, "Pool %i XIntensity set to \"%s\"", pool->pool_no, pool->xintensity);
if(pool_cmp(pool->rawintensity, default_profile.rawintensity))
{
if(!empty_string(profile->rawintensity))
pool->rawintensity = profile->rawintensity;
else
pool->rawintensity = default_profile.rawintensity;
}
applog(LOG_DEBUG, "Pool %i Raw Intensity set to \"%s\"", pool->pool_no, pool->rawintensity);
if(pool_cmp(pool->thread_concurrency, default_profile.thread_concurrency))
{
if(!empty_string(profile->thread_concurrency))
pool->thread_concurrency = profile->thread_concurrency;
else
pool->thread_concurrency = default_profile.thread_concurrency;
}
applog(LOG_DEBUG, "Pool %i Thread Concurrency set to \"%s\"", pool->pool_no, pool->thread_concurrency);
#ifdef HAVE_ADL
#ifdef HAVE_ADL
if(pool_cmp(pool->gpu_engine, default_profile.gpu_engine))
{
if(!empty_string(profile->gpu_engine))
pool->gpu_engine = profile->gpu_engine;
else
pool->gpu_engine = default_profile.gpu_engine;
}
applog(LOG_DEBUG, "Pool %i GPU Clock set to \"%s\"", pool->pool_no, pool->gpu_engine);
if(pool_cmp(pool->gpu_memclock, default_profile.gpu_memclock))
{
if(!empty_string(profile->gpu_memclock))
pool->gpu_memclock = profile->gpu_memclock;
else
pool->gpu_memclock = default_profile.gpu_memclock;
}
applog(LOG_DEBUG, "Pool %i GPU Memory clock set to \"%s\"", pool->pool_no, pool->gpu_memclock);
if(pool_cmp(pool->gpu_threads, default_profile.gpu_threads))
{
if(!empty_string(profile->gpu_threads))
pool->gpu_threads = profile->gpu_threads;
else
pool->gpu_threads = default_profile.gpu_threads;
}
applog(LOG_DEBUG, "Pool %i GPU Threads set to \"%s\"", pool->pool_no, pool->gpu_threads);
if(pool_cmp(pool->gpu_fan, default_profile.gpu_fan))
{
if(!empty_string(profile->gpu_fan))
pool->gpu_fan = profile->gpu_fan;
else
pool->gpu_fan = default_profile.gpu_fan;
}
applog(LOG_DEBUG, "Pool %i GPU Fan set to \"%s\"", pool->pool_no, pool->gpu_fan);
if(pool_cmp(pool->gpu_powertune, default_profile.gpu_powertune))
{
if(!empty_string(profile->gpu_powertune))
pool->gpu_powertune = profile->gpu_powertune;
else
pool->gpu_powertune = default_profile.gpu_powertune;
}
applog(LOG_DEBUG, "Pool %i GPU Powertune set to \"%s\"", pool->pool_no, pool->gpu_powertune);
if(pool_cmp(pool->gpu_vddc, default_profile.gpu_vddc))
{
if(!empty_string(profile->gpu_vddc))
pool->gpu_vddc = profile->gpu_vddc;
else
pool->gpu_vddc = default_profile.gpu_vddc;
}
applog(LOG_DEBUG, "Pool %i GPU Vddc set to \"%s\"", pool->pool_no, pool->gpu_vddc);
#endif
#endif
if(pool_cmp(pool->shaders, default_profile.shaders))
{
if(!empty_string(profile->shaders))
pool->shaders = profile->shaders;
else
pool->shaders = default_profile.shaders;
}
applog(LOG_DEBUG, "Pool %i Shaders set to \"%s\"", pool->pool_no, pool->shaders);
if(pool_cmp(pool->worksize, default_profile.worksize))
{
if(!empty_string(profile->worksize))
pool->worksize = profile->worksize;
applog(LOG_DEBUG, "Pool %i Worksize set to \"%s\"", pool->pool_no, pool->worksize);
}
else
{
applog(LOG_DEBUG, "Profile load failed for pool %i: profile %s not found.", pool->pool_no, pool->profile);
//remove profile name
pool->profile[0] = '\0';
}
pool->worksize = default_profile.worksize;
}
applog(LOG_DEBUG, "Pool %i Worksize set to \"%s\"", pool->pool_no, pool->worksize);
}
//helper function to add json values to pool object
@ -1291,7 +1412,7 @@ void write_config(const char *filename) @@ -1291,7 +1412,7 @@ void write_config(const char *filename)
else
{
//save algorithm name
if(json_object_set(config, "algorithm", json_string(opt_algorithm.name)) == -1)
if(json_object_set(config, "algorithm", json_string(default_profile.algorithm.name)) == -1)
{
applog(LOG_ERR, "Error: config_parser::write_config():\n json_object_set() failed on algorithm");
return;

7
config_parser.h

@ -12,7 +12,10 @@ @@ -12,7 +12,10 @@
#define empty_string(str) ((str && str[0] != '\0')?0:1)
#endif
#ifndef safe_cmp
#define safe_cmp(val1, val2) (((val1 && strcmp(val1, val2) != 0) || empty_string(val1))?1:0)
#define safe_cmp(val1, val2) (((val1 && strcasecmp(val1, val2) != 0) || empty_string(val1))?1:0)
#endif
#ifndef pool_cmp
#define pool_cmp(val1, val2) (((val1 && val2 && strcasecmp(val1, val2) == 0) || empty_string(val1))?1:0)
#endif
#ifndef isnull
#define isnull(str, default_str) ((str == NULL)?default_str:str)
@ -60,6 +63,8 @@ extern struct profile **profiles; @@ -60,6 +63,8 @@ extern struct profile **profiles;
extern int total_profiles;
/* option parser functions */
extern char *set_default_algorithm(const char *arg);
extern char *set_default_nfactor(const char *arg);
extern char *set_default_devices(const char *arg);
extern char *set_default_lookup_gap(const char *arg);
extern char *set_default_intensity(const char *arg);

3
driver-opencl.c

@ -28,6 +28,7 @@ @@ -28,6 +28,7 @@
#include "compat.h"
#include "miner.h"
#include "config_parser.h"
#include "driver-opencl.h"
#include "findnonce.h"
#include "ocl.h"
@ -1161,7 +1162,7 @@ static void opencl_detect(bool hotplug) @@ -1161,7 +1162,7 @@ static void opencl_detect(bool hotplug)
cgpu->threads = 1;
#endif
cgpu->virtual_gpu = i;
cgpu->algorithm = opt_algorithm;
cgpu->algorithm = default_profile.algorithm;
add_cgpu(cgpu);
}

8163
kernel/darkcoin-mod.cl

File diff suppressed because it is too large Load Diff

504
kernel/marucoin-mod.cl

@ -34,9 +34,9 @@ @@ -34,9 +34,9 @@
#define X13MOD_CL
#if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1
#define SPH_LITTLE_ENDIAN 1
#else
#define SPH_BIG_ENDIAN 1
#define SPH_BIG_ENDIAN 1
#endif
#define SPH_UPTR sph_u64
@ -44,24 +44,24 @@ @@ -44,24 +44,24 @@
typedef unsigned int sph_u32;
typedef int sph_s32;
#ifndef __OPENCL_VERSION__
typedef unsigned long long sph_u64;
typedef long long sph_s64;
typedef unsigned long long sph_u64;
typedef long long sph_s64;
#else
typedef unsigned long sph_u64;
typedef long sph_s64;
typedef unsigned long sph_u64;
typedef long sph_s64;
#endif
#define SPH_64 1
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) (as_uint(x))
#define SPH_ROTL32(x, n) rotate(as_uint(x), as_uint(n))
#define SPH_T32(x) ((as_uint(x)) & SPH_C32(0xFFFFFFFF))
#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n)))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) (as_ulong(x))
#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_T64(x) ((as_ulong(x)) & SPH_C64(0xFFFFFFFFFFFFFFFF))
#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
#define SPH_ECHO_64 1
@ -76,7 +76,7 @@ typedef long sph_s64; @@ -76,7 +76,7 @@ typedef long sph_s64;
#define SPH_CUBEHASH_UNROLL 0
#define SPH_KECCAK_UNROLL 0
#if !defined SPH_HAMSI_EXPAND_BIG
#define SPH_HAMSI_EXPAND_BIG 4
#define SPH_HAMSI_EXPAND_BIG 4
#endif
#include "blake.cl"
@ -104,6 +104,14 @@ typedef long sph_s64; @@ -104,6 +104,14 @@ typedef long sph_s64;
#define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + SPH_ROTL64(q[i+1], 5) + q[i+2] + SPH_ROTL64(q[i+3], 11) + \
q[i+4] + SPH_ROTL64(q[i+5], 27) + q[i+6] + SPH_ROTL64(q[i+7], 32) + \
q[i+8] + SPH_ROTL64(q[i+9], 37) + q[i+10] + SPH_ROTL64(q[i+11], 43) + \
q[i+12] + SPH_ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
typedef union {
unsigned char h1[64];
uint h4[16];
@ -125,9 +133,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes) @@ -125,9 +133,8 @@ __kernel void search(__global unsigned char* block, __global hash_t* hashes)
sph_u64 T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + (80 << 3), T1 = 0xFFFFFFFFFFFFFFFF;;
if ((T0 = SPH_T64(T0 + 1024)) < 1024)
{
T1 = SPH_T64(T1 + 1);
}
sph_u64 M0, M1, M2, M3, M4, M5, M6, M7;
sph_u64 M8, M9, MA, MB, MC, MD, ME, MF;
sph_u64 V0, V1, V2, V3, V4, V5, V6, V7;
@ -170,58 +177,268 @@ __kernel void search1(__global hash_t* hashes) @@ -170,58 +177,268 @@ __kernel void search1(__global hash_t* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
// bmw
sph_u64 BMW_H[16];
#pragma unroll 16
for(unsigned u = 0; u < 16; u++)
BMW_H[u] = BMW_IV512[u];
sph_u64 BMW_h1[16], BMW_h2[16];
sph_u64 mv[16];
mv[ 0] = SWAP8(hash->h8[0]);
mv[ 1] = SWAP8(hash->h8[1]);
mv[ 2] = SWAP8(hash->h8[2]);
mv[ 3] = SWAP8(hash->h8[3]);
mv[ 4] = SWAP8(hash->h8[4]);
mv[ 5] = SWAP8(hash->h8[5]);
mv[ 6] = SWAP8(hash->h8[6]);
mv[ 7] = SWAP8(hash->h8[7]);
mv[ 8] = 0x80;
mv[ 9] = 0;
sph_u64 mv[16],q[32];
sph_u64 tmp;
mv[0] = SWAP8(hash->h8[0]);
mv[1] = SWAP8(hash->h8[1]);
mv[2] = SWAP8(hash->h8[2]);
mv[3] = SWAP8(hash->h8[3]);
mv[4] = SWAP8(hash->h8[4]);
mv[5] = SWAP8(hash->h8[5]);
mv[6] = SWAP8(hash->h8[6]);
mv[7] = SWAP8(hash->h8[7]);
mv[8] = 0x80;
mv[9] = 0;
mv[10] = 0;
mv[11] = 0;
mv[12] = 0;
mv[13] = 0;
mv[14] = 0;
mv[15] = 0x200;
#define M(x) (mv[x])
#define H(x) (BMW_H[x])
#define dH(x) (BMW_h2[x])
mv[15] = SPH_C64(512);
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
FOLDb;
sph_u64 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
sph_u64 XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
#pragma unroll 16
for(int i=0;i<16;i++)
{
mv[i] = BMW_H[i];
BMW_H[i] = 0xaaaaaaaaaaaaaaa0ull + (sph_u64)i;
}
#undef M
#undef H
#undef dH
tmp = (mv[5] ^ BMW_H[5]) - (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]) + (mv[14] ^ BMW_H[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[1];
tmp = (mv[6] ^ BMW_H[6]) - (mv[8] ^ BMW_H[8]) + (mv[11] ^ BMW_H[11]) + (mv[14] ^ BMW_H[14]) - (mv[15] ^ BMW_H[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[2];
tmp = (mv[0] ^ BMW_H[0]) + (mv[7] ^ BMW_H[7]) + (mv[9] ^ BMW_H[9]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[3];
tmp = (mv[0] ^ BMW_H[0]) - (mv[1] ^ BMW_H[1]) + (mv[8] ^ BMW_H[8]) - (mv[10] ^ BMW_H[10]) + (mv[13] ^ BMW_H[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[4];
tmp = (mv[1] ^ BMW_H[1]) + (mv[2] ^ BMW_H[2]) + (mv[9] ^ BMW_H[9]) - (mv[11] ^ BMW_H[11]) - (mv[14] ^ BMW_H[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + BMW_H[5];
tmp = (mv[3] ^ BMW_H[3]) - (mv[2] ^ BMW_H[2]) + (mv[10] ^ BMW_H[10]) - (mv[12] ^ BMW_H[12]) + (mv[15] ^ BMW_H[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[6];
tmp = (mv[4] ^ BMW_H[4]) - (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) - (mv[11] ^ BMW_H[11]) + (mv[13] ^ BMW_H[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[7];
tmp = (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[5] ^ BMW_H[5]) - (mv[12] ^ BMW_H[12]) - (mv[14] ^ BMW_H[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[8];
tmp = (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) - (mv[6] ^ BMW_H[6]) + (mv[13] ^ BMW_H[13]) - (mv[15] ^ BMW_H[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[9];
tmp = (mv[0] ^ BMW_H[0]) - (mv[3] ^ BMW_H[3]) + (mv[6] ^ BMW_H[6]) - (mv[7] ^ BMW_H[7]) + (mv[14] ^ BMW_H[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + BMW_H[10];
tmp = (mv[8] ^ BMW_H[8]) - (mv[1] ^ BMW_H[1]) - (mv[4] ^ BMW_H[4]) - (mv[7] ^ BMW_H[7]) + (mv[15] ^ BMW_H[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[11];
tmp = (mv[8] ^ BMW_H[8]) - (mv[0] ^ BMW_H[0]) - (mv[2] ^ BMW_H[2]) - (mv[5] ^ BMW_H[5]) + (mv[9] ^ BMW_H[9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 13) ^ SPH_ROTL64(tmp, 43)) + BMW_H[12];
tmp = (mv[1] ^ BMW_H[1]) + (mv[3] ^ BMW_H[3]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[10] ^ BMW_H[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ SPH_ROTL64(tmp, 19) ^ SPH_ROTL64(tmp, 53)) + BMW_H[13];
tmp = (mv[2] ^ BMW_H[2]) + (mv[4] ^ BMW_H[4]) + (mv[7] ^ BMW_H[7]) + (mv[10] ^ BMW_H[10]) + (mv[11] ^ BMW_H[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ SPH_ROTL64(tmp, 28) ^ SPH_ROTL64(tmp, 59)) + BMW_H[14];
tmp = (mv[3] ^ BMW_H[3]) - (mv[5] ^ BMW_H[5]) + (mv[8] ^ BMW_H[8]) - (mv[11] ^ BMW_H[11]) - (mv[12] ^ BMW_H[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + BMW_H[15];
tmp = (mv[12] ^ BMW_H[12]) - (mv[4] ^ BMW_H[4]) - (mv[6] ^ BMW_H[6]) - (mv[9] ^ BMW_H[9]) + (mv[13] ^ BMW_H[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ SPH_ROTL64(tmp, 4) ^ SPH_ROTL64(tmp, 37)) + BMW_H[0];
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ SPH_ROTL64(q[i], 13) ^ SPH_ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ SPH_ROTL64(q[i+1], 19) ^ SPH_ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ SPH_ROTL64(q[i+2], 28) ^ SPH_ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ SPH_ROTL64(q[i+3], 4) ^ SPH_ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ SPH_ROTL64(q[i+4], 13) ^ SPH_ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ SPH_ROTL64(q[i+5], 19) ^ SPH_ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ SPH_ROTL64(q[i+6], 28) ^ SPH_ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ SPH_ROTL64(q[i+7], 4) ^ SPH_ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ SPH_ROTL64(q[i+8], 13) ^ SPH_ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ SPH_ROTL64(q[i+9], 19) ^ SPH_ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ SPH_ROTL64(q[i+10], 28) ^ SPH_ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ SPH_ROTL64(q[i+11], 4) ^ SPH_ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ SPH_ROTL64(q[i+12], 13) ^ SPH_ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ SPH_ROTL64(q[i+13], 19) ^ SPH_ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ SPH_ROTL64(q[i+14], 28) ^ SPH_ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ SPH_ROTL64(q[i+15], 4) ^ SPH_ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
#define M(x) (BMW_h2[x])
#define H(x) (final_b[x])
#define dH(x) (BMW_h1[x])
#pragma unroll 4
for(int i=2;i<6;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i+10], i+11) ) ^ BMW_H[i+7]);
}
FOLDb;
#pragma unroll 3
for(int i=6;i<9;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i+7]);
}
#undef M
#undef H
#undef dH
#pragma unroll 4
for(int i=9;i<13;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i+3], i+4) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
hash->h8[0] = SWAP8(BMW_h1[8]);
hash->h8[1] = SWAP8(BMW_h1[9]);
hash->h8[2] = SWAP8(BMW_h1[10]);
hash->h8[3] = SWAP8(BMW_h1[11]);
hash->h8[4] = SWAP8(BMW_h1[12]);
hash->h8[5] = SWAP8(BMW_h1[13]);
hash->h8[6] = SWAP8(BMW_h1[14]);
hash->h8[7] = SWAP8(BMW_h1[15]);
#pragma unroll 3
for(int i=13;i<16;i++)
{
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + SPH_ROTL64(mv[i], i+1) +
SPH_ROTL64(mv[i-13], (i-13)+1) - SPH_ROTL64(mv[i-6], (i-6)+1) ) ^ BMW_H[i-9]);
}
XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
BMW_H[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ mv[0]) + ( XL64 ^ q[24] ^ q[0]);
BMW_H[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ mv[1]) + ( XL64 ^ q[25] ^ q[1]);
BMW_H[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ mv[2]) + ( XL64 ^ q[26] ^ q[2]);
BMW_H[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ mv[3]) + ( XL64 ^ q[27] ^ q[3]);
BMW_H[4] = (SHR(XH64, 3) ^ q[20] ^ mv[4]) + ( XL64 ^ q[28] ^ q[4]);
BMW_H[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ mv[5]) + ( XL64 ^ q[29] ^ q[5]);
BMW_H[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ mv[6]) + ( XL64 ^ q[30] ^ q[6]);
BMW_H[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ mv[7]) + ( XL64 ^ q[31] ^ q[7]);
BMW_H[8] = SPH_ROTL64(BMW_H[4], 9) + ( XH64 ^ q[24] ^ mv[8]) + (SHL(XL64,8) ^ q[23] ^ q[8]);
BMW_H[9] = SPH_ROTL64(BMW_H[5],10) + ( XH64 ^ q[25] ^ mv[9]) + (SHR(XL64,6) ^ q[16] ^ q[9]);
BMW_H[10] = SPH_ROTL64(BMW_H[6],11) + ( XH64 ^ q[26] ^ mv[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
BMW_H[11] = SPH_ROTL64(BMW_H[7],12) + ( XH64 ^ q[27] ^ mv[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
BMW_H[12] = SPH_ROTL64(BMW_H[0],13) + ( XH64 ^ q[28] ^ mv[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
BMW_H[13] = SPH_ROTL64(BMW_H[1],14) + ( XH64 ^ q[29] ^ mv[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
BMW_H[14] = SPH_ROTL64(BMW_H[2],15) + ( XH64 ^ q[30] ^ mv[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
BMW_H[15] = SPH_ROTL64(BMW_H[3],16) + ( XH64 ^ q[31] ^ mv[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
hash->h8[0] = SWAP8(BMW_H[8]);
hash->h8[1] = SWAP8(BMW_H[9]);
hash->h8[2] = SWAP8(BMW_H[10]);
hash->h8[3] = SWAP8(BMW_H[11]);
hash->h8[4] = SWAP8(BMW_H[12]);
hash->h8[5] = SWAP8(BMW_H[13]);
hash->h8[6] = SWAP8(BMW_H[14]);
hash->h8[7] = SWAP8(BMW_H[15]);
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -232,43 +449,53 @@ __kernel void search2(__global hash_t* hashes) @@ -232,43 +449,53 @@ __kernel void search2(__global hash_t* hashes)
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__local sph_u64 T0_L[256], T1_L[256], T2_L[256], T3_L[256], T4_L[256], T5_L[256], T6_L[256], T7_L[256];
#if !SPH_SMALL_FOOTPRINT_GROESTL
__local sph_u64 T0_C[256], T1_C[256], T2_C[256], T3_C[256];
__local sph_u64 T4_C[256], T5_C[256], T6_C[256], T7_C[256];
#else
__local sph_u64 T0_C[256], T4_C[256];
#endif
int init = get_local_id(0);
int step = get_local_size(0);
for (int i = init; i < 256; i += step)
{
T0_L[i] = T0[i];
T1_L[i] = T1[i];
T2_L[i] = T2[i];
T3_L[i] = T3[i];
T4_L[i] = T4[i];
T5_L[i] = T5[i];
T6_L[i] = T6[i];
T7_L[i] = T7[i];
T0_C[i] = T0[i];
T4_C[i] = T4[i];
#if !SPH_SMALL_FOOTPRINT_GROESTL
T1_C[i] = T1[i];
T2_C[i] = T2[i];
T3_C[i] = T3[i];
T5_C[i] = T5[i];
T6_C[i] = T6[i];
T7_C[i] = T7[i];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
#define T0 T0_L
#define T1 T1_L
#define T2 T2_L
#define T3 T3_L
#define T4 T4_L
#define T5 T5_L
#define T6 T6_L
#define T7 T7_L
barrier(CLK_LOCAL_MEM_FENCE); // groestl
#define T0 T0_C
#define T1 T1_C
#define T2 T2_C
#define T3 T3_C
#define T4 T4_C
#define T5 T5_C
#define T6 T6_C
#define T7 T7_C
// groestl
sph_u64 H[16];
for (unsigned int u = 0; u < 15; u ++)
H[u] = 0;
#if USE_LE
#if USE_LE
H[15] = ((sph_u64)(512 & 0xFF) << 56) | ((sph_u64)(512 & 0xFF00) << 40);
#else
#else
H[15] = (sph_u64)512;
#endif
#endif
sph_u64 g[16], m[16];
m[0] = DEC64E(hash->h8[0]);
@ -279,8 +506,10 @@ __kernel void search2(__global hash_t* hashes) @@ -279,8 +506,10 @@ __kernel void search2(__global hash_t* hashes)
m[5] = DEC64E(hash->h8[5]);
m[6] = DEC64E(hash->h8[6]);
m[7] = DEC64E(hash->h8[7]);
for (unsigned int u = 0; u < 16; u ++)
g[u] = m[u] ^ H[u];
m[8] = 0x80; g[8] = m[8] ^ H[8];
m[9] = 0; g[9] = m[9] ^ H[9];
m[10] = 0; g[10] = m[10] ^ H[10];
@ -289,16 +518,23 @@ __kernel void search2(__global hash_t* hashes) @@ -289,16 +518,23 @@ __kernel void search2(__global hash_t* hashes)
m[13] = 0; g[13] = m[13] ^ H[13];
m[14] = 0; g[14] = m[14] ^ H[14];
m[15] = 0x100000000000000; g[15] = m[15] ^ H[15];
PERM_BIG_P(g);
PERM_BIG_Q(m);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= g[u] ^ m[u];
sph_u64 xH[16];
for (unsigned int u = 0; u < 16; u ++)
xH[u] = H[u];
PERM_BIG_P(xH);
for (unsigned int u = 0; u < 16; u ++)
H[u] ^= xH[u];
for (unsigned int u = 0; u < 8; u ++)
hash->h8[u] = DEC64E(H[u + 8]);
@ -355,7 +591,8 @@ __kernel void search4(__global hash_t* hashes) @@ -355,7 +591,8 @@ __kernel void search4(__global hash_t* hashes)
for(int i = 0; i < 2; i++)
{
if (i == 0) {
if (i == 0)
{
h0h ^= DEC64E(hash->h8[0]);
h0l ^= DEC64E(hash->h8[1]);
h1h ^= DEC64E(hash->h8[2]);
@ -364,7 +601,9 @@ __kernel void search4(__global hash_t* hashes) @@ -364,7 +601,9 @@ __kernel void search4(__global hash_t* hashes)
h2l ^= DEC64E(hash->h8[5]);
h3h ^= DEC64E(hash->h8[6]);
h3l ^= DEC64E(hash->h8[7]);
} else if(i == 1) {
}
else if(i == 1)
{
h4h ^= DEC64E(hash->h8[0]);
h4l ^= DEC64E(hash->h8[1]);
h5h ^= DEC64E(hash->h8[2]);
@ -471,7 +710,8 @@ __kernel void search6(__global hash_t* hashes) @@ -471,7 +710,8 @@ __kernel void search6(__global hash_t* hashes)
MI5;
LUFFA_P5;
if(i == 0) {
if(i == 0)
{
M0 = hash->h4[9];
M1 = hash->h4[8];
M2 = hash->h4[11];
@ -480,12 +720,16 @@ __kernel void search6(__global hash_t* hashes) @@ -480,12 +720,16 @@ __kernel void search6(__global hash_t* hashes)
M5 = hash->h4[12];
M6 = hash->h4[15];
M7 = hash->h4[14];
} else if(i == 1) {
}
else if(i == 1)
{
M0 = 0x80000000;
M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
} else if(i == 2) {
}
else if(i == 2)
M0 = M1 = M2 = M3 = M4 = M5 = M6 = M7 = 0;
} else if(i == 3) {
else if(i == 3)
{
hash->h4[1] = V00 ^ V10 ^ V20 ^ V30 ^ V40;
hash->h4[0] = V01 ^ V11 ^ V21 ^ V31 ^ V41;
hash->h4[3] = V02 ^ V12 ^ V22 ^ V32 ^ V42;
@ -535,10 +779,12 @@ __kernel void search7(__global hash_t* hashes) @@ -535,10 +779,12 @@ __kernel void search7(__global hash_t* hashes)
x6 ^= SWAP4(hash->h4[7]);
x7 ^= SWAP4(hash->h4[6]);
for (int i = 0; i < 13; i ++) {
for (int i = 0; i < 13; i ++)
{
SIXTEEN_ROUNDS;
if (i == 0) {
if (i == 0)
{
x0 ^= SWAP4(hash->h4[9]);
x1 ^= SWAP4(hash->h4[8]);
x2 ^= SWAP4(hash->h4[11]);
@ -547,12 +793,12 @@ __kernel void search7(__global hash_t* hashes) @@ -547,12 +793,12 @@ __kernel void search7(__global hash_t* hashes)
x5 ^= SWAP4(hash->h4[12]);
x6 ^= SWAP4(hash->h4[15]);
x7 ^= SWAP4(hash->h4[14]);
} else if(i == 1) {
}
else if(i == 1)
x0 ^= 0x80;
} else if (i == 2) {
else if (i == 2)
xv ^= SPH_C32(1);
}
}
hash->h4[0] = x0;
hash->h4[1] = x1;
@ -662,8 +908,10 @@ __kernel void search9(__global hash_t* hashes) @@ -662,8 +908,10 @@ __kernel void search9(__global hash_t* hashes)
// simd
s32 q[256];
unsigned char x[128];
for(unsigned int i = 0; i < 64; i++)
x[i] = hash->h1[i];
for(unsigned int i = 64; i < 128; i++)
x[i] = 0;
@ -796,9 +1044,8 @@ __kernel void search10(__global hash_t* hashes) @@ -796,9 +1044,8 @@ __kernel void search10(__global hash_t* hashes)
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < 8; i++) {
for (int i = 0; i < 8; i++)
hash.h8[i] = hashes[gid-offset].h8[i];
}
// echo
sph_u64 W00, W01, W10, W11, W20, W21, W30, W31, W40, W41, W50, W51, W60, W61, W70, W71, W80, W81, W90, W91, WA0, WA1, WB0, WB1, WC0, WC1, WD0, WD1, WE0, WE1, WF0, WF1;
@ -844,9 +1091,8 @@ __kernel void search10(__global hash_t* hashes) @@ -844,9 +1091,8 @@ __kernel void search10(__global hash_t* hashes)
WF0 = 0x200;
WF1 = 0;
for (unsigned u = 0; u < 10; u ++) {
for (unsigned u = 0; u < 10; u ++)
BIG_ROUND;
}
hashp->h8[0] = hash.h8[0] ^ Vb00 ^ W00 ^ W80;
hashp->h8[1] = hash.h8[1] ^ Vb01 ^ W01 ^ W81;
@ -865,8 +1111,7 @@ __kernel void search11(__global hash_t* hashes) @@ -865,8 +1111,7 @@ __kernel void search11(__global hash_t* hashes)
{
uint gid = get_global_id(0);
uint offset = get_global_offset(0);
hash_t hash;
__global hash_t *hashp = &(hashes[gid-offset]);
__global hash_t *hash = &(hashes[gid-offset]);
sph_u32 c0 = HAMSI_IV512[0], c1 = HAMSI_IV512[1], c2 = HAMSI_IV512[2], c3 = HAMSI_IV512[3];
sph_u32 c4 = HAMSI_IV512[4], c5 = HAMSI_IV512[5], c6 = HAMSI_IV512[6], c7 = HAMSI_IV512[7];
@ -876,29 +1121,28 @@ __kernel void search11(__global hash_t* hashes) @@ -876,29 +1121,28 @@ __kernel void search11(__global hash_t* hashes)
sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
sph_u32 h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
for (int i = 0; i < 8; i++) {
hash.h8[i] = hashes[gid-offset].h8[i];
}
#define buf(u) hash.h1[i + u]
for(int i = 0; i < 64; i += 8) {
#define buf(u) hash->h1[i + u]
for(int i = 0; i < 64; i += 8)
{
INPUT_BIG;
P_BIG;
T_BIG;
}
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
#undef buf
#define buf(u) (u == 0 ? 0x80 : 0)
INPUT_BIG;
P_BIG;
T_BIG;
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
#undef buf
#define buf(u) (u == 6 ? 2 : 0)
INPUT_BIG;
PF_BIG;
T_BIG;
for (unsigned u = 0; u < 16; u ++)
hashp->h4[u] = h[u];
for(unsigned u = 0; u < 16; u ++)
hash->h4[u] = h[u];
barrier(CLK_GLOBAL_MEM_FENCE);
}
@ -908,12 +1152,7 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo @@ -908,12 +1152,7 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo
{
uint gid = get_global_id(0);
uint offset = get_global_offset(0);
hash_t hash;
__global hash_t *hashp = &(hashes[gid-offset]);
for (int i = 0; i < 8; i++) {
hash.h8[i] = hashes[gid-offset].h8[i];
}
__global hash_t *hash = &(hashes[gid-offset]);
// fugue
sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
@ -929,22 +1168,25 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo @@ -929,22 +1168,25 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo
S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
FUGUE512_3((hash.h4[0x0]), (hash.h4[0x1]), (hash.h4[0x2]));
FUGUE512_3((hash.h4[0x3]), (hash.h4[0x4]), (hash.h4[0x5]));
FUGUE512_3((hash.h4[0x6]), (hash.h4[0x7]), (hash.h4[0x8]));
FUGUE512_3((hash.h4[0x9]), (hash.h4[0xA]), (hash.h4[0xB]));
FUGUE512_3((hash.h4[0xC]), (hash.h4[0xD]), (hash.h4[0xE]));
FUGUE512_3((hash.h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
FUGUE512_3((hash->h4[0x0]), (hash->h4[0x1]), (hash->h4[0x2]));
FUGUE512_3((hash->h4[0x3]), (hash->h4[0x4]), (hash->h4[0x5]));
FUGUE512_3((hash->h4[0x6]), (hash->h4[0x7]), (hash->h4[0x8]));
FUGUE512_3((hash->h4[0x9]), (hash->h4[0xA]), (hash->h4[0xB]));
FUGUE512_3((hash->h4[0xC]), (hash->h4[0xD]), (hash->h4[0xE]));
FUGUE512_3((hash->h4[0xF]), as_uint2(fc_bit_count).y, as_uint2(fc_bit_count).x);
// apply round shift if necessary
int i;
for (i = 0; i < 32; i ++) {
for (i = 0; i < 32; i ++)
{
ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03);
}
for (i = 0; i < 13; i ++) {
for (i = 0; i < 13; i ++)
{
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
@ -975,24 +1217,24 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo @@ -975,24 +1217,24 @@ __kernel void search12(__global hash_t* hashes, __global uint* output, const ulo
S18 ^= S00;
S27 ^= S00;
hash.h4[0] = SWAP4(S01);
hash.h4[1] = SWAP4(S02);
hash.h4[2] = SWAP4(S03);
hash.h4[3] = SWAP4(S04);
hash.h4[4] = SWAP4(S09);
hash.h4[5] = SWAP4(S10);
hash.h4[6] = SWAP4(S11);
hash.h4[7] = SWAP4(S12);
hash.h4[8] = SWAP4(S18);
hash.h4[9] = SWAP4(S19);
hash.h4[10] = SWAP4(S20);
hash.h4[11] = SWAP4(S21);
hash.h4[12] = SWAP4(S27);
hash.h4[13] = SWAP4(S28);
hash.h4[14] = SWAP4(S29);
hash.h4[15] = SWAP4(S30);
bool result = (hash.h8[3] <= target);
hash->h4[0] = SWAP4(S01);
hash->h4[1] = SWAP4(S02);
hash->h4[2] = SWAP4(S03);
hash->h4[3] = SWAP4(S04);
hash->h4[4] = SWAP4(S09);
hash->h4[5] = SWAP4(S10);
hash->h4[6] = SWAP4(S11);
hash->h4[7] = SWAP4(S12);
hash->h4[8] = SWAP4(S18);
hash->h4[9] = SWAP4(S19);
hash->h4[10] = SWAP4(S20);
hash->h4[11] = SWAP4(S21);
hash->h4[12] = SWAP4(S27);
hash->h4[13] = SWAP4(S28);
hash->h4[14] = SWAP4(S29);
hash->h4[15] = SWAP4(S30);
bool result = (hash->h8[3] <= target);
if (result)
output[atomic_inc(output+0xFF)] = SWAP4(gid);

2
miner.h

@ -1008,7 +1008,7 @@ extern int opt_queue; @@ -1008,7 +1008,7 @@ extern int opt_queue;
extern int opt_scantime;
extern int opt_expiry;
extern algorithm_t opt_algorithm;
//extern algorithm_t opt_algorithm;
extern cglock_t control_lock;
extern pthread_mutex_t hash_lock;

102
sgminer.c

@ -91,7 +91,7 @@ int opt_queue = 1; @@ -91,7 +91,7 @@ int opt_queue = 1;
int opt_scantime = 7;
int opt_expiry = 28;
algorithm_t opt_algorithm;
//algorithm_t opt_algorithm;
unsigned long long global_hashrate;
unsigned long global_quota_gcd = 1;
@ -513,9 +513,7 @@ struct pool *add_pool(void) @@ -513,9 +513,7 @@ struct pool *add_pool(void)
buf[0] = '\0';
pool->name = strdup(buf);
pool->profile = strdup(buf); //profile blank by default
/* Algorithm */
pool->algorithm = opt_algorithm;
pool->algorithm.name[0] = '\0'; //blank algorithm name
pools = (struct pool **)realloc(pools, sizeof(struct pool *) * (total_pools + 2));
pools[total_pools++] = pool;
@ -637,7 +635,7 @@ char *set_devices(char *arg) @@ -637,7 +635,7 @@ char *set_devices(char *arg)
int i, val1 = 0, val2 = 0;
char *nextptr;
if (*arg)
if(*arg)
{
if (*arg == '?')
{
@ -645,9 +643,9 @@ char *set_devices(char *arg) @@ -645,9 +643,9 @@ char *set_devices(char *arg)
return NULL;
}
//all devices enabled
else if(*arg == '*')
else if(*arg == '*' || !strcasecmp(arg, "all"))
{
applog(LOG_DEBUG, "set_devices(%s)", arg);
applog(LOG_DEBUG, "set_devices(all)");
opt_devs_enabled = 0;
return NULL;
}
@ -1165,31 +1163,6 @@ static void load_temp_cutoffs() @@ -1165,31 +1163,6 @@ static void load_temp_cutoffs()
}
}
static char *set_algo(const char *arg)
{
if ((json_array_index < 0) || (total_pools == 0)) {
set_algorithm(&opt_algorithm, arg);
applog(LOG_INFO, "Set default algorithm to %s", opt_algorithm.name);
} else {
set_pool_algorithm(arg);
}
return NULL;
}
static char *set_nfactor(const char *arg)
{
if ((json_array_index < 0) || (total_pools == 0)) {
set_algorithm_nfactor(&opt_algorithm, (const uint8_t) atoi(arg));
applog(LOG_INFO, "Set algorithm N-factor to %d (N to %d)",
opt_algorithm.nfactor, opt_algorithm.n);
} else {
set_pool_nfactor(arg);
}
return NULL;
}
static char *set_api_allow(const char *arg)
{
opt_set_charp(arg, &opt_api_allow);
@ -1251,7 +1224,7 @@ char *set_difficulty_multiplier(char *arg) @@ -1251,7 +1224,7 @@ char *set_difficulty_multiplier(char *arg)
/* These options are available from config file or commandline */
struct opt_table opt_config_table[] = {
OPT_WITH_ARG("--algorithm",
set_algo, NULL, NULL,
set_default_algorithm, NULL, NULL,
"Set mining algorithm and most common defaults, default: scrypt"),
OPT_WITH_ARG("--api-allow",
set_api_allow, NULL, NULL,
@ -1437,7 +1410,7 @@ struct opt_table opt_config_table[] = { @@ -1437,7 +1410,7 @@ struct opt_table opt_config_table[] = {
opt_set_bool, &opt_delaynet,
"Impose small delays in networking to not overload slow routers"),
OPT_WITH_ARG("--nfactor",
set_nfactor, NULL, NULL,
set_default_nfactor, NULL, NULL,
"Override default scrypt N-factor parameter."),
#ifdef HAVE_ADL
OPT_WITHOUT_ARG("--no-adl",
@ -5925,8 +5898,10 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) @@ -5925,8 +5898,10 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work)
bool soft_restart = !work->pool->gpu_threads;
rd_lock(&mining_thr_lock);
// Shutdown all threads first (necessary)
if (soft_restart) {
for (i = 0; i < start_threads; i++) {
if (soft_restart)
{
for (i = 0; i < start_threads; i++)
{
struct thr_info *thr = mining_thr[i];
thr->cgpu->drv->thread_shutdown(thr);
}
@ -5955,23 +5930,38 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) @@ -5955,23 +5930,38 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work)
else if(!empty_string(default_profile.lookup_gap))
set_lookup_gap((char *)default_profile.lookup_gap);
//intensity
if(!empty_string(work->pool->intensity))
set_intensity(work->pool->intensity);
else if(!empty_string(default_profile.intensity))
set_intensity(default_profile.intensity);
//xintensity
if (!empty_string(work->pool->xintensity))
set_xintensity(work->pool->xintensity);
else if(!empty_string(default_profile.xintensity))
set_xintensity(default_profile.xintensity);
//raw intensity
//raw intensity from pool
if (!empty_string(work->pool->rawintensity))
set_rawintensity(work->pool->rawintensity);
//raw intensity from default profile
else if(!empty_string(default_profile.rawintensity))
set_rawintensity(default_profile.rawintensity);
//if no rawintensity is set try xintensity
else
{
//xintensity from pool
if (!empty_string(work->pool->xintensity))
set_xintensity(work->pool->xintensity);
//xintensity from default profile
else if(!empty_string(default_profile.xintensity))
set_xintensity(default_profile.xintensity);
//no xintensity set try intensity
else
{
//intensity from pool
if(!empty_string(work->pool->intensity))
set_intensity(work->pool->intensity);
//intensity from defaults
else if(!empty_string(default_profile.intensity))
set_intensity(default_profile.intensity);
//nothing set anywhere, use 8
else
{
default_profile.intensity = strdup("8");
set_intensity(default_profile.intensity);
}
}
}
//shaders
if (!empty_string(work->pool->shaders))
@ -5985,7 +5975,7 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) @@ -5985,7 +5975,7 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work)
else if(!empty_string(default_profile.worksize))
set_worksize((char *)default_profile.worksize);
#ifdef HAVE_ADL
#ifdef HAVE_ADL
//GPU clock
if(!empty_string(work->pool->gpu_engine))
set_gpu_engine(work->pool->gpu_engine);
@ -6025,14 +6015,15 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) @@ -6025,14 +6015,15 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work)
set_powertune(i, gpus[i].gpu_powertune);
set_vddc(i, gpus[i].gpu_vddc);
}
#endif
#endif
// Change algorithm for each thread (thread_prepare calls initCl)
for (i = 0; i < start_threads; i++)
{
struct thr_info *thr = mining_thr[i];
thr->cgpu->algorithm = work->pool->algorithm;
if (soft_restart) {
if (soft_restart)
{
thr->cgpu->drv->thread_prepare(thr);
thr->cgpu->drv->thread_init(thr);
}
@ -6040,6 +6031,7 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) @@ -6040,6 +6031,7 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work)
// Necessary because algorithms can have dramatically different diffs
thr->cgpu->drv->working_diff = 1;
}
rd_unlock(&mining_thr_lock);
// Finish switching pools
@ -6111,7 +6103,9 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) @@ -6111,7 +6103,9 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work)
pthread_cond_broadcast(&algo_switch_wait_cond);
mutex_unlock(&algo_switch_wait_lock);
// Not all threads are waiting, join the waiting list
} else {
}
else
{
mutex_unlock(&algo_switch_lock);
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
// Wait for signal to start working again
@ -7785,7 +7779,7 @@ int main(int argc, char *argv[]) @@ -7785,7 +7779,7 @@ int main(int argc, char *argv[])
#endif
/* Default algorithm specified in algorithm.c ATM */
set_algorithm(&opt_algorithm, "scrypt");
set_algorithm(&default_profile.algorithm, "scrypt");
devcursor = 8;
logstart = devcursor + 1;

Loading…
Cancel
Save