diff --git a/kernel/alexkarnew.cl b/kernel/alexkarnew.cl index 757e8114..43486c3e 100644 --- a/kernel/alexkarnew.cl +++ b/kernel/alexkarnew.cl @@ -28,6 +28,11 @@ * online backup system. */ +/* Backwards compatibility, if NFACTOR not defined, default to 1024 scrypt */ +#ifndef NFACTOR +#define NFACTOR 1024 +#endif + __constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 }; __constant uint K[] = { 0x428a2f98U, @@ -761,7 +766,7 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup) uint CO=rotl(x,3U); uint CO_tmp=rotl(xSIZE,3U); - for(uint y=0; y<1024/LOOKUP_GAP; ++y, CO+=CO_tmp) + for(uint y=0; y0)); + const uint ySIZE = (NFACTOR/LOOKUP_GAP+(NFACTOR%LOOKUP_GAP>0)); const uint xSIZE = CONCURRENT_THREADS; uint x = get_global_id(0)%xSIZE; - for(uint y=0; y<1024/LOOKUP_GAP; ++y) + for(uint y=0; y0)); + const uint ySIZE = (NFACTOR/LOOKUP_GAP+(NFACTOR%LOOKUP_GAP>0)); const uint xSIZE = CONCURRENT_THREADS; uint x = get_global_id(0)%xSIZE; - for(uint y=0; y<1024/LOOKUP_GAP; ++y) + for(uint y=0; y0)); + const uint ySIZE = (NFACTOR/LOOKUP_GAP+(NFACTOR%LOOKUP_GAP>0)); const uint xSIZE = CONCURRENT_THREADS; uint x = get_global_id(0)%xSIZE; - for(uint y=0; y<1024/LOOKUP_GAP; ++y) + for(uint y=0; y<(NFACTOR/LOOKUP_GAP); ++y) { for(uint z=0; zopt_tc) { unsigned int sixtyfours; - sixtyfours = cgpu->max_alloc / 131072 / 64 - 1; + sixtyfours = cgpu->max_alloc / 131072 / 64 / (nfactor/1024)- 1; cgpu->thread_concurrency = sixtyfours * 64; if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; @@ -521,7 +522,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) if (clState->goffset) strcat(binaryfilename, "g"); - sprintf(numbuf, "lg%utc%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency); + sprintf(numbuf, "lg%utc%un%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency,opt_nfactor); strcat(binaryfilename, numbuf); sprintf(numbuf, "w%d", (int)clState->wsize); @@ -587,8 +588,8 @@ build: /* create a cl program executable for all the devices specified */ char *CompilerOptions = (char *)calloc(1, 256); - sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d", - cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize); + sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d -D NFACTOR=%d", + cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize,(unsigned int)nfactor); applog(LOG_DEBUG, "Setting worksize to %d", (int)(clState->wsize)); if (clState->vwidth > 1) @@ -777,7 +778,7 @@ built: return NULL; } - size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0)); + size_t ipt = (nfactor / cgpu->lookup_gap + (nfactor % cgpu->lookup_gap > 0)); size_t bufsize = 128 * ipt * cgpu->thread_concurrency; /* Use the max alloc value which has been rounded to a power of diff --git a/scrypt.c b/scrypt.c index d135a72d..b2cc005d 100644 --- a/scrypt.c +++ b/scrypt.c @@ -356,7 +356,7 @@ salsa20_8(uint32_t B[16], const uint32_t Bx[16]) /* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes */ -static void scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad, uint32_t *ostate) +static void scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad, uint32_t *ostate, const cl_uint n) { uint32_t * V; uint32_t X[32]; @@ -370,7 +370,7 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad, uint PBKDF2_SHA256_80_128(input, X); - for (i = 0; i < 1024; i += 2) { + for (i = 0; i < n; i += 2) { memcpy(&V[i * 32], X, 128); salsa20_8(&X[0], &X[16]); @@ -381,8 +381,8 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad, uint salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); } - for (i = 0; i < 1024; i += 2) { - j = X[16] & 1023; + for (i = 0; i < n; i += 2) { + j = X[16] & (n-1); p2 = (uint64_t *)(&V[j * 32]); for(k = 0; k < 16; k++) p1[k] ^= p2[k]; @@ -390,7 +390,7 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad, uint salsa20_8(&X[0], &X[16]); salsa20_8(&X[16], &X[0]); - j = X[16] & 1023; + j = X[16] & (n-1); p2 = (uint64_t *)(&V[j * 32]); for(k = 0; k < 16; k++) p1[k] ^= p2[k]; @@ -403,7 +403,8 @@ static void scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad, uint } /* 131583 rounded up to 4 byte alignment */ -#define SCRATCHBUF_SIZE (131584) +//#define SCRATCHBUF_SIZE (131584) +//#define SCRATCHBUF_SIZE (262207) void scrypt_regenhash(struct work *work) { @@ -411,17 +412,19 @@ void scrypt_regenhash(struct work *work) char *scratchbuf; uint32_t *nonce = (uint32_t *)(work->data + 76); uint32_t *ohash = (uint32_t *)(work->hash); - + be32enc_vect(data, (const uint32_t *)work->data, 19); data[19] = htobe32(*nonce); - scratchbuf = (char *)alloca(SCRATCHBUF_SIZE); - scrypt_1024_1_1_256_sp(data, scratchbuf, ohash); + //scratchbuf = alloca(SCRATCHBUF_SIZE); + scratchbuf = (char *)alloca((1 << opt_nfactor) * 128 + 512); + scrypt_1024_1_1_256_sp(data, scratchbuf, ohash, (1 << opt_nfactor)); flip32(ohash, ohash); } static const uint32_t diff1targ = 0x0000ffff; /* Used externally as confirmation of correct OCL code */ +/* int scrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) { uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); @@ -489,3 +492,4 @@ bool scanhash_scrypt(struct thr_info *thr, const unsigned char __maybe_unused *p free(scratchbuf);; return ret; } +*/ diff --git a/sgminer.c b/sgminer.c index 98e4051c..f16026e4 100644 --- a/sgminer.c +++ b/sgminer.c @@ -92,6 +92,7 @@ int opt_log_interval = 5; int opt_queue = 1; int opt_scantime = 7; int opt_expiry = 28; +int opt_nfactor = 11; static const bool opt_time = true; unsigned long long global_hashrate; unsigned long global_quota_gcd = 1; @@ -1105,6 +1106,9 @@ static struct opt_table opt_config_table[] = { opt_set_bool, &opt_compact, "Use compact display without per device statistics"), #endif + OPT_WITH_ARG("--nfactor", + set_int_0_to_9999, opt_show_intval, &opt_nfactor, + "Set scrypt nfactor, default: 10. Currently use 11 for vertcoin!"), OPT_WITHOUT_ARG("--debug|-D", enable_debug, &opt_debug, "Enable debug output"),