From 176cdf8bbc7b7dbbbccbf7f169a50afd5399e980 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 13 Jul 2012 14:44:52 +1000 Subject: [PATCH 001/178] Begin import of scrypt opencl kernel from reaper. --- configure.ac | 21 ++ driver-opencl.c | 15 + miner.h | 1 + scrypt120713.cl | 751 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 788 insertions(+) create mode 100644 scrypt120713.cl diff --git a/configure.ac b/configure.ac index 08449fea..75f680df 100644 --- a/configure.ac +++ b/configure.ac @@ -173,6 +173,8 @@ AC_ARG_ENABLE([adl], [adl=$enableval] ) +scrypt="no" + if test "$found_opencl" = 1; then if test "x$adl" != xno; then AC_CHECK_FILE([ADL_SDK/adl_sdk.h], have_adl=true, have_adl=false,) @@ -183,10 +185,20 @@ if test "$found_opencl" = 1; then DLOPEN_FLAGS="" fi fi + + AC_ARG_ENABLE([scrypt], + [AC_HELP_STRING([--enable-scrypt],[Compile support for scrypt litecoin mining (default disabled)])], + [scrypt=$enableval] + ) + if test "x$scrypt" = xyes; then + AC_DEFINE([USE_SCRYPT], [1], [Defined to 1 if scrypt support is wanted]) + fi else DLOPEN_FLAGS="" fi +AM_CONDITIONAL([HAS_SCRYPT], [test x$scrypt = xyes]) + bitforce="no" AC_ARG_ENABLE([bitforce], @@ -381,6 +393,7 @@ AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120223"], [Filename for phatk kernel AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120327"], [Filename for poclbm kernel]) AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120427"], [Filename for diakgcn kernel]) AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120328"], [Filename for diablo kernel]) +AC_DEFINE_UNQUOTED([SCRYPT_KERNNAME], ["scrypt120713"], [Filename for scrypt kernel]) AC_SUBST(OPENCL_LIBS) @@ -425,17 +438,25 @@ echo " curses.TUI...........: $cursesmsg" if test "x$opencl" != xno; then if test $found_opencl = 1; then echo " OpenCL...............: FOUND. GPU mining support enabled" + if test "x$scrypt" != xno; then + echo " scrypt...............: Enabled" + else + echo " scrypt...............: Disabled" + fi + else echo " OpenCL...............: NOT FOUND. GPU mining support DISABLED" if test "x$cpumining$bitforce$icarus$ztex$modminer" = xnonononono; then AC_MSG_ERROR([No mining configured in]) fi + echo " scrypt...............: Disabled (needs OpenCL)" fi else echo " OpenCL...............: Detection overrided. GPU mining support DISABLED" if test "x$cpumining$bitforce$icarus$ztex$modminer" = xnonononono; then AC_MSG_ERROR([No mining configured in]) fi + echo " scrypt...............: Disabled (needs OpenCL)" fi if test "x$adl" != xno; then diff --git a/driver-opencl.c b/driver-opencl.c index 880a4dac..b77614d3 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -137,6 +137,8 @@ static enum cl_kernels select_kernel(char *arg) return KL_POCLBM; if (!strcmp(arg, "phatk")) return KL_PHATK; + if (!strcmp(arg, "scrypt")) + return KL_SCRYPT; return KL_NONE; } @@ -986,6 +988,12 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t return status; } +static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) +{ + cl_int status = 0; + + return status; +} static void set_threads_hashes(unsigned int vectors, unsigned int *threads, int64_t *hashes, size_t *globalThreads, unsigned int minthreads, int intensity) @@ -1250,8 +1258,12 @@ static bool opencl_thread_prepare(struct thr_info *thr) case KL_PHATK: cgpu->kname = "phatk"; break; + case KL_SCRYPT: + cgpu->kname = "scrypt"; + break; case KL_POCLBM: cgpu->kname = "poclbm"; + break; default: break; } @@ -1290,6 +1302,9 @@ static bool opencl_thread_init(struct thr_info *thr) case KL_DIAKGCN: thrdata->queue_kernel_parameters = &queue_diakgcn_kernel; break; + case KL_SCRYPT: + thrdata->queue_kernel_parameters = &queue_scrypt_kernel; + break; default: case KL_DIABLO: thrdata->queue_kernel_parameters = &queue_diablo_kernel; diff --git a/miner.h b/miner.h index 120d7633..a9fa29c1 100644 --- a/miner.h +++ b/miner.h @@ -262,6 +262,7 @@ enum cl_kernels { KL_PHATK, KL_DIAKGCN, KL_DIABLO, + KL_SCRYPT, }; enum dev_reason { diff --git a/scrypt120713.cl b/scrypt120713.cl new file mode 100644 index 00000000..8826d0a3 --- /dev/null +++ b/scrypt120713.cl @@ -0,0 +1,751 @@ +#define rotl(x,y) rotate(x,y) +#define Ch(x,y,z) bitselect(z,y,x) +#define Maj(x,y,z) Ch((x^z),y,z) + +uint4 EndianSwap4(uint4 n) +{ + return rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U); +} + +#define Tr2(x) (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U)) +#define Tr1(x) (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U)) +#define Wr2(x) (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U)) +#define Wr1(x) (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U)) + +#define RND(a, b, c, d, e, f, g, h, k) \ + h += Tr1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += Tr2(a) + Maj(a, b, c); + +void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) +{ + uint4 S0 = *state0; + uint4 S1 = *state1; + +#define A S0.x +#define B S0.y +#define C S0.z +#define D S0.w +#define E S1.x +#define F S1.y +#define G S1.z +#define H S1.w + + uint4 W[4]; + + W[ 0].x = block0.x; + RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U); + W[ 0].y = block0.y; + RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U); + W[ 0].z = block0.z; + RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU); + W[ 0].w = block0.w; + RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U); + + W[ 1].x = block1.x; + RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); + W[ 1].y = block1.y; + RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); + W[ 1].z = block1.z; + RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); + W[ 1].w = block1.w; + RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); + + W[ 2].x = block2.x; + RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); + W[ 2].y = block2.y; + RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); + W[ 2].z = block2.z; + RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); + W[ 2].w = block2.w; + RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); + + W[ 3].x = block3.x; + RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); + W[ 3].y = block3.y; + RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); + W[ 3].z = block3.z; + RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); + W[ 3].w = block3.w; + RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + + *state0 += S0; + *state1 += S1; +} + +void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) +{ +#define A (*state0).x +#define B (*state0).y +#define C (*state0).z +#define D (*state0).w +#define E (*state1).x +#define F (*state1).y +#define G (*state1).z +#define H (*state1).w + + uint4 W[4]; + + W[ 0].x = block0.x; + D=0x98c7e2a2U+W[0].x; + H=0xfc08884dU+W[0].x; + + W[ 0].y = block0.y; + C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y; + G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U); + + W[ 0].z = block0.z; + B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z; + F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U); + + W[ 0].w = block0.w; + A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; + E=0x95F61999U+A+Tr2(F)+Maj(F,G,H); + + W[ 1].x = block1.x; + RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); + W[ 1].y = block1.y; + RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); + W[ 1].z = block1.z; + RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); + W[ 1].w = block1.w; + RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); + + W[ 2].x = block2.x; + RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); + W[ 2].y = block2.y; + RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); + W[ 2].z = block2.z; + RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); + W[ 2].w = block2.w; + RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); + + W[ 3].x = block3.x; + RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); + W[ 3].y = block3.y; + RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); + W[ 3].z = block3.z; + RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); + W[ 3].w = block3.w; + RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + + *state0 += (uint4)(0x6A09E667U,0xBB67AE85U,0x3C6EF372U,0xA54FF53AU); + *state1 += (uint4)(0x510E527FU,0x9B05688CU,0x1F83D9ABU,0x5BE0CD19U); +} + +__constant uint fixedW[64] = +{ + 0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, + 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794, + 0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f, + 0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c, + 0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa, + 0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012, + 0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4, + 0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848, +}; + +void SHA256_fixed(uint4*restrict state0,uint4*restrict state1) +{ + uint4 S0 = *state0; + uint4 S1 = *state1; + +#define A S0.x +#define B S0.y +#define C S0.z +#define D S0.w +#define E S1.x +#define F S1.y +#define G S1.z +#define H S1.w + + RND(A,B,C,D,E,F,G,H, fixedW[0]); + RND(H,A,B,C,D,E,F,G, fixedW[1]); + RND(G,H,A,B,C,D,E,F, fixedW[2]); + RND(F,G,H,A,B,C,D,E, fixedW[3]); + RND(E,F,G,H,A,B,C,D, fixedW[4]); + RND(D,E,F,G,H,A,B,C, fixedW[5]); + RND(C,D,E,F,G,H,A,B, fixedW[6]); + RND(B,C,D,E,F,G,H,A, fixedW[7]); + RND(A,B,C,D,E,F,G,H, fixedW[8]); + RND(H,A,B,C,D,E,F,G, fixedW[9]); + RND(G,H,A,B,C,D,E,F, fixedW[10]); + RND(F,G,H,A,B,C,D,E, fixedW[11]); + RND(E,F,G,H,A,B,C,D, fixedW[12]); + RND(D,E,F,G,H,A,B,C, fixedW[13]); + RND(C,D,E,F,G,H,A,B, fixedW[14]); + RND(B,C,D,E,F,G,H,A, fixedW[15]); + RND(A,B,C,D,E,F,G,H, fixedW[16]); + RND(H,A,B,C,D,E,F,G, fixedW[17]); + RND(G,H,A,B,C,D,E,F, fixedW[18]); + RND(F,G,H,A,B,C,D,E, fixedW[19]); + RND(E,F,G,H,A,B,C,D, fixedW[20]); + RND(D,E,F,G,H,A,B,C, fixedW[21]); + RND(C,D,E,F,G,H,A,B, fixedW[22]); + RND(B,C,D,E,F,G,H,A, fixedW[23]); + RND(A,B,C,D,E,F,G,H, fixedW[24]); + RND(H,A,B,C,D,E,F,G, fixedW[25]); + RND(G,H,A,B,C,D,E,F, fixedW[26]); + RND(F,G,H,A,B,C,D,E, fixedW[27]); + RND(E,F,G,H,A,B,C,D, fixedW[28]); + RND(D,E,F,G,H,A,B,C, fixedW[29]); + RND(C,D,E,F,G,H,A,B, fixedW[30]); + RND(B,C,D,E,F,G,H,A, fixedW[31]); + RND(A,B,C,D,E,F,G,H, fixedW[32]); + RND(H,A,B,C,D,E,F,G, fixedW[33]); + RND(G,H,A,B,C,D,E,F, fixedW[34]); + RND(F,G,H,A,B,C,D,E, fixedW[35]); + RND(E,F,G,H,A,B,C,D, fixedW[36]); + RND(D,E,F,G,H,A,B,C, fixedW[37]); + RND(C,D,E,F,G,H,A,B, fixedW[38]); + RND(B,C,D,E,F,G,H,A, fixedW[39]); + RND(A,B,C,D,E,F,G,H, fixedW[40]); + RND(H,A,B,C,D,E,F,G, fixedW[41]); + RND(G,H,A,B,C,D,E,F, fixedW[42]); + RND(F,G,H,A,B,C,D,E, fixedW[43]); + RND(E,F,G,H,A,B,C,D, fixedW[44]); + RND(D,E,F,G,H,A,B,C, fixedW[45]); + RND(C,D,E,F,G,H,A,B, fixedW[46]); + RND(B,C,D,E,F,G,H,A, fixedW[47]); + RND(A,B,C,D,E,F,G,H, fixedW[48]); + RND(H,A,B,C,D,E,F,G, fixedW[49]); + RND(G,H,A,B,C,D,E,F, fixedW[50]); + RND(F,G,H,A,B,C,D,E, fixedW[51]); + RND(E,F,G,H,A,B,C,D, fixedW[52]); + RND(D,E,F,G,H,A,B,C, fixedW[53]); + RND(C,D,E,F,G,H,A,B, fixedW[54]); + RND(B,C,D,E,F,G,H,A, fixedW[55]); + RND(A,B,C,D,E,F,G,H, fixedW[56]); + RND(H,A,B,C,D,E,F,G, fixedW[57]); + RND(G,H,A,B,C,D,E,F, fixedW[58]); + RND(F,G,H,A,B,C,D,E, fixedW[59]); + RND(E,F,G,H,A,B,C,D, fixedW[60]); + RND(D,E,F,G,H,A,B,C, fixedW[61]); + RND(C,D,E,F,G,H,A,B, fixedW[62]); + RND(B,C,D,E,F,G,H,A, fixedW[63]); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + *state0 += S0; + *state1 += S1; +} + +void shittify(uint4 B[8]) +{ + uint4 tmp[4]; + tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w); + tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w); + tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w); + tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i] = EndianSwap4(tmp[i]); + + tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w); + tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w); + tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w); + tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] = EndianSwap4(tmp[i]); +} + +void unshittify(uint4 B[8]) +{ + uint4 tmp[4]; + tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w); + tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w); + tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w); + tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i] = EndianSwap4(tmp[i]); + + tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w); + tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w); + tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w); + tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] = EndianSwap4(tmp[i]); +} + +void salsa(uint4 B[8]) +{ + uint4 w[4]; + +#pragma unroll + for(uint i=0; i<4; ++i) + w[i] = (B[i]^=B[i+4]); + +#pragma unroll + for(uint i=0; i<4; ++i) + { + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + } + +#pragma unroll + for(uint i=0; i<4; ++i) + w[i] = (B[i+4]^=(B[i]+=w[i])); + +#pragma unroll + for(uint i=0; i<4; ++i) + { + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + } + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] += w[i]; +} + +#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE) +#define CO Coord(z,x,y) + +void scrypt_core(uint4 X[8], __global uint4*restrict lookup) +{ + shittify(X); + const uint zSIZE = 8; + const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0)); + const uint xSIZE = CONCURRENT_THREADS; + uint x = get_global_id(0)%xSIZE; + + for(uint y=0; y<1024/LOOKUP_GAP; ++y) + { +#pragma unroll + for(uint z=0; z Date: Fri, 13 Jul 2012 14:47:02 +1000 Subject: [PATCH 002/178] Enable completely compiling scrypt out. --- driver-opencl.c | 9 +++++++++ miner.h | 2 ++ 2 files changed, 11 insertions(+) diff --git a/driver-opencl.c b/driver-opencl.c index b77614d3..e8a901dc 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -137,8 +137,10 @@ static enum cl_kernels select_kernel(char *arg) return KL_POCLBM; if (!strcmp(arg, "phatk")) return KL_PHATK; +#ifdef HAVE_SCRYPT if (!strcmp(arg, "scrypt")) return KL_SCRYPT; +#endif return KL_NONE; } @@ -988,12 +990,15 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t return status; } +#ifdef HAVE_SCRYPT static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) { cl_int status = 0; return status; } +#endif + static void set_threads_hashes(unsigned int vectors, unsigned int *threads, int64_t *hashes, size_t *globalThreads, unsigned int minthreads, int intensity) @@ -1258,9 +1263,11 @@ static bool opencl_thread_prepare(struct thr_info *thr) case KL_PHATK: cgpu->kname = "phatk"; break; +#ifdef HAVE_SCRYPT case KL_SCRYPT: cgpu->kname = "scrypt"; break; +#endif case KL_POCLBM: cgpu->kname = "poclbm"; break; @@ -1302,9 +1309,11 @@ static bool opencl_thread_init(struct thr_info *thr) case KL_DIAKGCN: thrdata->queue_kernel_parameters = &queue_diakgcn_kernel; break; +#ifdef HAVE_SCRYPT case KL_SCRYPT: thrdata->queue_kernel_parameters = &queue_scrypt_kernel; break; +#endif default: case KL_DIABLO: thrdata->queue_kernel_parameters = &queue_diablo_kernel; diff --git a/miner.h b/miner.h index a9fa29c1..74824730 100644 --- a/miner.h +++ b/miner.h @@ -262,7 +262,9 @@ enum cl_kernels { KL_PHATK, KL_DIAKGCN, KL_DIABLO, +#ifdef HAVE_SCRYPT KL_SCRYPT, +#endif }; enum dev_reason { From dd740caa98d30055184ea59d93286981a1bcfcb5 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 13 Jul 2012 19:02:43 +1000 Subject: [PATCH 003/178] Provide initial support for the scrypt kernel to compile with and mine scrypt with the --scrypt option. --- cgminer.c | 10 +++++++++- driver-opencl.c | 44 +++++++++++++++++++++++--------------------- miner.h | 7 ++++++- ocl.c | 20 ++++++++++++++++---- 4 files changed, 54 insertions(+), 27 deletions(-) diff --git a/cgminer.c b/cgminer.c index 55d3943b..c0084ab7 100644 --- a/cgminer.c +++ b/cgminer.c @@ -107,6 +107,9 @@ int opt_dynamic_interval = 7; int nDevs; int opt_g_threads = 2; int gpu_threads; +#ifdef USE_SCRYPT +bool opt_scrypt; +#endif #endif bool opt_restart = true; static bool opt_nogpu; @@ -863,7 +866,7 @@ static struct opt_table opt_config_table[] = { #ifdef HAVE_OPENCL OPT_WITH_ARG("--kernel|-k", set_kernel, NULL, NULL, - "Override kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"), + "Override sha256 kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"), #endif #ifdef USE_ICARUS OPT_WITH_ARG("--icarus-timing", @@ -953,6 +956,11 @@ static struct opt_table opt_config_table[] = { OPT_WITH_ARG("--sched-stop", set_schedtime, NULL, &schedstop, "Set a time of day in HH:MM to stop mining (will quit without a start time)"), +#ifdef USE_SCRYPT + OPT_WITHOUT_ARG("--scrypt", + opt_set_bool, &opt_scrypt, + "Use the scrypt algorithm for mining (litecoin only)"), +#endif OPT_WITH_ARG("--sharelog", set_sharelog, NULL, NULL, "Append share log to file"), diff --git a/driver-opencl.c b/driver-opencl.c index e8a901dc..0d77b4af 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -137,7 +137,7 @@ static enum cl_kernels select_kernel(char *arg) return KL_POCLBM; if (!strcmp(arg, "phatk")) return KL_PHATK; -#ifdef HAVE_SCRYPT +#ifdef USE_SCRYPT if (!strcmp(arg, "scrypt")) return KL_SCRYPT; #endif @@ -150,6 +150,8 @@ char *set_kernel(char *arg) int i, device = 0; char *nextptr; + if (opt_scrypt) + return "Cannot use sha256 kernel with scrypt"; nextptr = strtok(arg, ","); if (nextptr == NULL) return "Invalid parameters for set kernel"; @@ -990,7 +992,7 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t return status; } -#ifdef HAVE_SCRYPT +#ifdef USE_SCRYPT static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) { cl_int status = 0; @@ -1254,25 +1256,25 @@ static bool opencl_thread_prepare(struct thr_info *thr) if (!cgpu->kname) { switch (clStates[i]->chosen_kernel) { - case KL_DIABLO: - cgpu->kname = "diablo"; - break; - case KL_DIAKGCN: - cgpu->kname = "diakgcn"; - break; - case KL_PHATK: - cgpu->kname = "phatk"; - break; -#ifdef HAVE_SCRYPT - case KL_SCRYPT: - cgpu->kname = "scrypt"; - break; + case KL_DIABLO: + cgpu->kname = "diablo"; + break; + case KL_DIAKGCN: + cgpu->kname = "diakgcn"; + break; + case KL_PHATK: + cgpu->kname = "phatk"; + break; +#ifdef USE_SCRYPT + case KL_SCRYPT: + cgpu->kname = "scrypt"; + break; #endif - case KL_POCLBM: - cgpu->kname = "poclbm"; - break; - default: - break; + case KL_POCLBM: + cgpu->kname = "poclbm"; + break; + default: + break; } } applog(LOG_INFO, "initCl() finished. Found %s", name); @@ -1309,7 +1311,7 @@ static bool opencl_thread_init(struct thr_info *thr) case KL_DIAKGCN: thrdata->queue_kernel_parameters = &queue_diakgcn_kernel; break; -#ifdef HAVE_SCRYPT +#ifdef USE_SCRYPT case KL_SCRYPT: thrdata->queue_kernel_parameters = &queue_scrypt_kernel; break; diff --git a/miner.h b/miner.h index 74824730..c1cbe74e 100644 --- a/miner.h +++ b/miner.h @@ -262,7 +262,7 @@ enum cl_kernels { KL_PHATK, KL_DIAKGCN, KL_DIABLO, -#ifdef HAVE_SCRYPT +#ifdef USE_SCRYPT KL_SCRYPT, #endif }; @@ -620,6 +620,11 @@ extern bool use_syslog; extern struct thr_info *thr_info; extern struct cgpu_info gpus[MAX_GPUDEVICES]; extern int gpu_threads; +#ifdef USE_SCRYPT +extern bool opt_scrypt; +#else +#define opt_scrypt (0) +#endif extern double total_secs; extern int mining_threads; extern struct cgpu_info *cpus; diff --git a/ocl.c b/ocl.c index 464cb4e1..7b802571 100644 --- a/ocl.c +++ b/ocl.c @@ -354,8 +354,11 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) char numbuf[10]; if (gpus[gpu].kernel == KL_NONE) { - /* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */ - if (!strstr(name, "Tahiti") && + if (opt_scrypt) { + applog(LOG_INFO, "Selecting scrypt kernel"); + clState->chosen_kernel = KL_SCRYPT; + } else if (!strstr(name, "Tahiti") && + /* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */ (strstr(vbuff, "844.4") || // Linux 64 bit ATI 2.6 SDK strstr(vbuff, "851.4") || // Windows 64 bit "" strstr(vbuff, "831.4") || @@ -407,6 +410,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) strcpy(filename, DIAKGCN_KERNNAME".cl"); strcpy(binaryfilename, DIAKGCN_KERNNAME); break; + case KL_SCRYPT: + strcpy(filename, SCRYPT_KERNNAME".cl"); + strcpy(binaryfilename, SCRYPT_KERNNAME); + break; case KL_NONE: /* Shouldn't happen */ case KL_DIABLO: strcpy(filename, DIABLO_KERNNAME".cl"); @@ -528,8 +535,13 @@ build: /* create a cl program executable for all the devices specified */ char *CompilerOptions = calloc(1, 256); - sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", - (int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth); + if (opt_scrypt) { + sprintf(CompilerOptions, "-D LOOKUP_GAP=2 -D CONCURRENT_THREADS=6144 -D WORKSIZE=%d", + (int)clState->wsize); + } else { + sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", + (int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth); + } applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize); if (clState->vwidth > 1) applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth); From 2ed4072b5e02d06be905ee6588c02c51324d21ba Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 13 Jul 2012 19:18:11 +1000 Subject: [PATCH 004/178] Use cgminer specific output array entries in scrypt kernel. --- scrypt120713.cl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index 8826d0a3..faedd34b 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -685,12 +685,16 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup) unshittify(X); } +#define FOUND (0x80) +#define NFLAG (0x7F) + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global uint4*restrict input, __global uint*restrict output, __global uint4*restrict padcache, uint4 pad0, uint4 pad1) { + uint gid = get_global_id(0); uint4 X[8]; uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1; - uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,get_global_id(0)); + uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid); SHA256(&pad0,&pad1, data, (uint4)(0x80000000U,0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x280)); SHA256_fresh(&ostate0,&ostate1, pad0^0x5C5C5C5CU, pad1^0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU); @@ -718,7 +722,7 @@ __kernel void search(__global uint4*restrict input, __global uint*restrict outpu SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U)); if ((ostate1.w&0xFFFF) == 0) - output[get_global_id(0)&255] = get_global_id(0); + output[FOUND] = output[NFLAG & gid] = gid; } /*- From b085c338f64b8438f39dc70e7b05a9f27c22bfbf Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 13 Jul 2012 20:28:36 +1000 Subject: [PATCH 005/178] Make scrypt buffers and midstate compatible with cgminer. --- driver-opencl.c | 23 ++++++++++++++++++++++- findnonce.c | 4 ++++ miner.h | 3 +++ ocl.c | 6 +++++- ocl.h | 4 ++++ 5 files changed, 38 insertions(+), 2 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 0d77b4af..d50f1ffa 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -995,8 +995,25 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t #ifdef USE_SCRYPT static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) { + cl_uint4 *midstate = (cl_uint4 *)blk->midstate; + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; cl_int status = 0; + int i; + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->padbuffer8); + CL_SET_VARG(4, &midstate[0]); + CL_SET_VARG(4, &midstate[16]); + +#if 0 + clSetKernelArg(clState->kernel,0,sizeof(cl_mem), &clState->CLbuffer[0]); + clSetKernelArg(clState->kernel,1,sizeof(cl_mem), &clState->CLbuffer[1]); + clSetKernelArg(clState->kernel,2,sizeof(cl_mem), &clState->padbuffer8); + clSetKernelArg(clState->kernel,3,sizeof(cl_uint4), &midstate[0]); + clSetKernelArg(clState->kernel,4,sizeof(cl_uint4), &midstate[16]); +#endif return status; } #endif @@ -1330,6 +1347,10 @@ static bool opencl_thread_init(struct thr_info *thr) return false; } +#ifdef USE_SCRYPT + if (opt_scrypt) + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, BUFFERSIZE, blank_res, 0, NULL,NULL); +#endif status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, BUFFERSIZE, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { @@ -1456,7 +1477,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, BUFFERSIZE, thrdata->res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { - applog(LOG_ERR, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)"); + applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status); return -1; } diff --git a/findnonce.c b/findnonce.c index 98d7f0e7..4e40de53 100644 --- a/findnonce.c +++ b/findnonce.c @@ -127,6 +127,10 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) { blk->fiveA = blk->ctx_f + SHA256_K[5]; blk->sixA = blk->ctx_g + SHA256_K[6]; blk->sevenA = blk->ctx_h + SHA256_K[7]; + +#ifdef USE_SCRYPT + blk->midstate = (unsigned char *)state; +#endif } #define P(t) (W[(t)&0xF] = W[(t-16)&0xF] + (rotate(W[(t-15)&0xF], 25) ^ rotate(W[(t-15)&0xF], 14) ^ (W[(t-15)&0xF] >> 3)) + W[(t-7)&0xF] + (rotate(W[(t-2)&0xF], 15) ^ rotate(W[(t-2)&0xF], 13) ^ (W[(t-2)&0xF] >> 10))) diff --git a/miner.h b/miner.h index c1cbe74e..1261ea52 100644 --- a/miner.h +++ b/miner.h @@ -672,6 +672,9 @@ typedef struct { cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17; cl_uint zeroA, zeroB; cl_uint oneA, twoA, threeA, fourA, fiveA, sixA, sevenA; +#ifdef USE_SCRYPT + unsigned char *midstate; +#endif } dev_blk_ctx; #else typedef struct { diff --git a/ocl.c b/ocl.c index 7b802571..c6944e61 100644 --- a/ocl.c +++ b/ocl.c @@ -536,7 +536,7 @@ build: char *CompilerOptions = calloc(1, 256); if (opt_scrypt) { - sprintf(CompilerOptions, "-D LOOKUP_GAP=2 -D CONCURRENT_THREADS=6144 -D WORKSIZE=%d", + sprintf(CompilerOptions, "-D LOOKUP_GAP=2 -D CONCURRENT_THREADS=512 -D WORKSIZE=%d", (int)clState->wsize); } else { sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", @@ -732,6 +732,10 @@ built: return NULL; } +#ifdef USE_SCRYPT + if (opt_scrypt) + clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); +#endif clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status); diff --git a/ocl.h b/ocl.h index 2f2f2c24..fddcc67f 100644 --- a/ocl.h +++ b/ocl.h @@ -19,6 +19,10 @@ typedef struct { cl_command_queue commandQueue; cl_program program; cl_mem outputBuffer; +#ifdef USE_SCRYPT + cl_mem CLbuffer0; + cl_mem padbuffer8; +#endif bool hasBitAlign; bool hasOpenCL11plus; bool goffset; From 0f43eb5eb7b4c89d0190aa11edb67d2a7c546fb0 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 13 Jul 2012 20:35:44 +1000 Subject: [PATCH 006/178] Don't test nonce with sha and various fixes for scrypt. --- cgminer.c | 3 +++ miner.h | 2 -- ocl.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cgminer.c b/cgminer.c index c0084ab7..83439f77 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3971,6 +3971,9 @@ bool hashtest(const struct work *work) bool test_nonce(struct work *work, uint32_t nonce) { + if (opt_scrypt) + return true; + work->data[64 + 12 + 0] = (nonce >> 0) & 0xff; work->data[64 + 12 + 1] = (nonce >> 8) & 0xff; work->data[64 + 12 + 2] = (nonce >> 16) & 0xff; diff --git a/miner.h b/miner.h index 1261ea52..da62dcb6 100644 --- a/miner.h +++ b/miner.h @@ -262,9 +262,7 @@ enum cl_kernels { KL_PHATK, KL_DIAKGCN, KL_DIABLO, -#ifdef USE_SCRYPT KL_SCRYPT, -#endif }; enum dev_reason { diff --git a/ocl.c b/ocl.c index c6944e61..3faba2be 100644 --- a/ocl.c +++ b/ocl.c @@ -536,7 +536,7 @@ build: char *CompilerOptions = calloc(1, 256); if (opt_scrypt) { - sprintf(CompilerOptions, "-D LOOKUP_GAP=2 -D CONCURRENT_THREADS=512 -D WORKSIZE=%d", + sprintf(CompilerOptions, "-D LOOKUP_GAP=2 -D CONCURRENT_THREADS=6144 -D WORKSIZE=%d", (int)clState->wsize); } else { sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", From 1aacfe52795a70a4357bce09625fafc17ebac7ad Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 13 Jul 2012 20:38:26 +1000 Subject: [PATCH 007/178] Don't check postcalc nonce with sha256 in scrypt. --- findnonce.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/findnonce.c b/findnonce.c index 4e40de53..17c4189d 100644 --- a/findnonce.c +++ b/findnonce.c @@ -232,8 +232,13 @@ static void *postcalc_hash(void *userdata) pthread_detach(pthread_self()); for (entry = 0; entry < FOUND; entry++) { - if (pcd->res[entry]) - send_nonce(pcd, pcd->res[entry]); + if (pcd->res[entry]) { +#ifdef USE_SCRYPT + if (opt_scrypt) + submit_nonce(thr, work, entry); + else +#endif + send_nonce(pcd, pcd->res[entry]); nonces++; } From 8fd3bf74df6513fd43c78ce120b3c0806c03d9dd Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 13 Jul 2012 20:40:37 +1000 Subject: [PATCH 008/178] Build fix for opt scrypt. --- findnonce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/findnonce.c b/findnonce.c index 17c4189d..bbbd56f6 100644 --- a/findnonce.c +++ b/findnonce.c @@ -235,11 +235,12 @@ static void *postcalc_hash(void *userdata) if (pcd->res[entry]) { #ifdef USE_SCRYPT if (opt_scrypt) - submit_nonce(thr, work, entry); + submit_nonce(thr, pcd->work, entry); else #endif send_nonce(pcd, pcd->res[entry]); nonces++; + } } free(pcd); From e0296c411bd4e9a86e85dc87bc6d9f68c7dd3980 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 13 Jul 2012 21:35:25 +1000 Subject: [PATCH 009/178] Set up buffer8 for scrypt. --- ocl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocl.c b/ocl.c index 3faba2be..48d78b48 100644 --- a/ocl.c +++ b/ocl.c @@ -733,8 +733,10 @@ built: } #ifdef USE_SCRYPT - if (opt_scrypt) + if (opt_scrypt) { clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); + clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, 402653184, NULL, &status); + } #endif clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { From aabc723326a2c2f80b48f43e753bfca47e0cd156 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 00:30:25 +1000 Subject: [PATCH 010/178] Make sure goffset is set for scrypt and drop padbuffer8 to something manageable for now. --- driver-opencl.c | 2 +- ocl.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index d50f1ffa..5e79d2ba 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1470,7 +1470,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { - applog(LOG_ERR, "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)"); + applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); return -1; } diff --git a/ocl.c b/ocl.c index 48d78b48..95f25623 100644 --- a/ocl.c +++ b/ocl.c @@ -428,8 +428,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) gpus[gpu].vwidth = preferred_vwidth; } - if ((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) && - clState->vwidth == 1 && clState->hasOpenCL11plus) + if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) && + clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt) clState->goffset = true; if (gpus[gpu].work_size && gpus[gpu].work_size <= clState->max_work_size) @@ -536,7 +536,7 @@ build: char *CompilerOptions = calloc(1, 256); if (opt_scrypt) { - sprintf(CompilerOptions, "-D LOOKUP_GAP=2 -D CONCURRENT_THREADS=6144 -D WORKSIZE=%d", + sprintf(CompilerOptions, "-D LOOKUP_GAP=1 -D CONCURRENT_THREADS=1 -D WORKSIZE=%d", (int)clState->wsize); } else { sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", @@ -735,7 +735,7 @@ built: #ifdef USE_SCRYPT if (opt_scrypt) { clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); - clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, 402653184, NULL, &status); + clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, 131072, NULL, &status); } #endif clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); From ea444d0239df054f18e617ecf54836eecae9a626 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 00:59:38 +1000 Subject: [PATCH 011/178] Fix nonce submission code for scrypt. --- cgminer.c | 6 +++--- findnonce.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cgminer.c b/cgminer.c index 83439f77..f1f6c8c3 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3971,14 +3971,14 @@ bool hashtest(const struct work *work) bool test_nonce(struct work *work, uint32_t nonce) { - if (opt_scrypt) - return true; - work->data[64 + 12 + 0] = (nonce >> 0) & 0xff; work->data[64 + 12 + 1] = (nonce >> 8) & 0xff; work->data[64 + 12 + 2] = (nonce >> 16) & 0xff; work->data[64 + 12 + 3] = (nonce >> 24) & 0xff; + if (opt_scrypt) + return true; + return hashtest(work); } diff --git a/findnonce.c b/findnonce.c index bbbd56f6..ce282dc3 100644 --- a/findnonce.c +++ b/findnonce.c @@ -235,7 +235,7 @@ static void *postcalc_hash(void *userdata) if (pcd->res[entry]) { #ifdef USE_SCRYPT if (opt_scrypt) - submit_nonce(thr, pcd->work, entry); + submit_nonce(thr, pcd->work, pcd->res[entry]); else #endif send_nonce(pcd, pcd->res[entry]); From 8230ab05498e600fb20c8263452c85b0f0ba80c7 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 01:10:50 +1000 Subject: [PATCH 012/178] Display in debug mode when we're making the midstate locally. --- cgminer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cgminer.c b/cgminer.c index f1f6c8c3..e01b7236 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1239,6 +1239,7 @@ static bool work_decode(const json_t *val, struct work *work) if (likely(!jobj_binary(val, "midstate", work->midstate, sizeof(work->midstate), false))) { + applog(LOG_DEBUG, "Calculating midstate locally"); // Calculate it ourselves union { unsigned char c[64]; From b347a178f6c75636d0f64aa1a65ab19eda32628a Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Thu, 12 Jul 2012 16:57:50 +0000 Subject: [PATCH 013/178] bitforce: Use "full work" vs "nonce range" for kernel name Also move these string constants to #defines --- driver-bitforce.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 31892297..a5751bbe 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -32,6 +32,9 @@ #define tv_to_ms(tval) (tval.tv_sec * 1000 + tval.tv_usec / 1000) #define TIME_AVG_CONSTANT 8 +#define KNAME_WORK "full work" +#define KNAME_RANGE "nonce range" + struct device_api bitforce_api; #define BFopen(devpath) serial_open(devpath, 0, -1, true) @@ -93,10 +96,10 @@ static bool bitforce_detect_one(const char *devpath) if (opt_bfl_noncerange) { bitforce->nonce_range = true; bitforce->sleep_ms = BITFORCE_SLEEP_MS; - bitforce->kname = "Mini-rig"; + bitforce->kname = KNAME_RANGE; } else { bitforce->sleep_ms = BITFORCE_SLEEP_MS * 5; - bitforce->kname = "Single"; + bitforce->kname = KNAME_WORK; } if (likely((!memcmp(pdevbuf, ">>>ID: ", 7)) && (s = strstr(pdevbuf + 3, ">>>")))) { @@ -290,7 +293,7 @@ re_send: applog(LOG_WARNING, "BFL%i: Does not support nonce range, disabling", bitforce->device_id); bitforce->nonce_range = false; bitforce->sleep_ms *= 5; - bitforce->kname = "Single"; + bitforce->kname = KNAME_WORK; goto re_send; } applog(LOG_ERR, "BFL%i: Error: Send work reports: %s", bitforce->device_id, pdevbuf); @@ -438,7 +441,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work) bitforce->nonce_range = false; work->blk.nonce = 0xffffffff; bitforce->sleep_ms *= 5; - bitforce->kname = "Single"; + bitforce->kname = KNAME_WORK; } submit_nonce(thr, work, nonce); From 41daf99537ab51bfa1ab9899ef1c745510ce181e Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 09:45:55 +1000 Subject: [PATCH 014/178] Calculate midstate in separate function and remove likely/unlikely macros since they're dependent on pools, not code design. --- cgminer.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/cgminer.c b/cgminer.c index e01b7236..64d04310 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1230,6 +1230,27 @@ static bool jobj_binary(const json_t *obj, const char *key, return true; } +static void calc_midstate(struct work *work) +{ + union { + unsigned char c[64]; + uint32_t i[16]; + } data; + int swapcounter; + + for (swapcounter = 0; swapcounter < 16; swapcounter++) + data.i[swapcounter] = swab32(((uint32_t*) (work->data))[swapcounter]); + sha2_context ctx; + sha2_starts( &ctx, 0 ); + sha2_update( &ctx, data.c, 64 ); + memcpy(work->midstate, ctx.state, sizeof(work->midstate)); +#if defined(__BIG_ENDIAN__) || defined(MIPSEB) + int i; + for (i = 0; i < 8; i++) + (((uint32_t*) (work->midstate))[i]) = swab32(((uint32_t*) (work->midstate))[i]); +#endif +} + static bool work_decode(const json_t *val, struct work *work) { if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data), true))) { @@ -1237,29 +1258,13 @@ static bool work_decode(const json_t *val, struct work *work) goto err_out; } - if (likely(!jobj_binary(val, "midstate", - work->midstate, sizeof(work->midstate), false))) { - applog(LOG_DEBUG, "Calculating midstate locally"); + if (!jobj_binary(val, "midstate", work->midstate, sizeof(work->midstate), false)) { // Calculate it ourselves - union { - unsigned char c[64]; - uint32_t i[16]; - } data; - int swapcounter; - for (swapcounter = 0; swapcounter < 16; swapcounter++) - data.i[swapcounter] = swab32(((uint32_t*) (work->data))[swapcounter]); - sha2_context ctx; - sha2_starts( &ctx, 0 ); - sha2_update( &ctx, data.c, 64 ); - memcpy(work->midstate, ctx.state, sizeof(work->midstate)); -#if defined(__BIG_ENDIAN__) || defined(MIPSEB) - int i; - for (i = 0; i < 8; i++) - (((uint32_t*) (work->midstate))[i]) = swab32(((uint32_t*) (work->midstate))[i]); -#endif + applog(LOG_DEBUG, "Calculating midstate locally"); + calc_midstate(work); } - if (likely(!jobj_binary(val, "hash1", work->hash1, sizeof(work->hash1), false))) { + if (!jobj_binary(val, "hash1", work->hash1, sizeof(work->hash1), false)) { // Always the same anyway memcpy(work->hash1, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0", 64); } From 238db52aa6cc9b657316369996d2873482be583e Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 11:02:22 +1000 Subject: [PATCH 015/178] Make dynamic and scrypt opencl calls blocking. --- driver-opencl.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 5e79d2ba..ffe2fda7 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1395,6 +1395,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, _clState *clState = clStates[thr_id]; const cl_kernel *kernel = &clState->kernel; const int dynamic_us = opt_dynamic_interval * 1000; + cl_bool blocking; cl_int status; size_t globalThreads[1]; @@ -1402,14 +1403,20 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, unsigned int threads; int64_t hashes; + if (gpu->dynamic || opt_scrypt) + blocking = CL_TRUE; + else + blocking = CL_FALSE; + /* This finish flushes the readbuffer set with CL_FALSE later */ - clFinish(clState->commandQueue); - gettimeofday(&gpu->tv_gpuend, NULL); + if (!blocking) + clFinish(clState->commandQueue); if (gpu->dynamic) { struct timeval diff; suseconds_t gpu_us; + gettimeofday(&gpu->tv_gpuend, NULL); timersub(&gpu->tv_gpuend, &gpu->tv_gpustart, &diff); gpu_us = diff.tv_sec * 1000000 + diff.tv_usec; if (likely(gpu_us >= 0)) { @@ -1440,7 +1447,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, /* MAXBUFFERS entry is used as a flag to say nonces exist */ if (thrdata->res[FOUND]) { /* Clear the buffer again */ - status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, + status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0, BUFFERSIZE, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); @@ -1455,7 +1462,8 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, postcalc_hash_async(thr, work, thrdata->res); } memset(thrdata->res, 0, BUFFERSIZE); - clFinish(clState->commandQueue); + if (!blocking) + clFinish(clState->commandQueue); } gettimeofday(&gpu->tv_gpustart, NULL); @@ -1474,7 +1482,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, return -1; } - status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, + status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0, BUFFERSIZE, thrdata->res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status); From bd10764e769deef3a15d04bd3c4919e7cb045fc7 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 11:07:15 +1000 Subject: [PATCH 016/178] Cope with when we cannot set intensity low enough to meet dynamic interval by inducing a forced sleep. --- driver-opencl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver-opencl.c b/driver-opencl.c index ffe2fda7..c0869594 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1428,6 +1428,8 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, if (gpu->gpu_us_average > dynamic_us) { if (gpu->intensity > MIN_INTENSITY) --gpu->intensity; + else + nmsleep(opt_dynamic_interval / 2 ? : 1); } else if (gpu->gpu_us_average < dynamic_us / 2) { if (gpu->intensity < MAX_INTENSITY) ++gpu->intensity; From 243d005b1baa22ba62046ba52ea8ee187385037c Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 16:21:27 +1000 Subject: [PATCH 017/178] Set scrypt settings and buffer size in ocl.c code to be future modifiable. --- driver-opencl.c | 4 ++-- ocl.c | 23 ++++++++++++++++++----- ocl.h | 2 ++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index c0869594..2ebb54f5 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1004,8 +1004,8 @@ static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t CL_SET_ARG(clState->CLbuffer0); CL_SET_ARG(clState->outputBuffer); CL_SET_ARG(clState->padbuffer8); - CL_SET_VARG(4, &midstate[0]); - CL_SET_VARG(4, &midstate[16]); + CL_SET_ARG(midstate[0]); + CL_SET_ARG(midstate[16]); #if 0 clSetKernelArg(clState->kernel,0,sizeof(cl_mem), &clState->CLbuffer[0]); diff --git a/ocl.c b/ocl.c index 95f25623..2fc7e35f 100644 --- a/ocl.c +++ b/ocl.c @@ -464,6 +464,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) return NULL; } +#ifdef USE_SCRYPT + if (opt_scrypt) { + clState->lookup_gap = 1; + clState->thread_concurrency = 1; + } +#endif + strcat(binaryfilename, name); if (clState->goffset) strcat(binaryfilename, "g"); @@ -535,10 +542,13 @@ build: /* create a cl program executable for all the devices specified */ char *CompilerOptions = calloc(1, 256); - if (opt_scrypt) { - sprintf(CompilerOptions, "-D LOOKUP_GAP=1 -D CONCURRENT_THREADS=1 -D WORKSIZE=%d", - (int)clState->wsize); - } else { +#ifdef USE_SCRYPT + if (opt_scrypt) + sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d", + (int)clState->lookup_gap, (int)clState->thread_concurrency, (int)clState->wsize); + else +#endif + { sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", (int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth); } @@ -734,8 +744,11 @@ built: #ifdef USE_SCRYPT if (opt_scrypt) { + size_t ipt = (1024 / clState->lookup_gap + (1024 % clState->lookup_gap > 0)); + size_t bufsize = 128 * ipt * clState->thread_concurrency; + clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); - clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, 131072, NULL, &status); + clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); } #endif clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); diff --git a/ocl.h b/ocl.h index fddcc67f..b15c8893 100644 --- a/ocl.h +++ b/ocl.h @@ -22,6 +22,8 @@ typedef struct { #ifdef USE_SCRYPT cl_mem CLbuffer0; cl_mem padbuffer8; + size_t lookup_gap; + size_t thread_concurrency; #endif bool hasBitAlign; bool hasOpenCL11plus; From a5ebb71216eb19153ced15ecce2eb84239dc55db Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 22:01:20 +1000 Subject: [PATCH 018/178] Add cpumining capability for scrypt. --- Makefile.am | 4 + cgminer.c | 5 + driver-cpu.c | 21 ++- driver-cpu.h | 5 + scrypt.c | 452 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 486 insertions(+), 1 deletion(-) create mode 100644 scrypt.c diff --git a/Makefile.am b/Makefile.am index d17414df..a784ef91 100644 --- a/Makefile.am +++ b/Makefile.am @@ -56,6 +56,10 @@ cgminer_SOURCES += \ # the CPU portion extracted from original main.c cgminer_SOURCES += driver-cpu.h driver-cpu.c +if HAS_SCRYPT +cgminer_SOURCES += scrypt.c +endif + if HAS_YASM AM_CFLAGS = -DHAS_YASM if HAVE_x86_64 diff --git a/cgminer.c b/cgminer.c index 64d04310..f5044e22 100644 --- a/cgminer.c +++ b/cgminer.c @@ -5198,6 +5198,11 @@ int main(int argc, char *argv[]) opt_log_output = true; #ifdef WANT_CPUMINE +#ifdef USE_SCRYPT + if (opt_scrypt) + set_scrypt_algo(&opt_algo); + else +#endif if (0 <= opt_bench_algo) { double rate = bench_algo_stage3(opt_bench_algo); diff --git a/driver-cpu.c b/driver-cpu.c index 09ca478f..bd502b35 100644 --- a/driver-cpu.c +++ b/driver-cpu.c @@ -131,6 +131,9 @@ extern bool scanhash_sse2_32(struct thr_info*, const unsigned char *pmidstate, u uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce); +extern bool scanhash_scrypt(struct thr_info *thr, int thr_id, unsigned char *pdata, unsigned char *scratchbuf, + const unsigned char *ptarget, + uint32_t max_nonce, unsigned long *hashes_done); @@ -161,6 +164,9 @@ const char *algo_names[] = { #ifdef WANT_ALTIVEC_4WAY [ALGO_ALTIVEC_4WAY] = "altivec_4way", #endif +#ifdef WANT_SCRYPT + [ALGO_SCRYPT] = "scrypt", +#endif }; static const sha256_func sha256_funcs[] = { @@ -185,7 +191,10 @@ static const sha256_func sha256_funcs[] = { [ALGO_SSE2_64] = (sha256_func)scanhash_sse2_64, #endif #ifdef WANT_X8664_SSE4 - [ALGO_SSE4_64] = (sha256_func)scanhash_sse4_64 + [ALGO_SSE4_64] = (sha256_func)scanhash_sse4_64, +#endif +#ifdef WANT_SCRYPT + [ALGO_SCRYPT] = (sha256_func)scanhash_scrypt #endif }; #endif @@ -662,6 +671,9 @@ char *set_algo(const char *arg, enum sha256_algos *algo) { enum sha256_algos i; + if (opt_scrypt) + return "Can only use scrypt algorithm"; + if (!strcmp(arg, "auto")) { *algo = pick_fastest_algo(); return NULL; @@ -676,6 +688,13 @@ char *set_algo(const char *arg, enum sha256_algos *algo) return "Unknown algorithm"; } +#ifdef WANT_SCRYPT +void set_scrypt_algo(enum sha256_algos *algo) +{ + *algo = ALGO_SCRYPT; +} +#endif + void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo) { strncpy(buf, algo_names[*algo], OPT_SHOW_LEN); diff --git a/driver-cpu.h b/driver-cpu.h index ced400a5..3cf268b2 100644 --- a/driver-cpu.h +++ b/driver-cpu.h @@ -34,6 +34,10 @@ #define WANT_X8664_SSE4 1 #endif +#ifdef USE_SCRYPT +#define WANT_SCRYPT +#endif + enum sha256_algos { ALGO_C, /* plain C */ ALGO_4WAY, /* parallel SSE2 */ @@ -44,6 +48,7 @@ enum sha256_algos { ALGO_SSE2_64, /* SSE2 for x86_64 */ ALGO_SSE4_64, /* SSE4 for x86_64 */ ALGO_ALTIVEC_4WAY, /* parallel Altivec */ + ALGO_SCRYPT, /* scrypt */ }; extern const char *algo_names[]; diff --git a/scrypt.c b/scrypt.c new file mode 100644 index 00000000..569eeb9e --- /dev/null +++ b/scrypt.c @@ -0,0 +1,452 @@ +/*- + * Copyright 2009 Colin Percival, 2011 ArtForz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) + +typedef struct SHA256Context { + uint32_t state[8]; + uint32_t buf[16]; +} SHA256_CTX; + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = byteswap(src[i]); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, k) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + k) + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + if(swap) + for (i = 0; i < 16; i++) + W[i] = byteswap(block[i]); + else + memcpy(W, block, 64); + for (i = 16; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0, 0x428a2f98); + RNDr(S, W, 1, 0x71374491); + RNDr(S, W, 2, 0xb5c0fbcf); + RNDr(S, W, 3, 0xe9b5dba5); + RNDr(S, W, 4, 0x3956c25b); + RNDr(S, W, 5, 0x59f111f1); + RNDr(S, W, 6, 0x923f82a4); + RNDr(S, W, 7, 0xab1c5ed5); + RNDr(S, W, 8, 0xd807aa98); + RNDr(S, W, 9, 0x12835b01); + RNDr(S, W, 10, 0x243185be); + RNDr(S, W, 11, 0x550c7dc3); + RNDr(S, W, 12, 0x72be5d74); + RNDr(S, W, 13, 0x80deb1fe); + RNDr(S, W, 14, 0x9bdc06a7); + RNDr(S, W, 15, 0xc19bf174); + RNDr(S, W, 16, 0xe49b69c1); + RNDr(S, W, 17, 0xefbe4786); + RNDr(S, W, 18, 0x0fc19dc6); + RNDr(S, W, 19, 0x240ca1cc); + RNDr(S, W, 20, 0x2de92c6f); + RNDr(S, W, 21, 0x4a7484aa); + RNDr(S, W, 22, 0x5cb0a9dc); + RNDr(S, W, 23, 0x76f988da); + RNDr(S, W, 24, 0x983e5152); + RNDr(S, W, 25, 0xa831c66d); + RNDr(S, W, 26, 0xb00327c8); + RNDr(S, W, 27, 0xbf597fc7); + RNDr(S, W, 28, 0xc6e00bf3); + RNDr(S, W, 29, 0xd5a79147); + RNDr(S, W, 30, 0x06ca6351); + RNDr(S, W, 31, 0x14292967); + RNDr(S, W, 32, 0x27b70a85); + RNDr(S, W, 33, 0x2e1b2138); + RNDr(S, W, 34, 0x4d2c6dfc); + RNDr(S, W, 35, 0x53380d13); + RNDr(S, W, 36, 0x650a7354); + RNDr(S, W, 37, 0x766a0abb); + RNDr(S, W, 38, 0x81c2c92e); + RNDr(S, W, 39, 0x92722c85); + RNDr(S, W, 40, 0xa2bfe8a1); + RNDr(S, W, 41, 0xa81a664b); + RNDr(S, W, 42, 0xc24b8b70); + RNDr(S, W, 43, 0xc76c51a3); + RNDr(S, W, 44, 0xd192e819); + RNDr(S, W, 45, 0xd6990624); + RNDr(S, W, 46, 0xf40e3585); + RNDr(S, W, 47, 0x106aa070); + RNDr(S, W, 48, 0x19a4c116); + RNDr(S, W, 49, 0x1e376c08); + RNDr(S, W, 50, 0x2748774c); + RNDr(S, W, 51, 0x34b0bcb5); + RNDr(S, W, 52, 0x391c0cb3); + RNDr(S, W, 53, 0x4ed8aa4a); + RNDr(S, W, 54, 0x5b9cca4f); + RNDr(S, W, 55, 0x682e6ff3); + RNDr(S, W, 56, 0x748f82ee); + RNDr(S, W, 57, 0x78a5636f); + RNDr(S, W, 58, 0x84c87814); + RNDr(S, W, 59, 0x8cc70208); + RNDr(S, W, 60, 0x90befffa); + RNDr(S, W, 61, 0xa4506ceb); + RNDr(S, W, 62, 0xbef9a3f7); + RNDr(S, W, 63, 0xc67178f2); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static inline void +SHA256_InitState(uint32_t * state) +{ + /* Magic initialization constants */ + state[0] = 0x6A09E667; + state[1] = 0xBB67AE85; + state[2] = 0x3C6EF372; + state[3] = 0xA54FF53A; + state[4] = 0x510E527F; + state[5] = 0x9B05688C; + state[6] = 0x1F83D9AB; + state[7] = 0x5BE0CD19; +} + +static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000}; +static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300}; + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +static inline void +PBKDF2_SHA256_80_128(const uint32_t * passwd, uint32_t * buf) +{ + SHA256_CTX PShictx, PShoctx; + uint32_t tstate[8]; + uint32_t ihash[8]; + uint32_t i; + uint32_t pad[16]; + + static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000}; + + /* If Klen > 64, the key is really SHA256(K). */ + SHA256_InitState(tstate); + SHA256_Transform(tstate, passwd, 1); + memcpy(pad, passwd+16, 16); + memcpy(pad+4, passwdpad, 48); + SHA256_Transform(tstate, pad, 1); + memcpy(ihash, tstate, 32); + + SHA256_InitState(PShictx.state); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 16; i++) + pad[i] = 0x36363636; + SHA256_Transform(PShictx.state, pad, 0); + SHA256_Transform(PShictx.state, passwd, 1); + be32enc_vect(PShictx.buf, passwd+16, 4); + be32enc_vect(PShictx.buf+5, innerpad, 11); + + SHA256_InitState(PShoctx.state); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 16; i++) + pad[i] = 0x5c5c5c5c; + SHA256_Transform(PShoctx.state, pad, 0); + memcpy(PShoctx.buf+8, outerpad, 32); + + /* Iterate through the blocks. */ + for (i = 0; i < 4; i++) { + uint32_t istate[8]; + uint32_t ostate[8]; + + memcpy(istate, PShictx.state, 32); + PShictx.buf[4] = i + 1; + SHA256_Transform(istate, PShictx.buf, 0); + memcpy(PShoctx.buf, istate, 32); + + memcpy(ostate, PShoctx.state, 32); + SHA256_Transform(ostate, PShoctx.buf, 0); + be32enc_vect(buf+i*8, ostate, 8); + } +} + + +static inline uint32_t +PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt) +{ + uint32_t tstate[8]; + uint32_t ostate[8]; + uint32_t ihash[8]; + uint32_t i; + + /* Compute HMAC state after processing P and S. */ + uint32_t pad[16]; + + static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620}; + + /* If Klen > 64, the key is really SHA256(K). */ + SHA256_InitState(tstate); + SHA256_Transform(tstate, passwd, 1); + memcpy(pad, passwd+16, 16); + memcpy(pad+4, passwdpad, 48); + SHA256_Transform(tstate, pad, 1); + memcpy(ihash, tstate, 32); + + SHA256_InitState(ostate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 16; i++) + pad[i] = 0x5c5c5c5c; + SHA256_Transform(ostate, pad, 0); + + SHA256_InitState(tstate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 16; i++) + pad[i] = 0x36363636; + SHA256_Transform(tstate, pad, 0); + SHA256_Transform(tstate, salt, 1); + SHA256_Transform(tstate, salt+16, 1); + SHA256_Transform(tstate, ihash_finalblk, 0); + memcpy(pad, tstate, 32); + memcpy(pad+8, outerpad, 32); + + /* Feed the inner hash to the outer SHA256 operation. */ + SHA256_Transform(ostate, pad, 0); + /* Finish the outer SHA256 operation. */ + return byteswap(ostate[7]); +} + + +/** + * salsa20_8(B): + * Apply the salsa20/8 core to the provided block. + */ +static inline void +salsa20_8(uint32_t B[16], const uint32_t Bx[16]) +{ + uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; + size_t i; + + x00 = (B[ 0] ^= Bx[ 0]); + x01 = (B[ 1] ^= Bx[ 1]); + x02 = (B[ 2] ^= Bx[ 2]); + x03 = (B[ 3] ^= Bx[ 3]); + x04 = (B[ 4] ^= Bx[ 4]); + x05 = (B[ 5] ^= Bx[ 5]); + x06 = (B[ 6] ^= Bx[ 6]); + x07 = (B[ 7] ^= Bx[ 7]); + x08 = (B[ 8] ^= Bx[ 8]); + x09 = (B[ 9] ^= Bx[ 9]); + x10 = (B[10] ^= Bx[10]); + x11 = (B[11] ^= Bx[11]); + x12 = (B[12] ^= Bx[12]); + x13 = (B[13] ^= Bx[13]); + x14 = (B[14] ^= Bx[14]); + x15 = (B[15] ^= Bx[15]); + for (i = 0; i < 8; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns. */ + x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); + x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); + x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); + x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); + + /* Operate on rows. */ + x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); + x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); + x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); + x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); +#undef R + } + B[ 0] += x00; + B[ 1] += x01; + B[ 2] += x02; + B[ 3] += x03; + B[ 4] += x04; + B[ 5] += x05; + B[ 6] += x06; + B[ 7] += x07; + B[ 8] += x08; + B[ 9] += x09; + B[10] += x10; + B[11] += x11; + B[12] += x12; + B[13] += x13; + B[14] += x14; + B[15] += x15; +} + +/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output + scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes + */ +static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad) +{ + uint32_t * V; + uint32_t X[32]; + uint32_t i; + uint32_t j; + uint32_t k; + uint64_t *p1, *p2; + + p1 = (uint64_t *)X; + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + PBKDF2_SHA256_80_128(input, X); + + for (i = 0; i < 1024; i += 2) { + memcpy(&V[i * 32], X, 128); + + salsa20_8(&X[0], &X[16]); + salsa20_8(&X[16], &X[0]); + + memcpy(&V[(i + 1) * 32], X, 128); + + salsa20_8(&X[0], &X[16]); + salsa20_8(&X[16], &X[0]); + } + for (i = 0; i < 1024; i += 2) { + j = X[16] & 1023; + p2 = (uint64_t *)(&V[j * 32]); + for(k = 0; k < 16; k++) + p1[k] ^= p2[k]; + + salsa20_8(&X[0], &X[16]); + salsa20_8(&X[16], &X[0]); + + j = X[16] & 1023; + p2 = (uint64_t *)(&V[j * 32]); + for(k = 0; k < 16; k++) + p1[k] ^= p2[k]; + + salsa20_8(&X[0], &X[16]); + salsa20_8(&X[16], &X[0]); + } + + return PBKDF2_SHA256_80_128_32(input, X); +} + +bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsigned char *pdata, + unsigned char *phash1, unsigned char *phash, + const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, + uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + unsigned char *scratchbuf; + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = ((const uint32_t *)ptarget)[7]; + bool ret = false; + int i; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + scratchbuf = malloc(131583); + if (unlikely(!scratchbuf)) { + applog(LOG_ERR, "Failed to malloc scratchbuf in scanhash_scrypt"); + return ret; + } + + while(1) { + *nonce = ++n; + data[19] = n; + tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf); + + if (tmp_hash7 <= Htarg) { + ((uint32_t *)pdata)[19] = byteswap(n); + *last_nonce = n; + ret = true; + break; + } + + if ((n >= max_nonce) || thr->work_restart) { + *last_nonce = n; + break; + } + } +out_ret: + return ret; +} + From 54f1b80824eecdd53a22aaa347f792b5b6c8e647 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 22:19:55 +1000 Subject: [PATCH 019/178] Free the scratchbuf memory allocated in scrypt and don't check if CPUs are sick since they can't be. Prepare for khash hash rates in display. --- cgminer.c | 14 +++++++++++--- scrypt.c | 5 +++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/cgminer.c b/cgminer.c index f5044e22..6002927e 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3394,9 +3394,13 @@ static void hashmeter(int thr_id, struct timeval *diff, double utility, efficiency = 0.0; static double local_mhashes_done = 0; static double rolling = 0; - double local_mhashes = (double)hashes_done / 1000000.0; + double local_mhashes; bool showlog = false; + if (opt_scrypt) + local_mhashes = (double)hashes_done / 1000.0; + else + local_mhashes = (double)hashes_done / 1000000.0; /* Update the last time this thread reported in */ if (thr_id >= 0) { gettimeofday(&thr_info[thr_id].last, NULL); @@ -3472,9 +3476,9 @@ static void hashmeter(int thr_id, struct timeval *diff, utility = total_accepted / ( total_secs ? total_secs : 1 ) * 60; efficiency = total_getworks ? total_accepted * 100.0 / total_getworks : 0.0; - sprintf(statusline, "%s(%ds):%.1f (avg):%.1f Mh/s | Q:%d A:%d R:%d HW:%d E:%.0f%% U:%.1f/m", + sprintf(statusline, "%s(%ds):%.1f (avg):%.1f %sh/s | Q:%d A:%d R:%d HW:%d E:%.0f%% U:%.1f/m", want_per_device_stats ? "ALL " : "", - opt_log_interval, rolling, total_mhashes_done / total_secs, + opt_log_interval, rolling, total_mhashes_done / total_secs, opt_scrypt ? "K" : "M", total_getworks, total_accepted, total_rejected, hw_errors, efficiency, utility); @@ -4604,6 +4608,10 @@ static void *watchdog_thread(void __maybe_unused *userdata) if (thr->getwork || *denable == DEV_DISABLED) continue; +#ifdef WANT_CPUMINE + if (!strcmp(cgpu->api->dname, "cpu")) + continue; +#endif if (cgpu->rolling < WATCHDOG_LOW_HASH) cgpu->low_count++; else diff --git a/scrypt.c b/scrypt.c index 569eeb9e..4334bcf0 100644 --- a/scrypt.c +++ b/scrypt.c @@ -434,19 +434,20 @@ bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsig data[19] = n; tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf); - if (tmp_hash7 <= Htarg) { + if (unlikely(tmp_hash7 <= Htarg)) { ((uint32_t *)pdata)[19] = byteswap(n); *last_nonce = n; ret = true; break; } - if ((n >= max_nonce) || thr->work_restart) { + if (unlikely((n >= max_nonce) || thr->work_restart)) { *last_nonce = n; break; } } out_ret: + free(scratchbuf);; return ret; } From 6c6c285268671bba441d4cf527e9fa0a588630a6 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 22:25:41 +1000 Subject: [PATCH 020/178] Show Khash hashrates when scrypt is in use. --- cgminer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cgminer.c b/cgminer.c index 6002927e..a5529f28 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1372,10 +1372,11 @@ static void get_statline(char *buf, struct cgpu_info *cgpu) cgpu->api->get_statline_before(buf, cgpu); else tailsprintf(buf, " | "); - tailsprintf(buf, "(%ds):%.1f (avg):%.1f Mh/s | A:%d R:%d HW:%d U:%.1f/m", + tailsprintf(buf, "(%ds):%.1f (avg):%.1f %sh/s | A:%d R:%d HW:%d U:%.1f/m", opt_log_interval, cgpu->rolling, cgpu->total_mhashes / total_secs, + opt_scrypt ? "K" : "M", cgpu->accepted, cgpu->rejected, cgpu->hw_errors, @@ -1471,8 +1472,9 @@ static void curses_print_devstatus(int thr_id) adj_width(cgpu->rejected, &rwidth); adj_width(cgpu->hw_errors, &hwwidth); adj_width(cgpu->utility, &uwidth); - wprintw(statuswin, "/%5.1fMh/s | A:%*d R:%*d HW:%*d U:%*.2f/m", + wprintw(statuswin, "/%5.1f%sh/s | A:%*d R:%*d HW:%*d U:%*.2f/m", cgpu->total_mhashes / total_secs, + opt_scrypt ? "K" : "M", awidth, cgpu->accepted, rwidth, cgpu->rejected, hwwidth, cgpu->hw_errors, @@ -4703,7 +4705,7 @@ static void print_summary(void) #endif applog(LOG_WARNING, "Runtime: %d hrs : %d mins : %d secs", hours, mins, secs); if (total_secs) - applog(LOG_WARNING, "Average hashrate: %.1f Megahash/s", total_mhashes_done / total_secs); + applog(LOG_WARNING, "Average hashrate: %.1f %shash/s", total_mhashes_done / total_secs, opt_scrypt? "Kilo" : "Mega"); applog(LOG_WARNING, "Solved blocks: %d", found_blocks); applog(LOG_WARNING, "Queued work requests: %d", total_getworks); applog(LOG_WARNING, "Share submissions: %d", total_accepted + total_rejected); From fdcaea1c13838e0dd20a15ecd86b9c662b947dec Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 14 Jul 2012 22:37:32 +1000 Subject: [PATCH 021/178] Start with smaller amount of hashes in cpu mining to enable scrypt to return today sometime. --- driver-cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver-cpu.c b/driver-cpu.c index bd502b35..8ffc7802 100644 --- a/driver-cpu.c +++ b/driver-cpu.c @@ -777,7 +777,7 @@ static bool cpu_thread_prepare(struct thr_info *thr) static uint64_t cpu_can_limit_work(struct thr_info *thr) { - return 0xfffff; + return 0xffff; } static bool cpu_thread_init(struct thr_info *thr) From 46592a24f4d91991f3302e0b39bfc10cfe01255a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 15 Jul 2012 13:20:13 +1000 Subject: [PATCH 022/178] Use uint16 in SHA256 in scrypt kernel. --- scrypt120713.cl | 258 ++++++++++++++++++++++++------------------------ 1 file changed, 129 insertions(+), 129 deletions(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index faedd34b..47eb7e9c 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -31,187 +31,187 @@ void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, con #define G S1.z #define H S1.w - uint4 W[4]; + uint16 W; - W[ 0].x = block0.x; - RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U); - W[ 0].y = block0.y; - RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U); - W[ 0].z = block0.z; - RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU); - W[ 0].w = block0.w; - RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U); + W.s0 = block0.x; + RND(A,B,C,D,E,F,G,H, W.s0+0x428a2f98U); + W.s1 = block0.y; + RND(H,A,B,C,D,E,F,G, W.s1+0x71374491U); + W.s2 = block0.z; + RND(G,H,A,B,C,D,E,F, W.s2+0xb5c0fbcfU); + W.s3 = block0.w; + RND(F,G,H,A,B,C,D,E, W.s3+0xe9b5dba5U); - W[ 1].x = block1.x; - RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); - W[ 1].y = block1.y; - RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); - W[ 1].z = block1.z; - RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); - W[ 1].w = block1.w; - RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); + W.s4 = block1.x; + RND(E,F,G,H,A,B,C,D, W.s4+0x3956c25bU); + W.s5 = block1.y; + RND(D,E,F,G,H,A,B,C, W.s5+0x59f111f1U); + W.s6 = block1.z; + RND(C,D,E,F,G,H,A,B, W.s6+0x923f82a4U); + W.s7 = block1.w; + RND(B,C,D,E,F,G,H,A, W.s7+0xab1c5ed5U); - W[ 2].x = block2.x; - RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); - W[ 2].y = block2.y; - RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); - W[ 2].z = block2.z; - RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); - W[ 2].w = block2.w; - RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); + W.s8 = block2.x; + RND(A,B,C,D,E,F,G,H, W.s8+0xd807aa98U); + W.s9 = block2.y; + RND(H,A,B,C,D,E,F,G, W.s9+0x12835b01U); + W.sa = block2.z; + RND(G,H,A,B,C,D,E,F, W.sa+0x243185beU); + W.sb = block2.w; + RND(F,G,H,A,B,C,D,E, W.sb+0x550c7dc3U); - W[ 3].x = block3.x; - RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); - W[ 3].y = block3.y; - RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); - W[ 3].z = block3.z; - RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); - W[ 3].w = block3.w; - RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); + W.sc = block3.x; + RND(E,F,G,H,A,B,C,D, W.sc+0x72be5d74U); + W.sd = block3.y; + RND(D,E,F,G,H,A,B,C, W.sd+0x80deb1feU); + W.se = block3.z; + RND(C,D,E,F,G,H,A,B, W.se+0x9bdc06a7U); + W.sf = block3.w; + RND(B,C,D,E,F,G,H,A, W.sf+0xc19bf174U); - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); + W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); + RND(A,B,C,D,E,F,G,H, W.s0+0xe49b69c1U); - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); + W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); + RND(H,A,B,C,D,E,F,G, W.s1+0xefbe4786U); - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); + W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); + RND(G,H,A,B,C,D,E,F, W.s2+0x0fc19dc6U); - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); + W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); + RND(F,G,H,A,B,C,D,E, W.s3+0x240ca1ccU); - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); + W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); + RND(E,F,G,H,A,B,C,D, W.s4+0x2de92c6fU); - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); + W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); + RND(D,E,F,G,H,A,B,C, W.s5+0x4a7484aaU); - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); + W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); + RND(C,D,E,F,G,H,A,B, W.s6+0x5cb0a9dcU); - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); + W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); + RND(B,C,D,E,F,G,H,A, W.s7+0x76f988daU); - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); + W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); + RND(A,B,C,D,E,F,G,H, W.s8+0x983e5152U); - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); + W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); + RND(H,A,B,C,D,E,F,G, W.s9+0xa831c66dU); - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); + W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); + RND(G,H,A,B,C,D,E,F, W.sa+0xb00327c8U); - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); + W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); + RND(F,G,H,A,B,C,D,E, W.sb+0xbf597fc7U); - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); + W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); + RND(E,F,G,H,A,B,C,D, W.sc+0xc6e00bf3U); - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); + W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); + RND(D,E,F,G,H,A,B,C, W.sd+0xd5a79147U); - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); + W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); + RND(C,D,E,F,G,H,A,B, W.se+0x06ca6351U); - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); + W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); + RND(B,C,D,E,F,G,H,A, W.sf+0x14292967U); - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); + W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); + RND(A,B,C,D,E,F,G,H, W.s0+0x27b70a85U); - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); + W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); + RND(H,A,B,C,D,E,F,G, W.s1+0x2e1b2138U); - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); + W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); + RND(G,H,A,B,C,D,E,F, W.s2+0x4d2c6dfcU); - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); + W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); + RND(F,G,H,A,B,C,D,E, W.s3+0x53380d13U); - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); + W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); + RND(E,F,G,H,A,B,C,D, W.s4+0x650a7354U); - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); + W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); + RND(D,E,F,G,H,A,B,C, W.s5+0x766a0abbU); - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); + W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); + RND(C,D,E,F,G,H,A,B, W.s6+0x81c2c92eU); - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); + W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); + RND(B,C,D,E,F,G,H,A, W.s7+0x92722c85U); - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); + W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); + RND(A,B,C,D,E,F,G,H, W.s8+0xa2bfe8a1U); - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); + W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); + RND(H,A,B,C,D,E,F,G, W.s9+0xa81a664bU); - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); + W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); + RND(G,H,A,B,C,D,E,F, W.sa+0xc24b8b70U); - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); + W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); + RND(F,G,H,A,B,C,D,E, W.sb+0xc76c51a3U); - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); + W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); + RND(E,F,G,H,A,B,C,D, W.sc+0xd192e819U); - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); + W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); + RND(D,E,F,G,H,A,B,C, W.sd+0xd6990624U); - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); + W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); + RND(C,D,E,F,G,H,A,B, W.se+0xf40e3585U); - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); + W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); + RND(B,C,D,E,F,G,H,A, W.sf+0x106aa070U); - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); + W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); + RND(A,B,C,D,E,F,G,H, W.s0+0x19a4c116U); - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); + W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); + RND(H,A,B,C,D,E,F,G, W.s1+0x1e376c08U); - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); + W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); + RND(G,H,A,B,C,D,E,F, W.s2+0x2748774cU); - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); + W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); + RND(F,G,H,A,B,C,D,E, W.s3+0x34b0bcb5U); - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); + W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); + RND(E,F,G,H,A,B,C,D, W.s4+0x391c0cb3U); - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); + W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); + RND(D,E,F,G,H,A,B,C, W.s5+0x4ed8aa4aU); - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); + W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); + RND(C,D,E,F,G,H,A,B, W.s6+0x5b9cca4fU); - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); + W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); + RND(B,C,D,E,F,G,H,A, W.s7+0x682e6ff3U); - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); + W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); + RND(A,B,C,D,E,F,G,H, W.s8+0x748f82eeU); - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); + W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); + RND(H,A,B,C,D,E,F,G, W.s9+0x78a5636fU); - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); + W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); + RND(G,H,A,B,C,D,E,F, W.sa+0x84c87814U); - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); + W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); + RND(F,G,H,A,B,C,D,E, W.sb+0x8cc70208U); - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); + W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); + RND(E,F,G,H,A,B,C,D, W.sc+0x90befffaU); - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); + W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); + RND(D,E,F,G,H,A,B,C, W.sd+0xa4506cebU); - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); + W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); + RND(C,D,E,F,G,H,A,B, W.se+0xbef9a3f7U); - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); + W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); + RND(B,C,D,E,F,G,H,A, W.sf+0xc67178f2U); #undef A #undef B From cb5fed893f25aae8a64042c0375d7fa66b213111 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 15 Jul 2012 13:23:52 +1000 Subject: [PATCH 023/178] Get rid of spaces in arrays in scrypt kernel. --- scrypt120713.cl | 130 ++++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index 47eb7e9c..900ccce5 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -239,191 +239,191 @@ void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block uint4 W[4]; - W[ 0].x = block0.x; + W[0].x = block0.x; D=0x98c7e2a2U+W[0].x; H=0xfc08884dU+W[0].x; - W[ 0].y = block0.y; + W[0].y = block0.y; C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y; G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U); - W[ 0].z = block0.z; + W[0].z = block0.z; B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z; F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U); - W[ 0].w = block0.w; + W[0].w = block0.w; A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; E=0x95F61999U+A+Tr2(F)+Maj(F,G,H); - W[ 1].x = block1.x; + W[1].x = block1.x; RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); - W[ 1].y = block1.y; + W[1].y = block1.y; RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); - W[ 1].z = block1.z; + W[1].z = block1.z; RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); - W[ 1].w = block1.w; + W[1].w = block1.w; RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); - W[ 2].x = block2.x; + W[2].x = block2.x; RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); - W[ 2].y = block2.y; + W[2].y = block2.y; RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); - W[ 2].z = block2.z; + W[2].z = block2.z; RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); - W[ 2].w = block2.w; + W[2].w = block2.w; RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); - W[ 3].x = block3.x; + W[3].x = block3.x; RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); - W[ 3].y = block3.y; + W[3].y = block3.y; RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); - W[ 3].z = block3.z; + W[3].z = block3.z; RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); - W[ 3].w = block3.w; + W[3].w = block3.w; RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); #undef A @@ -709,7 +709,7 @@ __kernel void search(__global uint4*restrict input, __global uint*restrict outpu { pad0 = tstate0; pad1 = tstate1; - X[i*2 ] = ostate0; + X[i*2 ] = ostate0; X[i*2+1] = ostate1; SHA256(&pad0,&pad1, data, (uint4)(i+1,0x80000000U,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x4a0U)); From 6a02f8d94ae665859ee081c0741c7ec56ff7ba77 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 15 Jul 2012 13:28:56 +1000 Subject: [PATCH 024/178] Use uint16 in SHA256_fresh in scrypt kernel. --- scrypt120713.cl | 260 ++++++++++++++++++++++++------------------------ 1 file changed, 130 insertions(+), 130 deletions(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index 900ccce5..768a8661 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -237,194 +237,194 @@ void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block #define G (*state1).z #define H (*state1).w - uint4 W[4]; + uint16 W; - W[0].x = block0.x; - D=0x98c7e2a2U+W[0].x; - H=0xfc08884dU+W[0].x; + W.s0 = block0.x; + D=0x98c7e2a2U+W.s0; + H=0xfc08884dU+W.s0; - W[0].y = block0.y; - C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y; + W.s1 = block0.y; + C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W.s1; G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U); - W[0].z = block0.z; - B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z; + W.s2 = block0.z; + B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W.s2; F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U); - W[0].w = block0.w; - A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; + W.s3 = block0.w; + A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W.s3; E=0x95F61999U+A+Tr2(F)+Maj(F,G,H); - W[1].x = block1.x; - RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); - W[1].y = block1.y; - RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); - W[1].z = block1.z; - RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); - W[1].w = block1.w; - RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); + W.s4 = block1.x; + RND(E,F,G,H,A,B,C,D, W.s4+0x3956c25bU); + W.s5 = block1.y; + RND(D,E,F,G,H,A,B,C, W.s5+0x59f111f1U); + W.s6 = block1.z; + RND(C,D,E,F,G,H,A,B, W.s6+0x923f82a4U); + W.s7 = block1.w; + RND(B,C,D,E,F,G,H,A, W.s7+0xab1c5ed5U); - W[2].x = block2.x; - RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); - W[2].y = block2.y; - RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); - W[2].z = block2.z; - RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); - W[2].w = block2.w; - RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); + W.s8 = block2.x; + RND(A,B,C,D,E,F,G,H, W.s8+0xd807aa98U); + W.s9 = block2.y; + RND(H,A,B,C,D,E,F,G, W.s9+0x12835b01U); + W.sa = block2.z; + RND(G,H,A,B,C,D,E,F, W.sa+0x243185beU); + W.sb = block2.w; + RND(F,G,H,A,B,C,D,E, W.sb+0x550c7dc3U); - W[3].x = block3.x; - RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); - W[3].y = block3.y; - RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); - W[3].z = block3.z; - RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); - W[3].w = block3.w; - RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); + W.sc = block3.x; + RND(E,F,G,H,A,B,C,D, W.sc+0x72be5d74U); + W.sd = block3.y; + RND(D,E,F,G,H,A,B,C, W.sd+0x80deb1feU); + W.se = block3.z; + RND(C,D,E,F,G,H,A,B, W.se+0x9bdc06a7U); + W.sf = block3.w; + RND(B,C,D,E,F,G,H,A, W.sf+0xc19bf174U); - W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); + W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); + RND(A,B,C,D,E,F,G,H, W.s0+0xe49b69c1U); - W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); + W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); + RND(H,A,B,C,D,E,F,G, W.s1+0xefbe4786U); - W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); + W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); + RND(G,H,A,B,C,D,E,F, W.s2+0x0fc19dc6U); - W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); + W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); + RND(F,G,H,A,B,C,D,E, W.s3+0x240ca1ccU); - W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); + W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); + RND(E,F,G,H,A,B,C,D, W.s4+0x2de92c6fU); - W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); + W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); + RND(D,E,F,G,H,A,B,C, W.s5+0x4a7484aaU); - W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); + W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); + RND(C,D,E,F,G,H,A,B, W.s6+0x5cb0a9dcU); - W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); + W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); + RND(B,C,D,E,F,G,H,A, W.s7+0x76f988daU); - W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); + W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); + RND(A,B,C,D,E,F,G,H, W.s8+0x983e5152U); - W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); + W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); + RND(H,A,B,C,D,E,F,G, W.s9+0xa831c66dU); - W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); + W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); + RND(G,H,A,B,C,D,E,F, W.sa+0xb00327c8U); - W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); + W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); + RND(F,G,H,A,B,C,D,E, W.sb+0xbf597fc7U); - W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); + W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); + RND(E,F,G,H,A,B,C,D, W.sc+0xc6e00bf3U); - W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); + W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); + RND(D,E,F,G,H,A,B,C, W.sd+0xd5a79147U); - W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); + W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); + RND(C,D,E,F,G,H,A,B, W.se+0x06ca6351U); - W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); + W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); + RND(B,C,D,E,F,G,H,A, W.sf+0x14292967U); - W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); + W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); + RND(A,B,C,D,E,F,G,H, W.s0+0x27b70a85U); - W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); + W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); + RND(H,A,B,C,D,E,F,G, W.s1+0x2e1b2138U); - W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); + W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); + RND(G,H,A,B,C,D,E,F, W.s2+0x4d2c6dfcU); - W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); + W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); + RND(F,G,H,A,B,C,D,E, W.s3+0x53380d13U); - W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); + W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); + RND(E,F,G,H,A,B,C,D, W.s4+0x650a7354U); - W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); + W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); + RND(D,E,F,G,H,A,B,C, W.s5+0x766a0abbU); - W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); + W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); + RND(C,D,E,F,G,H,A,B, W.s6+0x81c2c92eU); - W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); + W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); + RND(B,C,D,E,F,G,H,A, W.s7+0x92722c85U); - W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); + W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); + RND(A,B,C,D,E,F,G,H, W.s8+0xa2bfe8a1U); - W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); + W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); + RND(H,A,B,C,D,E,F,G, W.s9+0xa81a664bU); - W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); + W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); + RND(G,H,A,B,C,D,E,F, W.sa+0xc24b8b70U); - W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); + W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); + RND(F,G,H,A,B,C,D,E, W.sb+0xc76c51a3U); - W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); + W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); + RND(E,F,G,H,A,B,C,D, W.sc+0xd192e819U); - W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); + W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); + RND(D,E,F,G,H,A,B,C, W.sd+0xd6990624U); - W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); + W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); + RND(C,D,E,F,G,H,A,B, W.se+0xf40e3585U); - W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); + W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); + RND(B,C,D,E,F,G,H,A, W.sf+0x106aa070U); - W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); + W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); + RND(A,B,C,D,E,F,G,H, W.s0+0x19a4c116U); - W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); + W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); + RND(H,A,B,C,D,E,F,G, W.s1+0x1e376c08U); - W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); + W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); + RND(G,H,A,B,C,D,E,F, W.s2+0x2748774cU); - W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); + W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); + RND(F,G,H,A,B,C,D,E, W.s3+0x34b0bcb5U); - W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); + W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); + RND(E,F,G,H,A,B,C,D, W.s4+0x391c0cb3U); - W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); + W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); + RND(D,E,F,G,H,A,B,C, W.s5+0x4ed8aa4aU); - W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); + W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); + RND(C,D,E,F,G,H,A,B, W.s6+0x5b9cca4fU); - W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); + W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); + RND(B,C,D,E,F,G,H,A, W.s7+0x682e6ff3U); - W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); + W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); + RND(A,B,C,D,E,F,G,H, W.s8+0x748f82eeU); - W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); + W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); + RND(H,A,B,C,D,E,F,G, W.s9+0x78a5636fU); - W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); + W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); + RND(G,H,A,B,C,D,E,F, W.sa+0x84c87814U); - W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); + W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); + RND(F,G,H,A,B,C,D,E, W.sb+0x8cc70208U); - W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); + W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); + RND(E,F,G,H,A,B,C,D, W.sc+0x90befffaU); - W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); + W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); + RND(D,E,F,G,H,A,B,C, W.sd+0xa4506cebU); - W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); + W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); + RND(C,D,E,F,G,H,A,B, W.se+0xbef9a3f7U); - W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); + W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); + RND(B,C,D,E,F,G,H,A, W.sf+0xc67178f2U); #undef A #undef B From 884f83f3138bc5a6d502614e0a64ea8763dfd8cb Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 15 Jul 2012 13:31:03 +1000 Subject: [PATCH 025/178] Allow more platforms to be probed if first does not return GPUs. --- ocl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocl.c b/ocl.c index 2fc7e35f..6ddf5998 100644 --- a/ocl.c +++ b/ocl.c @@ -118,6 +118,8 @@ int clDevicesNum(void) { status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); + if (i < numPlatforms - 1) + continue; return -1; } applog(LOG_INFO, "Platform %d devices: %d", i, numDevices); From 53e9c61c021a18d32bc5bf33c8ea15ac61778629 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 15 Jul 2012 13:40:11 +1000 Subject: [PATCH 026/178] Find the gpu platform with the most devices and use that if no platform option is passed. --- ocl.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ocl.c b/ocl.c index 6ddf5998..a06fc5e3 100644 --- a/ocl.c +++ b/ocl.c @@ -33,7 +33,7 @@ #include "findnonce.h" #include "ocl.h" -int opt_platform_id; +int opt_platform_id = -1; char *file_contents(const char *filename, int *length) { @@ -80,7 +80,7 @@ int clDevicesNum(void) { cl_uint numPlatforms; cl_platform_id *platforms; cl_platform_id platform = NULL; - unsigned int most_devices = 0, i; + unsigned int most_devices = 0, i, mdplatform; status = clGetPlatformIDs(0, NULL, &numPlatforms); /* If this fails, assume no GPUs. */ @@ -123,8 +123,10 @@ int clDevicesNum(void) { return -1; } applog(LOG_INFO, "Platform %d devices: %d", i, numDevices); - if (numDevices > most_devices) + if (numDevices > most_devices) { most_devices = numDevices; + mdplatform = i; + } if (numDevices) { unsigned int j; char pbuff[256]; @@ -139,6 +141,9 @@ int clDevicesNum(void) { } } + if (opt_platform_id < 0) + opt_platform_id = mdplatform;; + return most_devices; } From 04edf4bfa20ad39118507e9e60be6e18b867be2c Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 15 Jul 2012 13:40:56 +1000 Subject: [PATCH 027/178] Temporarily set opencl to use all devices to allow debugging of scrypt kernel rapidly. --- ocl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocl.c b/ocl.c index a06fc5e3..341d5c88 100644 --- a/ocl.c +++ b/ocl.c @@ -115,7 +115,7 @@ int clDevicesNum(void) { status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform %d version: %s", i, pbuff); - status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); if (i < numPlatforms - 1) @@ -132,7 +132,7 @@ int clDevicesNum(void) { char pbuff[256]; cl_device_id *devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); - clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); for (j = 0; j < numDevices; j++) { clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); applog(LOG_INFO, "\t%i\t%s", j, pbuff); @@ -255,7 +255,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform version: %s", vbuff); - status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); + status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); return NULL; @@ -266,7 +266,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) /* Now, get the device list data */ - status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); + status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status); return NULL; @@ -303,7 +303,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; - clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); + clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status); return NULL; From a1edc7dbcb9b0026e6b1f2f93e71aa3b1f00403f Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Sun, 15 Jul 2012 05:37:49 +0000 Subject: [PATCH 028/178] Bugfix: Fix build without curses but with OpenCL --- adl.c | 12 +++++++++++- cgminer.c | 5 ++++- driver-opencl.c | 5 +++-- logging.c | 2 +- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/adl.c b/adl.c index 8573e16f..85ec0aaf 100644 --- a/adl.c +++ b/adl.c @@ -33,6 +33,10 @@ #endif #include "adl_functions.h" +#ifndef HAVE_CURSES +#define wlogprint(...) applog(LOG_WARNING, __VA_ARGS__) +#endif + bool adl_active; bool opt_reorder = false; @@ -764,6 +768,7 @@ bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vdd return true; } +#ifdef HAVE_CURSES static void get_enginerange(int gpu, int *imin, int *imax) { struct gpu_adl *ga; @@ -776,6 +781,7 @@ static void get_enginerange(int gpu, int *imin, int *imax) *imin = ga->lpOdParameters.sEngineClock.iMin / 100; *imax = ga->lpOdParameters.sEngineClock.iMax / 100; } +#endif int set_engineclock(int gpu, int iEngineClock) { @@ -824,6 +830,7 @@ out: return ret; } +#ifdef HAVE_CURSES static void get_memoryrange(int gpu, int *imin, int *imax) { struct gpu_adl *ga; @@ -836,6 +843,7 @@ static void get_memoryrange(int gpu, int *imin, int *imax) *imin = ga->lpOdParameters.sMemoryClock.iMin / 100; *imax = ga->lpOdParameters.sMemoryClock.iMax / 100; } +#endif int set_memoryclock(int gpu, int iMemoryClock) { @@ -876,6 +884,7 @@ out: return ret; } +#ifdef HAVE_CURSES static void get_vddcrange(int gpu, float *imin, float *imax) { struct gpu_adl *ga; @@ -889,7 +898,6 @@ static void get_vddcrange(int gpu, float *imin, float *imax) *imax = (float)ga->lpOdParameters.sVddc.iMax / 1000; } -#ifdef HAVE_CURSES static float curses_float(const char *query) { float ret; @@ -997,6 +1005,7 @@ int set_fanspeed(int gpu, int iFanSpeed) return ret; } +#ifdef HAVE_CURSES static int set_powertune(int gpu, int iPercentage) { struct gpu_adl *ga; @@ -1018,6 +1027,7 @@ static int set_powertune(int gpu, int iPercentage) unlock_adl(); return ret; } +#endif /* Returns whether the fanspeed is optimal already or not. The fan_window bool * tells us whether the current fanspeed is in the target range for fanspeeds. diff --git a/cgminer.c b/cgminer.c index 55d3943b..ea540815 100644 --- a/cgminer.c +++ b/cgminer.c @@ -202,10 +202,11 @@ enum pool_strategy pool_strategy = POOL_FAILOVER; int opt_rotate_period; static int total_urls, total_users, total_passes, total_userpasses; +static #ifndef HAVE_CURSES const #endif -static bool curses_active; +bool curses_active; static char current_block[37]; static char *current_hash; @@ -1312,8 +1313,10 @@ double total_secs = 0.1; static char statusline[256]; /* logstart is where the log window should start */ static int devcursor, logstart, logcursor; +#ifdef HAVE_CURSES /* statusy is where the status window goes up to in cases where it won't fit at startup */ static int statusy; +#endif #ifdef HAVE_OPENCL struct cgpu_info gpus[MAX_GPUDEVICES]; /* Maximum number apparently possible */ #endif diff --git a/driver-opencl.c b/driver-opencl.c index 880a4dac..2cf44852 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1210,16 +1210,17 @@ static bool opencl_thread_prepare(struct thr_info *thr) applog(LOG_INFO, "Init GPU thread %i GPU %i virtual GPU %i", i, gpu, virtual_gpu); clStates[i] = initCl(virtual_gpu, name, sizeof(name)); if (!clStates[i]) { +#ifdef HAVE_CURSES if (use_curses) enable_curses(); +#endif applog(LOG_ERR, "Failed to init GPU thread %d, disabling device %d", i, gpu); if (!failmessage) { - char *buf; - applog(LOG_ERR, "Restarting the GPU from the menu will not fix this."); applog(LOG_ERR, "Try restarting cgminer."); failmessage = true; #ifdef HAVE_CURSES + char *buf; if (use_curses) { buf = curses_input("Press enter to continue"); if (buf) diff --git a/logging.c b/logging.c index 31956637..7d8a4309 100644 --- a/logging.c +++ b/logging.c @@ -18,7 +18,7 @@ bool opt_log_output = false; /* per default priorities higher than LOG_NOTICE are logged */ int opt_log_level = LOG_NOTICE; -static void my_log_curses(int prio, char *f, va_list ap) +static void my_log_curses(__maybe_unused int prio, char *f, va_list ap) { #ifdef HAVE_CURSES extern bool use_curses; From 3d1b4d637435fdebbd5cf342f5dfc6e75aac5ad9 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 11:17:27 +1000 Subject: [PATCH 029/178] Revert "Use uint16 in SHA256_fresh in scrypt kernel." This reverts commit 6a02f8d94ae665859ee081c0741c7ec56ff7ba77. --- scrypt120713.cl | 260 ++++++++++++++++++++++++------------------------ 1 file changed, 130 insertions(+), 130 deletions(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index 768a8661..900ccce5 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -237,194 +237,194 @@ void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block #define G (*state1).z #define H (*state1).w - uint16 W; + uint4 W[4]; - W.s0 = block0.x; - D=0x98c7e2a2U+W.s0; - H=0xfc08884dU+W.s0; + W[0].x = block0.x; + D=0x98c7e2a2U+W[0].x; + H=0xfc08884dU+W[0].x; - W.s1 = block0.y; - C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W.s1; + W[0].y = block0.y; + C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y; G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U); - W.s2 = block0.z; - B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W.s2; + W[0].z = block0.z; + B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z; F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U); - W.s3 = block0.w; - A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W.s3; + W[0].w = block0.w; + A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; E=0x95F61999U+A+Tr2(F)+Maj(F,G,H); - W.s4 = block1.x; - RND(E,F,G,H,A,B,C,D, W.s4+0x3956c25bU); - W.s5 = block1.y; - RND(D,E,F,G,H,A,B,C, W.s5+0x59f111f1U); - W.s6 = block1.z; - RND(C,D,E,F,G,H,A,B, W.s6+0x923f82a4U); - W.s7 = block1.w; - RND(B,C,D,E,F,G,H,A, W.s7+0xab1c5ed5U); + W[1].x = block1.x; + RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); + W[1].y = block1.y; + RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); + W[1].z = block1.z; + RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); + W[1].w = block1.w; + RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); - W.s8 = block2.x; - RND(A,B,C,D,E,F,G,H, W.s8+0xd807aa98U); - W.s9 = block2.y; - RND(H,A,B,C,D,E,F,G, W.s9+0x12835b01U); - W.sa = block2.z; - RND(G,H,A,B,C,D,E,F, W.sa+0x243185beU); - W.sb = block2.w; - RND(F,G,H,A,B,C,D,E, W.sb+0x550c7dc3U); + W[2].x = block2.x; + RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); + W[2].y = block2.y; + RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); + W[2].z = block2.z; + RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); + W[2].w = block2.w; + RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); - W.sc = block3.x; - RND(E,F,G,H,A,B,C,D, W.sc+0x72be5d74U); - W.sd = block3.y; - RND(D,E,F,G,H,A,B,C, W.sd+0x80deb1feU); - W.se = block3.z; - RND(C,D,E,F,G,H,A,B, W.se+0x9bdc06a7U); - W.sf = block3.w; - RND(B,C,D,E,F,G,H,A, W.sf+0xc19bf174U); + W[3].x = block3.x; + RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); + W[3].y = block3.y; + RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); + W[3].z = block3.z; + RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); + W[3].w = block3.w; + RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); - W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); - RND(A,B,C,D,E,F,G,H, W.s0+0xe49b69c1U); + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); - W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); - RND(H,A,B,C,D,E,F,G, W.s1+0xefbe4786U); + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); - W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); - RND(G,H,A,B,C,D,E,F, W.s2+0x0fc19dc6U); + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); - W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); - RND(F,G,H,A,B,C,D,E, W.s3+0x240ca1ccU); + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); - W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); - RND(E,F,G,H,A,B,C,D, W.s4+0x2de92c6fU); + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); - W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); - RND(D,E,F,G,H,A,B,C, W.s5+0x4a7484aaU); + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); - W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); - RND(C,D,E,F,G,H,A,B, W.s6+0x5cb0a9dcU); + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); - W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); - RND(B,C,D,E,F,G,H,A, W.s7+0x76f988daU); + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); - W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); - RND(A,B,C,D,E,F,G,H, W.s8+0x983e5152U); + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); - W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); - RND(H,A,B,C,D,E,F,G, W.s9+0xa831c66dU); + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); - W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); - RND(G,H,A,B,C,D,E,F, W.sa+0xb00327c8U); + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); - W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); - RND(F,G,H,A,B,C,D,E, W.sb+0xbf597fc7U); + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); - W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); - RND(E,F,G,H,A,B,C,D, W.sc+0xc6e00bf3U); + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); - W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); - RND(D,E,F,G,H,A,B,C, W.sd+0xd5a79147U); + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); - W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); - RND(C,D,E,F,G,H,A,B, W.se+0x06ca6351U); + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); - W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); - RND(B,C,D,E,F,G,H,A, W.sf+0x14292967U); + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); - W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); - RND(A,B,C,D,E,F,G,H, W.s0+0x27b70a85U); + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); - W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); - RND(H,A,B,C,D,E,F,G, W.s1+0x2e1b2138U); + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); - W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); - RND(G,H,A,B,C,D,E,F, W.s2+0x4d2c6dfcU); + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); - W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); - RND(F,G,H,A,B,C,D,E, W.s3+0x53380d13U); + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); - W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); - RND(E,F,G,H,A,B,C,D, W.s4+0x650a7354U); + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); - W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); - RND(D,E,F,G,H,A,B,C, W.s5+0x766a0abbU); + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); - W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); - RND(C,D,E,F,G,H,A,B, W.s6+0x81c2c92eU); + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); - W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); - RND(B,C,D,E,F,G,H,A, W.s7+0x92722c85U); + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); - W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); - RND(A,B,C,D,E,F,G,H, W.s8+0xa2bfe8a1U); + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); - W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); - RND(H,A,B,C,D,E,F,G, W.s9+0xa81a664bU); + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); - W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); - RND(G,H,A,B,C,D,E,F, W.sa+0xc24b8b70U); + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); - W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); - RND(F,G,H,A,B,C,D,E, W.sb+0xc76c51a3U); + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); - W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); - RND(E,F,G,H,A,B,C,D, W.sc+0xd192e819U); + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); - W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); - RND(D,E,F,G,H,A,B,C, W.sd+0xd6990624U); + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); - W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); - RND(C,D,E,F,G,H,A,B, W.se+0xf40e3585U); + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); - W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); - RND(B,C,D,E,F,G,H,A, W.sf+0x106aa070U); + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); - W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); - RND(A,B,C,D,E,F,G,H, W.s0+0x19a4c116U); + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); - W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); - RND(H,A,B,C,D,E,F,G, W.s1+0x1e376c08U); + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); - W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); - RND(G,H,A,B,C,D,E,F, W.s2+0x2748774cU); + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); - W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); - RND(F,G,H,A,B,C,D,E, W.s3+0x34b0bcb5U); + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); - W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); - RND(E,F,G,H,A,B,C,D, W.s4+0x391c0cb3U); + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); - W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); - RND(D,E,F,G,H,A,B,C, W.s5+0x4ed8aa4aU); + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); - W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); - RND(C,D,E,F,G,H,A,B, W.s6+0x5b9cca4fU); + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); - W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); - RND(B,C,D,E,F,G,H,A, W.s7+0x682e6ff3U); + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); - W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); - RND(A,B,C,D,E,F,G,H, W.s8+0x748f82eeU); + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); - W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); - RND(H,A,B,C,D,E,F,G, W.s9+0x78a5636fU); + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); - W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); - RND(G,H,A,B,C,D,E,F, W.sa+0x84c87814U); + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); - W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); - RND(F,G,H,A,B,C,D,E, W.sb+0x8cc70208U); + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); - W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); - RND(E,F,G,H,A,B,C,D, W.sc+0x90befffaU); + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); - W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); - RND(D,E,F,G,H,A,B,C, W.sd+0xa4506cebU); + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); - W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); - RND(C,D,E,F,G,H,A,B, W.se+0xbef9a3f7U); + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); - W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); - RND(B,C,D,E,F,G,H,A, W.sf+0xc67178f2U); + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); #undef A #undef B From a9a0bba18b8f22eacd788e20a9af9539b87680dc Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 11:53:18 +1000 Subject: [PATCH 030/178] Set the correct data for cldata and prepare for pad8 fixes. --- driver-opencl.c | 17 +++++++++++++---- ocl.c | 3 ++- ocl.h | 2 ++ 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 2ebb54f5..773bf8ed 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1001,6 +1001,8 @@ static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t cl_int status = 0; int i; + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); + CL_SET_ARG(clState->CLbuffer0); CL_SET_ARG(clState->outputBuffer); CL_SET_ARG(clState->padbuffer8); @@ -1309,7 +1311,7 @@ static bool opencl_thread_init(struct thr_info *thr) struct cgpu_info *gpu = thr->cgpu; struct opencl_thread_data *thrdata; _clState *clState = clStates[thr_id]; - cl_int status; + cl_int status = 0; thrdata = calloc(1, sizeof(*thrdata)); thr->cgpu_data = thrdata; @@ -1348,10 +1350,13 @@ static bool opencl_thread_init(struct thr_info *thr) } #ifdef USE_SCRYPT - if (opt_scrypt) - status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, BUFFERSIZE, blank_res, 0, NULL,NULL); + if (opt_scrypt) { + if (clState->padbufsize > BUFFERSIZE) + blank_res = realloc(blank_res, clState->padbufsize); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->padbuffer8, true, 0, clState->padbufsize, blank_res, 0, NULL,NULL); + } #endif - status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, + status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, BUFFERSIZE, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); @@ -1440,6 +1445,10 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, localThreads[0], gpu->intensity); if (hashes > gpu->max_hashes) gpu->max_hashes = hashes; + +#ifdef USE_SCRYPT + clState->cldata = work->data; +#endif status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); diff --git a/ocl.c b/ocl.c index 341d5c88..675a31c6 100644 --- a/ocl.c +++ b/ocl.c @@ -754,8 +754,9 @@ built: size_t ipt = (1024 / clState->lookup_gap + (1024 % clState->lookup_gap > 0)); size_t bufsize = 128 * ipt * clState->thread_concurrency; - clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); + clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 80, NULL, &status); clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); + clState->padbufsize = bufsize; } #endif clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); diff --git a/ocl.h b/ocl.h index b15c8893..56fa9b0d 100644 --- a/ocl.h +++ b/ocl.h @@ -24,6 +24,8 @@ typedef struct { cl_mem padbuffer8; size_t lookup_gap; size_t thread_concurrency; + size_t padbufsize; + void * cldata; #endif bool hasBitAlign; bool hasOpenCL11plus; From 6ac14f4280ff070733fce6b5b38893d3335c1cf9 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 13:14:21 +1000 Subject: [PATCH 031/178] Don't enqueuewrite buffer at all for pad8 and pass work details around for scrypt in dev_blk. --- cgminer.c | 6 +++++- driver-opencl.c | 23 +++++++++-------------- findnonce.c | 7 ++----- miner.h | 2 +- 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/cgminer.c b/cgminer.c index a5529f28..14183ec4 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3983,11 +3983,15 @@ bool hashtest(const struct work *work) bool test_nonce(struct work *work, uint32_t nonce) { + uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12); + + *work_nonce = htobe32(nonce); +#if 0 work->data[64 + 12 + 0] = (nonce >> 0) & 0xff; work->data[64 + 12 + 1] = (nonce >> 8) & 0xff; work->data[64 + 12 + 2] = (nonce >> 16) & 0xff; work->data[64 + 12 + 3] = (nonce >> 24) & 0xff; - +#endif if (opt_scrypt) return true; diff --git a/driver-opencl.c b/driver-opencl.c index 773bf8ed..4ddf3365 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -993,14 +993,14 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t } #ifdef USE_SCRYPT -static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) +static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) { - cl_uint4 *midstate = (cl_uint4 *)blk->midstate; + cl_uint4 *midstate = (cl_uint4 *)blk->work->midstate; cl_kernel *kernel = &clState->kernel; unsigned int num = 0; cl_int status = 0; - int i; + clState->cldata = blk->work->data; status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); CL_SET_ARG(clState->CLbuffer0); @@ -1349,13 +1349,6 @@ static bool opencl_thread_init(struct thr_info *thr) return false; } -#ifdef USE_SCRYPT - if (opt_scrypt) { - if (clState->padbufsize > BUFFERSIZE) - blank_res = realloc(blank_res, clState->padbufsize); - status = clEnqueueWriteBuffer(clState->commandQueue, clState->padbuffer8, true, 0, clState->padbufsize, blank_res, 0, NULL,NULL); - } -#endif status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, BUFFERSIZE, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { @@ -1385,7 +1378,12 @@ static void opencl_free_work(struct thr_info *thr, struct work *work) static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work) { - precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64)); +#ifdef USE_SCRYPT + if (opt_scrypt) + work->blk.work = work; + else +#endif + precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64)); return true; } @@ -1446,9 +1444,6 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, if (hashes > gpu->max_hashes) gpu->max_hashes = hashes; -#ifdef USE_SCRYPT - clState->cldata = work->data; -#endif status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); diff --git a/findnonce.c b/findnonce.c index ce282dc3..d0e19176 100644 --- a/findnonce.c +++ b/findnonce.c @@ -45,7 +45,8 @@ const uint32_t SHA256_K[64] = { d = d + h; \ h = h + (rotate(a, 30) ^ rotate(a, 19) ^ rotate(a, 10)) + ((a & b) | (c & (a | b))) -void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) { +void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) +{ cl_uint A, B, C, D, E, F, G, H; A = state[0]; @@ -127,10 +128,6 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) { blk->fiveA = blk->ctx_f + SHA256_K[5]; blk->sixA = blk->ctx_g + SHA256_K[6]; blk->sevenA = blk->ctx_h + SHA256_K[7]; - -#ifdef USE_SCRYPT - blk->midstate = (unsigned char *)state; -#endif } #define P(t) (W[(t)&0xF] = W[(t-16)&0xF] + (rotate(W[(t-15)&0xF], 25) ^ rotate(W[(t-15)&0xF], 14) ^ (W[(t-15)&0xF] >> 3)) + W[(t-7)&0xF] + (rotate(W[(t-2)&0xF], 15) ^ rotate(W[(t-2)&0xF], 13) ^ (W[(t-2)&0xF] >> 10))) diff --git a/miner.h b/miner.h index da62dcb6..5cc683ce 100644 --- a/miner.h +++ b/miner.h @@ -671,7 +671,7 @@ typedef struct { cl_uint zeroA, zeroB; cl_uint oneA, twoA, threeA, fourA, fiveA, sixA, sevenA; #ifdef USE_SCRYPT - unsigned char *midstate; + struct work *work; #endif } dev_blk_ctx; #else From 5cd4bbd21cfd81bc14ae4182af9c91581466f282 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 13:18:43 +1000 Subject: [PATCH 032/178] Get rid of stuff. --- cgminer.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cgminer.c b/cgminer.c index 14183ec4..7953baaf 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3986,12 +3986,7 @@ bool test_nonce(struct work *work, uint32_t nonce) uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12); *work_nonce = htobe32(nonce); -#if 0 - work->data[64 + 12 + 0] = (nonce >> 0) & 0xff; - work->data[64 + 12 + 1] = (nonce >> 8) & 0xff; - work->data[64 + 12 + 2] = (nonce >> 16) & 0xff; - work->data[64 + 12 + 3] = (nonce >> 24) & 0xff; -#endif + if (opt_scrypt) return true; From 76c37891ea5267ce71809123c1af03247000c3f5 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 13:19:42 +1000 Subject: [PATCH 033/178] Handle KL_SCRYPT in config write. --- cgminer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cgminer.c b/cgminer.c index 7953baaf..990a397c 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2851,6 +2851,11 @@ void write_config(FILE *fcfg) case KL_DIABLO: fprintf(fcfg, "diablo"); break; +#ifdef USE_SCRYPT + case KL_SCRYPT: + fprintf(fcfg, "scrypt"); + break; +#endif } } #ifdef HAVE_ADL From 428d5e5d4d601e4d52d529938cfc9d65965ee338 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 13:22:35 +1000 Subject: [PATCH 034/178] Limit scrypt to 1 vector. --- ocl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocl.c b/ocl.c index 675a31c6..16f23f0a 100644 --- a/ocl.c +++ b/ocl.c @@ -420,6 +420,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) case KL_SCRYPT: strcpy(filename, SCRYPT_KERNNAME".cl"); strcpy(binaryfilename, SCRYPT_KERNNAME); + /* Scrypt only supports vector 1 */ + gpus[gpu].vwidth = 1; break; case KL_NONE: /* Shouldn't happen */ case KL_DIABLO: From 861f4329b2cae683eaa9d10ab3ac26971ce3d7ba Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 13:39:30 +1000 Subject: [PATCH 035/178] Fix external scrypt algo missing. --- driver-cpu.h | 1 + 1 file changed, 1 insertion(+) diff --git a/driver-cpu.h b/driver-cpu.h index 3cf268b2..e4b44527 100644 --- a/driver-cpu.h +++ b/driver-cpu.h @@ -60,5 +60,6 @@ extern void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo); extern char *force_nthreads_int(const char *arg, int *i); extern void init_max_name_len(); extern double bench_algo_stage3(enum sha256_algos algo); +extern void set_scrypt_algo(enum sha256_algos *algo); #endif /* __DEVICE_CPU_H__ */ From f99ac0ca7813a68d610e9449459d1236662f4647 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 15 Jul 2012 13:31:03 +1000 Subject: [PATCH 036/178] Allow more platforms to be probed if first does not return GPUs. --- ocl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocl.c b/ocl.c index 464cb4e1..5dfef36d 100644 --- a/ocl.c +++ b/ocl.c @@ -118,6 +118,8 @@ int clDevicesNum(void) { status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); + if (i < numPlatforms - 1) + continue; return -1; } applog(LOG_INFO, "Platform %d devices: %d", i, numDevices); From ffd21f8db31952d0ec1f01701e1d1061027fa63a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 15 Jul 2012 13:40:11 +1000 Subject: [PATCH 037/178] Find the gpu platform with the most devices and use that if no platform option is passed. --- ocl.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ocl.c b/ocl.c index 5dfef36d..f4c763ca 100644 --- a/ocl.c +++ b/ocl.c @@ -33,7 +33,7 @@ #include "findnonce.h" #include "ocl.h" -int opt_platform_id; +int opt_platform_id = -1; char *file_contents(const char *filename, int *length) { @@ -80,7 +80,7 @@ int clDevicesNum(void) { cl_uint numPlatforms; cl_platform_id *platforms; cl_platform_id platform = NULL; - unsigned int most_devices = 0, i; + unsigned int most_devices = 0, i, mdplatform; status = clGetPlatformIDs(0, NULL, &numPlatforms); /* If this fails, assume no GPUs. */ @@ -123,8 +123,10 @@ int clDevicesNum(void) { return -1; } applog(LOG_INFO, "Platform %d devices: %d", i, numDevices); - if (numDevices > most_devices) + if (numDevices > most_devices) { most_devices = numDevices; + mdplatform = i; + } if (numDevices) { unsigned int j; char pbuff[256]; @@ -139,6 +141,9 @@ int clDevicesNum(void) { } } + if (opt_platform_id < 0) + opt_platform_id = mdplatform;; + return most_devices; } From 07292f73a1d9ab71c01d71150d15e4a4df56dc85 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 17:05:08 +1000 Subject: [PATCH 038/178] Initialise mdplatform. --- ocl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocl.c b/ocl.c index f4c763ca..c4480af8 100644 --- a/ocl.c +++ b/ocl.c @@ -80,7 +80,7 @@ int clDevicesNum(void) { cl_uint numPlatforms; cl_platform_id *platforms; cl_platform_id platform = NULL; - unsigned int most_devices = 0, i, mdplatform; + unsigned int most_devices = 0, i, mdplatform = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); /* If this fails, assume no GPUs. */ From 471daecb5f9ab39d2cc3820525d1904d03c5dc0c Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 17:05:08 +1000 Subject: [PATCH 039/178] Initialise mdplatform. --- ocl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocl.c b/ocl.c index 16f23f0a..5dcced71 100644 --- a/ocl.c +++ b/ocl.c @@ -80,7 +80,7 @@ int clDevicesNum(void) { cl_uint numPlatforms; cl_platform_id *platforms; cl_platform_id platform = NULL; - unsigned int most_devices = 0, i, mdplatform; + unsigned int most_devices = 0, i, mdplatform = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); /* If this fails, assume no GPUs. */ From 77b7ed4b9ea50927e4a900d442b9fc94f929abd1 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 20:25:16 +1000 Subject: [PATCH 040/178] Debug output per thread hashrate is out by a factor of 1000. --- cgminer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index 990a397c..52e92f58 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3423,8 +3423,8 @@ static void hashmeter(int thr_id, struct timeval *diff, double thread_rolling = 0.0; int i; - applog(LOG_DEBUG, "[thread %d: %llu hashes, %.0f khash/sec]", - thr_id, hashes_done, hashes_done / secs); + applog(LOG_DEBUG, "[thread %d: %llu hashes, %.1f khash/sec]", + thr_id, hashes_done, hashes_done / 1000 / secs); /* Rolling average for each thread and each device */ decay_time(&thr->rolling, local_mhashes / secs); From 4abecc26745c8fb473b6df67ac55f85e95a85275 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 22:03:43 +1000 Subject: [PATCH 041/178] Detach pthread from within the api thread in case it is terminated due to not being instantiated before pthread_cancel is called from main, leading to a segfault. --- cgminer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index 55d3943b..c225d317 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3349,6 +3349,7 @@ static void *api_thread(void *userdata) { struct thr_info *mythr = userdata; + pthread_detach(pthread_self()); pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); api(api_thr_id); @@ -5522,8 +5523,6 @@ begin_bench: thr = &thr_info[api_thr_id]; if (thr_info_create(thr, NULL, api_thread, thr)) quit(1, "API thread create failed"); - pthread_detach(thr->pth); - #ifdef HAVE_CURSES /* Create curses input thread for keyboard input. Create this last so From dd25454594a5c23202161e09549038dc9275f6f6 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 16 Jul 2012 22:03:43 +1000 Subject: [PATCH 042/178] Detach pthread from within the api thread in case it is terminated due to not being instantiated before pthread_cancel is called from main, leading to a segfault. --- cgminer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index 52e92f58..df9feee7 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3370,6 +3370,7 @@ static void *api_thread(void *userdata) { struct thr_info *mythr = userdata; + pthread_detach(pthread_self()); pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); api(api_thr_id); @@ -5558,8 +5559,6 @@ begin_bench: thr = &thr_info[api_thr_id]; if (thr_info_create(thr, NULL, api_thread, thr)) quit(1, "API thread create failed"); - pthread_detach(thr->pth); - #ifdef HAVE_CURSES /* Create curses input thread for keyboard input. Create this last so From e17ee1e8943df0054ce426a5ae687642c2d048f5 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 18 Jul 2012 21:57:57 +1000 Subject: [PATCH 043/178] Revert "Use uint16 in SHA256 in scrypt kernel." This reverts commit 46592a24f4d91991f3302e0b39bfc10cfe01255a. --- scrypt120713.cl | 258 ++++++++++++++++++++++++------------------------ 1 file changed, 129 insertions(+), 129 deletions(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index 900ccce5..69e2db8a 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -31,187 +31,187 @@ void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, con #define G S1.z #define H S1.w - uint16 W; + uint4 W[4]; - W.s0 = block0.x; - RND(A,B,C,D,E,F,G,H, W.s0+0x428a2f98U); - W.s1 = block0.y; - RND(H,A,B,C,D,E,F,G, W.s1+0x71374491U); - W.s2 = block0.z; - RND(G,H,A,B,C,D,E,F, W.s2+0xb5c0fbcfU); - W.s3 = block0.w; - RND(F,G,H,A,B,C,D,E, W.s3+0xe9b5dba5U); + W[ 0].x = block0.x; + RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U); + W[ 0].y = block0.y; + RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U); + W[ 0].z = block0.z; + RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU); + W[ 0].w = block0.w; + RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U); - W.s4 = block1.x; - RND(E,F,G,H,A,B,C,D, W.s4+0x3956c25bU); - W.s5 = block1.y; - RND(D,E,F,G,H,A,B,C, W.s5+0x59f111f1U); - W.s6 = block1.z; - RND(C,D,E,F,G,H,A,B, W.s6+0x923f82a4U); - W.s7 = block1.w; - RND(B,C,D,E,F,G,H,A, W.s7+0xab1c5ed5U); + W[ 1].x = block1.x; + RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); + W[ 1].y = block1.y; + RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); + W[ 1].z = block1.z; + RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); + W[ 1].w = block1.w; + RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); - W.s8 = block2.x; - RND(A,B,C,D,E,F,G,H, W.s8+0xd807aa98U); - W.s9 = block2.y; - RND(H,A,B,C,D,E,F,G, W.s9+0x12835b01U); - W.sa = block2.z; - RND(G,H,A,B,C,D,E,F, W.sa+0x243185beU); - W.sb = block2.w; - RND(F,G,H,A,B,C,D,E, W.sb+0x550c7dc3U); + W[ 2].x = block2.x; + RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); + W[ 2].y = block2.y; + RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); + W[ 2].z = block2.z; + RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); + W[ 2].w = block2.w; + RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); - W.sc = block3.x; - RND(E,F,G,H,A,B,C,D, W.sc+0x72be5d74U); - W.sd = block3.y; - RND(D,E,F,G,H,A,B,C, W.sd+0x80deb1feU); - W.se = block3.z; - RND(C,D,E,F,G,H,A,B, W.se+0x9bdc06a7U); - W.sf = block3.w; - RND(B,C,D,E,F,G,H,A, W.sf+0xc19bf174U); + W[ 3].x = block3.x; + RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); + W[ 3].y = block3.y; + RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); + W[ 3].z = block3.z; + RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); + W[ 3].w = block3.w; + RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); - W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); - RND(A,B,C,D,E,F,G,H, W.s0+0xe49b69c1U); + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); - W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); - RND(H,A,B,C,D,E,F,G, W.s1+0xefbe4786U); + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); - W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); - RND(G,H,A,B,C,D,E,F, W.s2+0x0fc19dc6U); + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); - W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); - RND(F,G,H,A,B,C,D,E, W.s3+0x240ca1ccU); + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); - W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); - RND(E,F,G,H,A,B,C,D, W.s4+0x2de92c6fU); + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); - W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); - RND(D,E,F,G,H,A,B,C, W.s5+0x4a7484aaU); + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); - W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); - RND(C,D,E,F,G,H,A,B, W.s6+0x5cb0a9dcU); + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); - W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); - RND(B,C,D,E,F,G,H,A, W.s7+0x76f988daU); + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); - W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); - RND(A,B,C,D,E,F,G,H, W.s8+0x983e5152U); + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); - W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); - RND(H,A,B,C,D,E,F,G, W.s9+0xa831c66dU); + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); - W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); - RND(G,H,A,B,C,D,E,F, W.sa+0xb00327c8U); + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); - W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); - RND(F,G,H,A,B,C,D,E, W.sb+0xbf597fc7U); + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); - W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); - RND(E,F,G,H,A,B,C,D, W.sc+0xc6e00bf3U); + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); - W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); - RND(D,E,F,G,H,A,B,C, W.sd+0xd5a79147U); + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); - W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); - RND(C,D,E,F,G,H,A,B, W.se+0x06ca6351U); + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); - W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); - RND(B,C,D,E,F,G,H,A, W.sf+0x14292967U); + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); - W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); - RND(A,B,C,D,E,F,G,H, W.s0+0x27b70a85U); + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); - W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); - RND(H,A,B,C,D,E,F,G, W.s1+0x2e1b2138U); + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); - W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); - RND(G,H,A,B,C,D,E,F, W.s2+0x4d2c6dfcU); + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); - W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); - RND(F,G,H,A,B,C,D,E, W.s3+0x53380d13U); + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); - W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); - RND(E,F,G,H,A,B,C,D, W.s4+0x650a7354U); + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); - W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); - RND(D,E,F,G,H,A,B,C, W.s5+0x766a0abbU); + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); - W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); - RND(C,D,E,F,G,H,A,B, W.s6+0x81c2c92eU); + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); - W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); - RND(B,C,D,E,F,G,H,A, W.s7+0x92722c85U); + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); - W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); - RND(A,B,C,D,E,F,G,H, W.s8+0xa2bfe8a1U); + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); - W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); - RND(H,A,B,C,D,E,F,G, W.s9+0xa81a664bU); + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); - W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); - RND(G,H,A,B,C,D,E,F, W.sa+0xc24b8b70U); + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); - W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); - RND(F,G,H,A,B,C,D,E, W.sb+0xc76c51a3U); + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); - W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); - RND(E,F,G,H,A,B,C,D, W.sc+0xd192e819U); + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); - W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); - RND(D,E,F,G,H,A,B,C, W.sd+0xd6990624U); + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); - W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); - RND(C,D,E,F,G,H,A,B, W.se+0xf40e3585U); + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); - W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); - RND(B,C,D,E,F,G,H,A, W.sf+0x106aa070U); + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); - W.s0 += Wr1(W.se) + W.s9 + Wr2(W.s1); - RND(A,B,C,D,E,F,G,H, W.s0+0x19a4c116U); + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); - W.s1 += Wr1(W.sf) + W.sa + Wr2(W.s2); - RND(H,A,B,C,D,E,F,G, W.s1+0x1e376c08U); + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); - W.s2 += Wr1(W.s0) + W.sb + Wr2(W.s3); - RND(G,H,A,B,C,D,E,F, W.s2+0x2748774cU); + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); - W.s3 += Wr1(W.s1) + W.sc + Wr2(W.s4); - RND(F,G,H,A,B,C,D,E, W.s3+0x34b0bcb5U); + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); - W.s4 += Wr1(W.s2) + W.sd + Wr2(W.s5); - RND(E,F,G,H,A,B,C,D, W.s4+0x391c0cb3U); + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); - W.s5 += Wr1(W.s3) + W.se + Wr2(W.s6); - RND(D,E,F,G,H,A,B,C, W.s5+0x4ed8aa4aU); + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); - W.s6 += Wr1(W.s4) + W.sf + Wr2(W.s7); - RND(C,D,E,F,G,H,A,B, W.s6+0x5b9cca4fU); + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); - W.s7 += Wr1(W.s5) + W.s0 + Wr2(W.s8); - RND(B,C,D,E,F,G,H,A, W.s7+0x682e6ff3U); + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); - W.s8 += Wr1(W.s6) + W.s1 + Wr2(W.s9); - RND(A,B,C,D,E,F,G,H, W.s8+0x748f82eeU); + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); - W.s9 += Wr1(W.s7) + W.s2 + Wr2(W.sa); - RND(H,A,B,C,D,E,F,G, W.s9+0x78a5636fU); + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); - W.sa += Wr1(W.s8) + W.s3 + Wr2(W.sb); - RND(G,H,A,B,C,D,E,F, W.sa+0x84c87814U); + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); - W.sb += Wr1(W.s9) + W.s4 + Wr2(W.sc); - RND(F,G,H,A,B,C,D,E, W.sb+0x8cc70208U); + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); - W.sc += Wr1(W.sa) + W.s5 + Wr2(W.sd); - RND(E,F,G,H,A,B,C,D, W.sc+0x90befffaU); + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); - W.sd += Wr1(W.sb) + W.s6 + Wr2(W.se); - RND(D,E,F,G,H,A,B,C, W.sd+0xa4506cebU); + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); - W.se += Wr1(W.sc) + W.s7 + Wr2(W.sf); - RND(C,D,E,F,G,H,A,B, W.se+0xbef9a3f7U); + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); - W.sf += Wr1(W.sd) + W.s8 + Wr2(W.s0); - RND(B,C,D,E,F,G,H,A, W.sf+0xc67178f2U); + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); #undef A #undef B From 3e61db105d5caa73195555da5c18d3413fef4365 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 18 Jul 2012 21:58:27 +1000 Subject: [PATCH 044/178] Create command queue before compiling program in opencl. --- ocl.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ocl.c b/ocl.c index 5dcced71..feb6aa2c 100644 --- a/ocl.c +++ b/ocl.c @@ -309,6 +309,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) return NULL; } + ///////////////////////////////////////////////////////////////// + // Create an OpenCL command queue + ///////////////////////////////////////////////////////////////// + clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); + if (status != CL_SUCCESS) /* Try again without OOE enable */ + clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status); + return NULL; + } + /* Check for BFI INT support. Hopefully people don't mix devices with * and without it! */ char * extensions = malloc(1024); @@ -739,18 +751,6 @@ built: return NULL; } - ///////////////////////////////////////////////////////////////// - // Create an OpenCL command queue - ///////////////////////////////////////////////////////////////// - clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); - if (status != CL_SUCCESS) /* Try again without OOE enable */ - clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); - if (status != CL_SUCCESS) { - applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status); - return NULL; - } - #ifdef USE_SCRYPT if (opt_scrypt) { size_t ipt = (1024 / clState->lookup_gap + (1024 % clState->lookup_gap > 0)); From d72add9af350f62fc17b7d8119812247a3c70561 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 20 Jul 2012 16:16:18 +1000 Subject: [PATCH 045/178] Send correct values to scrypt kernel to get it finally working. --- cgminer.c | 16 ++++++++++++---- driver-opencl.c | 15 ++++----------- findnonce.c | 9 ++++++--- ocl.c | 4 ++-- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/cgminer.c b/cgminer.c index df9feee7..f2ea3826 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1711,8 +1711,13 @@ static bool submit_upstream_work(const struct work *work, CURL *curl) if (!QUIET) { hash32 = (uint32_t *)(work->hash); - sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]), - work->block? " BLOCK!" : ""); + if (opt_scrypt) { + sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[7]), (unsigned long)(hash32[6]), + work->block? " BLOCK!" : ""); + } else { + sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]), + work->block? " BLOCK!" : ""); + } } /* Theoretically threads could race when modifying accepted and @@ -3991,10 +3996,13 @@ bool test_nonce(struct work *work, uint32_t nonce) { uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12); + if (opt_scrypt) { + *work_nonce = nonce; + return true; + } + *work_nonce = htobe32(nonce); - if (opt_scrypt) - return true; return hashtest(work); } diff --git a/driver-opencl.c b/driver-opencl.c index 4ddf3365..3ec60dd3 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -995,7 +995,7 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t #ifdef USE_SCRYPT static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) { - cl_uint4 *midstate = (cl_uint4 *)blk->work->midstate; + char *midstate = blk->work->midstate; cl_kernel *kernel = &clState->kernel; unsigned int num = 0; cl_int status = 0; @@ -1006,16 +1006,9 @@ static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_u CL_SET_ARG(clState->CLbuffer0); CL_SET_ARG(clState->outputBuffer); CL_SET_ARG(clState->padbuffer8); - CL_SET_ARG(midstate[0]); - CL_SET_ARG(midstate[16]); - -#if 0 - clSetKernelArg(clState->kernel,0,sizeof(cl_mem), &clState->CLbuffer[0]); - clSetKernelArg(clState->kernel,1,sizeof(cl_mem), &clState->CLbuffer[1]); - clSetKernelArg(clState->kernel,2,sizeof(cl_mem), &clState->padbuffer8); - clSetKernelArg(clState->kernel,3,sizeof(cl_uint4), &midstate[0]); - clSetKernelArg(clState->kernel,4,sizeof(cl_uint4), &midstate[16]); -#endif + CL_SET_VARG(4, &midstate[0]); + CL_SET_VARG(4, &midstate[16]); + return status; } #endif diff --git a/findnonce.c b/findnonce.c index d0e19176..a11333a1 100644 --- a/findnonce.c +++ b/findnonce.c @@ -229,13 +229,16 @@ static void *postcalc_hash(void *userdata) pthread_detach(pthread_self()); for (entry = 0; entry < FOUND; entry++) { - if (pcd->res[entry]) { + uint32_t nonce = pcd->res[entry]; + + if (nonce) { + applog(LOG_DEBUG, "OCL NONCE %u", nonce); #ifdef USE_SCRYPT if (opt_scrypt) - submit_nonce(thr, pcd->work, pcd->res[entry]); + submit_nonce(thr, pcd->work, nonce); else #endif - send_nonce(pcd, pcd->res[entry]); + send_nonce(pcd, nonce); nonces++; } } diff --git a/ocl.c b/ocl.c index feb6aa2c..1c6d98ee 100644 --- a/ocl.c +++ b/ocl.c @@ -487,8 +487,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) #ifdef USE_SCRYPT if (opt_scrypt) { - clState->lookup_gap = 1; - clState->thread_concurrency = 1; + clState->lookup_gap = 2; + clState->thread_concurrency = 6144; } #endif From 8dc0d6e4856e2a3dbabc7a6b3b6d2242cec48f2a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 20 Jul 2012 22:07:20 +1000 Subject: [PATCH 046/178] Constify input variable in scrypt kernel. --- scrypt120713.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index 69e2db8a..695eaf3d 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -689,7 +689,7 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup) #define NFLAG (0x7F) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void search(__global uint4*restrict input, __global uint*restrict output, __global uint4*restrict padcache, uint4 pad0, uint4 pad1) +__kernel void search(__global const uint4 * restrict input, __global uint*restrict output, __global uint4*restrict padcache, uint4 pad0, uint4 pad1) { uint gid = get_global_id(0); uint4 X[8]; From 537b28d53e90e24953722ec8502f71ca0367c4cd Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 20 Jul 2012 22:10:33 +1000 Subject: [PATCH 047/178] Make pad0 and pad1 local variable in scrypt kernel. --- scrypt120713.cl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index 695eaf3d..f7c1a6ce 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -689,12 +689,13 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup) #define NFLAG (0x7F) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void search(__global const uint4 * restrict input, __global uint*restrict output, __global uint4*restrict padcache, uint4 pad0, uint4 pad1) +__kernel void search(__global const uint4 * restrict input, __global uint*restrict output, __global uint4*restrict padcache, const uint4 midstate0, const uint4 midstate16) { uint gid = get_global_id(0); uint4 X[8]; uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1; uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid); + uint4 pad0 = midstate0, pad1 = midstate16; SHA256(&pad0,&pad1, data, (uint4)(0x80000000U,0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x280)); SHA256_fresh(&ostate0,&ostate1, pad0^0x5C5C5C5CU, pad1^0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU); From 808f403a14267263066b0c257c262a1be6ef47ca Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 20 Jul 2012 23:44:43 +1000 Subject: [PATCH 048/178] Use cpu scrypt code to check if an scrypt share is below target before submitting it. --- Makefile.am | 8 ++++---- cgminer.c | 2 +- findnonce.c | 12 +++++++++--- scrypt.c | 16 +++++++++++++++- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/Makefile.am b/Makefile.am index a784ef91..1473375e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -45,6 +45,10 @@ cgminer_SOURCES += ocl.c ocl.h findnonce.c findnonce.h cgminer_SOURCES += adl.c adl.h adl_functions.h cgminer_SOURCES += *.cl +if HAS_SCRYPT +cgminer_SOURCES += scrypt.c +endif + if HAS_CPUMINE # original CPU related sources, unchanged cgminer_SOURCES += \ @@ -56,10 +60,6 @@ cgminer_SOURCES += \ # the CPU portion extracted from original main.c cgminer_SOURCES += driver-cpu.h driver-cpu.c -if HAS_SCRYPT -cgminer_SOURCES += scrypt.c -endif - if HAS_YASM AM_CFLAGS = -DHAS_YASM if HAVE_x86_64 diff --git a/cgminer.c b/cgminer.c index f2ea3826..48010128 100644 --- a/cgminer.c +++ b/cgminer.c @@ -4012,7 +4012,7 @@ bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce) /* Do one last check before attempting to submit the work */ /* Side effect: sets work->data for us */ if (!test_nonce(work, nonce)) { - applog(LOG_INFO, "Share below target"); + applog(LOG_INFO, "Pool %d share below target", work->pool->pool_no); return true; } return submit_work_sync(thr, work); diff --git a/findnonce.c b/findnonce.c index a11333a1..92e48812 100644 --- a/findnonce.c +++ b/findnonce.c @@ -220,6 +220,8 @@ static void send_nonce(struct pc_data *pcd, cl_uint nonce) } } +extern bool scrypt_scan_nonce(unsigned char *pdata, uint32_t nonce); + static void *postcalc_hash(void *userdata) { struct pc_data *pcd = (struct pc_data *)userdata; @@ -234,9 +236,13 @@ static void *postcalc_hash(void *userdata) if (nonce) { applog(LOG_DEBUG, "OCL NONCE %u", nonce); #ifdef USE_SCRYPT - if (opt_scrypt) - submit_nonce(thr, pcd->work, nonce); - else + if (opt_scrypt) { + struct work *work = pcd->work; + if (scrypt_scan_nonce(work, nonce)) + submit_nonce(thr, work, nonce); + else + applog(LOG_INFO, "Pool %d share below target", work->pool->pool_no); + } else #endif send_nonce(pcd, nonce); nonces++; diff --git a/scrypt.c b/scrypt.c index 4334bcf0..14378d6d 100644 --- a/scrypt.c +++ b/scrypt.c @@ -407,6 +407,20 @@ static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad) return PBKDF2_SHA256_80_128_32(input, X); } +bool scrypt_scan_nonce(struct work *work, uint32_t nonce) +{ + uint32_t Htarg, tmp_hash7, data[20]; + unsigned char *scratchbuf; + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(nonce); + + Htarg = ((const uint32_t *)work->target)[7]; + scratchbuf = alloca(131584); + tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf); + return (tmp_hash7 <= Htarg); +} + bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsigned char *pdata, unsigned char *phash1, unsigned char *phash, const unsigned char *ptarget, @@ -423,7 +437,7 @@ bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsig be32enc_vect(data, (const uint32_t *)pdata, 19); - scratchbuf = malloc(131583); + scratchbuf = malloc(131584); if (unlikely(!scratchbuf)) { applog(LOG_ERR, "Failed to malloc scratchbuf in scanhash_scrypt"); return ret; From d9eba43a4234d17380b63f7a4626e33f110abd7d Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 20 Jul 2012 23:49:43 +1000 Subject: [PATCH 049/178] Fix build. --- findnonce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/findnonce.c b/findnonce.c index 92e48812..3864fba8 100644 --- a/findnonce.c +++ b/findnonce.c @@ -220,7 +220,7 @@ static void send_nonce(struct pc_data *pcd, cl_uint nonce) } } -extern bool scrypt_scan_nonce(unsigned char *pdata, uint32_t nonce); +extern bool scrypt_scan_nonce(struct work *work, uint32_t nonce); static void *postcalc_hash(void *userdata) { From d13a3f1d501f03e65d9c694e72f582075ed0ce63 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 02:47:27 +1000 Subject: [PATCH 050/178] Decrease lookup gap to 1. Does not seem to help in any way being 2. --- ocl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocl.c b/ocl.c index 1c6d98ee..13ac5e2e 100644 --- a/ocl.c +++ b/ocl.c @@ -487,7 +487,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) #ifdef USE_SCRYPT if (opt_scrypt) { - clState->lookup_gap = 2; + clState->lookup_gap = 1; clState->thread_concurrency = 6144; } #endif From 7d53fba1ad5cc84bf96764b48a92c1bf26b91b96 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 02:49:50 +1000 Subject: [PATCH 051/178] Reinstate GPU only opencl device detection. --- ocl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocl.c b/ocl.c index 13ac5e2e..8f70a395 100644 --- a/ocl.c +++ b/ocl.c @@ -115,7 +115,7 @@ int clDevicesNum(void) { status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform %d version: %s", i, pbuff); - status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); + status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); if (i < numPlatforms - 1) @@ -132,7 +132,7 @@ int clDevicesNum(void) { char pbuff[256]; cl_device_id *devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); - clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); for (j = 0; j < numDevices; j++) { clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); applog(LOG_INFO, "\t%i\t%s", j, pbuff); @@ -255,7 +255,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform version: %s", vbuff); - status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); + status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); return NULL; @@ -266,7 +266,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) /* Now, get the device list data */ - status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); + status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status); return NULL; @@ -303,7 +303,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; - clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_ALL, NULL, NULL, &status); + clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status); return NULL; From a22edd2a7f42ff61a37daf6f334277dc1f2b8348 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 10:25:33 +1000 Subject: [PATCH 052/178] Test the target in the actual scrypt kernel itself saving further calculations. --- driver-opencl.c | 3 +++ scrypt120713.cl | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 3ec60dd3..cd2c9ab3 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -998,8 +998,10 @@ static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_u char *midstate = blk->work->midstate; cl_kernel *kernel = &clState->kernel; unsigned int num = 0; + cl_uint le_target; cl_int status = 0; + le_target = ~swab32((uint32_t)blk->work->target[7]); clState->cldata = blk->work->data; status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); @@ -1008,6 +1010,7 @@ static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_u CL_SET_ARG(clState->padbuffer8); CL_SET_VARG(4, &midstate[0]); CL_SET_VARG(4, &midstate[16]); + CL_SET_ARG(le_target); return status; } diff --git a/scrypt120713.cl b/scrypt120713.cl index f7c1a6ce..95b006e9 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -689,7 +689,9 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup) #define NFLAG (0x7F) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void search(__global const uint4 * restrict input, __global uint*restrict output, __global uint4*restrict padcache, const uint4 midstate0, const uint4 midstate16) +__kernel void search(__global const uint4 * restrict input, +__global uint*restrict output, __global uint4*restrict padcache, +const uint4 midstate0, const uint4 midstate16, const uint target) { uint gid = get_global_id(0); uint4 X[8]; @@ -722,7 +724,7 @@ __kernel void search(__global const uint4 * restrict input, __global uint*restri SHA256_fixed(&tmp0,&tmp1); SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U)); - if ((ostate1.w&0xFFFF) == 0) + if (!(ostate1.w&target)) output[FOUND] = output[NFLAG & gid] = gid; } From b9e5f8e55065df4f050ab45f66a435b1ac54efb1 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 10:28:41 +1000 Subject: [PATCH 053/178] Revert "Use cpu scrypt code to check if an scrypt share is below target before submitting it." This reverts commit 808f403a14267263066b0c257c262a1be6ef47ca. Conflicts: findnonce.c --- Makefile.am | 8 ++++---- cgminer.c | 2 +- findnonce.c | 12 +++--------- scrypt.c | 16 +--------------- 4 files changed, 9 insertions(+), 29 deletions(-) diff --git a/Makefile.am b/Makefile.am index 1473375e..a784ef91 100644 --- a/Makefile.am +++ b/Makefile.am @@ -45,10 +45,6 @@ cgminer_SOURCES += ocl.c ocl.h findnonce.c findnonce.h cgminer_SOURCES += adl.c adl.h adl_functions.h cgminer_SOURCES += *.cl -if HAS_SCRYPT -cgminer_SOURCES += scrypt.c -endif - if HAS_CPUMINE # original CPU related sources, unchanged cgminer_SOURCES += \ @@ -60,6 +56,10 @@ cgminer_SOURCES += \ # the CPU portion extracted from original main.c cgminer_SOURCES += driver-cpu.h driver-cpu.c +if HAS_SCRYPT +cgminer_SOURCES += scrypt.c +endif + if HAS_YASM AM_CFLAGS = -DHAS_YASM if HAVE_x86_64 diff --git a/cgminer.c b/cgminer.c index 48010128..f2ea3826 100644 --- a/cgminer.c +++ b/cgminer.c @@ -4012,7 +4012,7 @@ bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce) /* Do one last check before attempting to submit the work */ /* Side effect: sets work->data for us */ if (!test_nonce(work, nonce)) { - applog(LOG_INFO, "Pool %d share below target", work->pool->pool_no); + applog(LOG_INFO, "Share below target"); return true; } return submit_work_sync(thr, work); diff --git a/findnonce.c b/findnonce.c index 3864fba8..a11333a1 100644 --- a/findnonce.c +++ b/findnonce.c @@ -220,8 +220,6 @@ static void send_nonce(struct pc_data *pcd, cl_uint nonce) } } -extern bool scrypt_scan_nonce(struct work *work, uint32_t nonce); - static void *postcalc_hash(void *userdata) { struct pc_data *pcd = (struct pc_data *)userdata; @@ -236,13 +234,9 @@ static void *postcalc_hash(void *userdata) if (nonce) { applog(LOG_DEBUG, "OCL NONCE %u", nonce); #ifdef USE_SCRYPT - if (opt_scrypt) { - struct work *work = pcd->work; - if (scrypt_scan_nonce(work, nonce)) - submit_nonce(thr, work, nonce); - else - applog(LOG_INFO, "Pool %d share below target", work->pool->pool_no); - } else + if (opt_scrypt) + submit_nonce(thr, pcd->work, nonce); + else #endif send_nonce(pcd, nonce); nonces++; diff --git a/scrypt.c b/scrypt.c index 14378d6d..4334bcf0 100644 --- a/scrypt.c +++ b/scrypt.c @@ -407,20 +407,6 @@ static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad) return PBKDF2_SHA256_80_128_32(input, X); } -bool scrypt_scan_nonce(struct work *work, uint32_t nonce) -{ - uint32_t Htarg, tmp_hash7, data[20]; - unsigned char *scratchbuf; - - be32enc_vect(data, (const uint32_t *)work->data, 19); - data[19] = htobe32(nonce); - - Htarg = ((const uint32_t *)work->target)[7]; - scratchbuf = alloca(131584); - tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf); - return (tmp_hash7 <= Htarg); -} - bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsigned char *pdata, unsigned char *phash1, unsigned char *phash, const unsigned char *ptarget, @@ -437,7 +423,7 @@ bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsig be32enc_vect(data, (const uint32_t *)pdata, 19); - scratchbuf = malloc(131584); + scratchbuf = malloc(131583); if (unlikely(!scratchbuf)) { applog(LOG_ERR, "Failed to malloc scratchbuf in scanhash_scrypt"); return ret; From 1b5c676de7d9b64445758555465cc1b892452e45 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 11:00:36 +1000 Subject: [PATCH 054/178] Use 256 output slots for kernels to allow 1 for each worksize. --- diablo120328.cl | 4 ++-- diakgcn120427.cl | 4 ++-- findnonce.h | 6 +++--- phatk120223.cl | 4 ++-- poclbm120327.cl | 4 ++-- scrypt120713.cl | 4 ++-- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/diablo120328.cl b/diablo120328.cl index e9d2d87f..4b64c300 100644 --- a/diablo120328.cl +++ b/diablo120328.cl @@ -1242,8 +1242,8 @@ void search( ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]); -#define FOUND (0x80) -#define NFLAG (0x7F) +#define FOUND (0x800) +#define NFLAG (0x7FF) #if defined(VECTORS4) bool result = any(ZA[924] == 0x136032EDU); diff --git a/diakgcn120427.cl b/diakgcn120427.cl index 37d51c51..7dd73fb9 100644 --- a/diakgcn120427.cl +++ b/diakgcn120427.cl @@ -571,8 +571,8 @@ __kernel V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]); -#define FOUND (0x80) -#define NFLAG (0x7F) +#define FOUND (0x800) +#define NFLAG (0x7FF) #ifdef VECTORS4 if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU)) diff --git a/findnonce.h b/findnonce.h index 5b93c15c..ce69569e 100644 --- a/findnonce.h +++ b/findnonce.h @@ -4,10 +4,10 @@ #include "config.h" #define MAXTHREADS (0xFFFFFFFEULL) -#define MAXBUFFERS (0xFF) +#define MAXBUFFERS (0xFFF) #define BUFFERSIZE (sizeof(uint32_t) * MAXBUFFERS) -#define FOUND (0x80) -/* #define NFLAG (0x7F) Just for reference */ +#define FOUND (0x800) +/* #define NFLAG (0x7FF) Just for reference */ #ifdef HAVE_OPENCL extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data); diff --git a/phatk120223.cl b/phatk120223.cl index 7d1c3200..0f604436 100644 --- a/phatk120223.cl +++ b/phatk120223.cl @@ -387,8 +387,8 @@ void search( const uint state0, const uint state1, const uint state2, const uint W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) - (-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64))); -#define FOUND (0x80) -#define NFLAG (0x7F) +#define FOUND (0x800) +#define NFLAG (0x7FF) #ifdef VECTORS4 bool result = W[117].x & W[117].y & W[117].z & W[117].w; diff --git a/poclbm120327.cl b/poclbm120327.cl index 72491a26..3e8b9943 100644 --- a/poclbm120327.cl +++ b/poclbm120327.cl @@ -1311,8 +1311,8 @@ Vals[1]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); Vals[1]+=K[59]; Vals[1]+=Vals[5]; -#define FOUND (0x80) -#define NFLAG (0x7F) +#define FOUND (0x800) +#define NFLAG (0x7FF) #if defined(VECTORS2) || defined(VECTORS4) Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); diff --git a/scrypt120713.cl b/scrypt120713.cl index 95b006e9..7aaaa198 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -685,8 +685,8 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup) unshittify(X); } -#define FOUND (0x80) -#define NFLAG (0x7F) +#define FOUND (0x800) +#define NFLAG (0x7FF) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global const uint4 * restrict input, From e45ebb62a90e860db96926e1055ab523cde1ee6f Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 13:30:32 +1000 Subject: [PATCH 055/178] Correct target value passed to scrypt kernel. --- driver-opencl.c | 2 +- scrypt120713.cl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index cd2c9ab3..1eaf758a 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1001,7 +1001,7 @@ static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_u cl_uint le_target; cl_int status = 0; - le_target = ~swab32((uint32_t)blk->work->target[7]); + le_target = ~swab32(*(cl_uint *)(blk->work->target + 28)); clState->cldata = blk->work->data; status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); diff --git a/scrypt120713.cl b/scrypt120713.cl index 7aaaa198..0f8db652 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -724,7 +724,7 @@ const uint4 midstate0, const uint4 midstate16, const uint target) SHA256_fixed(&tmp0,&tmp1); SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U)); - if (!(ostate1.w&target)) + if (!(ostate1.w & target)) output[FOUND] = output[NFLAG & gid] = gid; } From 04f55a0e66f169d57d950383e4ea5dfe7fddd51f Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 13:56:54 +1000 Subject: [PATCH 056/178] Change the scale of intensity for scrypt kernel and fix a build warning. --- driver-opencl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 1eaf758a..c578a98c 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -995,7 +995,7 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t #ifdef USE_SCRYPT static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) { - char *midstate = blk->work->midstate; + unsigned char *midstate = blk->work->midstate; cl_kernel *kernel = &clState->kernel; unsigned int num = 0; cl_uint le_target; @@ -1020,7 +1020,7 @@ static void set_threads_hashes(unsigned int vectors, unsigned int *threads, int64_t *hashes, size_t *globalThreads, unsigned int minthreads, int intensity) { - *threads = 1 << (15 + intensity); + *threads = 1 << ((opt_scrypt ? 0 : 15) + intensity); if (*threads < minthreads) *threads = minthreads; *globalThreads = *threads; From ffe1318f7080db83c6e53d3560ee876fe1f57409 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 16:05:50 +1000 Subject: [PATCH 057/178] Ignore negative intensities for scrypt. --- driver-opencl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/driver-opencl.c b/driver-opencl.c index c578a98c..e44faeff 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1020,7 +1020,12 @@ static void set_threads_hashes(unsigned int vectors, unsigned int *threads, int64_t *hashes, size_t *globalThreads, unsigned int minthreads, int intensity) { - *threads = 1 << ((opt_scrypt ? 0 : 15) + intensity); + if (opt_scrypt) { + if (intensity < 0) + intensity = 0; + *threads = 1 << intensity; + } else + *threads = 1 << (15 + intensity); if (*threads < minthreads) *threads = minthreads; *globalThreads = *threads; From 39f7d2fa74567773549df6a04358b05b994176cc Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 17:31:06 +1000 Subject: [PATCH 058/178] Allow lookup gap and thread concurrency to be passed per device and store details in kernel binary filename. --- cgminer.c | 10 ++++++++++ driver-opencl.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++ driver-opencl.h | 4 ++++ miner.h | 4 ++++ ocl.c | 40 +++++++++++++++++++++---------------- ocl.h | 2 -- 6 files changed, 93 insertions(+), 19 deletions(-) diff --git a/cgminer.c b/cgminer.c index f2ea3826..b37814dc 100644 --- a/cgminer.c +++ b/cgminer.c @@ -853,6 +853,11 @@ static struct opt_table opt_config_table[] = { OPT_WITH_ARG("--gpu-vddc", set_gpu_vddc, NULL, NULL, "Set the GPU voltage in Volts - one value for all or separate by commas for per card"), +#endif +#ifdef USE_SCRYPT + OPT_WITH_ARG("--lookup-gap", + set_lookup_gap, NULL, NULL, + "Set GPU lookup gap for scrypt mining, comma separated"), #endif OPT_WITH_ARG("--intensity|-I", set_intensity, NULL, NULL, @@ -999,6 +1004,11 @@ static struct opt_table opt_config_table[] = { opt_hidden #endif ), +#ifdef USE_SCRYPT + OPT_WITH_ARG("--thread-concurrency", + set_thread_concurrency, NULL, NULL, + "Set GPU thread concurrency for scrypt mining, comma separated"), +#endif OPT_WITH_ARG("--url|-o", set_url, NULL, NULL, "URL for bitcoin JSON-RPC server"), diff --git a/driver-opencl.c b/driver-opencl.c index e44faeff..059a7ece 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -127,6 +127,58 @@ char *set_worksize(char *arg) return NULL; } +#ifdef USE_SCRYPT +char *set_lookup_gap(char *arg) +{ + int i, val = 0, device = 0; + char *nextptr; + + nextptr = strtok(arg, ","); + if (nextptr == NULL) + return "Invalid parameters for set lookup gap"; + val = atoi(nextptr); + + gpus[device++].lookup_gap = val; + + while ((nextptr = strtok(NULL, ",")) != NULL) { + val = atoi(nextptr); + + gpus[device++].lookup_gap = val; + } + if (device == 1) { + for (i = device; i < MAX_GPUDEVICES; i++) + gpus[i].lookup_gap = gpus[0].lookup_gap; + } + + return NULL; +} + +char *set_thread_concurrency(char *arg) +{ + int i, val = 0, device = 0; + char *nextptr; + + nextptr = strtok(arg, ","); + if (nextptr == NULL) + return "Invalid parameters for set thread concurrency"; + val = atoi(nextptr); + + gpus[device++].thread_concurrency = val; + + while ((nextptr = strtok(NULL, ",")) != NULL) { + val = atoi(nextptr); + + gpus[device++].thread_concurrency = val; + } + if (device == 1) { + for (i = device; i < MAX_GPUDEVICES; i++) + gpus[i].thread_concurrency = gpus[0].thread_concurrency; + } + + return NULL; +} +#endif + static enum cl_kernels select_kernel(char *arg) { if (!strcmp(arg, "diablo")) diff --git a/driver-opencl.h b/driver-opencl.h index 600bd854..f09571b9 100644 --- a/driver-opencl.h +++ b/driver-opencl.h @@ -18,6 +18,10 @@ extern char *set_temp_target(char *arg); extern char *set_intensity(char *arg); extern char *set_vector(char *arg); extern char *set_worksize(char *arg); +#ifdef USE_SCRYPT +extern char *set_lookup_gap(char *arg); +extern char *set_thread_concurrency(char *arg); +#endif extern char *set_kernel(char *arg); void manage_gpu(void); extern void pause_dynamic_threads(int gpu); diff --git a/miner.h b/miner.h index 5cc683ce..65c8fa14 100644 --- a/miner.h +++ b/miner.h @@ -360,6 +360,10 @@ struct cgpu_info { size_t work_size; enum cl_kernels kernel; +#ifdef USE_SCRYPT + int lookup_gap; + int thread_concurrency; +#endif struct timeval tv_gpustart;; struct timeval tv_gpuend; double gpu_us_average; diff --git a/ocl.c b/ocl.c index 8f70a395..4f21b2bb 100644 --- a/ocl.c +++ b/ocl.c @@ -367,6 +367,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) * compiler to ensure we only load a binary that matches what would * have otherwise created. The filename is: * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin + * For scrypt the filename is: + * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin */ char binaryfilename[255]; char filename[255]; @@ -461,6 +463,15 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth; gpus[gpu].work_size = clState->wsize; +#ifdef USE_SCRYPT + if (opt_scrypt) { + if (!gpus[gpu].lookup_gap) + gpus[gpu].lookup_gap = 2; + if (!gpus[gpu].thread_concurrency) + gpus[gpu].thread_concurrency = 2048; + } +#endif + FILE *binaryfile; size_t *binary_sizes; char **binaries; @@ -485,24 +496,19 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) return NULL; } -#ifdef USE_SCRYPT - if (opt_scrypt) { - clState->lookup_gap = 1; - clState->thread_concurrency = 6144; - } -#endif - strcat(binaryfilename, name); if (clState->goffset) strcat(binaryfilename, "g"); - strcat(binaryfilename, "v"); - sprintf(numbuf, "%d", clState->vwidth); - strcat(binaryfilename, numbuf); - strcat(binaryfilename, "w"); - sprintf(numbuf, "%d", (int)clState->wsize); + if (opt_scrypt) { + sprintf(numbuf, "lg%dtc%d", gpus[gpu].lookup_gap, gpus[gpu].thread_concurrency); + strcat(binaryfilename, numbuf); + } else { + sprintf(numbuf, "v%d", clState->vwidth); + strcat(binaryfilename, numbuf); + } + sprintf(numbuf, "w%d", (int)clState->wsize); strcat(binaryfilename, numbuf); - strcat(binaryfilename, "l"); - sprintf(numbuf, "%d", (int)sizeof(long)); + sprintf(numbuf, "l%d", (int)sizeof(long)); strcat(binaryfilename, numbuf); strcat(binaryfilename, ".bin"); @@ -566,7 +572,7 @@ build: #ifdef USE_SCRYPT if (opt_scrypt) sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d", - (int)clState->lookup_gap, (int)clState->thread_concurrency, (int)clState->wsize); + gpus[gpu].lookup_gap, gpus[gpu].thread_concurrency, (int)clState->wsize); else #endif { @@ -753,8 +759,8 @@ built: #ifdef USE_SCRYPT if (opt_scrypt) { - size_t ipt = (1024 / clState->lookup_gap + (1024 % clState->lookup_gap > 0)); - size_t bufsize = 128 * ipt * clState->thread_concurrency; + size_t ipt = (1024 / gpus[gpu].lookup_gap + (1024 % gpus[gpu].lookup_gap > 0)); + size_t bufsize = 128 * ipt * gpus[gpu].thread_concurrency; clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 80, NULL, &status); clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); diff --git a/ocl.h b/ocl.h index 56fa9b0d..984e7d62 100644 --- a/ocl.h +++ b/ocl.h @@ -22,8 +22,6 @@ typedef struct { #ifdef USE_SCRYPT cl_mem CLbuffer0; cl_mem padbuffer8; - size_t lookup_gap; - size_t thread_concurrency; size_t padbufsize; void * cldata; #endif From c21e15a0c35677880703c2e5e452b143ff5d41ef Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 21 Jul 2012 17:37:23 +1000 Subject: [PATCH 059/178] Add name to scrypt kernel copyright. --- scrypt120713.cl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrypt120713.cl b/scrypt120713.cl index 0f8db652..a273f02c 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -729,7 +729,8 @@ const uint4 midstate0, const uint4 midstate16, const uint target) } /*- - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt, + * 2012 Con Kolivas. * All rights reserved. * * Redistribution and use in source and binary forms, with or without From 17dfe74f964bc739ca58ebe72e261e40422d2c97 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 22 Jul 2012 00:49:47 +1000 Subject: [PATCH 060/178] Allow intensities up to 20 if scrypt is compiled in. --- miner.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/miner.h b/miner.h index 65c8fa14..95affeb3 100644 --- a/miner.h +++ b/miner.h @@ -610,8 +610,13 @@ extern void add_pool_details(bool live, char *url, char *user, char *pass); #define MIN_INTENSITY -10 #define _MIN_INTENSITY_STR "-10" +#ifdef USE_SCRYPT +#define MAX_INTENSITY 20 +#define _MAX_INTENSITY_STR "20" +#else #define MAX_INTENSITY 14 #define _MAX_INTENSITY_STR "14" +#endif extern struct list_head scan_devices; extern int nDevs; From 1711b4eb77f161d9058094a9f82999a495dc427a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 22 Jul 2012 00:58:09 +1000 Subject: [PATCH 061/178] Display size of scrypt buffer used in debug. --- ocl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ocl.c b/ocl.c index 4f21b2bb..d6e12c52 100644 --- a/ocl.c +++ b/ocl.c @@ -761,6 +761,7 @@ built: if (opt_scrypt) { size_t ipt = (1024 / gpus[gpu].lookup_gap + (1024 % gpus[gpu].lookup_gap > 0)); size_t bufsize = 128 * ipt * gpus[gpu].thread_concurrency; + applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize); clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 80, NULL, &status); clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); From 94c94d659a3ab302038a6910191a13ee3e144919 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 23 Jul 2012 07:38:58 +1000 Subject: [PATCH 062/178] Nonce testing for btc got screwed up, leading to no accepted shares. Fix it. --- cgminer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cgminer.c b/cgminer.c index b37814dc..efe78946 100644 --- a/cgminer.c +++ b/cgminer.c @@ -4004,15 +4004,17 @@ bool hashtest(const struct work *work) bool test_nonce(struct work *work, uint32_t nonce) { - uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12); - if (opt_scrypt) { + uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12); + *work_nonce = nonce; return true; } - *work_nonce = htobe32(nonce); - + work->data[64 + 12 + 0] = (nonce >> 0) & 0xff; + work->data[64 + 12 + 1] = (nonce >> 8) & 0xff; + work->data[64 + 12 + 2] = (nonce >> 16) & 0xff; + work->data[64 + 12 + 3] = (nonce >> 24) & 0xff; return hashtest(work); } From 5087ff9069061247627f2cf49c4a5fc7b98c5c48 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 23 Jul 2012 16:37:13 +1000 Subject: [PATCH 063/178] Add debugging output if buffer allocation fails for scrypt and round up bufsize to a multiple of 256. --- ocl.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ocl.c b/ocl.c index d6e12c52..b7c8e5f0 100644 --- a/ocl.c +++ b/ocl.c @@ -761,11 +761,22 @@ built: if (opt_scrypt) { size_t ipt = (1024 / gpus[gpu].lookup_gap + (1024 % gpus[gpu].lookup_gap > 0)); size_t bufsize = 128 * ipt * gpus[gpu].thread_concurrency; - applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize); - clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 80, NULL, &status); - clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); + if (bufsize % 256) + bufsize += (256 - bufsize % 256); + applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize); clState->padbufsize = bufsize; + clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease CT or increase LG", status); + return NULL; + } + + clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status); + return NULL; + } } #endif clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); From 89eb1fa393730ac04edea6f6a7e3a8c9cf35f406 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 23 Jul 2012 17:41:31 +1000 Subject: [PATCH 064/178] Check the maximum allocable memory size per opencl device. --- miner.h | 2 ++ ocl.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/miner.h b/miner.h index 95affeb3..291574c9 100644 --- a/miner.h +++ b/miner.h @@ -356,9 +356,11 @@ struct cgpu_info { int virtual_adl; int intensity; bool dynamic; + cl_uint vwidth; size_t work_size; enum cl_kernels kernel; + cl_ulong max_alloc; #ifdef USE_SCRYPT int lookup_gap; diff --git a/ocl.c b/ocl.c index b7c8e5f0..880aaf7a 100644 --- a/ocl.c +++ b/ocl.c @@ -363,6 +363,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) } applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size); + status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&gpus[gpu].max_alloc, NULL); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status); + return NULL; + } + applog(LOG_DEBUG, "Max mem alloc size is %u", gpus[gpu].max_alloc); + /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would * have otherwise created. The filename is: From d8f81c18eed045cd084f97464cdc8fa0be5833e2 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 23 Jul 2012 17:51:57 +1000 Subject: [PATCH 065/178] Use the detected maximum allocable memory on a GPU to determine the optimal scrypt settings when lookup_gap and thread_concurrency parameters are not given. --- ocl.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ocl.c b/ocl.c index 880aaf7a..bd8fd6d3 100644 --- a/ocl.c +++ b/ocl.c @@ -472,10 +472,14 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) #ifdef USE_SCRYPT if (opt_scrypt) { - if (!gpus[gpu].lookup_gap) + if (!gpus[gpu].lookup_gap) { + applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); gpus[gpu].lookup_gap = 2; - if (!gpus[gpu].thread_concurrency) - gpus[gpu].thread_concurrency = 2048; + } + if (!gpus[gpu].thread_concurrency) { + gpus[gpu].thread_concurrency = gpus[gpu].max_alloc / 32768 / gpus[gpu].lookup_gap; + applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, gpus[gpu].thread_concurrency); + } } #endif @@ -769,9 +773,11 @@ built: size_t ipt = (1024 / gpus[gpu].lookup_gap + (1024 % gpus[gpu].lookup_gap > 0)); size_t bufsize = 128 * ipt * gpus[gpu].thread_concurrency; - if (bufsize % 256) - bufsize += (256 - bufsize % 256); applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize); + if (bufsize > gpus[gpu].max_alloc) { + applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u, your scrypt settings come to %u", + gpu, gpus[gpu].max_alloc, bufsize); + } clState->padbufsize = bufsize; clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); if (status != CL_SUCCESS) { From 3a0d60cfe1e769feafae7719911c1039fd517414 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 23 Jul 2012 21:30:30 +1000 Subject: [PATCH 066/178] Always create the largest possible padbuffer for scrypt kernels even if not needed for thread_concurrency, giving us some headroom for intensity levels. --- ocl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ocl.c b/ocl.c index bd8fd6d3..b09291bb 100644 --- a/ocl.c +++ b/ocl.c @@ -773,11 +773,14 @@ built: size_t ipt = (1024 / gpus[gpu].lookup_gap + (1024 % gpus[gpu].lookup_gap > 0)); size_t bufsize = 128 * ipt * gpus[gpu].thread_concurrency; - applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize); + /* Always allocate the largest possible buffer allowed, even if we're not initially requiring it + * based on thread_concurrency, giving us some headroom for intensity levels. */ if (bufsize > gpus[gpu].max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u, your scrypt settings come to %u", gpu, gpus[gpu].max_alloc, bufsize); - } + } else + bufsize = gpus[gpu].max_alloc; + applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize); clState->padbufsize = bufsize; clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); if (status != CL_SUCCESS) { From a1cb0d06d5cd64c87e29463d25cc9b3a8e967ed4 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Mon, 23 Jul 2012 15:48:00 +0000 Subject: [PATCH 067/178] Bugfix: Copy argv[0] given to dirname() Per manpage, dirname can (and does on GNU/Linux!) modify its argument --- cgminer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cgminer.c b/cgminer.c index 20e1d2fc..08893575 100644 --- a/cgminer.c +++ b/cgminer.c @@ -4105,6 +4105,7 @@ int main (int argc, char *argv[]) bool pools_active = false; struct sigaction handler; struct thr_info *thr; + char *s; unsigned int k; int i, j; @@ -4135,7 +4136,9 @@ int main (int argc, char *argv[]) opt_kernel_path = alloca(PATH_MAX); strcpy(opt_kernel_path, CGMINER_PREFIX); cgminer_path = alloca(PATH_MAX); - strcpy(cgminer_path, dirname(argv[0])); + s = strdup(argv[0]); + strcpy(cgminer_path, dirname(s)); + free(s); strcat(cgminer_path, "/"); #ifdef WANT_CPUMINE // Hack to make cgminer silent when called recursively on WIN32 From c55830502a0df4ae1165e583de4e96a11af33c62 Mon Sep 17 00:00:00 2001 From: Kano Date: Tue, 24 Jul 2012 02:19:23 +1000 Subject: [PATCH 068/178] BFL force all code to timeout to avoid hanging --- driver-bitforce.c | 16 +++++++++------- fpgautils.c | 10 +++++----- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 31892297..ff537203 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -34,13 +34,15 @@ struct device_api bitforce_api; -#define BFopen(devpath) serial_open(devpath, 0, -1, true) +// Code must deal with a timeout +#define BFopen(devpath) serial_open(devpath, 0, 1, true) static void BFgets(char *buf, size_t bufLen, int fd) { - do + do { + buf[0] = '\0'; --bufLen; - while (likely(bufLen && read(fd, buf, 1) == 1 && (buf++)[0] != '\n')); + } while (likely(bufLen && read(fd, buf, 1) == 1 && (buf++)[0] != '\n')); buf[0] = '\0'; } @@ -72,7 +74,7 @@ static bool bitforce_detect_one(const char *devpath) BFwrite(fdDev, "ZGX", 3); BFgets(pdevbuf, sizeof(pdevbuf), fdDev); if (unlikely(!pdevbuf[0])) { - applog(LOG_ERR, "BFL: Error reading (ZGX)"); + applog(LOG_ERR, "BFL: Error reading/timeout (ZGX)"); return 0; } @@ -200,7 +202,7 @@ void bitforce_init(struct cgpu_info *bitforce) if (unlikely(!pdevbuf[0])) { mutex_unlock(&bitforce->device_mutex); - applog(LOG_ERR, "BFL%i: Error reading (ZGX)", bitforce->device_id); + applog(LOG_ERR, "BFL%i: Error reading/timeout (ZGX)", bitforce->device_id); return; } @@ -240,7 +242,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce) mutex_unlock(&bitforce->device_mutex); if (unlikely(!pdevbuf[0])) { - applog(LOG_ERR, "BFL%i: Error: Get temp returned empty string", bitforce->device_id); + applog(LOG_ERR, "BFL%i: Error: Get temp returned empty string/timed out", bitforce->device_id); bitforce->temp = 0; return false; } @@ -328,7 +330,7 @@ re_send: } if (unlikely(!pdevbuf[0])) { - applog(LOG_ERR, "BFL%i: Error: Send block data returned empty string", bitforce->device_id); + applog(LOG_ERR, "BFL%i: Error: Send block data returned empty string/timed out", bitforce->device_id); return false; } diff --git a/fpgautils.c b/fpgautils.c index 0ebee7f6..07b3fe36 100644 --- a/fpgautils.c +++ b/fpgautils.c @@ -178,7 +178,8 @@ serial_open(const char*devpath, unsigned long baud, signed short timeout, bool p SetCommConfig(hSerial, &comCfg, sizeof(comCfg)); - const DWORD ctoms = (timeout == -1) ? 30000 : (timeout * 100); + // Code must specify a valid timeout value (0 means don't timeout) + const DWORD ctoms = (timeout * 100); COMMTIMEOUTS cto = {ctoms, 0, ctoms, 0, ctoms}; SetCommTimeouts(hSerial, &cto); @@ -230,10 +231,9 @@ serial_open(const char*devpath, unsigned long baud, signed short timeout, bool p my_termios.c_oflag &= ~OPOST; my_termios.c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN); - if (timeout >= 0) { - my_termios.c_cc[VTIME] = (cc_t)timeout; - my_termios.c_cc[VMIN] = 0; - } + // Code must specify a valid timeout value (0 means don't timeout) + my_termios.c_cc[VTIME] = (cc_t)timeout; + my_termios.c_cc[VMIN] = 0; tcsetattr(fdDev, TCSANOW, &my_termios); if (purge) From 1097aefbad905066cc2628effb6065a1d08f05dc Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Mon, 23 Jul 2012 19:35:51 +0000 Subject: [PATCH 069/178] Add space to log output now that there is more screen real estate available. --- logging.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/logging.c b/logging.c index 7d8a4309..aba6f606 100644 --- a/logging.c +++ b/logging.c @@ -121,7 +121,7 @@ static void __maybe_unused log_generic(int prio, const char *fmt, va_list ap) len = 40 + strlen(fmt) + 22; f = alloca(len); - sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n", + sprintf(f, " [%d-%02d-%02d %02d:%02d:%02d] %s\n", tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, From d2195bd04e77d844fae39ef66645523ea25725ea Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Mon, 23 Jul 2012 19:41:41 +0000 Subject: [PATCH 070/178] Use log_generic for vapplog to cut down on code duplication --- logging.c | 45 +++++---------------------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/logging.c b/logging.c index aba6f606..fb3868fa 100644 --- a/logging.c +++ b/logging.c @@ -39,49 +39,14 @@ static void my_log_curses(__maybe_unused int prio, char *f, va_list ap) } } +static void log_generic(int prio, const char *fmt, va_list ap); + void vapplog(int prio, const char *fmt, va_list ap) { if (!opt_debug && prio == LOG_DEBUG) return; - -#ifdef HAVE_SYSLOG_H - if (use_syslog) { - vsyslog(prio, fmt, ap); - } -#else - if (0) {} -#endif - else if (opt_log_output || prio <= LOG_NOTICE) { - char *f; - int len; - struct timeval tv = {0, 0}; - struct tm *tm; - - gettimeofday(&tv, NULL); - - tm = localtime(&tv.tv_sec); - - len = 40 + strlen(fmt) + 22; - f = alloca(len); - sprintf(f, " [%d-%02d-%02d %02d:%02d:%02d] %s\n", - tm->tm_year + 1900, - tm->tm_mon + 1, - tm->tm_mday, - tm->tm_hour, - tm->tm_min, - tm->tm_sec, - fmt); - /* Only output to stderr if it's not going to the screen as well */ - if (!isatty(fileno((FILE *)stderr))) { - va_list apc; - - va_copy(apc, ap); - vfprintf(stderr, f, apc); /* atomic write to stderr */ - fflush(stderr); - } - - my_log_curses(prio, f, ap); - } + if (use_syslog || opt_log_output || prio <= LOG_NOTICE) + log_generic(prio, fmt, ap); } void applog(int prio, const char *fmt, ...) @@ -100,7 +65,7 @@ void applog(int prio, const char *fmt, ...) * generic log function used by priority specific ones * equals vapplog() without additional priority checks */ -static void __maybe_unused log_generic(int prio, const char *fmt, va_list ap) +static void log_generic(int prio, const char *fmt, va_list ap) { #ifdef HAVE_SYSLOG_H if (use_syslog) { From 80593c47fce8bfe3dbc9d7b3d258edad2ab0a7ae Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Mon, 23 Jul 2012 19:56:03 +0000 Subject: [PATCH 071/178] Move opt_quiet check to my_log_curses, so it works for curses-less builds --- cgminer.c | 5 +---- logging.c | 3 +++ miner.h | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cgminer.c b/cgminer.c index ea540815..6d95df92 100644 --- a/cgminer.c +++ b/cgminer.c @@ -87,7 +87,7 @@ static bool opt_benchmark; static bool have_longpoll; static bool want_per_device_stats; bool use_syslog; -static bool opt_quiet; +bool opt_quiet; static bool opt_realquiet; bool opt_loginput; const int opt_cutofftemp = 95; @@ -1566,9 +1566,6 @@ void log_curses(int prio, const char *f, va_list ap) { bool high_prio; - if (opt_quiet && prio != LOG_ERR) - return; - high_prio = (prio == LOG_WARNING || prio == LOG_ERR); if (curses_active_locked()) { diff --git a/logging.c b/logging.c index fb3868fa..db623748 100644 --- a/logging.c +++ b/logging.c @@ -20,6 +20,9 @@ int opt_log_level = LOG_NOTICE; static void my_log_curses(__maybe_unused int prio, char *f, va_list ap) { + if (opt_quiet && prio != LOG_ERR) + return; + #ifdef HAVE_CURSES extern bool use_curses; if (use_curses) diff --git a/miner.h b/miner.h index 120d7633..76689c03 100644 --- a/miner.h +++ b/miner.h @@ -614,6 +614,7 @@ extern int opt_n_threads; extern int num_processors; extern int hw_errors; extern bool use_syslog; +extern bool opt_quiet; extern struct thr_info *thr_info; extern struct cgpu_info gpus[MAX_GPUDEVICES]; extern int gpu_threads; From d98e561a0a0ad7c711fee2a5344b6bdda76bb54c Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Mon, 23 Jul 2012 20:15:45 +0000 Subject: [PATCH 072/178] Simplify code to a single vprintf path for curses-less printing --- cgminer.c | 9 ++++----- logging.c | 8 ++------ miner.h | 2 +- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/cgminer.c b/cgminer.c index 6d95df92..20623974 100644 --- a/cgminer.c +++ b/cgminer.c @@ -164,9 +164,7 @@ static int total_threads; static pthread_mutex_t hash_lock; static pthread_mutex_t qd_lock; static pthread_mutex_t *stgd_lock; -#ifdef HAVE_CURSES static pthread_mutex_t curses_lock; -#endif static pthread_mutex_t ch_lock; static pthread_rwlock_t blk_lock; @@ -1562,7 +1560,7 @@ void wlogprint(const char *f, ...) #endif #ifdef HAVE_CURSES -void log_curses(int prio, const char *f, va_list ap) +bool log_curses_only(int prio, const char *f, va_list ap) { bool high_prio; @@ -1577,8 +1575,9 @@ void log_curses(int prio, const char *f, va_list ap) } } unlock_curses(); - } else - vprintf(f, ap); + return true; + } + return false; } void clear_logwin(void) diff --git a/logging.c b/logging.c index db623748..26e01960 100644 --- a/logging.c +++ b/logging.c @@ -25,8 +25,8 @@ static void my_log_curses(__maybe_unused int prio, char *f, va_list ap) #ifdef HAVE_CURSES extern bool use_curses; - if (use_curses) - log_curses(prio, f, ap); + if (use_curses && log_curses_only(prio, f, ap)) + ; else #endif { @@ -34,11 +34,7 @@ static void my_log_curses(__maybe_unused int prio, char *f, va_list ap) strcpy(f + len - 1, " \n"); -#ifdef HAVE_CURSES - log_curses(prio, f, ap); -#else vprintf(f, ap); -#endif } } diff --git a/miner.h b/miner.h index 76689c03..6aaa314e 100644 --- a/miner.h +++ b/miner.h @@ -797,7 +797,7 @@ extern void switch_pools(struct pool *selected); extern void remove_pool(struct pool *pool); extern void write_config(FILE *fcfg); extern void default_save_file(char *filename); -extern void log_curses(int prio, const char *f, va_list ap); +extern bool log_curses_only(int prio, const char *f, va_list ap); extern void clear_logwin(void); extern bool pool_tclear(struct pool *pool, bool *var); extern struct thread_q *tq_new(void); From da4ff2bde1dbb2ebe0d7b1e81ff7e460411b13f5 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Mon, 23 Jul 2012 20:22:01 +0000 Subject: [PATCH 073/178] Bugfix: Use a mutex to control non-curses output Without this, there is no guarantee writes won't overlap (and it happens quite a bit on Windows with the bitforce driver) --- cgminer.c | 10 ++++------ logging.c | 2 ++ miner.h | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cgminer.c b/cgminer.c index 20623974..87e23f80 100644 --- a/cgminer.c +++ b/cgminer.c @@ -164,7 +164,7 @@ static int total_threads; static pthread_mutex_t hash_lock; static pthread_mutex_t qd_lock; static pthread_mutex_t *stgd_lock; -static pthread_mutex_t curses_lock; +pthread_mutex_t console_lock; static pthread_mutex_t ch_lock; static pthread_rwlock_t blk_lock; @@ -1323,12 +1323,12 @@ struct cgpu_info *cpus; #ifdef HAVE_CURSES static inline void unlock_curses(void) { - mutex_unlock(&curses_lock); + mutex_unlock(&console_lock); } static inline void lock_curses(void) { - mutex_lock(&curses_lock); + mutex_lock(&console_lock); } static bool curses_active_locked(void) @@ -5056,9 +5056,7 @@ int main(int argc, char *argv[]) mutex_init(&hash_lock); mutex_init(&qd_lock); -#ifdef HAVE_CURSES - mutex_init(&curses_lock); -#endif + mutex_init(&console_lock); mutex_init(&control_lock); mutex_init(&sharelog_lock); mutex_init(&ch_lock); diff --git a/logging.c b/logging.c index 26e01960..47d1970d 100644 --- a/logging.c +++ b/logging.c @@ -34,7 +34,9 @@ static void my_log_curses(__maybe_unused int prio, char *f, va_list ap) strcpy(f + len - 1, " \n"); + mutex_lock(&console_lock); vprintf(f, ap); + mutex_unlock(&console_lock); } } diff --git a/miner.h b/miner.h index 6aaa314e..3f55670f 100644 --- a/miner.h +++ b/miner.h @@ -576,6 +576,8 @@ extern bool fulltest(const unsigned char *hash, const unsigned char *target); extern int opt_scantime; +extern pthread_mutex_t console_lock; + extern pthread_mutex_t restart_lock; extern pthread_cond_t restart_cond; From f98774c35c2cc9eeb601cd367ee8bb7a2c07e835 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 24 Jul 2012 17:52:54 +1000 Subject: [PATCH 074/178] Fix target testing with scrypt kernel as it would have been missing shares below target. --- driver-opencl.c | 2 +- scrypt120713.cl | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 059a7ece..7b3f8b75 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1053,7 +1053,7 @@ static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_u cl_uint le_target; cl_int status = 0; - le_target = ~swab32(*(cl_uint *)(blk->work->target + 28)); + le_target = *(cl_uint *)(blk->work->target + 28); clState->cldata = blk->work->data; status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); diff --git a/scrypt120713.cl b/scrypt120713.cl index a273f02c..d38f6a54 100644 --- a/scrypt120713.cl +++ b/scrypt120713.cl @@ -2,10 +2,7 @@ #define Ch(x,y,z) bitselect(z,y,x) #define Maj(x,y,z) Ch((x^z),y,z) -uint4 EndianSwap4(uint4 n) -{ - return rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U); -} +#define EndianSwap(n) (rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U)) #define Tr2(x) (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U)) #define Tr1(x) (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U)) @@ -552,7 +549,7 @@ void shittify(uint4 B[8]) #pragma unroll for(uint i=0; i<4; ++i) - B[i] = EndianSwap4(tmp[i]); + B[i] = EndianSwap(tmp[i]); tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w); tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w); @@ -561,7 +558,7 @@ void shittify(uint4 B[8]) #pragma unroll for(uint i=0; i<4; ++i) - B[i+4] = EndianSwap4(tmp[i]); + B[i+4] = EndianSwap(tmp[i]); } void unshittify(uint4 B[8]) @@ -574,7 +571,7 @@ void unshittify(uint4 B[8]) #pragma unroll for(uint i=0; i<4; ++i) - B[i] = EndianSwap4(tmp[i]); + B[i] = EndianSwap(tmp[i]); tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w); tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w); @@ -583,7 +580,7 @@ void unshittify(uint4 B[8]) #pragma unroll for(uint i=0; i<4; ++i) - B[i+4] = EndianSwap4(tmp[i]); + B[i+4] = EndianSwap(tmp[i]); } void salsa(uint4 B[8]) @@ -723,8 +720,9 @@ const uint4 midstate0, const uint4 midstate16, const uint target) SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]); SHA256_fixed(&tmp0,&tmp1); SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U)); - - if (!(ostate1.w & target)) + + bool found = (EndianSwap(ostate1.w) <= target); + if (found) output[FOUND] = output[NFLAG & gid] = gid; } From 9a6c082ad159935375915994372e8cc1944c2da2 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 24 Jul 2012 20:27:37 +1000 Subject: [PATCH 075/178] Make the thread concurrency and lookup gap options hidden on the command line and autotune parameters with a newly parsed --shaders option. --- cgminer.c | 7 +++++-- driver-opencl.c | 25 +++++++++++++++++++++++++ driver-opencl.h | 1 + miner.h | 1 + ocl.c | 3 +++ 5 files changed, 35 insertions(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index efe78946..6e9dd891 100644 --- a/cgminer.c +++ b/cgminer.c @@ -857,7 +857,7 @@ static struct opt_table opt_config_table[] = { #ifdef USE_SCRYPT OPT_WITH_ARG("--lookup-gap", set_lookup_gap, NULL, NULL, - "Set GPU lookup gap for scrypt mining, comma separated"), + opt_hidden), #endif OPT_WITH_ARG("--intensity|-I", set_intensity, NULL, NULL, @@ -965,6 +965,9 @@ static struct opt_table opt_config_table[] = { OPT_WITHOUT_ARG("--scrypt", opt_set_bool, &opt_scrypt, "Use the scrypt algorithm for mining (litecoin only)"), + OPT_WITH_ARG("--shaders", + set_shaders, NULL, NULL, + "GPU shaders per card for tuning scrypt, comma separated"), #endif OPT_WITH_ARG("--sharelog", set_sharelog, NULL, NULL, @@ -1007,7 +1010,7 @@ static struct opt_table opt_config_table[] = { #ifdef USE_SCRYPT OPT_WITH_ARG("--thread-concurrency", set_thread_concurrency, NULL, NULL, - "Set GPU thread concurrency for scrypt mining, comma separated"), + opt_hidden), #endif OPT_WITH_ARG("--url|-o", set_url, NULL, NULL, diff --git a/driver-opencl.c b/driver-opencl.c index 7b3f8b75..eafdd5d2 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -128,6 +128,31 @@ char *set_worksize(char *arg) } #ifdef USE_SCRYPT +char *set_shaders(char *arg) +{ + int i, val = 0, device = 0; + char *nextptr; + + nextptr = strtok(arg, ","); + if (nextptr == NULL) + return "Invalid parameters for set lookup gap"; + val = atoi(nextptr); + + gpus[device++].shaders = val; + + while ((nextptr = strtok(NULL, ",")) != NULL) { + val = atoi(nextptr); + + gpus[device++].shaders = val; + } + if (device == 1) { + for (i = device; i < MAX_GPUDEVICES; i++) + gpus[i].shaders = gpus[0].shaders; + } + + return NULL; +} + char *set_lookup_gap(char *arg) { int i, val = 0, device = 0; diff --git a/driver-opencl.h b/driver-opencl.h index f09571b9..c1d61822 100644 --- a/driver-opencl.h +++ b/driver-opencl.h @@ -19,6 +19,7 @@ extern char *set_intensity(char *arg); extern char *set_vector(char *arg); extern char *set_worksize(char *arg); #ifdef USE_SCRYPT +extern char *set_shaders(char *arg); extern char *set_lookup_gap(char *arg); extern char *set_thread_concurrency(char *arg); #endif diff --git a/miner.h b/miner.h index 291574c9..68c6e159 100644 --- a/miner.h +++ b/miner.h @@ -365,6 +365,7 @@ struct cgpu_info { #ifdef USE_SCRYPT int lookup_gap; int thread_concurrency; + int shaders; #endif struct timeval tv_gpustart;; struct timeval tv_gpuend; diff --git a/ocl.c b/ocl.c index b09291bb..f7264447 100644 --- a/ocl.c +++ b/ocl.c @@ -478,6 +478,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) } if (!gpus[gpu].thread_concurrency) { gpus[gpu].thread_concurrency = gpus[gpu].max_alloc / 32768 / gpus[gpu].lookup_gap; + if (gpus[gpu].shaders && gpus[gpu].thread_concurrency > gpus[gpu].shaders) + gpus[gpu].thread_concurrency -= gpus[gpu].thread_concurrency % gpus[gpu].shaders; + applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, gpus[gpu].thread_concurrency); } } From b3a41e40a82a69ba434bc5c518744da2c3275e9e Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 24 Jul 2012 20:33:04 +1000 Subject: [PATCH 076/178] Update kernel versions reflecting changes in the API. --- configure.ac | 10 +- diablo120328.cl | 1274 ------------------------------------------- diakgcn120427.cl | 587 -------------------- phatk120223.cl | 417 -------------- poclbm120327.cl | 1353 ---------------------------------------------- scrypt120713.cl | 757 -------------------------- 6 files changed, 5 insertions(+), 4393 deletions(-) delete mode 100644 diablo120328.cl delete mode 100644 diakgcn120427.cl delete mode 100644 phatk120223.cl delete mode 100644 poclbm120327.cl delete mode 100644 scrypt120713.cl diff --git a/configure.ac b/configure.ac index 75f680df..8cf840bf 100644 --- a/configure.ac +++ b/configure.ac @@ -389,11 +389,11 @@ fi AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to cgminer install]) -AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120223"], [Filename for phatk kernel]) -AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120327"], [Filename for poclbm kernel]) -AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120427"], [Filename for diakgcn kernel]) -AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120328"], [Filename for diablo kernel]) -AC_DEFINE_UNQUOTED([SCRYPT_KERNNAME], ["scrypt120713"], [Filename for scrypt kernel]) +AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120724"], [Filename for phatk kernel]) +AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120724"], [Filename for poclbm kernel]) +AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120724"], [Filename for diakgcn kernel]) +AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120724"], [Filename for diablo kernel]) +AC_DEFINE_UNQUOTED([SCRYPT_KERNNAME], ["scrypt120724"], [Filename for scrypt kernel]) AC_SUBST(OPENCL_LIBS) diff --git a/diablo120328.cl b/diablo120328.cl deleted file mode 100644 index 4b64c300..00000000 --- a/diablo120328.cl +++ /dev/null @@ -1,1274 +0,0 @@ -/* - * DiabloMiner - OpenCL miner for BitCoin - * Copyright (C) 2010, 2011, 2012 Patrick McFarland - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more detail). - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifdef VECTORS4 - typedef uint4 z; -#elif defined(VECTORS2) - typedef uint2 z; -#else - typedef uint z; -#endif - -#ifdef BITALIGN -#pragma OPENCL EXTENSION cl_amd_media_ops : enable -#define Zrotr(a, b) amd_bitalign((z)a, (z)a, (z)(32 - b)) -#else -#define Zrotr(a, b) rotate((z)a, (z)b) -#endif - -#ifdef BFI_INT -#define ZCh(a, b, c) amd_bytealign(a, b, c) -#define ZMa(a, b, c) amd_bytealign((c ^ a), (b), (a)) -#else -#define ZCh(a, b, c) bitselect((z)c, (z)b, (z)a) -#define ZMa(a, b, c) bitselect((z)a, (z)b, (z)c ^ (z)a) -#endif - -#define ZR25(n) ((Zrotr((n), 25) ^ Zrotr((n), 14) ^ ((n) >> 3U))) -#define ZR15(n) ((Zrotr((n), 15) ^ Zrotr((n), 13) ^ ((n) >> 10U))) -#define ZR26(n) ((Zrotr((n), 26) ^ Zrotr((n), 21) ^ Zrotr((n), 7))) -#define ZR30(n) ((Zrotr((n), 30) ^ Zrotr((n), 19) ^ Zrotr((n), 10))) - -__kernel -__attribute__((vec_type_hint(z))) -__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void search( -#ifndef GOFFSET - const z base, -#endif - const uint PreVal4_state0, const uint PreVal4_state0_k7, - const uint PreVal4_T1, - const uint W18, const uint W19, - const uint W16, const uint W17, - const uint W16_plus_K16, const uint W17_plus_K17, - const uint W31, const uint W32, - const uint d1, const uint b1, const uint c1, - const uint h1, const uint f1, const uint g1, - const uint c1_plus_k5, const uint b1_plus_k6, - const uint state0, const uint state1, const uint state2, const uint state3, - const uint state4, const uint state5, const uint state6, const uint state7, - __global uint * output) -{ - - z ZA[930]; - -#ifdef GOFFSET - const z Znonce = (uint)(get_global_id(0)); -#else - const z Znonce = base + (uint)(get_global_id(0)); -#endif - - ZA[15] = Znonce + PreVal4_state0; - - ZA[16] = (ZCh(ZA[15], b1, c1) + d1) + ZR26(ZA[15]); - ZA[26] = Znonce + PreVal4_T1; - - ZA[27] = ZMa(f1, g1, ZA[26]) + ZR30(ZA[26]); - ZA[17] = ZA[16] + h1; - - ZA[19] = (ZCh(ZA[17], ZA[15], b1) + c1_plus_k5) + ZR26(ZA[17]); - ZA[28] = ZA[27] + ZA[16]; - - ZA[548] = ZMa(ZA[26], f1, ZA[28]) + ZR30(ZA[28]); - ZA[20] = ZA[19] + g1; - - ZA[22] = (ZCh(ZA[20], ZA[17], ZA[15]) + b1_plus_k6) + ZR26(ZA[20]); - ZA[29] = ZA[548] + ZA[19]; - - ZA[549] = ZMa(ZA[28], ZA[26], ZA[29]) + ZR30(ZA[29]); - ZA[23] = ZA[22] + f1; - - ZA[24] = ZCh(ZA[23], ZA[20], ZA[17]) + ZR26(ZA[23]); - ZA[180] = Znonce + PreVal4_state0_k7; - ZA[30] = ZA[549] + ZA[22]; - - ZA[31] = ZMa(ZA[29], ZA[28], ZA[30]) + ZR30(ZA[30]); - ZA[181] = ZA[180] + ZA[24]; - - ZA[182] = ZA[181] + ZA[26]; - ZA[183] = ZA[181] + ZA[31]; - ZA[18] = ZA[17] + 0xd807aa98U; - - ZA[186] = (ZCh(ZA[182], ZA[23], ZA[20]) + ZA[18]) + ZR26(ZA[182]); - ZA[184] = ZMa(ZA[30], ZA[29], ZA[183]) + ZR30(ZA[183]); - - ZA[187] = ZA[186] + ZA[28]; - ZA[188] = ZA[186] + ZA[184]; - ZA[21] = ZA[20] + 0x12835b01U; - - ZA[191] = (ZCh(ZA[187], ZA[182], ZA[23]) + ZA[21]) + ZR26(ZA[187]); - ZA[189] = ZMa(ZA[183], ZA[30], ZA[188]) + ZR30(ZA[188]); - - ZA[192] = ZA[191] + ZA[29]; - ZA[193] = ZA[191] + ZA[189]; - ZA[25] = ZA[23] + 0x243185beU; - - ZA[196] = (ZCh(ZA[192], ZA[187], ZA[182]) + ZA[25]) + ZR26(ZA[192]); - ZA[194] = ZMa(ZA[188], ZA[183], ZA[193]) + ZR30(ZA[193]); - - ZA[197] = ZA[196] + ZA[30]; - ZA[198] = ZA[196] + ZA[194]; - ZA[185] = ZA[182] + 0x550c7dc3U; - - ZA[201] = (ZCh(ZA[197], ZA[192], ZA[187]) + ZA[185]) + ZR26(ZA[197]); - ZA[199] = ZMa(ZA[193], ZA[188], ZA[198]) + ZR30(ZA[198]); - - ZA[202] = ZA[201] + ZA[183]; - ZA[203] = ZA[201] + ZA[199]; - ZA[190] = ZA[187] + 0x72be5d74U; - - ZA[206] = (ZCh(ZA[202], ZA[197], ZA[192]) + ZA[190]) + ZR26(ZA[202]); - ZA[204] = ZMa(ZA[198], ZA[193], ZA[203]) + ZR30(ZA[203]); - - ZA[207] = ZA[206] + ZA[188]; - ZA[208] = ZA[206] + ZA[204]; - ZA[195] = ZA[192] + 0x80deb1feU; - - ZA[211] = (ZCh(ZA[207], ZA[202], ZA[197]) + ZA[195]) + ZR26(ZA[207]); - ZA[209] = ZMa(ZA[203], ZA[198], ZA[208]) + ZR30(ZA[208]); - - ZA[212] = ZA[193] + ZA[211]; - ZA[213] = ZA[211] + ZA[209]; - ZA[200] = ZA[197] + 0x9bdc06a7U; - - ZA[216] = (ZCh(ZA[212], ZA[207], ZA[202]) + ZA[200]) + ZR26(ZA[212]); - ZA[214] = ZMa(ZA[208], ZA[203], ZA[213]) + ZR30(ZA[213]); - - ZA[217] = ZA[198] + ZA[216]; - ZA[218] = ZA[216] + ZA[214]; - ZA[205] = ZA[202] + 0xc19bf3f4U; - - ZA[220] = (ZCh(ZA[217], ZA[212], ZA[207]) + ZA[205]) + ZR26(ZA[217]); - ZA[219] = ZMa(ZA[213], ZA[208], ZA[218]) + ZR30(ZA[218]); - - ZA[222] = ZA[203] + ZA[220]; - ZA[223] = ZA[220] + ZA[219]; - ZA[210] = ZA[207] + W16_plus_K16; - - ZA[226] = (ZCh(ZA[222], ZA[217], ZA[212]) + ZA[210]) + ZR26(ZA[222]); - ZA[225] = ZMa(ZA[218], ZA[213], ZA[223]) + ZR30(ZA[223]); - - ZA[0] = ZR25(Znonce) + W18; - ZA[228] = ZA[226] + ZA[225]; - ZA[227] = ZA[208] + ZA[226]; - ZA[215] = ZA[212] + W17_plus_K17; - - ZA[231] = (ZCh(ZA[227], ZA[222], ZA[217]) + ZA[215]) + ZR26(ZA[227]); - ZA[229] = ZMa(ZA[223], ZA[218], ZA[228]) + ZR30(ZA[228]); - ZA[1] = ZA[0] + 0x0fc19dc6U; - - ZA[232] = ZA[213] + ZA[231]; - ZA[233] = ZA[231] + ZA[229]; - ZA[221] = ZA[217] + ZA[1]; - ZA[32] = Znonce + W19; - - ZA[236] = (ZCh(ZA[232], ZA[227], ZA[222]) + ZA[221]) + ZR26(ZA[232]); - ZA[234] = ZMa(ZA[228], ZA[223], ZA[233]) + ZR30(ZA[233]); - ZA[33] = ZA[32] + 0x240ca1ccU; - - ZA[3] = ZR15(ZA[0]) + 0x80000000U; - ZA[238] = ZA[236] + ZA[234]; - ZA[237] = ZA[218] + ZA[236]; - ZA[224] = ZA[222] + ZA[33]; - - ZA[241] = (ZCh(ZA[237], ZA[232], ZA[227]) + ZA[224]) + ZR26(ZA[237]); - ZA[239] = ZMa(ZA[233], ZA[228], ZA[238]) + ZR30(ZA[238]); - ZA[4] = ZA[3] + 0x2de92c6fU; - - ZA[35] = ZR15(ZA[32]); - ZA[243] = ZA[241] + ZA[239]; - ZA[242] = ZA[223] + ZA[241]; - ZA[230] = ZA[227] + ZA[4]; - - ZA[246] = (ZCh(ZA[242], ZA[237], ZA[232]) + ZA[230]) + ZR26(ZA[242]); - ZA[244] = ZMa(ZA[238], ZA[233], ZA[243]) + ZR30(ZA[243]); - ZA[36] = ZA[35] + 0x4a7484aaU; - - ZA[7] = ZR15(ZA[3]) + 0x00000280U; - ZA[248] = ZA[246] + ZA[244]; - ZA[247] = ZA[228] + ZA[246]; - ZA[235] = ZA[232] + ZA[36]; - - ZA[251] = (ZCh(ZA[247], ZA[242], ZA[237]) + ZA[235]) + ZR26(ZA[247]); - ZA[249] = ZMa(ZA[243], ZA[238], ZA[248]) + ZR30(ZA[248]); - ZA[8] = ZA[7] + 0x5cb0a9dcU; - - ZA[38] = ZR15(ZA[35]) + W16; - ZA[253] = ZA[251] + ZA[249]; - ZA[252] = ZA[233] + ZA[251]; - ZA[240] = ZA[237] + ZA[8]; - - ZA[256] = (ZCh(ZA[252], ZA[247], ZA[242]) + ZA[240]) + ZR26(ZA[252]); - ZA[254] = ZMa(ZA[248], ZA[243], ZA[253]) + ZR30(ZA[253]); - ZA[40] = ZA[38] + 0x76f988daU; - - ZA[10] = ZR15(ZA[7]) + W17; - ZA[258] = ZA[256] + ZA[254]; - ZA[257] = ZA[238] + ZA[256]; - ZA[245] = ZA[242] + ZA[40]; - - ZA[261] = (ZCh(ZA[257], ZA[252], ZA[247]) + ZA[245]) + ZR26(ZA[257]); - ZA[259] = ZMa(ZA[253], ZA[248], ZA[258]) + ZR30(ZA[258]); - ZA[13] = ZA[10] + 0x983e5152U; - - ZA[43] = ZR15(ZA[38]) + ZA[0]; - ZA[263] = ZA[261] + ZA[259]; - ZA[262] = ZA[243] + ZA[261]; - ZA[250] = ZA[247] + ZA[13]; - - ZA[266] = (ZCh(ZA[262], ZA[257], ZA[252]) + ZA[250]) + ZR26(ZA[262]); - ZA[264] = ZMa(ZA[258], ZA[253], ZA[263]) + ZR30(ZA[263]); - ZA[11] = ZR15(ZA[10]); - ZA[45] = ZA[43] + 0xa831c66dU; - - ZA[52] = ZA[11] + ZA[32]; - ZA[267] = ZA[248] + ZA[266]; - ZA[255] = ZA[252] + ZA[45]; - ZA[268] = ZA[266] + ZA[264]; - - ZA[271] = (ZCh(ZA[267], ZA[262], ZA[257]) + ZA[255]) + ZR26(ZA[267]); - ZA[269] = ZMa(ZA[263], ZA[258], ZA[268]) + ZR30(ZA[268]); - ZA[54] = ZA[52] + 0xb00327c8U; - - ZA[48] = ZR15(ZA[43]) + ZA[3]; - ZA[273] = ZA[271] + ZA[269]; - ZA[272] = ZA[253] + ZA[271]; - ZA[260] = ZA[257] + ZA[54]; - - ZA[276] = (ZCh(ZA[272], ZA[267], ZA[262]) + ZA[260]) + ZR26(ZA[272]); - ZA[274] = ZMa(ZA[268], ZA[263], ZA[273]) + ZR30(ZA[273]); - ZA[49] = ZA[48] + 0xbf597fc7U; - - ZA[61] = ZR15(ZA[52]) + ZA[35]; - ZA[278] = ZA[276] + ZA[274]; - ZA[277] = ZA[258] + ZA[276]; - ZA[265] = ZA[262] + ZA[49]; - - ZA[281] = (ZCh(ZA[277], ZA[272], ZA[267]) + ZA[265]) + ZR26(ZA[277]); - ZA[279] = ZMa(ZA[273], ZA[268], ZA[278]) + ZR30(ZA[278]); - ZA[62] = ZA[61] + 0xc6e00bf3U; - - ZA[53] = ZR15(ZA[48]) + ZA[7]; - ZA[283] = ZA[281] + ZA[279]; - ZA[282] = ZA[263] + ZA[281]; - ZA[270] = ZA[267] + ZA[62]; - - ZA[286] = (ZCh(ZA[282], ZA[277], ZA[272]) + ZA[270]) + ZR26(ZA[282]); - ZA[284] = ZMa(ZA[278], ZA[273], ZA[283]) + ZR30(ZA[283]); - ZA[39] = ZA[38] + 0x00A00055U; - ZA[55] = ZA[53] + 0xd5a79147U; - - ZA[66] = ZR15(ZA[61]) + ZA[39]; - ZA[288] = ZA[286] + ZA[284]; - ZA[287] = ZA[268] + ZA[286]; - ZA[275] = ZA[272] + ZA[55]; - - ZA[291] = (ZCh(ZA[287], ZA[282], ZA[277]) + ZA[275]) + ZR26(ZA[287]); - ZA[289] = ZMa(ZA[283], ZA[278], ZA[288]) + ZR30(ZA[288]); - ZA[12] = ZA[10] + W31; - ZA[68] = ZA[66] + 0x06ca6351U; - - ZA[67] = ZR15(ZA[53]) + ZA[12]; - ZA[293] = ZA[291] + ZA[289]; - ZA[292] = ZA[273] + ZA[291]; - ZA[280] = ZA[277] + ZA[68]; - - ZA[296] = (ZCh(ZA[292], ZA[287], ZA[282]) + ZA[280]) + ZR26(ZA[292]); - ZA[294] = ZMa(ZA[288], ZA[283], ZA[293]) + ZR30(ZA[293]); - ZA[2] = ZR25(ZA[0]); - ZA[69] = ZA[67] + 0x14292967U; - ZA[44] = ZA[43] + W32; - - ZA[75] = ZR15(ZA[66]) + ZA[44]; - ZA[298] = ZA[296] + ZA[294]; - ZA[297] = ZA[278] + ZA[296]; - ZA[285] = ZA[282] + ZA[69]; - ZA[5] = ZA[2] + W17; - - ZA[301] = (ZCh(ZA[297], ZA[292], ZA[287]) + ZA[285]) + ZR26(ZA[297]); - ZA[299] = ZMa(ZA[293], ZA[288], ZA[298]) + ZR30(ZA[298]); - ZA[56] = ZA[52] + ZA[5]; - ZA[76] = ZA[75] + 0x27b70a85U; - - ZA[34] = ZR25(ZA[32]) + ZA[0]; - ZA[70] = ZR15(ZA[67]) + ZA[56]; - ZA[302] = ZA[283] + ZA[301]; - ZA[303] = ZA[301] + ZA[299]; - ZA[290] = ZA[287] + ZA[76]; - - ZA[306] = (ZCh(ZA[302], ZA[297], ZA[292]) + ZA[290]) + ZR26(ZA[302]); - ZA[304] = ZMa(ZA[298], ZA[293], ZA[303]) + ZR30(ZA[303]); - ZA[6] = ZR25(ZA[3]); - ZA[77] = ZA[70] + 0x2e1b2138U; - ZA[50] = ZA[34] + ZA[48]; - - ZA[78] = ZR15(ZA[75]) + ZA[50]; - ZA[308] = ZA[306] + ZA[304]; - ZA[307] = ZA[288] + ZA[306]; - ZA[295] = ZA[292] + ZA[77]; - ZA[41] = ZA[32] + ZA[6]; - - ZA[311] = (ZCh(ZA[307], ZA[302], ZA[297]) + ZA[295]) + ZR26(ZA[307]); - ZA[309] = ZMa(ZA[303], ZA[298], ZA[308]) + ZR30(ZA[308]); - ZA[63] = ZA[41] + ZA[61]; - ZA[85] = ZA[78] + 0x4d2c6dfcU; - - ZA[37] = ZR25(ZA[35]) + ZA[3]; - ZA[79] = ZR15(ZA[70]) + ZA[63]; - ZA[312] = ZA[293] + ZA[311]; - ZA[313] = ZA[311] + ZA[309]; - ZA[300] = ZA[297] + ZA[85]; - - ZA[316] = (ZCh(ZA[312], ZA[307], ZA[302]) + ZA[300]) + ZR26(ZA[312]); - ZA[314] = ZMa(ZA[308], ZA[303], ZA[313]) + ZR30(ZA[313]); - ZA[9] = ZR25(ZA[7]); - ZA[86] = ZA[79] + 0x53380d13U; - ZA[57] = ZA[37] + ZA[53]; - - ZA[87] = ZR15(ZA[78]) + ZA[57]; - ZA[318] = ZA[316] + ZA[314]; - ZA[317] = ZA[298] + ZA[316]; - ZA[305] = ZA[302] + ZA[86]; - ZA[46] = ZA[35] + ZA[9]; - - ZA[321] = (ZCh(ZA[317], ZA[312], ZA[307]) + ZA[305]) + ZR26(ZA[317]); - ZA[319] = ZMa(ZA[313], ZA[308], ZA[318]) + ZR30(ZA[318]); - ZA[71] = ZA[46] + ZA[66]; - ZA[92] = ZA[87] + 0x650a7354U; - - ZA[42] = ZR25(ZA[38]) + ZA[7]; - ZA[88] = ZR15(ZA[79]) + ZA[71]; - ZA[322] = ZA[303] + ZA[321]; - ZA[323] = ZA[321] + ZA[319]; - ZA[310] = ZA[307] + ZA[92]; - - ZA[326] = (ZCh(ZA[322], ZA[317], ZA[312]) + ZA[310]) + ZR26(ZA[322]); - ZA[324] = ZMa(ZA[318], ZA[313], ZA[323]) + ZR30(ZA[323]); - ZA[14] = ZR25(ZA[10]); - ZA[93] = ZA[88] + 0x766a0abbU; - ZA[72] = ZA[42] + ZA[67]; - - ZA[94] = ZR15(ZA[87]) + ZA[72]; - ZA[328] = ZA[326] + ZA[324]; - ZA[327] = ZA[308] + ZA[326]; - ZA[315] = ZA[312] + ZA[93]; - ZA[51] = ZA[38] + ZA[14]; - - ZA[331] = (ZCh(ZA[327], ZA[322], ZA[317]) + ZA[315]) + ZR26(ZA[327]); - ZA[329] = ZMa(ZA[323], ZA[318], ZA[328]) + ZR30(ZA[328]); - ZA[80] = ZA[51] + ZA[75]; - ZA[100] = ZA[94] + 0x81c2c92eU; - - ZA[47] = ZR25(ZA[43]) + ZA[10]; - ZA[95] = ZR15(ZA[88]) + ZA[80]; - ZA[332] = ZA[313] + ZA[331]; - ZA[333] = ZA[331] + ZA[329]; - ZA[320] = ZA[317] + ZA[100]; - - ZA[336] = (ZCh(ZA[332], ZA[327], ZA[322]) + ZA[320]) + ZR26(ZA[332]); - ZA[334] = ZMa(ZA[328], ZA[323], ZA[333]) + ZR30(ZA[333]); - ZA[81] = ZA[47] + ZA[70]; - ZA[101] = ZA[95] + 0x92722c85U; - - ZA[58] = ZR25(ZA[52]) + ZA[43]; - ZA[102] = ZR15(ZA[94]) + ZA[81]; - ZA[337] = ZA[318] + ZA[336]; - ZA[338] = ZA[336] + ZA[334]; - ZA[325] = ZA[322] + ZA[101]; - - ZA[341] = (ZCh(ZA[337], ZA[332], ZA[327]) + ZA[325]) + ZR26(ZA[337]); - ZA[339] = ZMa(ZA[333], ZA[328], ZA[338]) + ZR30(ZA[338]); - ZA[89] = ZA[58] + ZA[78]; - ZA[108] = ZA[102] + 0xa2bfe8a1U; - - ZA[59] = ZR25(ZA[48]) + ZA[52]; - ZA[103] = ZR15(ZA[95]) + ZA[89]; - ZA[342] = ZA[323] + ZA[341]; - ZA[343] = ZA[341] + ZA[339]; - ZA[330] = ZA[327] + ZA[108]; - - ZA[346] = (ZCh(ZA[342], ZA[337], ZA[332]) + ZA[330]) + ZR26(ZA[342]); - ZA[344] = ZMa(ZA[338], ZA[333], ZA[343]) + ZR30(ZA[343]); - ZA[90] = ZA[59] + ZA[79]; - ZA[109] = ZA[103] + 0xa81a664bU; - - ZA[64] = ZR25(ZA[61]) + ZA[48]; - ZA[110] = ZR15(ZA[102]) + ZA[90]; - ZA[347] = ZA[328] + ZA[346]; - ZA[348] = ZA[346] + ZA[344]; - ZA[335] = ZA[332] + ZA[109]; - - ZA[351] = (ZCh(ZA[347], ZA[342], ZA[337]) + ZA[335]) + ZR26(ZA[347]); - ZA[349] = ZMa(ZA[343], ZA[338], ZA[348]) + ZR30(ZA[348]); - ZA[60] = ZR25(ZA[53]); - ZA[116] = ZA[110] + 0xc24b8b70U; - ZA[96] = ZA[87] + ZA[64]; - - ZA[111] = ZR15(ZA[103]) + ZA[96]; - ZA[353] = ZA[351] + ZA[349]; - ZA[352] = ZA[333] + ZA[351]; - ZA[340] = ZA[337] + ZA[116]; - ZA[65] = ZA[60] + ZA[61]; - - ZA[356] = (ZCh(ZA[352], ZA[347], ZA[342]) + ZA[340]) + ZR26(ZA[352]); - ZA[354] = ZMa(ZA[348], ZA[343], ZA[353]) + ZR30(ZA[353]); - ZA[97] = ZA[88] + ZA[65]; - ZA[117] = ZA[111] + 0xc76c51a3U; - - ZA[73] = ZR25(ZA[66]) + ZA[53]; - ZA[118] = ZR15(ZA[110]) + ZA[97]; - ZA[357] = ZA[338] + ZA[356]; - ZA[358] = ZA[356] + ZA[354]; - ZA[345] = ZA[342] + ZA[117]; - - ZA[361] = (ZCh(ZA[357], ZA[352], ZA[347]) + ZA[345]) + ZR26(ZA[357]); - ZA[359] = ZMa(ZA[353], ZA[348], ZA[358]) + ZR30(ZA[358]); - ZA[104] = ZA[73] + ZA[94]; - ZA[124] = ZA[118] + 0xd192e819U; - - ZA[74] = ZR25(ZA[67]) + ZA[66]; - ZA[119] = ZR15(ZA[111]) + ZA[104]; - ZA[362] = ZA[343] + ZA[361]; - ZA[363] = ZA[361] + ZA[359]; - ZA[350] = ZA[347] + ZA[124]; - - ZA[366] = (ZCh(ZA[362], ZA[357], ZA[352]) + ZA[350]) + ZR26(ZA[362]); - ZA[364] = ZMa(ZA[358], ZA[353], ZA[363]) + ZR30(ZA[363]); - ZA[105] = ZA[74] + ZA[95]; - ZA[125] = ZA[119] + 0xd6990624U; - - ZA[82] = ZR25(ZA[75]) + ZA[67]; - ZA[126] = ZR15(ZA[118]) + ZA[105]; - ZA[367] = ZA[348] + ZA[366]; - ZA[368] = ZA[366] + ZA[364]; - ZA[355] = ZA[352] + ZA[125]; - - ZA[371] = (ZCh(ZA[367], ZA[362], ZA[357]) + ZA[355]) + ZR26(ZA[367]); - ZA[369] = ZMa(ZA[363], ZA[358], ZA[368]) + ZR30(ZA[368]); - ZA[112] = ZA[102] + ZA[82]; - ZA[132] = ZA[126] + 0xf40e3585U; - - ZA[83] = ZR25(ZA[70]) + ZA[75]; - ZA[127] = ZR15(ZA[119]) + ZA[112]; - ZA[372] = ZA[353] + ZA[371]; - ZA[373] = ZA[371] + ZA[369]; - ZA[360] = ZA[357] + ZA[132]; - - ZA[376] = (ZCh(ZA[372], ZA[367], ZA[362]) + ZA[360]) + ZR26(ZA[372]); - ZA[374] = ZMa(ZA[368], ZA[363], ZA[373]) + ZR30(ZA[373]); - ZA[113] = ZA[103] + ZA[83]; - ZA[133] = ZA[127] + 0x106aa070U; - - ZA[84] = ZR25(ZA[78]) + ZA[70]; - ZA[134] = ZR15(ZA[126]) + ZA[113]; - ZA[377] = ZA[358] + ZA[376]; - ZA[378] = ZA[376] + ZA[374]; - ZA[365] = ZA[362] + ZA[133]; - - ZA[381] = (ZCh(ZA[377], ZA[372], ZA[367]) + ZA[365]) + ZR26(ZA[377]); - ZA[379] = ZMa(ZA[373], ZA[368], ZA[378]) + ZR30(ZA[378]); - ZA[120] = ZA[110] + ZA[84]; - ZA[140] = ZA[134] + 0x19a4c116U; - - ZA[91] = ZR25(ZA[79]) + ZA[78]; - ZA[135] = ZR15(ZA[127]) + ZA[120]; - ZA[382] = ZA[363] + ZA[381]; - ZA[383] = ZA[381] + ZA[379]; - ZA[370] = ZA[367] + ZA[140]; - - ZA[386] = (ZCh(ZA[382], ZA[377], ZA[372]) + ZA[370]) + ZR26(ZA[382]); - ZA[384] = ZMa(ZA[378], ZA[373], ZA[383]) + ZR30(ZA[383]); - ZA[121] = ZA[111] + ZA[91]; - ZA[141] = ZA[135] + 0x1e376c08U; - - ZA[98] = ZR25(ZA[87]) + ZA[79]; - ZA[142] = ZR15(ZA[134]) + ZA[121]; - ZA[387] = ZA[368] + ZA[386]; - ZA[388] = ZA[386] + ZA[384]; - ZA[375] = ZA[372] + ZA[141]; - - ZA[391] = (ZCh(ZA[387], ZA[382], ZA[377]) + ZA[375]) + ZR26(ZA[387]); - ZA[389] = ZMa(ZA[383], ZA[378], ZA[388]) + ZR30(ZA[388]); - ZA[128] = ZA[118] + ZA[98]; - ZA[147] = ZA[142] + 0x2748774cU; - - ZA[99] = ZR25(ZA[88]) + ZA[87]; - ZA[143] = ZR15(ZA[135]) + ZA[128]; - ZA[392] = ZA[373] + ZA[391]; - ZA[393] = ZA[391] + ZA[389]; - ZA[380] = ZA[377] + ZA[147]; - - ZA[396] = (ZCh(ZA[392], ZA[387], ZA[382]) + ZA[380]) + ZR26(ZA[392]); - ZA[394] = ZMa(ZA[388], ZA[383], ZA[393]) + ZR30(ZA[393]); - ZA[129] = ZA[119] + ZA[99]; - ZA[148] = ZA[143] + 0x34b0bcb5U; - - ZA[106] = ZR25(ZA[94]) + ZA[88]; - ZA[149] = ZR15(ZA[142]) + ZA[129]; - ZA[397] = ZA[378] + ZA[396]; - ZA[398] = ZA[396] + ZA[394]; - ZA[385] = ZA[382] + ZA[148]; - - ZA[401] = (ZCh(ZA[397], ZA[392], ZA[387]) + ZA[385]) + ZR26(ZA[397]); - ZA[399] = ZMa(ZA[393], ZA[388], ZA[398]) + ZR30(ZA[398]); - ZA[136] = ZA[126] + ZA[106]; - ZA[153] = ZA[149] + 0x391c0cb3U; - - ZA[107] = ZR25(ZA[95]) + ZA[94]; - ZA[150] = ZR15(ZA[143]) + ZA[136]; - ZA[402] = ZA[383] + ZA[401]; - ZA[403] = ZA[401] + ZA[399]; - ZA[390] = ZA[387] + ZA[153]; - - ZA[406] = (ZCh(ZA[402], ZA[397], ZA[392]) + ZA[390]) + ZR26(ZA[402]); - ZA[404] = ZMa(ZA[398], ZA[393], ZA[403]) + ZR30(ZA[403]); - ZA[137] = ZA[127] + ZA[107]; - ZA[154] = ZA[150] + 0x4ed8aa4aU; - - ZA[114] = ZR25(ZA[102]) + ZA[95]; - ZA[155] = ZR15(ZA[149]) + ZA[137]; - ZA[407] = ZA[388] + ZA[406]; - ZA[408] = ZA[406] + ZA[404]; - ZA[395] = ZA[392] + ZA[154]; - - ZA[411] = (ZCh(ZA[407], ZA[402], ZA[397]) + ZA[395]) + ZR26(ZA[407]); - ZA[409] = ZMa(ZA[403], ZA[398], ZA[408]) + ZR30(ZA[408]); - ZA[144] = ZA[134] + ZA[114]; - ZA[159] = ZA[155] + 0x5b9cca4fU; - - ZA[115] = ZR25(ZA[103]) + ZA[102]; - ZA[156] = ZR15(ZA[150]) + ZA[144]; - ZA[412] = ZA[393] + ZA[411]; - ZA[413] = ZA[411] + ZA[409]; - ZA[400] = ZA[397] + ZA[159]; - - ZA[416] = (ZCh(ZA[412], ZA[407], ZA[402]) + ZA[400]) + ZR26(ZA[412]); - ZA[414] = ZMa(ZA[408], ZA[403], ZA[413]) + ZR30(ZA[413]); - ZA[145] = ZA[135] + ZA[115]; - ZA[160] = ZA[156] + 0x682e6ff3U; - - ZA[122] = ZR25(ZA[110]) + ZA[103]; - ZA[161] = ZR15(ZA[155]) + ZA[145]; - ZA[417] = ZA[398] + ZA[416]; - ZA[418] = ZA[416] + ZA[414]; - ZA[405] = ZA[402] + ZA[160]; - - ZA[421] = (ZCh(ZA[417], ZA[412], ZA[407]) + ZA[405]) + ZR26(ZA[417]); - ZA[419] = ZMa(ZA[413], ZA[408], ZA[418]) + ZR30(ZA[418]); - ZA[151] = ZA[142] + ZA[122]; - ZA[165] = ZA[161] + 0x748f82eeU; - - ZA[123] = ZR25(ZA[111]) + ZA[110]; - ZA[162] = ZR15(ZA[156]) + ZA[151]; - ZA[422] = ZA[403] + ZA[421]; - ZA[423] = ZA[421] + ZA[419]; - ZA[410] = ZA[407] + ZA[165]; - - ZA[426] = (ZCh(ZA[422], ZA[417], ZA[412]) + ZA[410]) + ZR26(ZA[422]); - ZA[424] = ZMa(ZA[418], ZA[413], ZA[423]) + ZR30(ZA[423]); - ZA[152] = ZA[143] + ZA[123]; - ZA[166] = ZA[162] + 0x78a5636fU; - - ZA[130] = ZR25(ZA[118]) + ZA[111]; - ZA[167] = ZR15(ZA[161]) + ZA[152]; - ZA[427] = ZA[408] + ZA[426]; - ZA[428] = ZA[426] + ZA[424]; - ZA[415] = ZA[412] + ZA[166]; - - ZA[431] = (ZCh(ZA[427], ZA[422], ZA[417]) + ZA[415]) + ZR26(ZA[427]); - ZA[429] = ZMa(ZA[423], ZA[418], ZA[428]) + ZR30(ZA[428]); - ZA[157] = ZA[149] + ZA[130]; - ZA[170] = ZA[167] + 0x84c87814U; - - ZA[131] = ZR25(ZA[119]) + ZA[118]; - ZA[168] = ZR15(ZA[162]) + ZA[157]; - ZA[432] = ZA[413] + ZA[431]; - ZA[433] = ZA[431] + ZA[429]; - ZA[420] = ZA[417] + ZA[170]; - - ZA[436] = (ZCh(ZA[432], ZA[427], ZA[422]) + ZA[420]) + ZR26(ZA[432]); - ZA[434] = ZMa(ZA[428], ZA[423], ZA[433]) + ZR30(ZA[433]); - ZA[158] = ZA[150] + ZA[131]; - ZA[171] = ZA[168] + 0x8cc70208U; - - ZA[138] = ZR25(ZA[126]) + ZA[119]; - ZA[172] = ZR15(ZA[167]) + ZA[158]; - ZA[437] = ZA[418] + ZA[436]; - ZA[438] = ZA[436] + ZA[434]; - ZA[425] = ZA[422] + ZA[171]; - - ZA[441] = (ZCh(ZA[437], ZA[432], ZA[427]) + ZA[425]) + ZR26(ZA[437]); - ZA[439] = ZMa(ZA[433], ZA[428], ZA[438]) + ZR30(ZA[438]); - ZA[163] = ZA[155] + ZA[138]; - ZA[174] = ZA[172] + 0x90befffaU; - - ZA[139] = ZR25(ZA[127]) + ZA[126]; - ZA[173] = ZR15(ZA[168]) + ZA[163]; - ZA[442] = ZA[423] + ZA[441]; - ZA[443] = ZA[441] + ZA[439]; - ZA[430] = ZA[427] + ZA[174]; - - ZA[445] = (ZCh(ZA[442], ZA[437], ZA[432]) + ZA[430]) + ZR26(ZA[442]); - ZA[444] = ZMa(ZA[438], ZA[433], ZA[443]) + ZR30(ZA[443]); - ZA[164] = ZA[156] + ZA[139]; - ZA[175] = ZA[173] + 0xa4506cebU; - - ZA[146] = ZR25(ZA[134]) + ZA[127]; - ZA[176] = ZR15(ZA[172]) + ZA[164]; - ZA[446] = ZA[428] + ZA[445]; - ZA[447] = ZA[445] + ZA[444]; - ZA[435] = ZA[432] + ZA[175]; - - ZA[449] = (ZCh(ZA[446], ZA[442], ZA[437]) + ZA[435]) + ZR26(ZA[446]); - ZA[448] = ZMa(ZA[443], ZA[438], ZA[447]) + ZR30(ZA[447]); - ZA[169] = ZA[161] + ZA[146]; - ZA[178] = ZA[176] + 0xbef9a3f7U; - - ZA[177] = ZR15(ZA[173]) + ZA[169]; - ZA[451] = ZA[449] + ZA[448]; - ZA[450] = ZA[433] + ZA[449]; - ZA[440] = ZA[437] + ZA[178]; - - ZA[453] = (ZCh(ZA[450], ZA[446], ZA[442]) + ZA[440]) + ZR26(ZA[450]); - ZA[452] = ZMa(ZA[447], ZA[443], ZA[451]) + ZR30(ZA[451]); - ZA[179] = ZA[177] + 0xc67178f2U; - - ZA[454] = ZA[438] + ZA[453]; - ZA[494] = ZA[442] + ZA[179]; - ZA[455] = ZA[453] + ZA[452]; - - ZA[457] = (ZCh(ZA[454], ZA[450], ZA[446]) + ZA[494]) + ZR26(ZA[454]); - ZA[456] = ZMa(ZA[451], ZA[447], ZA[455]) + ZR30(ZA[455]); - - ZA[459] = ZA[457] + ZA[456]; - - ZA[461] = ZA[455] + state1; - ZA[460] = ZA[459] + state0; - - ZA[495] = ZA[460] + 0x98c7e2a2U; - ZA[469] = ZA[461] + 0x90bb1e3cU; - - ZA[498] = (ZCh(ZA[495], 0x510e527fU, 0x9b05688cU) + ZA[469]) + ZR26(ZA[495]); - ZA[462] = ZA[451] + state2; - - ZA[496] = ZA[460] + 0xfc08884dU; - ZA[506] = ZA[498] + 0x3c6ef372U; - ZA[470] = ZA[462] + 0x50c6645bU; - - ZA[507] = (ZCh(ZA[506], ZA[495], 0x510e527fU) + ZA[470]) + ZR26(ZA[506]); - ZA[500] = ZMa(0x6a09e667U, 0xbb67ae85U, ZA[496]) + ZR30(ZA[496]); - ZA[463] = ZA[447] + state3; - - ZA[458] = ZA[443] + ZA[457]; - ZA[499] = ZA[498] + ZA[500]; - ZA[508] = ZA[507] + 0xbb67ae85U; - ZA[473] = ZA[463] + 0x3ac42e24U; - - ZA[510] = (ZCh(ZA[508], ZA[506], ZA[495]) + ZA[473]) + ZR26(ZA[508]); - ZA[928] = ZMa(ZA[496], 0x6a09e667U, ZA[499]) + ZR30(ZA[499]); - ZA[464] = ZA[458] + state4; - - ZA[476] = ZA[464] + ZA[460] + 0xd21ea4fdU; - ZA[511] = ZA[510] + 0x6a09e667U; - ZA[509] = ZA[928] + ZA[507]; - ZA[465] = ZA[454] + state5; - - ZA[514] = (ZCh(ZA[511], ZA[508], ZA[506]) + ZA[476]) + ZR26(ZA[511]); - ZA[512] = ZMa(ZA[499], ZA[496], ZA[509]) + ZR30(ZA[509]); - ZA[478] = ZA[465] + 0x59f111f1U; - - ZA[519] = ZA[506] + ZA[478]; - ZA[516] = ZA[496] + ZA[514]; - ZA[513] = ZA[510] + ZA[512]; - ZA[466] = ZA[450] + state6; - - ZA[520] = (ZCh(ZA[516], ZA[511], ZA[508]) + ZA[519]) + ZR26(ZA[516]); - ZA[515] = ZMa(ZA[509], ZA[499], ZA[513]) + ZR30(ZA[513]); - ZA[480] = ZA[466] + 0x923f82a4U; - - ZA[524] = ZA[508] + ZA[480]; - ZA[521] = ZA[499] + ZA[520]; - ZA[517] = ZA[514] + ZA[515]; - ZA[467] = ZA[446] + state7; - - ZA[525] = (ZCh(ZA[521], ZA[516], ZA[511]) + ZA[524]) + ZR26(ZA[521]); - ZA[522] = ZMa(ZA[513], ZA[509], ZA[517]) + ZR30(ZA[517]); - ZA[484] = ZA[467] + 0xab1c5ed5U; - - ZA[529] = ZA[511] + ZA[484]; - ZA[526] = ZA[509] + ZA[525]; - ZA[523] = ZA[520] + ZA[522]; - - ZA[530] = (ZCh(ZA[526], ZA[521], ZA[516]) + ZA[529]) + ZR26(ZA[526]); - ZA[550] = ZMa(ZA[517], ZA[513], ZA[523]) + ZR30(ZA[523]); - - ZA[531] = ZA[513] + ZA[530]; - ZA[533] = ZA[516] + 0x5807aa98U; - ZA[527] = ZA[550] + ZA[525]; - - ZA[534] = (ZCh(ZA[531], ZA[526], ZA[521]) + ZA[533]) + ZR26(ZA[531]); - ZA[551] = ZMa(ZA[523], ZA[517], ZA[527]) + ZR30(ZA[527]); - - ZA[535] = ZA[517] + ZA[534]; - ZA[538] = ZA[521] + 0x12835b01U; - ZA[532] = ZA[551] + ZA[530]; - - ZA[539] = (ZCh(ZA[535], ZA[531], ZA[526]) + ZA[538]) + ZR26(ZA[535]); - ZA[552] = ZMa(ZA[527], ZA[523], ZA[532]) + ZR30(ZA[532]); - - ZA[540] = ZA[523] + ZA[539]; - ZA[542] = ZA[526] + 0x243185beU; - ZA[536] = ZA[552] + ZA[534]; - - ZA[543] = (ZCh(ZA[540], ZA[535], ZA[531]) + ZA[542]) + ZR26(ZA[540]); - ZA[553] = ZMa(ZA[532], ZA[527], ZA[536]) + ZR30(ZA[536]); - - ZA[544] = ZA[527] + ZA[543]; - ZA[555] = ZA[531] + 0x550c7dc3U; - ZA[541] = ZA[553] + ZA[539]; - - ZA[558] = (ZCh(ZA[544], ZA[540], ZA[535]) + ZA[555]) + ZR26(ZA[544]); - ZA[547] = ZMa(ZA[536], ZA[532], ZA[541]) + ZR30(ZA[541]); - - ZA[559] = ZA[532] + ZA[558]; - ZA[556] = ZA[535] + 0x72be5d74U; - ZA[545] = ZA[547] + ZA[543]; - - ZA[562] = (ZCh(ZA[559], ZA[544], ZA[540]) + ZA[556]) + ZR26(ZA[559]); - ZA[561] = ZMa(ZA[541], ZA[536], ZA[545]) + ZR30(ZA[545]); - - ZA[563] = ZA[536] + ZA[562]; - ZA[560] = ZA[561] + ZA[558]; - ZA[557] = ZA[540] + 0x80deb1feU; - - ZA[568] = (ZCh(ZA[563], ZA[559], ZA[544]) + ZA[557]) + ZR26(ZA[563]); - ZA[564] = ZMa(ZA[545], ZA[541], ZA[560]) + ZR30(ZA[560]); - - ZA[569] = ZA[541] + ZA[568]; - ZA[572] = ZA[544] + 0x9bdc06a7U; - ZA[565] = ZA[562] + ZA[564]; - - ZA[574] = (ZCh(ZA[569], ZA[563], ZA[559]) + ZA[572]) + ZR26(ZA[569]); - ZA[570] = ZMa(ZA[560], ZA[545], ZA[565]) + ZR30(ZA[565]); - ZA[468] = ZR25(ZA[461]); - - ZA[497] = ZA[468] + ZA[460]; - ZA[575] = ZA[545] + ZA[574]; - ZA[571] = ZA[568] + ZA[570]; - ZA[573] = ZA[559] + 0xc19bf274U; - - ZA[578] = (ZCh(ZA[575], ZA[569], ZA[563]) + ZA[573]) + ZR26(ZA[575]); - ZA[576] = ZMa(ZA[565], ZA[560], ZA[571]) + ZR30(ZA[571]); - ZA[929] = ZR25(ZA[462]); - ZA[503] = ZA[497] + 0xe49b69c1U; - - ZA[471] = ZA[929] + ZA[461] + 0x00a00000U; - ZA[582] = ZA[563] + ZA[503]; - ZA[579] = ZA[560] + ZA[578]; - ZA[577] = ZA[574] + ZA[576]; - - ZA[583] = (ZCh(ZA[579], ZA[575], ZA[569]) + ZA[582]) + ZR26(ZA[579]); - ZA[580] = ZMa(ZA[571], ZA[565], ZA[577]) + ZR30(ZA[577]); - ZA[488] = ZA[471] + 0xefbe4786U; - - ZA[472] = ZR25(ZA[463]) + ZA[462]; - ZA[587] = ZA[569] + ZA[488]; - ZA[584] = ZA[565] + ZA[583]; - ZA[581] = ZA[578] + ZA[580]; - - ZA[588] = (ZCh(ZA[584], ZA[579], ZA[575]) + ZA[587]) + ZR26(ZA[584]); - ZA[586] = ZMa(ZA[577], ZA[571], ZA[581]) + ZR30(ZA[581]); - ZA[501] = ZR15(ZA[497]) + ZA[472]; - ZA[475] = ZR15(ZA[471]); - ZA[926] = ZA[575] + 0x0fc19dc6U; - - ZA[474] = ZA[475] + ZA[463] + ZR25(ZA[464]); - ZA[927] = ZA[926] + ZA[501]; - ZA[589] = ZA[571] + ZA[588]; - ZA[585] = ZA[583] + ZA[586]; - - ZA[592] = (ZCh(ZA[589], ZA[584], ZA[579]) + ZA[927]) + ZR26(ZA[589]); - ZA[590] = ZMa(ZA[581], ZA[577], ZA[585]) + ZR30(ZA[585]); - ZA[477] = ZR25(ZA[465]) + ZA[464]; - ZA[489] = ZA[474] + 0x240ca1ccU; - - ZA[518] = ZR15(ZA[501]) + ZA[477]; - ZA[479] = ZR25(ZA[466]); - ZA[596] = ZA[579] + ZA[489]; - ZA[593] = ZA[577] + ZA[592]; - ZA[591] = ZA[588] + ZA[590]; - - ZA[597] = (ZCh(ZA[593], ZA[589], ZA[584]) + ZA[596]) + ZR26(ZA[593]); - ZA[594] = ZMa(ZA[585], ZA[581], ZA[591]) + ZR30(ZA[591]); - ZA[481] = ZA[479] + ZA[465]; - ZA[601] = ZA[518] + 0x2de92c6fU; - - ZA[482] = ZR15(ZA[474]) + ZA[481]; - ZA[602] = ZA[584] + ZA[601]; - ZA[598] = ZA[581] + ZA[597]; - ZA[595] = ZA[592] + ZA[594]; - - ZA[632] = (ZCh(ZA[598], ZA[593], ZA[589]) + ZA[602]) + ZR26(ZA[598]); - ZA[599] = ZMa(ZA[591], ZA[585], ZA[595]) + ZR30(ZA[595]); - ZA[483] = ZA[466] + 0x00000100U + ZR25(ZA[467]); - ZA[490] = ZA[482] + 0x4a7484aaU; - - ZA[528] = ZR15(ZA[518]) + ZA[483]; - ZA[736] = ZA[585] + ZA[632]; - ZA[605] = ZA[589] + ZA[490]; - ZA[600] = ZA[597] + ZA[599]; - ZA[485] = ZA[467] + 0x11002000U; - - ZA[738] = (ZCh(ZA[736], ZA[598], ZA[593]) + ZA[605]) + ZR26(ZA[736]); - ZA[744] = ZMa(ZA[595], ZA[591], ZA[600]) + ZR30(ZA[600]); - ZA[487] = ZR15(ZA[482]) + ZA[485]; - ZA[603] = ZA[528] + 0x5cb0a9dcU; - - ZA[502] = ZA[497] + ZA[487]; - ZA[739] = ZA[591] + ZA[738]; - ZA[604] = ZA[593] + ZA[603]; - ZA[737] = ZA[744] + ZA[632]; - - ZA[741] = (ZCh(ZA[739], ZA[736], ZA[598]) + ZA[604]) + ZR26(ZA[739]); - ZA[745] = ZMa(ZA[600], ZA[595], ZA[737]) + ZR30(ZA[737]); - ZA[486] = ZA[471] + 0x80000000U; - ZA[606] = ZA[502] + 0x76f988daU; - - ZA[537] = ZR15(ZA[528]) + ZA[486]; - ZA[742] = ZA[595] + ZA[741]; - ZA[613] = ZA[598] + ZA[606]; - ZA[740] = ZA[745] + ZA[738]; - - ZA[747] = (ZCh(ZA[742], ZA[739], ZA[736]) + ZA[613]) + ZR26(ZA[742]); - ZA[746] = ZMa(ZA[737], ZA[600], ZA[740]) + ZR30(ZA[740]); - ZA[607] = ZA[537] + 0x983e5152U; - - ZA[546] = ZR15(ZA[502]) + ZA[501]; - ZA[751] = ZA[736] + ZA[607]; - ZA[748] = ZA[600] + ZA[747]; - ZA[743] = ZA[746] + ZA[741]; - - ZA[752] = (ZCh(ZA[748], ZA[742], ZA[739]) + ZA[751]) + ZR26(ZA[748]); - ZA[749] = ZMa(ZA[740], ZA[737], ZA[743]) + ZR30(ZA[743]); - ZA[608] = ZA[546] + 0xa831c66dU; - - ZA[554] = ZR15(ZA[537]) + ZA[474]; - ZA[756] = ZA[739] + ZA[608]; - ZA[753] = ZA[737] + ZA[752]; - ZA[750] = ZA[747] + ZA[749]; - - ZA[757] = (ZCh(ZA[753], ZA[748], ZA[742]) + ZA[756]) + ZR26(ZA[753]); - ZA[754] = ZMa(ZA[743], ZA[740], ZA[750]) + ZR30(ZA[750]); - ZA[609] = ZA[554] + 0xb00327c8U; - - ZA[566] = ZR15(ZA[546]) + ZA[518]; - ZA[761] = ZA[742] + ZA[609]; - ZA[758] = ZA[740] + ZA[757]; - ZA[755] = ZA[752] + ZA[754]; - - ZA[762] = (ZCh(ZA[758], ZA[753], ZA[748]) + ZA[761]) + ZR26(ZA[758]); - ZA[759] = ZMa(ZA[750], ZA[743], ZA[755]) + ZR30(ZA[755]); - ZA[610] = ZA[566] + 0xbf597fc7U; - - ZA[567] = ZR15(ZA[554]) + ZA[482]; - ZA[766] = ZA[748] + ZA[610]; - ZA[763] = ZA[743] + ZA[762]; - ZA[760] = ZA[757] + ZA[759]; - - ZA[767] = (ZCh(ZA[763], ZA[758], ZA[753]) + ZA[766]) + ZR26(ZA[763]); - ZA[764] = ZMa(ZA[755], ZA[750], ZA[760]) + ZR30(ZA[760]); - ZA[611] = ZA[567] + 0xc6e00bf3U; - - ZA[614] = ZR15(ZA[566]) + ZA[528]; - ZA[771] = ZA[753] + ZA[611]; - ZA[768] = ZA[750] + ZA[767]; - ZA[765] = ZA[762] + ZA[764]; - - ZA[772] = (ZCh(ZA[768], ZA[763], ZA[758]) + ZA[771]) + ZR26(ZA[768]); - ZA[769] = ZMa(ZA[760], ZA[755], ZA[765]) + ZR30(ZA[765]); - ZA[612] = ZA[502] + 0x00400022U; - ZA[615] = ZA[614] + 0xd5a79147U; - - ZA[616] = ZR15(ZA[567]) + ZA[612]; - ZA[504] = ZR25(ZA[497]) + 0x00000100U; - ZA[776] = ZA[758] + ZA[615]; - ZA[773] = ZA[755] + ZA[772]; - ZA[770] = ZA[767] + ZA[769]; - - ZA[777] = (ZCh(ZA[773], ZA[768], ZA[763]) + ZA[776]) + ZR26(ZA[773]); - ZA[774] = ZMa(ZA[765], ZA[760], ZA[770]) + ZR30(ZA[770]); - ZA[492] = ZR25(ZA[471]); - ZA[618] = ZA[537] + ZA[504]; - ZA[617] = ZA[616] + 0x06ca6351U; - - ZA[619] = ZR15(ZA[614]) + ZA[618]; - ZA[781] = ZA[763] + ZA[617]; - ZA[778] = ZA[760] + ZA[777]; - ZA[775] = ZA[772] + ZA[774]; - ZA[505] = ZA[492] + ZA[497]; - - ZA[782] = (ZCh(ZA[778], ZA[773], ZA[768]) + ZA[781]) + ZR26(ZA[778]); - ZA[779] = ZMa(ZA[770], ZA[765], ZA[775]) + ZR30(ZA[775]); - ZA[621] = ZA[505] + ZA[546]; - ZA[620] = ZA[619] + 0x14292967U; - - ZA[622] = ZR15(ZA[616]) + ZA[621]; - ZA[625] = ZR25(ZA[501]); - ZA[786] = ZA[768] + ZA[620]; - ZA[783] = ZA[765] + ZA[782]; - ZA[624] = ZA[554] + ZA[471]; - ZA[780] = ZA[777] + ZA[779]; - - ZA[787] = (ZCh(ZA[783], ZA[778], ZA[773]) + ZA[786]) + ZR26(ZA[783]); - ZA[784] = ZMa(ZA[775], ZA[770], ZA[780]) + ZR30(ZA[780]); - ZA[493] = ZR25(ZA[474]); - ZA[626] = ZA[625] + ZA[624]; - ZA[623] = ZA[622] + 0x27b70a85U; - - ZA[627] = ZR15(ZA[619]) + ZA[626]; - ZA[791] = ZA[773] + ZA[623]; - ZA[788] = ZA[770] + ZA[787]; - ZA[785] = ZA[782] + ZA[784]; - ZA[629] = ZA[493] + ZA[501]; - - ZA[792] = (ZCh(ZA[788], ZA[783], ZA[778]) + ZA[791]) + ZR26(ZA[788]); - ZA[789] = ZMa(ZA[780], ZA[775], ZA[785]) + ZR30(ZA[785]); - ZA[630] = ZA[566] + ZA[629]; - ZA[628] = ZA[627] + 0x2e1b2138U; - - ZA[634] = ZR25(ZA[518]) + ZA[474]; - ZA[631] = ZR15(ZA[622]) + ZA[630]; - ZA[796] = ZA[778] + ZA[628]; - ZA[793] = ZA[775] + ZA[792]; - ZA[790] = ZA[787] + ZA[789]; - - ZA[797] = (ZCh(ZA[793], ZA[788], ZA[783]) + ZA[796]) + ZR26(ZA[793]); - ZA[794] = ZMa(ZA[785], ZA[780], ZA[790]) + ZR30(ZA[790]); - ZA[491] = ZR25(ZA[482]); - ZA[635] = ZA[567] + ZA[634]; - ZA[633] = ZA[631] + 0x4d2c6dfcU; - - ZA[636] = ZR15(ZA[627]) + ZA[635]; - ZA[801] = ZA[783] + ZA[633]; - ZA[798] = ZA[780] + ZA[797]; - ZA[795] = ZA[792] + ZA[794]; - ZA[638] = ZA[491] + ZA[518]; - - ZA[802] = (ZCh(ZA[798], ZA[793], ZA[788]) + ZA[801]) + ZR26(ZA[798]); - ZA[799] = ZMa(ZA[790], ZA[785], ZA[795]) + ZR30(ZA[795]); - ZA[639] = ZA[638] + ZA[614]; - ZA[637] = ZA[636] + 0x53380d13U; - - ZA[642] = ZR25(ZA[528]) + ZA[482]; - ZA[640] = ZR15(ZA[631]) + ZA[639]; - ZA[806] = ZA[788] + ZA[637]; - ZA[803] = ZA[785] + ZA[802]; - ZA[800] = ZA[797] + ZA[799]; - - ZA[807] = (ZCh(ZA[803], ZA[798], ZA[793]) + ZA[806]) + ZR26(ZA[803]); - ZA[804] = ZMa(ZA[795], ZA[790], ZA[800]) + ZR30(ZA[800]); - ZA[643] = ZA[616] + ZA[642]; - ZA[641] = ZA[640] + 0x650a7354U; - - ZA[646] = ZR25(ZA[502]) + ZA[528]; - ZA[644] = ZR15(ZA[636]) + ZA[643]; - ZA[811] = ZA[793] + ZA[641]; - ZA[808] = ZA[790] + ZA[807]; - ZA[805] = ZA[802] + ZA[804]; - - ZA[812] = (ZCh(ZA[808], ZA[803], ZA[798]) + ZA[811]) + ZR26(ZA[808]); - ZA[809] = ZMa(ZA[800], ZA[795], ZA[805]) + ZR30(ZA[805]); - ZA[647] = ZA[619] + ZA[646]; - ZA[645] = ZA[644] + 0x766a0abbU; - - ZA[650] = ZR25(ZA[537]) + ZA[502]; - ZA[648] = ZR15(ZA[640]) + ZA[647]; - ZA[816] = ZA[798] + ZA[645]; - ZA[813] = ZA[795] + ZA[812]; - ZA[810] = ZA[807] + ZA[809]; - - ZA[817] = (ZCh(ZA[813], ZA[808], ZA[803]) + ZA[816]) + ZR26(ZA[813]); - ZA[814] = ZMa(ZA[805], ZA[800], ZA[810]) + ZR30(ZA[810]); - ZA[925] = ZA[622] + ZA[650]; - ZA[649] = ZA[648] + 0x81c2c92eU; - - ZA[653] = ZR25(ZA[546]) + ZA[537]; - ZA[651] = ZR15(ZA[644]) + ZA[925]; - ZA[821] = ZA[803] + ZA[649]; - ZA[818] = ZA[800] + ZA[817]; - ZA[815] = ZA[812] + ZA[814]; - - ZA[822] = (ZCh(ZA[818], ZA[813], ZA[808]) + ZA[821]) + ZR26(ZA[818]); - ZA[819] = ZMa(ZA[810], ZA[805], ZA[815]) + ZR30(ZA[815]); - ZA[654] = ZA[627] + ZA[653]; - ZA[652] = ZA[651] + 0x92722c85U; - - ZA[657] = ZR25(ZA[554]) + ZA[546]; - ZA[655] = ZR15(ZA[648]) + ZA[654]; - ZA[826] = ZA[808] + ZA[652]; - ZA[823] = ZA[805] + ZA[822]; - ZA[820] = ZA[817] + ZA[819]; - - ZA[827] = (ZCh(ZA[823], ZA[818], ZA[813]) + ZA[826]) + ZR26(ZA[823]); - ZA[824] = ZMa(ZA[815], ZA[810], ZA[820]) + ZR30(ZA[820]); - ZA[658] = ZA[631] + ZA[657]; - ZA[656] = ZA[655] + 0xa2bfe8a1U; - - ZA[661] = ZR25(ZA[566]) + ZA[554]; - ZA[659] = ZR15(ZA[651]) + ZA[658]; - ZA[831] = ZA[813] + ZA[656]; - ZA[828] = ZA[810] + ZA[827]; - ZA[825] = ZA[822] + ZA[824]; - - ZA[832] = (ZCh(ZA[828], ZA[823], ZA[818]) + ZA[831]) + ZR26(ZA[828]); - ZA[829] = ZMa(ZA[820], ZA[815], ZA[825]) + ZR30(ZA[825]); - ZA[662] = ZA[636] + ZA[661]; - ZA[660] = ZA[659] + 0xa81a664bU; - - ZA[665] = ZR25(ZA[567]) + ZA[566]; - ZA[663] = ZR15(ZA[655]) + ZA[662]; - ZA[836] = ZA[818] + ZA[660]; - ZA[833] = ZA[815] + ZA[832]; - ZA[830] = ZA[827] + ZA[829]; - - ZA[837] = (ZCh(ZA[833], ZA[828], ZA[823]) + ZA[836]) + ZR26(ZA[833]); - ZA[834] = ZMa(ZA[825], ZA[820], ZA[830]) + ZR30(ZA[830]); - ZA[666] = ZA[640] + ZA[665]; - ZA[664] = ZA[663] + 0xc24b8b70U; - - ZA[669] = ZR25(ZA[614]) + ZA[567]; - ZA[667] = ZR15(ZA[659]) + ZA[666]; - ZA[841] = ZA[823] + ZA[664]; - ZA[838] = ZA[820] + ZA[837]; - ZA[835] = ZA[832] + ZA[834]; - - ZA[842] = (ZCh(ZA[838], ZA[833], ZA[828]) + ZA[841]) + ZR26(ZA[838]); - ZA[839] = ZMa(ZA[830], ZA[825], ZA[835]) + ZR30(ZA[835]); - ZA[670] = ZA[644] + ZA[669]; - ZA[668] = ZA[667] + 0xc76c51a3U; - - ZA[677] = ZR25(ZA[616]) + ZA[614]; - ZA[671] = ZR15(ZA[663]) + ZA[670]; - ZA[846] = ZA[828] + ZA[668]; - ZA[843] = ZA[825] + ZA[842]; - ZA[840] = ZA[837] + ZA[839]; - - ZA[847] = (ZCh(ZA[843], ZA[838], ZA[833]) + ZA[846]) + ZR26(ZA[843]); - ZA[844] = ZMa(ZA[835], ZA[830], ZA[840]) + ZR30(ZA[840]); - ZA[678] = ZA[648] + ZA[677]; - ZA[676] = ZA[671] + 0xd192e819U; - - ZA[682] = ZR25(ZA[619]) + ZA[616]; - ZA[679] = ZR15(ZA[667]) + ZA[678]; - ZA[851] = ZA[833] + ZA[676]; - ZA[848] = ZA[830] + ZA[847]; - ZA[845] = ZA[842] + ZA[844]; - - ZA[852] = (ZCh(ZA[848], ZA[843], ZA[838]) + ZA[851]) + ZR26(ZA[848]); - ZA[849] = ZMa(ZA[840], ZA[835], ZA[845]) + ZR30(ZA[845]); - ZA[683] = ZA[651] + ZA[682]; - ZA[680] = ZA[679] + 0xd6990624U; - - ZA[686] = ZR25(ZA[622]) + ZA[619]; - ZA[684] = ZR15(ZA[671]) + ZA[683]; - ZA[856] = ZA[838] + ZA[680]; - ZA[853] = ZA[835] + ZA[852]; - ZA[850] = ZA[847] + ZA[849]; - - ZA[857] = (ZCh(ZA[853], ZA[848], ZA[843]) + ZA[856]) + ZR26(ZA[853]); - ZA[854] = ZMa(ZA[845], ZA[840], ZA[850]) + ZR30(ZA[850]); - ZA[687] = ZA[655] + ZA[686]; - ZA[685] = ZA[684] + 0xf40e3585U; - - ZA[690] = ZR25(ZA[627]) + ZA[622]; - ZA[688] = ZR15(ZA[679]) + ZA[687]; - ZA[861] = ZA[843] + ZA[685]; - ZA[858] = ZA[840] + ZA[857]; - ZA[855] = ZA[852] + ZA[854]; - - ZA[862] = (ZCh(ZA[858], ZA[853], ZA[848]) + ZA[861]) + ZR26(ZA[858]); - ZA[859] = ZMa(ZA[850], ZA[845], ZA[855]) + ZR30(ZA[855]); - ZA[691] = ZA[659] + ZA[690]; - ZA[689] = ZA[688] + 0x106aa070U; - - ZA[694] = ZR25(ZA[631]) + ZA[627]; - ZA[692] = ZR15(ZA[684]) + ZA[691]; - ZA[866] = ZA[848] + ZA[689]; - ZA[863] = ZA[845] + ZA[862]; - ZA[860] = ZA[857] + ZA[859]; - - ZA[867] = (ZCh(ZA[863], ZA[858], ZA[853]) + ZA[866]) + ZR26(ZA[863]); - ZA[864] = ZMa(ZA[855], ZA[850], ZA[860]) + ZR30(ZA[860]); - ZA[695] = ZA[663] + ZA[694]; - ZA[693] = ZA[692] + 0x19a4c116U; - - ZA[698] = ZR25(ZA[636]) + ZA[631]; - ZA[696] = ZR15(ZA[688]) + ZA[695]; - ZA[871] = ZA[853] + ZA[693]; - ZA[868] = ZA[850] + ZA[867]; - ZA[865] = ZA[862] + ZA[864]; - - ZA[873] = (ZCh(ZA[868], ZA[863], ZA[858]) + ZA[871]) + ZR26(ZA[868]); - ZA[869] = ZMa(ZA[860], ZA[855], ZA[865]) + ZR30(ZA[865]); - ZA[699] = ZA[667] + ZA[698]; - ZA[697] = ZA[696] + 0x1e376c08U; - - ZA[702] = ZR25(ZA[640]) + ZA[636]; - ZA[700] = ZR15(ZA[692]) + ZA[699]; - ZA[877] = ZA[858] + ZA[697]; - ZA[874] = ZA[855] + ZA[873]; - ZA[870] = ZA[867] + ZA[869]; - - ZA[878] = (ZCh(ZA[874], ZA[868], ZA[863]) + ZA[877]) + ZR26(ZA[874]); - ZA[875] = ZMa(ZA[865], ZA[860], ZA[870]) + ZR30(ZA[870]); - ZA[703] = ZA[671] + ZA[702]; - ZA[701] = ZA[700] + 0x2748774cU; - - ZA[706] = ZR25(ZA[644]) + ZA[640]; - ZA[704] = ZR15(ZA[696]) + ZA[703]; - ZA[882] = ZA[863] + ZA[701]; - ZA[879] = ZA[860] + ZA[878]; - ZA[876] = ZA[873] + ZA[875]; - - ZA[883] = (ZCh(ZA[879], ZA[874], ZA[868]) + ZA[882]) + ZR26(ZA[879]); - ZA[880] = ZMa(ZA[870], ZA[865], ZA[876]) + ZR30(ZA[876]); - ZA[707] = ZA[679] + ZA[706]; - ZA[705] = ZA[704] + 0x34b0bcb5U; - - ZA[710] = ZR25(ZA[648]) + ZA[644]; - ZA[708] = ZR15(ZA[700]) + ZA[707]; - ZA[887] = ZA[868] + ZA[705]; - ZA[884] = ZA[865] + ZA[883]; - ZA[881] = ZA[878] + ZA[880]; - - ZA[888] = (ZCh(ZA[884], ZA[879], ZA[874]) + ZA[887]) + ZR26(ZA[884]); - ZA[885] = ZMa(ZA[876], ZA[870], ZA[881]) + ZR30(ZA[881]); - ZA[711] = ZA[684] + ZA[710]; - ZA[709] = ZA[708] + 0x391c0cb3U; - - ZA[714] = ZR25(ZA[651]) + ZA[648]; - ZA[712] = ZR15(ZA[704]) + ZA[711]; - ZA[892] = ZA[874] + ZA[709]; - ZA[889] = ZA[870] + ZA[888]; - ZA[886] = ZA[883] + ZA[885]; - - ZA[893] = (ZCh(ZA[889], ZA[884], ZA[879]) + ZA[892]) + ZR26(ZA[889]); - ZA[890] = ZMa(ZA[881], ZA[876], ZA[886]) + ZR30(ZA[886]); - ZA[715] = ZA[688] + ZA[714]; - ZA[713] = ZA[712] + 0x4ed8aa4aU; - - ZA[718] = ZR25(ZA[655]) + ZA[651]; - ZA[716] = ZR15(ZA[708]) + ZA[715]; - ZA[897] = ZA[879] + ZA[713]; - ZA[894] = ZA[876] + ZA[893]; - ZA[891] = ZA[888] + ZA[890]; - - ZA[898] = (ZCh(ZA[894], ZA[889], ZA[884]) + ZA[897]) + ZR26(ZA[894]); - ZA[895] = ZMa(ZA[886], ZA[881], ZA[891]) + ZR30(ZA[891]); - ZA[719] = ZA[692] + ZA[718]; - ZA[717] = ZA[716] + 0x5b9cca4fU; - - ZA[722] = ZR25(ZA[659]) + ZA[655]; - ZA[720] = ZR15(ZA[712]) + ZA[719]; - ZA[902] = ZA[884] + ZA[717]; - ZA[899] = ZA[881] + ZA[898]; - ZA[896] = ZA[893] + ZA[895]; - - ZA[903] = (ZCh(ZA[899], ZA[894], ZA[889]) + ZA[902]) + ZR26(ZA[899]); - ZA[900] = ZMa(ZA[891], ZA[886], ZA[896]) + ZR30(ZA[896]); - ZA[723] = ZA[696] + ZA[722]; - ZA[721] = ZA[720] + 0x682e6ff3U; - - ZA[672] = ZR25(ZA[663]) + ZA[659]; - ZA[724] = ZR15(ZA[716]) + ZA[723]; - ZA[907] = ZA[889] + ZA[721]; - ZA[904] = ZA[886] + ZA[903]; - ZA[901] = ZA[898] + ZA[900]; - - ZA[908] = (ZCh(ZA[904], ZA[899], ZA[894]) + ZA[907]) + ZR26(ZA[904]); - ZA[905] = ZMa(ZA[896], ZA[891], ZA[901]) + ZR30(ZA[901]); - ZA[673] = ZR25(ZA[667]) + ZA[663]; - ZA[726] = ZA[700] + ZA[672]; - ZA[725] = ZA[724] + 0x748f82eeU; - - ZA[727] = ZR15(ZA[720]) + ZA[726]; - ZA[912] = ZA[894] + ZA[725]; - ZA[909] = ZA[891] + ZA[908]; - ZA[906] = ZA[903] + ZA[905]; - ZA[675] = ZA[667] + 0x8cc70208U; - ZA[729] = ZA[704] + ZA[673]; - - ZA[913] = (ZCh(ZA[909], ZA[904], ZA[899]) + ZA[912]) + ZR26(ZA[909]); - ZA[910] = ZMa(ZA[901], ZA[896], ZA[906]) + ZR30(ZA[906]); - ZA[674] = ZR25(ZA[671]) + ZA[675]; - ZA[730] = ZR15(ZA[724]) + ZA[729]; - ZA[728] = ZA[727] + 0x78a5636fU; - - ZA[681] = ZR25(ZA[679]) + ZA[671]; - ZA[917] = ZA[899] + ZA[901] + ZA[728]; - ZA[914] = ZA[896] + ZA[913]; - ZA[911] = ZA[908] + ZA[910]; - ZA[732] = ZA[708] + ZA[674]; - ZA[731] = ZA[730] + 0x84c87814U; - - ZA[918] = (ZCh(ZA[914], ZA[909], ZA[904]) + ZA[917]) + ZR26(ZA[914]); - ZA[915] = ZMa(ZA[906], ZA[901], ZA[911]) + ZR30(ZA[911]); - ZA[733] = ZR15(ZA[727]) + ZA[732]; - ZA[919] = ZA[906] + ZA[904] + ZA[731]; - ZA[734] = ZA[712] + ZA[681]; - - ZA[920] = (ZCh(ZA[918], ZA[914], ZA[909]) + ZA[919]) + ZR26(ZA[918]); - ZA[735] = ZR15(ZA[730]) + ZA[734]; - ZA[921] = ZA[911] + ZA[909] + ZA[733]; - ZA[916] = ZA[913] + ZA[915]; - - ZA[922] = (ZCh(ZA[920], ZA[918], ZA[914]) + ZA[921]) + ZR26(ZA[920]); - ZA[923] = ZA[916] + ZA[914] + ZA[735]; - - ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]); - -#define FOUND (0x800) -#define NFLAG (0x7FF) - -#if defined(VECTORS4) - bool result = any(ZA[924] == 0x136032EDU); - - if (result) { - if (ZA[924].x == 0x136032EDU) - output[FOUND] = output[NFLAG & Znonce.x] = Znonce.x; - if (ZA[924].y == 0x136032EDU) - output[FOUND] = output[NFLAG & Znonce.y] = Znonce.y; - if (ZA[924].z == 0x136032EDU) - output[FOUND] = output[NFLAG & Znonce.z] = Znonce.z; - if (ZA[924].w == 0x136032EDU) - output[FOUND] = output[NFLAG & Znonce.w] = Znonce.w; - } -#elif defined(VECTORS2) - bool result = any(ZA[924] == 0x136032EDU); - - if (result) { - if (ZA[924].x == 0x136032EDU) - output[FOUND] = output[NFLAG & Znonce.x] = Znonce.x; - if (ZA[924].y == 0x136032EDU) - output[FOUND] = output[NFLAG & Znonce.y] = Znonce.y; - } -#else - if (ZA[924] == 0x136032EDU) - output[FOUND] = output[NFLAG & Znonce] = Znonce; -#endif -} diff --git a/diakgcn120427.cl b/diakgcn120427.cl deleted file mode 100644 index 7dd73fb9..00000000 --- a/diakgcn120427.cl +++ /dev/null @@ -1,587 +0,0 @@ -// DiaKGCN 27-04-2012 - OpenCL kernel by Diapolo -// -// Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3. -// The kernel was rewritten by me (Diapolo) and is still public-domain! - -#ifdef VECTORS4 - typedef uint4 u; -#elif defined VECTORS2 - typedef uint2 u; -#else - typedef uint u; -#endif - -#ifdef BITALIGN - #pragma OPENCL EXTENSION cl_amd_media_ops : enable - #ifdef BFI_INT - #define ch(x, y, z) amd_bytealign(x, y, z) - #define ma(x, y, z) amd_bytealign(z ^ x, y, x) - #else - #define ch(x, y, z) bitselect(z, y, x) - #define ma(z, x, y) bitselect(z, y, z ^ x) - #endif -#else - #define ch(x, y, z) (z ^ (x & (y ^ z))) - #define ma(x, y, z) ((x & z) | (y & (x | z))) -#endif - -#define rotr15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U)) -#define rotr25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U)) -#define rotr26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U)) -#define rotr30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U)) - -__kernel - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) - void search( -#ifndef GOFFSET - const u base, -#endif - const uint PreVal0, const uint PreVal4, - const uint H1, const uint D1A, const uint B1, const uint C1, - const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7, - const uint W16addK16, const uint W17addK17, - const uint PreW18, const uint PreW19, - const uint W16, const uint W17, - const uint PreW31, const uint PreW32, - const uint state0, const uint state1, const uint state2, const uint state3, - const uint state4, const uint state5, const uint state6, const uint state7, - const uint state0A, const uint state0B, - const uint state1A, const uint state2A, const uint state3A, const uint state4A, - const uint state5A, const uint state6A, const uint state7A, - __global uint * output) -{ - u V[8]; - u W[16]; - -#ifdef VECTORS4 - const u nonce = (uint)(get_local_id(0)) * 4U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base; -#elif defined VECTORS2 - const u nonce = (uint)(get_local_id(0)) * 2U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base; -#else - #ifdef GOFFSET - const u nonce = (uint)(get_global_id(0)); - #else - const u nonce = (uint)(get_local_id(0)) + (uint)(get_group_id(0)) * (uint)(WORKSIZE) + base; - #endif -#endif - - V[0] = PreVal0 + nonce; - V[1] = B1; - V[2] = C1; - V[3] = D1A; - V[4] = PreVal4 + nonce; - V[5] = F1; - V[6] = G1; - V[7] = H1; - - V[7] += V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - -//---------------------------------------------------------------------------------- - -#ifdef VECTORS4 - W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U, rotr25(nonce.x) ^ 0x4008000U, rotr25(nonce.x) ^ 0x600c000U); -#elif defined VECTORS2 - W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U); -#else - W[0] = PreW18 + rotr25(nonce); -#endif - W[1] = PreW19 + nonce; - W[2] = 0x80000000U + rotr15(W[0]); - W[3] = rotr15(W[1]); - W[4] = 0x00000280U + rotr15(W[2]); - W[5] = W16 + rotr15(W[3]); - W[6] = W17 + rotr15(W[4]); - W[7] = W[0] + rotr15(W[5]); - W[8] = W[1] + rotr15(W[6]); - W[9] = W[2] + rotr15(W[7]); - W[10] = W[3] + rotr15(W[8]); - W[11] = W[4] + rotr15(W[9]); - W[12] = W[5] + 0x00a00055U + rotr15(W[10]); - W[13] = W[6] + PreW31 + rotr15(W[11]); - W[14] = W[7] + PreW32 + rotr15(W[12]); - W[15] = W[8] + W17 + rotr15(W[13]) + rotr25(W[0]); - - V[1] += 0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0]; - V[5] = 0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0] + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - -//---------------------------------------------------------------------------------- - - W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]); - W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]); - W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]); - W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]); - W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]); - W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]); - W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]); - W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]); - W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]); - W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]); - W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]); - W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]); - W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]); - W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]); - W[14] = W[14] + W[7] + rotr15(W[12]) + rotr25(W[15]); - W[15] = W[15] + W[8] + rotr15(W[13]) + rotr25( W[0]); - - V[1] += 0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - -//---------------------------------------------------------------------------------- - - W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]); - W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]); - W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]); - W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]); - W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]); - W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]); - W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]); - W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]); - W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]); - W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]); - W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]); - W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]); - W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]); - W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]); - - V[1] += 0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - -//---------------------------------------------------------------------------------- - - W[0] = state0 + V[0] + rotr25(state1 + V[1]); - W[1] = state1 + V[1] + 0x00a00000U + rotr25(state2 + V[2]); - W[2] = state2 + V[2] + rotr15(W[0]) + rotr25(state3 + V[3]); - W[3] = state3 + V[3] + rotr15(W[1]) + rotr25(state4 + V[4]); - W[4] = state4 + V[4] + rotr15(W[2]) + rotr25(state5 + V[5]); - W[5] = state5 + V[5] + rotr15(W[3]) + rotr25(state6 + V[6]); - W[6] = state6 + V[6] + 0x00000100U + rotr15(W[4]) + rotr25(state7 + V[7]); - W[7] = state7 + V[7] + W[0] + 0x11002000U + rotr15(W[5]); - W[8] = W[1] + 0x80000000U + rotr15(W[6]); - W[9] = W[2] + rotr15(W[7]); - W[10] = W[3] + rotr15(W[8]); - W[11] = W[4] + rotr15(W[9]); - W[12] = W[5] + rotr15(W[10]); - W[13] = W[6] + rotr15(W[11]); - W[14] = W[7] + 0x00400022U + rotr15(W[12]); - W[15] = W[8] + 0x00000100U + rotr15(W[13]) + rotr25(W[0]); - - // 0x71374491U + 0x1f83d9abU + state1 - const u state1AaddV1 = state1A + V[1]; - // 0xb5c0fbcfU + 0x9b05688cU + state2 - const u state2AaddV2 = state2A + V[2]; - // 0x510e527fU + 0xe9b5dba5U + state3 - const u state3AaddV3 = state3A + V[3]; - // 0x3956c25bU + state4 - const u state4AaddV4 = state4A + V[4]; - // 0x59f111f1U + state5 - const u state5AaddV5 = state5A + V[5]; - // 0x923f82a4U + state6 - const u state6AaddV6 = state6A + V[6]; - // 0xab1c5ed5U + state7 - const u state7AaddV7 = state7A + V[7]; - - // 0x98c7e2a2U + state0 - V[3] = state0A + V[0]; - // 0xfc08884dU + state0 - V[7] = state0B + V[0]; - V[0] = 0x6a09e667U; - V[1] = 0xbb67ae85U; - V[2] = 0x3c6ef372U; - V[4] = 0x510e527fU; - V[5] = 0x9b05688cU; - V[6] = 0x1f83d9abU; - - V[2] += state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - -//---------------------------------------------------------------------------------- - - W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]); - W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]); - W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]); - W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]); - W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]); - W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]); - W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]); - W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]); - W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]); - W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]); - W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]); - W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]); - W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]); - W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]); - W[14] = W[14] + W[7] + rotr15(W[12]) + rotr25(W[15]); - W[15] = W[15] + W[8] + rotr15(W[13]) + rotr25( W[0]); - - V[3] += 0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - -//---------------------------------------------------------------------------------- - - W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]); - W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]); - W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]); - W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]); - W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]); - W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]); - W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]); - W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]); - W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]); - W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]); - W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]); - W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]); - W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]); - - V[3] += 0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - V[6] = 0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); - - V[1] += 0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - V[5] = 0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); - - V[0] += 0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - V[4] = 0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); - - V[7] += 0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - V[3] = 0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); - - V[6] += 0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]); - V[2] = 0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); - - V[5] += 0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]); - V[1] = 0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); - - V[4] += 0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]); - V[0] = 0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); - - V[3] += 0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]); - V[7] = 0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); - - V[2] += 0x78a5636fU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]); - - V[1] += 0x84c87814U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]); - - V[0] += 0x8cc70208U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]); - - V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]); - -#define FOUND (0x800) -#define NFLAG (0x7FF) - -#ifdef VECTORS4 - if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU)) - output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : ((V[7].y == 0x136032edU) ? nonce.y : ((V[7].z == 0x136032edU) ? nonce.z : nonce.w)); -#elif defined VECTORS2 - if ((V[7].x == 0x136032edU) + (V[7].y == 0x136032edU)) - output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : nonce.y; -#else - if (V[7] == 0x136032edU) - output[FOUND] = output[NFLAG & nonce] = nonce; -#endif -} diff --git a/phatk120223.cl b/phatk120223.cl deleted file mode 100644 index 0f604436..00000000 --- a/phatk120223.cl +++ /dev/null @@ -1,417 +0,0 @@ -// This file is taken and modified from the public-domain poclbm project, and -// I have therefore decided to keep it public-domain. -// Modified version copyright 2011-2012 Con Kolivas - -#ifdef VECTORS4 - typedef uint4 u; -#elif defined VECTORS2 - typedef uint2 u; -#else - typedef uint u; -#endif - -__constant uint K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -__constant uint ConstW[128] = { -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, -0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 -}; - -__constant uint H[8] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 -}; - - -#ifdef BITALIGN - #pragma OPENCL EXTENSION cl_amd_media_ops : enable - #define rot(x, y) amd_bitalign(x, x, (uint)(32 - y)) - -// This part is not from the stock poclbm kernel. It's part of an optimization -// added in the Phoenix Miner. - -// Some AMD devices have Vals[0] BFI_INT opcode, which behaves exactly like the -// SHA-256 Ch function, but provides it in exactly one instruction. If -// detected, use it for Ch. Otherwise, construct Ch out of simpler logical -// primitives. - - #ifdef BFI_INT - // Well, slight problem... It turns out BFI_INT isn't actually exposed to - // OpenCL (or CAL IL for that matter) in any way. However, there is - // a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via - // amd_bytealign, takes the same inputs, and provides the same output. - // We can use that as a placeholder for BFI_INT and have the application - // patch it after compilation. - - // This is the BFI_INT function - #define Ch(x, y, z) amd_bytealign(x,y,z) - // Ma can also be implemented in terms of BFI_INT... - #define Ma(z, x, y) amd_bytealign(z^x,y,x) - #else // BFI_INT - // Later SDKs optimise this to BFI INT without patching and GCN - // actually fails if manually patched with BFI_INT - - #define Ch(x, y, z) bitselect((u)z, (u)y, (u)x) - #define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) - #define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y) - #endif -#else // BITALIGN - #define Ch(x, y, z) (z ^ (x & (y ^ z))) - #define Ma(x, y, z) ((x & z) | (y & (x | z))) - #define rot(x, y) rotate((u)x, (u)y) - #define rotr(x, y) rotate((u)x, (u)(32-y)) -#endif - - - -//Various intermediate calculations for each SHA round -#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8])) -#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u)) - -#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8])) -#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u)) - -#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8]) -#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8]) - -//t1 calc when W is already calculated -#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] + W[(n)] + s1(n) + ch(n) - -//t1 calc which calculates W -#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] + W(n) + s1(n) + ch(n) - -//Used for constant W Values (the compiler optimizes out zeros) -#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n) - -//t2 Calc -#define t2(n) maj(n) + s0(n) - -#define rotC(x,n) (x<> (32-n)) - -//W calculation used for SHA round -#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n)) - - - -//Partial W calculations (used for the begining where only some values are nonzero) -#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U))) -#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U))) - - -#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U))) -#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U))) - - -#define P3(n) W[n-7] -#define P4(n) W[n-16] - - -//Partial Calcs for constant W values -#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U))) -#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U))) -#define P3C(x) ConstW[x-7] -#define P4C(x) ConstW[x-16] - -//SHA round with built in W calc -#define sharoundW(n) Barrier1(n); Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n); - -//SHA round without W calc -#define sharound(n) Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n); - -//SHA round for constant W values -#define sharoundC(n) Barrier3(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n); - -//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order -#define Barrier1(n) t1 = t1C((n+1)) -#define Barrier2(n) t1 = t1C((n)) -#define Barrier3(n) t1 = t1C((n)) - -//#define WORKSIZE 256 -#define MAXBUFFERS (4095) - -__kernel - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void search( const uint state0, const uint state1, const uint state2, const uint state3, - const uint state4, const uint state5, const uint state6, const uint state7, - const uint B1, const uint C1, const uint D1, - const uint F1, const uint G1, const uint H1, - const u base, - const uint W16, const uint W17, - const uint PreVal4, const uint PreVal0, - const uint PreW18, const uint PreW19, - const uint PreW31, const uint PreW32, - - __global uint * output) -{ - - - u W[124]; - u Vals[8]; - -//Dummy Variable to prevent compiler from reordering between rounds - u t1; - - //Vals[0]=state0; - Vals[1]=B1; - Vals[2]=C1; - Vals[3]=D1; - //Vals[4]=PreVal4; - Vals[5]=F1; - Vals[6]=G1; - Vals[7]=H1; - - W[16] = W16; - W[17] = W17; - -#ifdef VECTORS4 - //Less dependencies to get both the local id and group id and then add them - W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); - uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U); - //Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3 - W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U}; -#elif defined VECTORS2 - W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); - uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U); - W[18] = PreW18 + (u){r, r ^ 0x2004000U}; -#else - W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); - u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U); - W[18] = PreW18 + r; -#endif - //the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions - - - - Vals[4] = PreVal4 + W[3]; - Vals[0] = PreVal0 + W[3]; - - sharoundC(4); - W[19] = PreW19 + W[3]; - sharoundC(5); - W[20] = P4C(20) + P1(20); - sharoundC(6); - W[21] = P1(21); - sharoundC(7); - W[22] = P3C(22) + P1(22); - sharoundC(8); - W[23] = W[16] + P1(23); - sharoundC(9); - W[24] = W[17] + P1(24); - sharoundC(10); - W[25] = P1(25) + P3(25); - W[26] = P1(26) + P3(26); - sharoundC(11); - W[27] = P1(27) + P3(27); - W[28] = P1(28) + P3(28); - sharoundC(12); - W[29] = P1(29) + P3(29); - sharoundC(13); - W[30] = P1(30) + P2C(30) + P3(30); - W[31] = PreW31 + (P1(31) + P3(31)); - sharoundC(14); - W[32] = PreW32 + (P1(32) + P3(32)); - sharoundC(15); - sharound(16); - sharound(17); - sharound(18); - sharound(19); - sharound(20); - sharound(21); - sharound(22); - sharound(23); - sharound(24); - sharound(25); - sharound(26); - sharound(27); - sharound(28); - sharound(29); - sharound(30); - sharound(31); - sharound(32); - sharoundW(33); - sharoundW(34); - sharoundW(35); - sharoundW(36); - sharoundW(37); - sharoundW(38); - sharoundW(39); - sharoundW(40); - sharoundW(41); - sharoundW(42); - sharoundW(43); - sharoundW(44); - sharoundW(45); - sharoundW(46); - sharoundW(47); - sharoundW(48); - sharoundW(49); - sharoundW(50); - sharoundW(51); - sharoundW(52); - sharoundW(53); - sharoundW(54); - sharoundW(55); - sharoundW(56); - sharoundW(57); - sharoundW(58); - sharoundW(59); - sharoundW(60); - sharoundW(61); - sharoundW(62); - sharoundW(63); - - W[64]=state0+Vals[0]; - W[65]=state1+Vals[1]; - W[66]=state2+Vals[2]; - W[67]=state3+Vals[3]; - W[68]=state4+Vals[4]; - W[69]=state5+Vals[5]; - W[70]=state6+Vals[6]; - W[71]=state7+Vals[7]; - - Vals[0]=H[0]; - Vals[1]=H[1]; - Vals[2]=H[2]; - Vals[3]=H[3]; - Vals[4]=H[4]; - Vals[5]=H[5]; - Vals[6]=H[6]; - Vals[7]=H[7]; - - //sharound(64 + 0); - const u Temp = (0xb0edbdd0U + K[0]) + W[64]; - Vals[7] = Temp + 0x08909ae5U; - Vals[3] = 0xa54ff53aU + Temp; - -#define P124(n) P2(n) + P1(n) + P4(n) - - - W[64 + 16] = + P2(64 + 16) + P4(64 + 16); - sharound(64 + 1); - W[64 + 17] = P1C(64 + 17) + P2(64 + 17) + P4(64 + 17); - sharound(64 + 2); - W[64 + 18] = P124(64 + 18); - sharound(64 + 3); - W[64 + 19] = P124(64 + 19); - sharound(64 + 4); - W[64 + 20] = P124(64 + 20); - sharound(64 + 5); - W[64 + 21] = P124(64 + 21); - sharound(64 + 6); - W[64 + 22] = P4(64 + 22) + P3C(64 + 22) + P2(64 + 22) + P1(64 + 22); - sharound(64 + 7); - W[64 + 23] = P4(64 + 23) + P3(64 + 23) + P2C(64 + 23) + P1(64 + 23); - sharoundC(64 + 8); - W[64 + 24] = P1(64 + 24) + P4C(64 + 24) + P3(64 + 24); - sharoundC(64 + 9); - W[64 + 25] = P3(64 + 25) + P1(64 + 25); - sharoundC(64 + 10); - W[64 + 26] = P3(64 + 26) + P1(64 + 26); - sharoundC(64 + 11); - W[64 + 27] = P3(64 + 27) + P1(64 + 27); - sharoundC(64 + 12); - W[64 + 28] = P3(64 + 28) + P1(64 + 28); - sharoundC(64 + 13); - W[64 + 29] = P1(64 + 29) + P3(64 + 29); - W[64 + 30] = P3(64 + 30) + P2C(64 + 30) + P1(64 + 30); - sharoundC(64 + 14); - W[64 + 31] = P4C(64 + 31) + P3(64 + 31) + P2(64 + 31) + P1(64 + 31); - sharoundC(64 + 15); - sharound(64 + 16); - sharound(64 + 17); - sharound(64 + 18); - sharound(64 + 19); - sharound(64 + 20); - sharound(64 + 21); - sharound(64 + 22); - sharound(64 + 23); - sharound(64 + 24); - sharound(64 + 25); - sharound(64 + 26); - sharound(64 + 27); - sharound(64 + 28); - sharound(64 + 29); - sharound(64 + 30); - sharound(64 + 31); - sharoundW(64 + 32); - sharoundW(64 + 33); - sharoundW(64 + 34); - sharoundW(64 + 35); - sharoundW(64 + 36); - sharoundW(64 + 37); - sharoundW(64 + 38); - sharoundW(64 + 39); - sharoundW(64 + 40); - sharoundW(64 + 41); - sharoundW(64 + 42); - sharoundW(64 + 43); - sharoundW(64 + 44); - sharoundW(64 + 45); - sharoundW(64 + 46); - sharoundW(64 + 47); - sharoundW(64 + 48); - sharoundW(64 + 49); - sharoundW(64 + 50); - sharoundW(64 + 51); - sharoundW(64 + 52); - sharoundW(64 + 53); - sharoundW(64 + 54); - sharoundW(64 + 55); - sharoundW(64 + 56); - sharoundW(64 + 57); - sharoundW(64 + 58); - - W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) - - (-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64))); - -#define FOUND (0x800) -#define NFLAG (0x7FF) - -#ifdef VECTORS4 - bool result = W[117].x & W[117].y & W[117].z & W[117].w; - if (!result) { - if (!W[117].x) - output[FOUND] = output[NFLAG & W[3].x] = W[3].x; - if (!W[117].y) - output[FOUND] = output[NFLAG & W[3].y] = W[3].y; - if (!W[117].z) - output[FOUND] = output[NFLAG & W[3].z] = W[3].z; - if (!W[117].w) - output[FOUND] = output[NFLAG & W[3].w] = W[3].w; - } -#elif defined VECTORS2 - bool result = W[117].x & W[117].y; - if (!result) { - if (!W[117].x) - output[FOUND] = output[NFLAG & W[3].x] = W[3].x; - if (!W[117].y) - output[FOUND] = output[NFLAG & W[3].y] = W[3].y; - } -#else - if (!W[117]) - output[FOUND] = output[NFLAG & W[3]] = W[3]; -#endif -} diff --git a/poclbm120327.cl b/poclbm120327.cl deleted file mode 100644 index 3e8b9943..00000000 --- a/poclbm120327.cl +++ /dev/null @@ -1,1353 +0,0 @@ -// -ck modified kernel taken from Phoenix taken from poclbm, with aspects of -// phatk and others. -// Modified version copyright 2011-2012 Con Kolivas - -// This file is taken and modified from the public-domain poclbm project, and -// we have therefore decided to keep it public-domain in Phoenix. - -#ifdef VECTORS4 - typedef uint4 u; -#elif defined VECTORS2 - typedef uint2 u; -#else - typedef uint u; -#endif - -__constant uint K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - - -// This part is not from the stock poclbm kernel. It's part of an optimization -// added in the Phoenix Miner. - -// Some AMD devices have a BFI_INT opcode, which behaves exactly like the -// SHA-256 ch function, but provides it in exactly one instruction. If -// detected, use it for ch. Otherwise, construct ch out of simpler logical -// primitives. - -#ifdef BITALIGN - #pragma OPENCL EXTENSION cl_amd_media_ops : enable - #define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y) -#else - #define rotr(x, y) rotate((u)x, (u)(32 - y)) -#endif -#ifdef BFI_INT - // Well, slight problem... It turns out BFI_INT isn't actually exposed to - // OpenCL (or CAL IL for that matter) in any way. However, there is - // a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via - // amd_bytealign, takes the same inputs, and provides the same output. - // We can use that as a placeholder for BFI_INT and have the application - // patch it after compilation. - - // This is the BFI_INT function - #define ch(x, y, z) amd_bytealign(x, y, z) - - // Ma can also be implemented in terms of BFI_INT... - #define Ma(x, y, z) amd_bytealign( (z^x), (y), (x) ) - - // AMD's KernelAnalyzer throws errors compiling the kernel if we use - // amd_bytealign on constants with vectors enabled, so we use this to avoid - // problems. (this is used 4 times, and likely optimized out by the compiler.) - #define Ma2(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) -#else // BFI_INT - //GCN actually fails if manually patched with BFI_INT - - #define ch(x, y, z) bitselect((u)z, (u)y, (u)x) - #define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) - #define Ma2(x, y, z) Ma(x, y, z) -#endif - - -__kernel -__attribute__((vec_type_hint(u))) -__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -void search(const uint state0, const uint state1, const uint state2, const uint state3, - const uint state4, const uint state5, const uint state6, const uint state7, - const uint b1, const uint c1, - const uint f1, const uint g1, const uint h1, -#ifndef GOFFSET - const u base, -#endif - const uint fw0, const uint fw1, const uint fw2, const uint fw3, const uint fw15, const uint fw01r, - const uint D1A, const uint C1addK5, const uint B1addK6, - const uint W16addK16, const uint W17addK17, - const uint PreVal4addT1, const uint Preval0, - __global uint * output) -{ - u Vals[24]; - u *W = &Vals[8]; - -#ifdef GOFFSET - const u nonce = (uint)(get_global_id(0)); -#else - const u nonce = base + (uint)(get_global_id(0)); -#endif - -Vals[5]=Preval0; -Vals[5]+=nonce; - -Vals[0]=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],b1,c1); -Vals[0]+=D1A; - -Vals[2]=Vals[0]; -Vals[2]+=h1; - -Vals[1]=PreVal4addT1; -Vals[1]+=nonce; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); - -Vals[6]=C1addK5; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],b1); - -Vals[3]=Vals[6]; -Vals[3]+=g1; -Vals[0]+=Ma2(g1,Vals[1],f1); -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma2(f1,Vals[0],Vals[1]); - -Vals[7]=B1addK6; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); - -Vals[4]=Vals[7]; -Vals[4]+=f1; - -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[7]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[8]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[9]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[10]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[11]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[12]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[13]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[14]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=0xC19BF3F4U; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=W16addK16; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=W17addK17; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[2]=(rotr(nonce,7)^rotr(nonce,18)^(nonce>>3U)); -W[2]+=fw2; -Vals[4]+=W[2]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[18]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[3]=nonce; -W[3]+=fw3; -Vals[1]+=W[3]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[19]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); -W[4]+=0x80000000U; -Vals[0]+=W[4]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[20]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[5]=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); -Vals[6]+=W[5]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[21]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[6]=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); -W[6]+=0x00000280U; -Vals[7]+=W[6]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[22]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[7]=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); -W[7]+=fw0; -Vals[5]+=W[7]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[23]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[8]=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); -W[8]+=fw1; -Vals[2]+=W[8]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[24]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[9]=W[2]; -W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); -Vals[3]+=W[9]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[25]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[10]=W[3]; -W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); -Vals[4]+=W[10]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[26]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[11]=W[4]; -W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); -Vals[1]+=W[11]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[27]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[12]=W[5]; -W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); -Vals[0]+=W[12]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[28]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[13]=W[6]; -W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); -Vals[6]+=W[13]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[29]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[14]=0x00a00055U; -W[14]+=W[7]; -W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); -Vals[7]+=W[14]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[30]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[15]=fw15; -W[15]+=W[8]; -W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); -Vals[5]+=W[15]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[31]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[0]=fw01r; -W[0]+=W[9]; -W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U)); -Vals[2]+=W[0]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[32]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[1]=fw1; -W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); -W[1]+=W[10]; -W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U)); -Vals[3]+=W[1]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[33]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); -W[2]+=W[11]; -W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); -Vals[4]+=W[2]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[34]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); -W[3]+=W[12]; -W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); -Vals[1]+=W[3]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[35]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); -W[4]+=W[13]; -W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); -Vals[0]+=W[4]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[36]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); -W[5]+=W[14]; -W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); -Vals[6]+=W[5]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[37]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); -W[6]+=W[15]; -W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); -Vals[7]+=W[6]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[38]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U)); -W[7]+=W[0]; -W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); -Vals[5]+=W[7]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[39]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U)); -W[8]+=W[1]; -W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); -Vals[2]+=W[8]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[40]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U)); -W[9]+=W[2]; -W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); -Vals[3]+=W[9]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[41]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U)); -W[10]+=W[3]; -W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); -Vals[4]+=W[10]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[42]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U)); -W[11]+=W[4]; -W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); -Vals[1]+=W[11]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[43]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U)); -W[12]+=W[5]; -W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); -Vals[0]+=W[12]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[44]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U)); -W[13]+=W[6]; -W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); -Vals[6]+=W[13]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[45]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[14]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U)); -W[14]+=W[7]; -W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); -Vals[7]+=W[14]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[46]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); -W[15]+=W[8]; -W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); -Vals[5]+=W[15]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[47]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U)); -W[0]+=W[9]; -W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U)); -Vals[2]+=W[0]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[48]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); -W[1]+=W[10]; -W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U)); -Vals[3]+=W[1]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[49]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); -W[2]+=W[11]; -W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); -Vals[4]+=W[2]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[50]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); -W[3]+=W[12]; -W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); -Vals[1]+=W[3]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[51]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); -W[4]+=W[13]; -W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); -Vals[0]+=W[4]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[52]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); -W[5]+=W[14]; -W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); -Vals[6]+=W[5]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[53]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); -W[6]+=W[15]; -W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); -Vals[7]+=W[6]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[54]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U)); -W[7]+=W[0]; -W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); -Vals[5]+=W[7]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[55]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U)); -W[8]+=W[1]; -W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); -Vals[2]+=W[8]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[56]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U)); -W[9]+=W[2]; -W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); -Vals[3]+=W[9]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[57]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U)); -W[10]+=W[3]; -W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); -Vals[4]+=W[10]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[58]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U)); -W[11]+=W[4]; -W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); -Vals[1]+=W[11]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[59]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U)); -W[12]+=W[5]; -W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); -Vals[0]+=W[12]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[60]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U)); -W[13]+=W[6]; -W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); -Vals[6]+=W[13]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[61]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -Vals[7]+=W[14]; -Vals[7]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U)); -Vals[7]+=W[7]; -Vals[7]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[62]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -Vals[5]+=W[15]; -Vals[5]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); -Vals[5]+=W[8]; -Vals[5]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[63]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -Vals[5]+=state0; - -W[7]=state7; -W[7]+=Vals[2]; - -Vals[2]=0xF377ED68U; -Vals[2]+=Vals[5]; - -W[3]=state3; -W[3]+=Vals[0]; - -Vals[0]=0xa54ff53aU; -Vals[0]+=Vals[2]; -Vals[2]+=0x08909ae5U; - -W[6]=state6; -W[6]+=Vals[3]; - -Vals[3]=0x90BB1E3CU; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=(0x9b05688cU^(Vals[0]&0xca0b3af3U)); - -Vals[7]+=state1; -Vals[3]+=Vals[7]; - -W[2]=state2; -W[2]+=Vals[6]; - -Vals[6]=0x3c6ef372U; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma2(0xbb67ae85U,Vals[2],0x6a09e667U); - -W[5]=state5; -W[5]+=Vals[4]; - -Vals[4]=0x50C6645BU; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],0x510e527fU); -Vals[4]+=W[2]; - -W[1]=Vals[7]; -Vals[7]=0xbb67ae85U; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma2(0x6a09e667U,Vals[3],Vals[2]); - -W[4]=state4; -W[4]+=Vals[1]; - -Vals[1]=0x3AC42E24U; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=W[3]; - -W[0]=Vals[5]; - -Vals[5]=Vals[1]; -Vals[5]+=0x6a09e667U; - -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[4]; -Vals[0]+=W[4]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[5]; -Vals[6]+=W[5]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[6]; -Vals[7]+=W[6]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[7]; -Vals[5]+=W[7]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=0x5807AA98U; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[9]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[10]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[11]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[12]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[13]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[14]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=0xC19BF274U; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U)); -Vals[2]+=W[0]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[16]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); -W[1]+=0x00a00000U; -Vals[3]+=W[1]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[17]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); -W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); -Vals[4]+=W[2]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[18]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); -W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); -Vals[1]+=W[3]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[19]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); -W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); -Vals[0]+=W[4]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[20]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); -W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); -Vals[6]+=W[5]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[21]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); -W[6]+=0x00000100U; -W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); -Vals[7]+=W[6]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[22]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[7]+=0x11002000U; -W[7]+=W[0]; -W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); -Vals[5]+=W[7]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[23]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[8]=0x80000000U; -W[8]+=W[1]; -W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); -Vals[2]+=W[8]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[24]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[9]=W[2]; -W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); -Vals[3]+=W[9]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[25]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[10]=W[3]; -W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); -Vals[4]+=W[10]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[26]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[11]=W[4]; -W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); -Vals[1]+=W[11]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[27]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[12]=W[5]; -W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); -Vals[0]+=W[12]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[28]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[13]=W[6]; -W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); -Vals[6]+=W[13]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[29]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[14]=0x00400022U; -W[14]+=W[7]; -W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); -Vals[7]+=W[14]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[30]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[15]=0x00000100U; -W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); -W[15]+=W[8]; -W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); -Vals[5]+=W[15]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[31]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U)); -W[0]+=W[9]; -W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U)); -Vals[2]+=W[0]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[32]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); -W[1]+=W[10]; -W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U)); -Vals[3]+=W[1]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[33]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); -W[2]+=W[11]; -W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); -Vals[4]+=W[2]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[34]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); -W[3]+=W[12]; -W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); -Vals[1]+=W[3]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[35]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); -W[4]+=W[13]; -W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); -Vals[0]+=W[4]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[36]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); -W[5]+=W[14]; -W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); -Vals[6]+=W[5]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[37]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); -W[6]+=W[15]; -W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); -Vals[7]+=W[6]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[38]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U)); -W[7]+=W[0]; -W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); -Vals[5]+=W[7]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[39]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U)); -W[8]+=W[1]; -W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); -Vals[2]+=W[8]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[40]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U)); -W[9]+=W[2]; -W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); -Vals[3]+=W[9]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[41]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U)); -W[10]+=W[3]; -W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); -Vals[4]+=W[10]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[42]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U)); -W[11]+=W[4]; -W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); -Vals[1]+=W[11]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[43]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U)); -W[12]+=W[5]; -W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); -Vals[0]+=W[12]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[44]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U)); -W[13]+=W[6]; -W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); -Vals[6]+=W[13]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[45]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[14]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U)); -W[14]+=W[7]; -W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); -Vals[7]+=W[14]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[46]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); -W[15]+=W[8]; -W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); -Vals[5]+=W[15]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[47]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U)); -W[0]+=W[9]; -W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U)); -Vals[2]+=W[0]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[48]; -Vals[0]+=Vals[2]; -Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); -Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - -W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); -W[1]+=W[10]; -W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U)); -Vals[3]+=W[1]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[49]; -Vals[6]+=Vals[3]; -Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); -Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); - -W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); -W[2]+=W[11]; -W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); -Vals[4]+=W[2]; -Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); -Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); -Vals[4]+=K[50]; -Vals[7]+=Vals[4]; -Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); -Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); - -W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); -W[3]+=W[12]; -W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); -Vals[1]+=W[3]; -Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); -Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); -Vals[1]+=K[51]; -Vals[5]+=Vals[1]; -Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); -Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); - -W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); -W[4]+=W[13]; -W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); -Vals[0]+=W[4]; -Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); -Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); -Vals[0]+=K[52]; -Vals[2]+=Vals[0]; -Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); - -W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); -W[5]+=W[14]; -W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); -Vals[6]+=W[5]; -Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); -Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); -Vals[6]+=K[53]; -Vals[3]+=Vals[6]; -Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); -Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); - -W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); -W[6]+=W[15]; -W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); -Vals[7]+=W[6]; -Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); -Vals[7]+=K[54]; -Vals[4]+=Vals[7]; -Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); -Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); - -W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U)); -W[7]+=W[0]; -W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); -Vals[5]+=W[7]; -Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); -Vals[5]+=K[55]; -Vals[1]+=Vals[5]; -Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); -Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); - -W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U)); -W[8]+=W[1]; -W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); -Vals[2]+=W[8]; -Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); -Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); -Vals[2]+=K[56]; -Vals[0]+=Vals[2]; - -W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U)); -W[9]+=W[2]; -W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); -Vals[3]+=W[9]; -Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); -Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); -Vals[3]+=K[57]; -Vals[3]+=Vals[6]; - -W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U)); -W[10]+=W[3]; -W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); -Vals[4]+=W[10]; -Vals[4]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); -Vals[4]+=ch(Vals[3],Vals[0],Vals[1]); -Vals[4]+=K[58]; -Vals[4]+=Vals[7]; -Vals[1]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); -Vals[1]+=ch(Vals[4],Vals[3],Vals[0]); -Vals[1]+=W[11]; -Vals[1]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U)); -Vals[1]+=W[4]; -Vals[1]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); -Vals[1]+=K[59]; -Vals[1]+=Vals[5]; - -#define FOUND (0x800) -#define NFLAG (0x7FF) - -#if defined(VECTORS2) || defined(VECTORS4) - Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); - Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); - Vals[2]+=W[12]; - Vals[2]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U)); - Vals[2]+=W[5]; - Vals[2]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); - Vals[2]+=Vals[0]; - Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); - Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); - - if (any(Vals[2] == 0x136032edU)) { - if (Vals[2].x == 0x136032edU) - output[FOUND] = output[NFLAG & nonce.x] = nonce.x; - if (Vals[2].y == 0x136032edU) - output[FOUND] = output[NFLAG & nonce.y] = nonce.y; -#if defined(VECTORS4) - if (Vals[2].z == 0x136032edU) - output[FOUND] = output[NFLAG & nonce.z] = nonce.z; - if (Vals[2].w == 0x136032edU) - output[FOUND] = output[NFLAG & nonce.w] = nonce.w; -#endif - } -#else - if ((Vals[2]+ - Ma(Vals[6],Vals[5],Vals[7])+ - (rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22))+ - W[12]+ - (rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U))+ - W[5]+ - (rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U))+ - Vals[0]+ - (rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25))+ - ch(Vals[1],Vals[4],Vals[3])) == 0x136032edU) - output[FOUND] = output[NFLAG & nonce] = nonce; -#endif -} diff --git a/scrypt120713.cl b/scrypt120713.cl deleted file mode 100644 index d38f6a54..00000000 --- a/scrypt120713.cl +++ /dev/null @@ -1,757 +0,0 @@ -#define rotl(x,y) rotate(x,y) -#define Ch(x,y,z) bitselect(z,y,x) -#define Maj(x,y,z) Ch((x^z),y,z) - -#define EndianSwap(n) (rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U)) - -#define Tr2(x) (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U)) -#define Tr1(x) (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U)) -#define Wr2(x) (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U)) -#define Wr1(x) (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U)) - -#define RND(a, b, c, d, e, f, g, h, k) \ - h += Tr1(e) + Ch(e, f, g) + k; \ - d += h; \ - h += Tr2(a) + Maj(a, b, c); - -void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) -{ - uint4 S0 = *state0; - uint4 S1 = *state1; - -#define A S0.x -#define B S0.y -#define C S0.z -#define D S0.w -#define E S1.x -#define F S1.y -#define G S1.z -#define H S1.w - - uint4 W[4]; - - W[ 0].x = block0.x; - RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U); - W[ 0].y = block0.y; - RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U); - W[ 0].z = block0.z; - RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU); - W[ 0].w = block0.w; - RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U); - - W[ 1].x = block1.x; - RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); - W[ 1].y = block1.y; - RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); - W[ 1].z = block1.z; - RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); - W[ 1].w = block1.w; - RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); - - W[ 2].x = block2.x; - RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); - W[ 2].y = block2.y; - RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); - W[ 2].z = block2.z; - RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); - W[ 2].w = block2.w; - RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); - - W[ 3].x = block3.x; - RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); - W[ 3].y = block3.y; - RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); - W[ 3].z = block3.z; - RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); - W[ 3].w = block3.w; - RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); - - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); - - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); - - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); - - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); - - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); - - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); - - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); - - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); - - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); - - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); - - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); - - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); - - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); - - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); - - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); - - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); - - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); - - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); - - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); - - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); - - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); - - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); - - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); - - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); - - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); - - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); - - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); - - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); - - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); - - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); - - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); - - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); - - W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); - - W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); - - W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); - - W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); - - W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); - - W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); - - W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); - - W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); - - W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); - - W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); - - W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); - - W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); - - W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); - - W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); - - W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); - - W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); - -#undef A -#undef B -#undef C -#undef D -#undef E -#undef F -#undef G -#undef H - - *state0 += S0; - *state1 += S1; -} - -void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) -{ -#define A (*state0).x -#define B (*state0).y -#define C (*state0).z -#define D (*state0).w -#define E (*state1).x -#define F (*state1).y -#define G (*state1).z -#define H (*state1).w - - uint4 W[4]; - - W[0].x = block0.x; - D=0x98c7e2a2U+W[0].x; - H=0xfc08884dU+W[0].x; - - W[0].y = block0.y; - C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y; - G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U); - - W[0].z = block0.z; - B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z; - F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U); - - W[0].w = block0.w; - A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; - E=0x95F61999U+A+Tr2(F)+Maj(F,G,H); - - W[1].x = block1.x; - RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); - W[1].y = block1.y; - RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); - W[1].z = block1.z; - RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); - W[1].w = block1.w; - RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); - - W[2].x = block2.x; - RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); - W[2].y = block2.y; - RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); - W[2].z = block2.z; - RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); - W[2].w = block2.w; - RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); - - W[3].x = block3.x; - RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); - W[3].y = block3.y; - RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); - W[3].z = block3.z; - RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); - W[3].w = block3.w; - RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); - - W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); - - W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); - - W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); - - W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); - - W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); - - W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); - - W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); - - W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); - - W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); - - W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); - - W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); - - W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); - - W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); - - W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); - - W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); - - W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); - - W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); - - W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); - - W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); - - W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); - - W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); - - W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); - - W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); - - W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); - - W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); - - W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); - - W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); - - W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); - - W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); - - W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); - - W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); - - W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); - - W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); - RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); - - W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); - RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); - - W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); - RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); - - W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); - RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); - - W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); - RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); - - W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); - RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); - - W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); - RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); - - W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); - RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); - - W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); - RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); - - W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); - RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); - - W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); - RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); - - W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); - RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); - - W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); - RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); - - W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); - RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); - - W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); - RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); - - W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); - RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); - -#undef A -#undef B -#undef C -#undef D -#undef E -#undef F -#undef G -#undef H - - *state0 += (uint4)(0x6A09E667U,0xBB67AE85U,0x3C6EF372U,0xA54FF53AU); - *state1 += (uint4)(0x510E527FU,0x9B05688CU,0x1F83D9ABU,0x5BE0CD19U); -} - -__constant uint fixedW[64] = -{ - 0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, - 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794, - 0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f, - 0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c, - 0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa, - 0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012, - 0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4, - 0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848, -}; - -void SHA256_fixed(uint4*restrict state0,uint4*restrict state1) -{ - uint4 S0 = *state0; - uint4 S1 = *state1; - -#define A S0.x -#define B S0.y -#define C S0.z -#define D S0.w -#define E S1.x -#define F S1.y -#define G S1.z -#define H S1.w - - RND(A,B,C,D,E,F,G,H, fixedW[0]); - RND(H,A,B,C,D,E,F,G, fixedW[1]); - RND(G,H,A,B,C,D,E,F, fixedW[2]); - RND(F,G,H,A,B,C,D,E, fixedW[3]); - RND(E,F,G,H,A,B,C,D, fixedW[4]); - RND(D,E,F,G,H,A,B,C, fixedW[5]); - RND(C,D,E,F,G,H,A,B, fixedW[6]); - RND(B,C,D,E,F,G,H,A, fixedW[7]); - RND(A,B,C,D,E,F,G,H, fixedW[8]); - RND(H,A,B,C,D,E,F,G, fixedW[9]); - RND(G,H,A,B,C,D,E,F, fixedW[10]); - RND(F,G,H,A,B,C,D,E, fixedW[11]); - RND(E,F,G,H,A,B,C,D, fixedW[12]); - RND(D,E,F,G,H,A,B,C, fixedW[13]); - RND(C,D,E,F,G,H,A,B, fixedW[14]); - RND(B,C,D,E,F,G,H,A, fixedW[15]); - RND(A,B,C,D,E,F,G,H, fixedW[16]); - RND(H,A,B,C,D,E,F,G, fixedW[17]); - RND(G,H,A,B,C,D,E,F, fixedW[18]); - RND(F,G,H,A,B,C,D,E, fixedW[19]); - RND(E,F,G,H,A,B,C,D, fixedW[20]); - RND(D,E,F,G,H,A,B,C, fixedW[21]); - RND(C,D,E,F,G,H,A,B, fixedW[22]); - RND(B,C,D,E,F,G,H,A, fixedW[23]); - RND(A,B,C,D,E,F,G,H, fixedW[24]); - RND(H,A,B,C,D,E,F,G, fixedW[25]); - RND(G,H,A,B,C,D,E,F, fixedW[26]); - RND(F,G,H,A,B,C,D,E, fixedW[27]); - RND(E,F,G,H,A,B,C,D, fixedW[28]); - RND(D,E,F,G,H,A,B,C, fixedW[29]); - RND(C,D,E,F,G,H,A,B, fixedW[30]); - RND(B,C,D,E,F,G,H,A, fixedW[31]); - RND(A,B,C,D,E,F,G,H, fixedW[32]); - RND(H,A,B,C,D,E,F,G, fixedW[33]); - RND(G,H,A,B,C,D,E,F, fixedW[34]); - RND(F,G,H,A,B,C,D,E, fixedW[35]); - RND(E,F,G,H,A,B,C,D, fixedW[36]); - RND(D,E,F,G,H,A,B,C, fixedW[37]); - RND(C,D,E,F,G,H,A,B, fixedW[38]); - RND(B,C,D,E,F,G,H,A, fixedW[39]); - RND(A,B,C,D,E,F,G,H, fixedW[40]); - RND(H,A,B,C,D,E,F,G, fixedW[41]); - RND(G,H,A,B,C,D,E,F, fixedW[42]); - RND(F,G,H,A,B,C,D,E, fixedW[43]); - RND(E,F,G,H,A,B,C,D, fixedW[44]); - RND(D,E,F,G,H,A,B,C, fixedW[45]); - RND(C,D,E,F,G,H,A,B, fixedW[46]); - RND(B,C,D,E,F,G,H,A, fixedW[47]); - RND(A,B,C,D,E,F,G,H, fixedW[48]); - RND(H,A,B,C,D,E,F,G, fixedW[49]); - RND(G,H,A,B,C,D,E,F, fixedW[50]); - RND(F,G,H,A,B,C,D,E, fixedW[51]); - RND(E,F,G,H,A,B,C,D, fixedW[52]); - RND(D,E,F,G,H,A,B,C, fixedW[53]); - RND(C,D,E,F,G,H,A,B, fixedW[54]); - RND(B,C,D,E,F,G,H,A, fixedW[55]); - RND(A,B,C,D,E,F,G,H, fixedW[56]); - RND(H,A,B,C,D,E,F,G, fixedW[57]); - RND(G,H,A,B,C,D,E,F, fixedW[58]); - RND(F,G,H,A,B,C,D,E, fixedW[59]); - RND(E,F,G,H,A,B,C,D, fixedW[60]); - RND(D,E,F,G,H,A,B,C, fixedW[61]); - RND(C,D,E,F,G,H,A,B, fixedW[62]); - RND(B,C,D,E,F,G,H,A, fixedW[63]); - -#undef A -#undef B -#undef C -#undef D -#undef E -#undef F -#undef G -#undef H - *state0 += S0; - *state1 += S1; -} - -void shittify(uint4 B[8]) -{ - uint4 tmp[4]; - tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w); - tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w); - tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w); - tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w); - -#pragma unroll - for(uint i=0; i<4; ++i) - B[i] = EndianSwap(tmp[i]); - - tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w); - tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w); - tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w); - tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w); - -#pragma unroll - for(uint i=0; i<4; ++i) - B[i+4] = EndianSwap(tmp[i]); -} - -void unshittify(uint4 B[8]) -{ - uint4 tmp[4]; - tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w); - tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w); - tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w); - tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w); - -#pragma unroll - for(uint i=0; i<4; ++i) - B[i] = EndianSwap(tmp[i]); - - tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w); - tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w); - tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w); - tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w); - -#pragma unroll - for(uint i=0; i<4; ++i) - B[i+4] = EndianSwap(tmp[i]); -} - -void salsa(uint4 B[8]) -{ - uint4 w[4]; - -#pragma unroll - for(uint i=0; i<4; ++i) - w[i] = (B[i]^=B[i+4]); - -#pragma unroll - for(uint i=0; i<4; ++i) - { - w[0] ^= rotl(w[3] +w[2] , 7U); - w[1] ^= rotl(w[0] +w[3] , 9U); - w[2] ^= rotl(w[1] +w[0] ,13U); - w[3] ^= rotl(w[2] +w[1] ,18U); - w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); - w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); - w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); - w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); - } - -#pragma unroll - for(uint i=0; i<4; ++i) - w[i] = (B[i+4]^=(B[i]+=w[i])); - -#pragma unroll - for(uint i=0; i<4; ++i) - { - w[0] ^= rotl(w[3] +w[2] , 7U); - w[1] ^= rotl(w[0] +w[3] , 9U); - w[2] ^= rotl(w[1] +w[0] ,13U); - w[3] ^= rotl(w[2] +w[1] ,18U); - w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); - w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); - w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); - w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); - } - -#pragma unroll - for(uint i=0; i<4; ++i) - B[i+4] += w[i]; -} - -#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE) -#define CO Coord(z,x,y) - -void scrypt_core(uint4 X[8], __global uint4*restrict lookup) -{ - shittify(X); - const uint zSIZE = 8; - const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0)); - const uint xSIZE = CONCURRENT_THREADS; - uint x = get_global_id(0)%xSIZE; - - for(uint y=0; y<1024/LOOKUP_GAP; ++y) - { -#pragma unroll - for(uint z=0; z Date: Tue, 24 Jul 2012 20:53:05 +1000 Subject: [PATCH 077/178] Don't make opt_scrypt mandatory blocking with opencl code. --- driver-opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver-opencl.c b/driver-opencl.c index eafdd5d2..b22017c1 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1484,7 +1484,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, unsigned int threads; int64_t hashes; - if (gpu->dynamic || opt_scrypt) + if (gpu->dynamic) blocking = CL_TRUE; else blocking = CL_FALSE; From bff58c3bed937bd027e46907acd1eab7327e838b Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 24 Jul 2012 20:55:34 +1000 Subject: [PATCH 078/178] Add back in new cl files. --- diablo120724.cl | 1274 +++++++++++++++++++++++++++++++++++++++++++ diakgcn120724.cl | 587 ++++++++++++++++++++ phatk120724.cl | 417 ++++++++++++++ poclbm120724.cl | 1353 ++++++++++++++++++++++++++++++++++++++++++++++ scrypt120724.cl | 757 ++++++++++++++++++++++++++ 5 files changed, 4388 insertions(+) create mode 100644 diablo120724.cl create mode 100644 diakgcn120724.cl create mode 100644 phatk120724.cl create mode 100644 poclbm120724.cl create mode 100644 scrypt120724.cl diff --git a/diablo120724.cl b/diablo120724.cl new file mode 100644 index 00000000..4b64c300 --- /dev/null +++ b/diablo120724.cl @@ -0,0 +1,1274 @@ +/* + * DiabloMiner - OpenCL miner for BitCoin + * Copyright (C) 2010, 2011, 2012 Patrick McFarland + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more detail). + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifdef VECTORS4 + typedef uint4 z; +#elif defined(VECTORS2) + typedef uint2 z; +#else + typedef uint z; +#endif + +#ifdef BITALIGN +#pragma OPENCL EXTENSION cl_amd_media_ops : enable +#define Zrotr(a, b) amd_bitalign((z)a, (z)a, (z)(32 - b)) +#else +#define Zrotr(a, b) rotate((z)a, (z)b) +#endif + +#ifdef BFI_INT +#define ZCh(a, b, c) amd_bytealign(a, b, c) +#define ZMa(a, b, c) amd_bytealign((c ^ a), (b), (a)) +#else +#define ZCh(a, b, c) bitselect((z)c, (z)b, (z)a) +#define ZMa(a, b, c) bitselect((z)a, (z)b, (z)c ^ (z)a) +#endif + +#define ZR25(n) ((Zrotr((n), 25) ^ Zrotr((n), 14) ^ ((n) >> 3U))) +#define ZR15(n) ((Zrotr((n), 15) ^ Zrotr((n), 13) ^ ((n) >> 10U))) +#define ZR26(n) ((Zrotr((n), 26) ^ Zrotr((n), 21) ^ Zrotr((n), 7))) +#define ZR30(n) ((Zrotr((n), 30) ^ Zrotr((n), 19) ^ Zrotr((n), 10))) + +__kernel +__attribute__((vec_type_hint(z))) +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void search( +#ifndef GOFFSET + const z base, +#endif + const uint PreVal4_state0, const uint PreVal4_state0_k7, + const uint PreVal4_T1, + const uint W18, const uint W19, + const uint W16, const uint W17, + const uint W16_plus_K16, const uint W17_plus_K17, + const uint W31, const uint W32, + const uint d1, const uint b1, const uint c1, + const uint h1, const uint f1, const uint g1, + const uint c1_plus_k5, const uint b1_plus_k6, + const uint state0, const uint state1, const uint state2, const uint state3, + const uint state4, const uint state5, const uint state6, const uint state7, + __global uint * output) +{ + + z ZA[930]; + +#ifdef GOFFSET + const z Znonce = (uint)(get_global_id(0)); +#else + const z Znonce = base + (uint)(get_global_id(0)); +#endif + + ZA[15] = Znonce + PreVal4_state0; + + ZA[16] = (ZCh(ZA[15], b1, c1) + d1) + ZR26(ZA[15]); + ZA[26] = Znonce + PreVal4_T1; + + ZA[27] = ZMa(f1, g1, ZA[26]) + ZR30(ZA[26]); + ZA[17] = ZA[16] + h1; + + ZA[19] = (ZCh(ZA[17], ZA[15], b1) + c1_plus_k5) + ZR26(ZA[17]); + ZA[28] = ZA[27] + ZA[16]; + + ZA[548] = ZMa(ZA[26], f1, ZA[28]) + ZR30(ZA[28]); + ZA[20] = ZA[19] + g1; + + ZA[22] = (ZCh(ZA[20], ZA[17], ZA[15]) + b1_plus_k6) + ZR26(ZA[20]); + ZA[29] = ZA[548] + ZA[19]; + + ZA[549] = ZMa(ZA[28], ZA[26], ZA[29]) + ZR30(ZA[29]); + ZA[23] = ZA[22] + f1; + + ZA[24] = ZCh(ZA[23], ZA[20], ZA[17]) + ZR26(ZA[23]); + ZA[180] = Znonce + PreVal4_state0_k7; + ZA[30] = ZA[549] + ZA[22]; + + ZA[31] = ZMa(ZA[29], ZA[28], ZA[30]) + ZR30(ZA[30]); + ZA[181] = ZA[180] + ZA[24]; + + ZA[182] = ZA[181] + ZA[26]; + ZA[183] = ZA[181] + ZA[31]; + ZA[18] = ZA[17] + 0xd807aa98U; + + ZA[186] = (ZCh(ZA[182], ZA[23], ZA[20]) + ZA[18]) + ZR26(ZA[182]); + ZA[184] = ZMa(ZA[30], ZA[29], ZA[183]) + ZR30(ZA[183]); + + ZA[187] = ZA[186] + ZA[28]; + ZA[188] = ZA[186] + ZA[184]; + ZA[21] = ZA[20] + 0x12835b01U; + + ZA[191] = (ZCh(ZA[187], ZA[182], ZA[23]) + ZA[21]) + ZR26(ZA[187]); + ZA[189] = ZMa(ZA[183], ZA[30], ZA[188]) + ZR30(ZA[188]); + + ZA[192] = ZA[191] + ZA[29]; + ZA[193] = ZA[191] + ZA[189]; + ZA[25] = ZA[23] + 0x243185beU; + + ZA[196] = (ZCh(ZA[192], ZA[187], ZA[182]) + ZA[25]) + ZR26(ZA[192]); + ZA[194] = ZMa(ZA[188], ZA[183], ZA[193]) + ZR30(ZA[193]); + + ZA[197] = ZA[196] + ZA[30]; + ZA[198] = ZA[196] + ZA[194]; + ZA[185] = ZA[182] + 0x550c7dc3U; + + ZA[201] = (ZCh(ZA[197], ZA[192], ZA[187]) + ZA[185]) + ZR26(ZA[197]); + ZA[199] = ZMa(ZA[193], ZA[188], ZA[198]) + ZR30(ZA[198]); + + ZA[202] = ZA[201] + ZA[183]; + ZA[203] = ZA[201] + ZA[199]; + ZA[190] = ZA[187] + 0x72be5d74U; + + ZA[206] = (ZCh(ZA[202], ZA[197], ZA[192]) + ZA[190]) + ZR26(ZA[202]); + ZA[204] = ZMa(ZA[198], ZA[193], ZA[203]) + ZR30(ZA[203]); + + ZA[207] = ZA[206] + ZA[188]; + ZA[208] = ZA[206] + ZA[204]; + ZA[195] = ZA[192] + 0x80deb1feU; + + ZA[211] = (ZCh(ZA[207], ZA[202], ZA[197]) + ZA[195]) + ZR26(ZA[207]); + ZA[209] = ZMa(ZA[203], ZA[198], ZA[208]) + ZR30(ZA[208]); + + ZA[212] = ZA[193] + ZA[211]; + ZA[213] = ZA[211] + ZA[209]; + ZA[200] = ZA[197] + 0x9bdc06a7U; + + ZA[216] = (ZCh(ZA[212], ZA[207], ZA[202]) + ZA[200]) + ZR26(ZA[212]); + ZA[214] = ZMa(ZA[208], ZA[203], ZA[213]) + ZR30(ZA[213]); + + ZA[217] = ZA[198] + ZA[216]; + ZA[218] = ZA[216] + ZA[214]; + ZA[205] = ZA[202] + 0xc19bf3f4U; + + ZA[220] = (ZCh(ZA[217], ZA[212], ZA[207]) + ZA[205]) + ZR26(ZA[217]); + ZA[219] = ZMa(ZA[213], ZA[208], ZA[218]) + ZR30(ZA[218]); + + ZA[222] = ZA[203] + ZA[220]; + ZA[223] = ZA[220] + ZA[219]; + ZA[210] = ZA[207] + W16_plus_K16; + + ZA[226] = (ZCh(ZA[222], ZA[217], ZA[212]) + ZA[210]) + ZR26(ZA[222]); + ZA[225] = ZMa(ZA[218], ZA[213], ZA[223]) + ZR30(ZA[223]); + + ZA[0] = ZR25(Znonce) + W18; + ZA[228] = ZA[226] + ZA[225]; + ZA[227] = ZA[208] + ZA[226]; + ZA[215] = ZA[212] + W17_plus_K17; + + ZA[231] = (ZCh(ZA[227], ZA[222], ZA[217]) + ZA[215]) + ZR26(ZA[227]); + ZA[229] = ZMa(ZA[223], ZA[218], ZA[228]) + ZR30(ZA[228]); + ZA[1] = ZA[0] + 0x0fc19dc6U; + + ZA[232] = ZA[213] + ZA[231]; + ZA[233] = ZA[231] + ZA[229]; + ZA[221] = ZA[217] + ZA[1]; + ZA[32] = Znonce + W19; + + ZA[236] = (ZCh(ZA[232], ZA[227], ZA[222]) + ZA[221]) + ZR26(ZA[232]); + ZA[234] = ZMa(ZA[228], ZA[223], ZA[233]) + ZR30(ZA[233]); + ZA[33] = ZA[32] + 0x240ca1ccU; + + ZA[3] = ZR15(ZA[0]) + 0x80000000U; + ZA[238] = ZA[236] + ZA[234]; + ZA[237] = ZA[218] + ZA[236]; + ZA[224] = ZA[222] + ZA[33]; + + ZA[241] = (ZCh(ZA[237], ZA[232], ZA[227]) + ZA[224]) + ZR26(ZA[237]); + ZA[239] = ZMa(ZA[233], ZA[228], ZA[238]) + ZR30(ZA[238]); + ZA[4] = ZA[3] + 0x2de92c6fU; + + ZA[35] = ZR15(ZA[32]); + ZA[243] = ZA[241] + ZA[239]; + ZA[242] = ZA[223] + ZA[241]; + ZA[230] = ZA[227] + ZA[4]; + + ZA[246] = (ZCh(ZA[242], ZA[237], ZA[232]) + ZA[230]) + ZR26(ZA[242]); + ZA[244] = ZMa(ZA[238], ZA[233], ZA[243]) + ZR30(ZA[243]); + ZA[36] = ZA[35] + 0x4a7484aaU; + + ZA[7] = ZR15(ZA[3]) + 0x00000280U; + ZA[248] = ZA[246] + ZA[244]; + ZA[247] = ZA[228] + ZA[246]; + ZA[235] = ZA[232] + ZA[36]; + + ZA[251] = (ZCh(ZA[247], ZA[242], ZA[237]) + ZA[235]) + ZR26(ZA[247]); + ZA[249] = ZMa(ZA[243], ZA[238], ZA[248]) + ZR30(ZA[248]); + ZA[8] = ZA[7] + 0x5cb0a9dcU; + + ZA[38] = ZR15(ZA[35]) + W16; + ZA[253] = ZA[251] + ZA[249]; + ZA[252] = ZA[233] + ZA[251]; + ZA[240] = ZA[237] + ZA[8]; + + ZA[256] = (ZCh(ZA[252], ZA[247], ZA[242]) + ZA[240]) + ZR26(ZA[252]); + ZA[254] = ZMa(ZA[248], ZA[243], ZA[253]) + ZR30(ZA[253]); + ZA[40] = ZA[38] + 0x76f988daU; + + ZA[10] = ZR15(ZA[7]) + W17; + ZA[258] = ZA[256] + ZA[254]; + ZA[257] = ZA[238] + ZA[256]; + ZA[245] = ZA[242] + ZA[40]; + + ZA[261] = (ZCh(ZA[257], ZA[252], ZA[247]) + ZA[245]) + ZR26(ZA[257]); + ZA[259] = ZMa(ZA[253], ZA[248], ZA[258]) + ZR30(ZA[258]); + ZA[13] = ZA[10] + 0x983e5152U; + + ZA[43] = ZR15(ZA[38]) + ZA[0]; + ZA[263] = ZA[261] + ZA[259]; + ZA[262] = ZA[243] + ZA[261]; + ZA[250] = ZA[247] + ZA[13]; + + ZA[266] = (ZCh(ZA[262], ZA[257], ZA[252]) + ZA[250]) + ZR26(ZA[262]); + ZA[264] = ZMa(ZA[258], ZA[253], ZA[263]) + ZR30(ZA[263]); + ZA[11] = ZR15(ZA[10]); + ZA[45] = ZA[43] + 0xa831c66dU; + + ZA[52] = ZA[11] + ZA[32]; + ZA[267] = ZA[248] + ZA[266]; + ZA[255] = ZA[252] + ZA[45]; + ZA[268] = ZA[266] + ZA[264]; + + ZA[271] = (ZCh(ZA[267], ZA[262], ZA[257]) + ZA[255]) + ZR26(ZA[267]); + ZA[269] = ZMa(ZA[263], ZA[258], ZA[268]) + ZR30(ZA[268]); + ZA[54] = ZA[52] + 0xb00327c8U; + + ZA[48] = ZR15(ZA[43]) + ZA[3]; + ZA[273] = ZA[271] + ZA[269]; + ZA[272] = ZA[253] + ZA[271]; + ZA[260] = ZA[257] + ZA[54]; + + ZA[276] = (ZCh(ZA[272], ZA[267], ZA[262]) + ZA[260]) + ZR26(ZA[272]); + ZA[274] = ZMa(ZA[268], ZA[263], ZA[273]) + ZR30(ZA[273]); + ZA[49] = ZA[48] + 0xbf597fc7U; + + ZA[61] = ZR15(ZA[52]) + ZA[35]; + ZA[278] = ZA[276] + ZA[274]; + ZA[277] = ZA[258] + ZA[276]; + ZA[265] = ZA[262] + ZA[49]; + + ZA[281] = (ZCh(ZA[277], ZA[272], ZA[267]) + ZA[265]) + ZR26(ZA[277]); + ZA[279] = ZMa(ZA[273], ZA[268], ZA[278]) + ZR30(ZA[278]); + ZA[62] = ZA[61] + 0xc6e00bf3U; + + ZA[53] = ZR15(ZA[48]) + ZA[7]; + ZA[283] = ZA[281] + ZA[279]; + ZA[282] = ZA[263] + ZA[281]; + ZA[270] = ZA[267] + ZA[62]; + + ZA[286] = (ZCh(ZA[282], ZA[277], ZA[272]) + ZA[270]) + ZR26(ZA[282]); + ZA[284] = ZMa(ZA[278], ZA[273], ZA[283]) + ZR30(ZA[283]); + ZA[39] = ZA[38] + 0x00A00055U; + ZA[55] = ZA[53] + 0xd5a79147U; + + ZA[66] = ZR15(ZA[61]) + ZA[39]; + ZA[288] = ZA[286] + ZA[284]; + ZA[287] = ZA[268] + ZA[286]; + ZA[275] = ZA[272] + ZA[55]; + + ZA[291] = (ZCh(ZA[287], ZA[282], ZA[277]) + ZA[275]) + ZR26(ZA[287]); + ZA[289] = ZMa(ZA[283], ZA[278], ZA[288]) + ZR30(ZA[288]); + ZA[12] = ZA[10] + W31; + ZA[68] = ZA[66] + 0x06ca6351U; + + ZA[67] = ZR15(ZA[53]) + ZA[12]; + ZA[293] = ZA[291] + ZA[289]; + ZA[292] = ZA[273] + ZA[291]; + ZA[280] = ZA[277] + ZA[68]; + + ZA[296] = (ZCh(ZA[292], ZA[287], ZA[282]) + ZA[280]) + ZR26(ZA[292]); + ZA[294] = ZMa(ZA[288], ZA[283], ZA[293]) + ZR30(ZA[293]); + ZA[2] = ZR25(ZA[0]); + ZA[69] = ZA[67] + 0x14292967U; + ZA[44] = ZA[43] + W32; + + ZA[75] = ZR15(ZA[66]) + ZA[44]; + ZA[298] = ZA[296] + ZA[294]; + ZA[297] = ZA[278] + ZA[296]; + ZA[285] = ZA[282] + ZA[69]; + ZA[5] = ZA[2] + W17; + + ZA[301] = (ZCh(ZA[297], ZA[292], ZA[287]) + ZA[285]) + ZR26(ZA[297]); + ZA[299] = ZMa(ZA[293], ZA[288], ZA[298]) + ZR30(ZA[298]); + ZA[56] = ZA[52] + ZA[5]; + ZA[76] = ZA[75] + 0x27b70a85U; + + ZA[34] = ZR25(ZA[32]) + ZA[0]; + ZA[70] = ZR15(ZA[67]) + ZA[56]; + ZA[302] = ZA[283] + ZA[301]; + ZA[303] = ZA[301] + ZA[299]; + ZA[290] = ZA[287] + ZA[76]; + + ZA[306] = (ZCh(ZA[302], ZA[297], ZA[292]) + ZA[290]) + ZR26(ZA[302]); + ZA[304] = ZMa(ZA[298], ZA[293], ZA[303]) + ZR30(ZA[303]); + ZA[6] = ZR25(ZA[3]); + ZA[77] = ZA[70] + 0x2e1b2138U; + ZA[50] = ZA[34] + ZA[48]; + + ZA[78] = ZR15(ZA[75]) + ZA[50]; + ZA[308] = ZA[306] + ZA[304]; + ZA[307] = ZA[288] + ZA[306]; + ZA[295] = ZA[292] + ZA[77]; + ZA[41] = ZA[32] + ZA[6]; + + ZA[311] = (ZCh(ZA[307], ZA[302], ZA[297]) + ZA[295]) + ZR26(ZA[307]); + ZA[309] = ZMa(ZA[303], ZA[298], ZA[308]) + ZR30(ZA[308]); + ZA[63] = ZA[41] + ZA[61]; + ZA[85] = ZA[78] + 0x4d2c6dfcU; + + ZA[37] = ZR25(ZA[35]) + ZA[3]; + ZA[79] = ZR15(ZA[70]) + ZA[63]; + ZA[312] = ZA[293] + ZA[311]; + ZA[313] = ZA[311] + ZA[309]; + ZA[300] = ZA[297] + ZA[85]; + + ZA[316] = (ZCh(ZA[312], ZA[307], ZA[302]) + ZA[300]) + ZR26(ZA[312]); + ZA[314] = ZMa(ZA[308], ZA[303], ZA[313]) + ZR30(ZA[313]); + ZA[9] = ZR25(ZA[7]); + ZA[86] = ZA[79] + 0x53380d13U; + ZA[57] = ZA[37] + ZA[53]; + + ZA[87] = ZR15(ZA[78]) + ZA[57]; + ZA[318] = ZA[316] + ZA[314]; + ZA[317] = ZA[298] + ZA[316]; + ZA[305] = ZA[302] + ZA[86]; + ZA[46] = ZA[35] + ZA[9]; + + ZA[321] = (ZCh(ZA[317], ZA[312], ZA[307]) + ZA[305]) + ZR26(ZA[317]); + ZA[319] = ZMa(ZA[313], ZA[308], ZA[318]) + ZR30(ZA[318]); + ZA[71] = ZA[46] + ZA[66]; + ZA[92] = ZA[87] + 0x650a7354U; + + ZA[42] = ZR25(ZA[38]) + ZA[7]; + ZA[88] = ZR15(ZA[79]) + ZA[71]; + ZA[322] = ZA[303] + ZA[321]; + ZA[323] = ZA[321] + ZA[319]; + ZA[310] = ZA[307] + ZA[92]; + + ZA[326] = (ZCh(ZA[322], ZA[317], ZA[312]) + ZA[310]) + ZR26(ZA[322]); + ZA[324] = ZMa(ZA[318], ZA[313], ZA[323]) + ZR30(ZA[323]); + ZA[14] = ZR25(ZA[10]); + ZA[93] = ZA[88] + 0x766a0abbU; + ZA[72] = ZA[42] + ZA[67]; + + ZA[94] = ZR15(ZA[87]) + ZA[72]; + ZA[328] = ZA[326] + ZA[324]; + ZA[327] = ZA[308] + ZA[326]; + ZA[315] = ZA[312] + ZA[93]; + ZA[51] = ZA[38] + ZA[14]; + + ZA[331] = (ZCh(ZA[327], ZA[322], ZA[317]) + ZA[315]) + ZR26(ZA[327]); + ZA[329] = ZMa(ZA[323], ZA[318], ZA[328]) + ZR30(ZA[328]); + ZA[80] = ZA[51] + ZA[75]; + ZA[100] = ZA[94] + 0x81c2c92eU; + + ZA[47] = ZR25(ZA[43]) + ZA[10]; + ZA[95] = ZR15(ZA[88]) + ZA[80]; + ZA[332] = ZA[313] + ZA[331]; + ZA[333] = ZA[331] + ZA[329]; + ZA[320] = ZA[317] + ZA[100]; + + ZA[336] = (ZCh(ZA[332], ZA[327], ZA[322]) + ZA[320]) + ZR26(ZA[332]); + ZA[334] = ZMa(ZA[328], ZA[323], ZA[333]) + ZR30(ZA[333]); + ZA[81] = ZA[47] + ZA[70]; + ZA[101] = ZA[95] + 0x92722c85U; + + ZA[58] = ZR25(ZA[52]) + ZA[43]; + ZA[102] = ZR15(ZA[94]) + ZA[81]; + ZA[337] = ZA[318] + ZA[336]; + ZA[338] = ZA[336] + ZA[334]; + ZA[325] = ZA[322] + ZA[101]; + + ZA[341] = (ZCh(ZA[337], ZA[332], ZA[327]) + ZA[325]) + ZR26(ZA[337]); + ZA[339] = ZMa(ZA[333], ZA[328], ZA[338]) + ZR30(ZA[338]); + ZA[89] = ZA[58] + ZA[78]; + ZA[108] = ZA[102] + 0xa2bfe8a1U; + + ZA[59] = ZR25(ZA[48]) + ZA[52]; + ZA[103] = ZR15(ZA[95]) + ZA[89]; + ZA[342] = ZA[323] + ZA[341]; + ZA[343] = ZA[341] + ZA[339]; + ZA[330] = ZA[327] + ZA[108]; + + ZA[346] = (ZCh(ZA[342], ZA[337], ZA[332]) + ZA[330]) + ZR26(ZA[342]); + ZA[344] = ZMa(ZA[338], ZA[333], ZA[343]) + ZR30(ZA[343]); + ZA[90] = ZA[59] + ZA[79]; + ZA[109] = ZA[103] + 0xa81a664bU; + + ZA[64] = ZR25(ZA[61]) + ZA[48]; + ZA[110] = ZR15(ZA[102]) + ZA[90]; + ZA[347] = ZA[328] + ZA[346]; + ZA[348] = ZA[346] + ZA[344]; + ZA[335] = ZA[332] + ZA[109]; + + ZA[351] = (ZCh(ZA[347], ZA[342], ZA[337]) + ZA[335]) + ZR26(ZA[347]); + ZA[349] = ZMa(ZA[343], ZA[338], ZA[348]) + ZR30(ZA[348]); + ZA[60] = ZR25(ZA[53]); + ZA[116] = ZA[110] + 0xc24b8b70U; + ZA[96] = ZA[87] + ZA[64]; + + ZA[111] = ZR15(ZA[103]) + ZA[96]; + ZA[353] = ZA[351] + ZA[349]; + ZA[352] = ZA[333] + ZA[351]; + ZA[340] = ZA[337] + ZA[116]; + ZA[65] = ZA[60] + ZA[61]; + + ZA[356] = (ZCh(ZA[352], ZA[347], ZA[342]) + ZA[340]) + ZR26(ZA[352]); + ZA[354] = ZMa(ZA[348], ZA[343], ZA[353]) + ZR30(ZA[353]); + ZA[97] = ZA[88] + ZA[65]; + ZA[117] = ZA[111] + 0xc76c51a3U; + + ZA[73] = ZR25(ZA[66]) + ZA[53]; + ZA[118] = ZR15(ZA[110]) + ZA[97]; + ZA[357] = ZA[338] + ZA[356]; + ZA[358] = ZA[356] + ZA[354]; + ZA[345] = ZA[342] + ZA[117]; + + ZA[361] = (ZCh(ZA[357], ZA[352], ZA[347]) + ZA[345]) + ZR26(ZA[357]); + ZA[359] = ZMa(ZA[353], ZA[348], ZA[358]) + ZR30(ZA[358]); + ZA[104] = ZA[73] + ZA[94]; + ZA[124] = ZA[118] + 0xd192e819U; + + ZA[74] = ZR25(ZA[67]) + ZA[66]; + ZA[119] = ZR15(ZA[111]) + ZA[104]; + ZA[362] = ZA[343] + ZA[361]; + ZA[363] = ZA[361] + ZA[359]; + ZA[350] = ZA[347] + ZA[124]; + + ZA[366] = (ZCh(ZA[362], ZA[357], ZA[352]) + ZA[350]) + ZR26(ZA[362]); + ZA[364] = ZMa(ZA[358], ZA[353], ZA[363]) + ZR30(ZA[363]); + ZA[105] = ZA[74] + ZA[95]; + ZA[125] = ZA[119] + 0xd6990624U; + + ZA[82] = ZR25(ZA[75]) + ZA[67]; + ZA[126] = ZR15(ZA[118]) + ZA[105]; + ZA[367] = ZA[348] + ZA[366]; + ZA[368] = ZA[366] + ZA[364]; + ZA[355] = ZA[352] + ZA[125]; + + ZA[371] = (ZCh(ZA[367], ZA[362], ZA[357]) + ZA[355]) + ZR26(ZA[367]); + ZA[369] = ZMa(ZA[363], ZA[358], ZA[368]) + ZR30(ZA[368]); + ZA[112] = ZA[102] + ZA[82]; + ZA[132] = ZA[126] + 0xf40e3585U; + + ZA[83] = ZR25(ZA[70]) + ZA[75]; + ZA[127] = ZR15(ZA[119]) + ZA[112]; + ZA[372] = ZA[353] + ZA[371]; + ZA[373] = ZA[371] + ZA[369]; + ZA[360] = ZA[357] + ZA[132]; + + ZA[376] = (ZCh(ZA[372], ZA[367], ZA[362]) + ZA[360]) + ZR26(ZA[372]); + ZA[374] = ZMa(ZA[368], ZA[363], ZA[373]) + ZR30(ZA[373]); + ZA[113] = ZA[103] + ZA[83]; + ZA[133] = ZA[127] + 0x106aa070U; + + ZA[84] = ZR25(ZA[78]) + ZA[70]; + ZA[134] = ZR15(ZA[126]) + ZA[113]; + ZA[377] = ZA[358] + ZA[376]; + ZA[378] = ZA[376] + ZA[374]; + ZA[365] = ZA[362] + ZA[133]; + + ZA[381] = (ZCh(ZA[377], ZA[372], ZA[367]) + ZA[365]) + ZR26(ZA[377]); + ZA[379] = ZMa(ZA[373], ZA[368], ZA[378]) + ZR30(ZA[378]); + ZA[120] = ZA[110] + ZA[84]; + ZA[140] = ZA[134] + 0x19a4c116U; + + ZA[91] = ZR25(ZA[79]) + ZA[78]; + ZA[135] = ZR15(ZA[127]) + ZA[120]; + ZA[382] = ZA[363] + ZA[381]; + ZA[383] = ZA[381] + ZA[379]; + ZA[370] = ZA[367] + ZA[140]; + + ZA[386] = (ZCh(ZA[382], ZA[377], ZA[372]) + ZA[370]) + ZR26(ZA[382]); + ZA[384] = ZMa(ZA[378], ZA[373], ZA[383]) + ZR30(ZA[383]); + ZA[121] = ZA[111] + ZA[91]; + ZA[141] = ZA[135] + 0x1e376c08U; + + ZA[98] = ZR25(ZA[87]) + ZA[79]; + ZA[142] = ZR15(ZA[134]) + ZA[121]; + ZA[387] = ZA[368] + ZA[386]; + ZA[388] = ZA[386] + ZA[384]; + ZA[375] = ZA[372] + ZA[141]; + + ZA[391] = (ZCh(ZA[387], ZA[382], ZA[377]) + ZA[375]) + ZR26(ZA[387]); + ZA[389] = ZMa(ZA[383], ZA[378], ZA[388]) + ZR30(ZA[388]); + ZA[128] = ZA[118] + ZA[98]; + ZA[147] = ZA[142] + 0x2748774cU; + + ZA[99] = ZR25(ZA[88]) + ZA[87]; + ZA[143] = ZR15(ZA[135]) + ZA[128]; + ZA[392] = ZA[373] + ZA[391]; + ZA[393] = ZA[391] + ZA[389]; + ZA[380] = ZA[377] + ZA[147]; + + ZA[396] = (ZCh(ZA[392], ZA[387], ZA[382]) + ZA[380]) + ZR26(ZA[392]); + ZA[394] = ZMa(ZA[388], ZA[383], ZA[393]) + ZR30(ZA[393]); + ZA[129] = ZA[119] + ZA[99]; + ZA[148] = ZA[143] + 0x34b0bcb5U; + + ZA[106] = ZR25(ZA[94]) + ZA[88]; + ZA[149] = ZR15(ZA[142]) + ZA[129]; + ZA[397] = ZA[378] + ZA[396]; + ZA[398] = ZA[396] + ZA[394]; + ZA[385] = ZA[382] + ZA[148]; + + ZA[401] = (ZCh(ZA[397], ZA[392], ZA[387]) + ZA[385]) + ZR26(ZA[397]); + ZA[399] = ZMa(ZA[393], ZA[388], ZA[398]) + ZR30(ZA[398]); + ZA[136] = ZA[126] + ZA[106]; + ZA[153] = ZA[149] + 0x391c0cb3U; + + ZA[107] = ZR25(ZA[95]) + ZA[94]; + ZA[150] = ZR15(ZA[143]) + ZA[136]; + ZA[402] = ZA[383] + ZA[401]; + ZA[403] = ZA[401] + ZA[399]; + ZA[390] = ZA[387] + ZA[153]; + + ZA[406] = (ZCh(ZA[402], ZA[397], ZA[392]) + ZA[390]) + ZR26(ZA[402]); + ZA[404] = ZMa(ZA[398], ZA[393], ZA[403]) + ZR30(ZA[403]); + ZA[137] = ZA[127] + ZA[107]; + ZA[154] = ZA[150] + 0x4ed8aa4aU; + + ZA[114] = ZR25(ZA[102]) + ZA[95]; + ZA[155] = ZR15(ZA[149]) + ZA[137]; + ZA[407] = ZA[388] + ZA[406]; + ZA[408] = ZA[406] + ZA[404]; + ZA[395] = ZA[392] + ZA[154]; + + ZA[411] = (ZCh(ZA[407], ZA[402], ZA[397]) + ZA[395]) + ZR26(ZA[407]); + ZA[409] = ZMa(ZA[403], ZA[398], ZA[408]) + ZR30(ZA[408]); + ZA[144] = ZA[134] + ZA[114]; + ZA[159] = ZA[155] + 0x5b9cca4fU; + + ZA[115] = ZR25(ZA[103]) + ZA[102]; + ZA[156] = ZR15(ZA[150]) + ZA[144]; + ZA[412] = ZA[393] + ZA[411]; + ZA[413] = ZA[411] + ZA[409]; + ZA[400] = ZA[397] + ZA[159]; + + ZA[416] = (ZCh(ZA[412], ZA[407], ZA[402]) + ZA[400]) + ZR26(ZA[412]); + ZA[414] = ZMa(ZA[408], ZA[403], ZA[413]) + ZR30(ZA[413]); + ZA[145] = ZA[135] + ZA[115]; + ZA[160] = ZA[156] + 0x682e6ff3U; + + ZA[122] = ZR25(ZA[110]) + ZA[103]; + ZA[161] = ZR15(ZA[155]) + ZA[145]; + ZA[417] = ZA[398] + ZA[416]; + ZA[418] = ZA[416] + ZA[414]; + ZA[405] = ZA[402] + ZA[160]; + + ZA[421] = (ZCh(ZA[417], ZA[412], ZA[407]) + ZA[405]) + ZR26(ZA[417]); + ZA[419] = ZMa(ZA[413], ZA[408], ZA[418]) + ZR30(ZA[418]); + ZA[151] = ZA[142] + ZA[122]; + ZA[165] = ZA[161] + 0x748f82eeU; + + ZA[123] = ZR25(ZA[111]) + ZA[110]; + ZA[162] = ZR15(ZA[156]) + ZA[151]; + ZA[422] = ZA[403] + ZA[421]; + ZA[423] = ZA[421] + ZA[419]; + ZA[410] = ZA[407] + ZA[165]; + + ZA[426] = (ZCh(ZA[422], ZA[417], ZA[412]) + ZA[410]) + ZR26(ZA[422]); + ZA[424] = ZMa(ZA[418], ZA[413], ZA[423]) + ZR30(ZA[423]); + ZA[152] = ZA[143] + ZA[123]; + ZA[166] = ZA[162] + 0x78a5636fU; + + ZA[130] = ZR25(ZA[118]) + ZA[111]; + ZA[167] = ZR15(ZA[161]) + ZA[152]; + ZA[427] = ZA[408] + ZA[426]; + ZA[428] = ZA[426] + ZA[424]; + ZA[415] = ZA[412] + ZA[166]; + + ZA[431] = (ZCh(ZA[427], ZA[422], ZA[417]) + ZA[415]) + ZR26(ZA[427]); + ZA[429] = ZMa(ZA[423], ZA[418], ZA[428]) + ZR30(ZA[428]); + ZA[157] = ZA[149] + ZA[130]; + ZA[170] = ZA[167] + 0x84c87814U; + + ZA[131] = ZR25(ZA[119]) + ZA[118]; + ZA[168] = ZR15(ZA[162]) + ZA[157]; + ZA[432] = ZA[413] + ZA[431]; + ZA[433] = ZA[431] + ZA[429]; + ZA[420] = ZA[417] + ZA[170]; + + ZA[436] = (ZCh(ZA[432], ZA[427], ZA[422]) + ZA[420]) + ZR26(ZA[432]); + ZA[434] = ZMa(ZA[428], ZA[423], ZA[433]) + ZR30(ZA[433]); + ZA[158] = ZA[150] + ZA[131]; + ZA[171] = ZA[168] + 0x8cc70208U; + + ZA[138] = ZR25(ZA[126]) + ZA[119]; + ZA[172] = ZR15(ZA[167]) + ZA[158]; + ZA[437] = ZA[418] + ZA[436]; + ZA[438] = ZA[436] + ZA[434]; + ZA[425] = ZA[422] + ZA[171]; + + ZA[441] = (ZCh(ZA[437], ZA[432], ZA[427]) + ZA[425]) + ZR26(ZA[437]); + ZA[439] = ZMa(ZA[433], ZA[428], ZA[438]) + ZR30(ZA[438]); + ZA[163] = ZA[155] + ZA[138]; + ZA[174] = ZA[172] + 0x90befffaU; + + ZA[139] = ZR25(ZA[127]) + ZA[126]; + ZA[173] = ZR15(ZA[168]) + ZA[163]; + ZA[442] = ZA[423] + ZA[441]; + ZA[443] = ZA[441] + ZA[439]; + ZA[430] = ZA[427] + ZA[174]; + + ZA[445] = (ZCh(ZA[442], ZA[437], ZA[432]) + ZA[430]) + ZR26(ZA[442]); + ZA[444] = ZMa(ZA[438], ZA[433], ZA[443]) + ZR30(ZA[443]); + ZA[164] = ZA[156] + ZA[139]; + ZA[175] = ZA[173] + 0xa4506cebU; + + ZA[146] = ZR25(ZA[134]) + ZA[127]; + ZA[176] = ZR15(ZA[172]) + ZA[164]; + ZA[446] = ZA[428] + ZA[445]; + ZA[447] = ZA[445] + ZA[444]; + ZA[435] = ZA[432] + ZA[175]; + + ZA[449] = (ZCh(ZA[446], ZA[442], ZA[437]) + ZA[435]) + ZR26(ZA[446]); + ZA[448] = ZMa(ZA[443], ZA[438], ZA[447]) + ZR30(ZA[447]); + ZA[169] = ZA[161] + ZA[146]; + ZA[178] = ZA[176] + 0xbef9a3f7U; + + ZA[177] = ZR15(ZA[173]) + ZA[169]; + ZA[451] = ZA[449] + ZA[448]; + ZA[450] = ZA[433] + ZA[449]; + ZA[440] = ZA[437] + ZA[178]; + + ZA[453] = (ZCh(ZA[450], ZA[446], ZA[442]) + ZA[440]) + ZR26(ZA[450]); + ZA[452] = ZMa(ZA[447], ZA[443], ZA[451]) + ZR30(ZA[451]); + ZA[179] = ZA[177] + 0xc67178f2U; + + ZA[454] = ZA[438] + ZA[453]; + ZA[494] = ZA[442] + ZA[179]; + ZA[455] = ZA[453] + ZA[452]; + + ZA[457] = (ZCh(ZA[454], ZA[450], ZA[446]) + ZA[494]) + ZR26(ZA[454]); + ZA[456] = ZMa(ZA[451], ZA[447], ZA[455]) + ZR30(ZA[455]); + + ZA[459] = ZA[457] + ZA[456]; + + ZA[461] = ZA[455] + state1; + ZA[460] = ZA[459] + state0; + + ZA[495] = ZA[460] + 0x98c7e2a2U; + ZA[469] = ZA[461] + 0x90bb1e3cU; + + ZA[498] = (ZCh(ZA[495], 0x510e527fU, 0x9b05688cU) + ZA[469]) + ZR26(ZA[495]); + ZA[462] = ZA[451] + state2; + + ZA[496] = ZA[460] + 0xfc08884dU; + ZA[506] = ZA[498] + 0x3c6ef372U; + ZA[470] = ZA[462] + 0x50c6645bU; + + ZA[507] = (ZCh(ZA[506], ZA[495], 0x510e527fU) + ZA[470]) + ZR26(ZA[506]); + ZA[500] = ZMa(0x6a09e667U, 0xbb67ae85U, ZA[496]) + ZR30(ZA[496]); + ZA[463] = ZA[447] + state3; + + ZA[458] = ZA[443] + ZA[457]; + ZA[499] = ZA[498] + ZA[500]; + ZA[508] = ZA[507] + 0xbb67ae85U; + ZA[473] = ZA[463] + 0x3ac42e24U; + + ZA[510] = (ZCh(ZA[508], ZA[506], ZA[495]) + ZA[473]) + ZR26(ZA[508]); + ZA[928] = ZMa(ZA[496], 0x6a09e667U, ZA[499]) + ZR30(ZA[499]); + ZA[464] = ZA[458] + state4; + + ZA[476] = ZA[464] + ZA[460] + 0xd21ea4fdU; + ZA[511] = ZA[510] + 0x6a09e667U; + ZA[509] = ZA[928] + ZA[507]; + ZA[465] = ZA[454] + state5; + + ZA[514] = (ZCh(ZA[511], ZA[508], ZA[506]) + ZA[476]) + ZR26(ZA[511]); + ZA[512] = ZMa(ZA[499], ZA[496], ZA[509]) + ZR30(ZA[509]); + ZA[478] = ZA[465] + 0x59f111f1U; + + ZA[519] = ZA[506] + ZA[478]; + ZA[516] = ZA[496] + ZA[514]; + ZA[513] = ZA[510] + ZA[512]; + ZA[466] = ZA[450] + state6; + + ZA[520] = (ZCh(ZA[516], ZA[511], ZA[508]) + ZA[519]) + ZR26(ZA[516]); + ZA[515] = ZMa(ZA[509], ZA[499], ZA[513]) + ZR30(ZA[513]); + ZA[480] = ZA[466] + 0x923f82a4U; + + ZA[524] = ZA[508] + ZA[480]; + ZA[521] = ZA[499] + ZA[520]; + ZA[517] = ZA[514] + ZA[515]; + ZA[467] = ZA[446] + state7; + + ZA[525] = (ZCh(ZA[521], ZA[516], ZA[511]) + ZA[524]) + ZR26(ZA[521]); + ZA[522] = ZMa(ZA[513], ZA[509], ZA[517]) + ZR30(ZA[517]); + ZA[484] = ZA[467] + 0xab1c5ed5U; + + ZA[529] = ZA[511] + ZA[484]; + ZA[526] = ZA[509] + ZA[525]; + ZA[523] = ZA[520] + ZA[522]; + + ZA[530] = (ZCh(ZA[526], ZA[521], ZA[516]) + ZA[529]) + ZR26(ZA[526]); + ZA[550] = ZMa(ZA[517], ZA[513], ZA[523]) + ZR30(ZA[523]); + + ZA[531] = ZA[513] + ZA[530]; + ZA[533] = ZA[516] + 0x5807aa98U; + ZA[527] = ZA[550] + ZA[525]; + + ZA[534] = (ZCh(ZA[531], ZA[526], ZA[521]) + ZA[533]) + ZR26(ZA[531]); + ZA[551] = ZMa(ZA[523], ZA[517], ZA[527]) + ZR30(ZA[527]); + + ZA[535] = ZA[517] + ZA[534]; + ZA[538] = ZA[521] + 0x12835b01U; + ZA[532] = ZA[551] + ZA[530]; + + ZA[539] = (ZCh(ZA[535], ZA[531], ZA[526]) + ZA[538]) + ZR26(ZA[535]); + ZA[552] = ZMa(ZA[527], ZA[523], ZA[532]) + ZR30(ZA[532]); + + ZA[540] = ZA[523] + ZA[539]; + ZA[542] = ZA[526] + 0x243185beU; + ZA[536] = ZA[552] + ZA[534]; + + ZA[543] = (ZCh(ZA[540], ZA[535], ZA[531]) + ZA[542]) + ZR26(ZA[540]); + ZA[553] = ZMa(ZA[532], ZA[527], ZA[536]) + ZR30(ZA[536]); + + ZA[544] = ZA[527] + ZA[543]; + ZA[555] = ZA[531] + 0x550c7dc3U; + ZA[541] = ZA[553] + ZA[539]; + + ZA[558] = (ZCh(ZA[544], ZA[540], ZA[535]) + ZA[555]) + ZR26(ZA[544]); + ZA[547] = ZMa(ZA[536], ZA[532], ZA[541]) + ZR30(ZA[541]); + + ZA[559] = ZA[532] + ZA[558]; + ZA[556] = ZA[535] + 0x72be5d74U; + ZA[545] = ZA[547] + ZA[543]; + + ZA[562] = (ZCh(ZA[559], ZA[544], ZA[540]) + ZA[556]) + ZR26(ZA[559]); + ZA[561] = ZMa(ZA[541], ZA[536], ZA[545]) + ZR30(ZA[545]); + + ZA[563] = ZA[536] + ZA[562]; + ZA[560] = ZA[561] + ZA[558]; + ZA[557] = ZA[540] + 0x80deb1feU; + + ZA[568] = (ZCh(ZA[563], ZA[559], ZA[544]) + ZA[557]) + ZR26(ZA[563]); + ZA[564] = ZMa(ZA[545], ZA[541], ZA[560]) + ZR30(ZA[560]); + + ZA[569] = ZA[541] + ZA[568]; + ZA[572] = ZA[544] + 0x9bdc06a7U; + ZA[565] = ZA[562] + ZA[564]; + + ZA[574] = (ZCh(ZA[569], ZA[563], ZA[559]) + ZA[572]) + ZR26(ZA[569]); + ZA[570] = ZMa(ZA[560], ZA[545], ZA[565]) + ZR30(ZA[565]); + ZA[468] = ZR25(ZA[461]); + + ZA[497] = ZA[468] + ZA[460]; + ZA[575] = ZA[545] + ZA[574]; + ZA[571] = ZA[568] + ZA[570]; + ZA[573] = ZA[559] + 0xc19bf274U; + + ZA[578] = (ZCh(ZA[575], ZA[569], ZA[563]) + ZA[573]) + ZR26(ZA[575]); + ZA[576] = ZMa(ZA[565], ZA[560], ZA[571]) + ZR30(ZA[571]); + ZA[929] = ZR25(ZA[462]); + ZA[503] = ZA[497] + 0xe49b69c1U; + + ZA[471] = ZA[929] + ZA[461] + 0x00a00000U; + ZA[582] = ZA[563] + ZA[503]; + ZA[579] = ZA[560] + ZA[578]; + ZA[577] = ZA[574] + ZA[576]; + + ZA[583] = (ZCh(ZA[579], ZA[575], ZA[569]) + ZA[582]) + ZR26(ZA[579]); + ZA[580] = ZMa(ZA[571], ZA[565], ZA[577]) + ZR30(ZA[577]); + ZA[488] = ZA[471] + 0xefbe4786U; + + ZA[472] = ZR25(ZA[463]) + ZA[462]; + ZA[587] = ZA[569] + ZA[488]; + ZA[584] = ZA[565] + ZA[583]; + ZA[581] = ZA[578] + ZA[580]; + + ZA[588] = (ZCh(ZA[584], ZA[579], ZA[575]) + ZA[587]) + ZR26(ZA[584]); + ZA[586] = ZMa(ZA[577], ZA[571], ZA[581]) + ZR30(ZA[581]); + ZA[501] = ZR15(ZA[497]) + ZA[472]; + ZA[475] = ZR15(ZA[471]); + ZA[926] = ZA[575] + 0x0fc19dc6U; + + ZA[474] = ZA[475] + ZA[463] + ZR25(ZA[464]); + ZA[927] = ZA[926] + ZA[501]; + ZA[589] = ZA[571] + ZA[588]; + ZA[585] = ZA[583] + ZA[586]; + + ZA[592] = (ZCh(ZA[589], ZA[584], ZA[579]) + ZA[927]) + ZR26(ZA[589]); + ZA[590] = ZMa(ZA[581], ZA[577], ZA[585]) + ZR30(ZA[585]); + ZA[477] = ZR25(ZA[465]) + ZA[464]; + ZA[489] = ZA[474] + 0x240ca1ccU; + + ZA[518] = ZR15(ZA[501]) + ZA[477]; + ZA[479] = ZR25(ZA[466]); + ZA[596] = ZA[579] + ZA[489]; + ZA[593] = ZA[577] + ZA[592]; + ZA[591] = ZA[588] + ZA[590]; + + ZA[597] = (ZCh(ZA[593], ZA[589], ZA[584]) + ZA[596]) + ZR26(ZA[593]); + ZA[594] = ZMa(ZA[585], ZA[581], ZA[591]) + ZR30(ZA[591]); + ZA[481] = ZA[479] + ZA[465]; + ZA[601] = ZA[518] + 0x2de92c6fU; + + ZA[482] = ZR15(ZA[474]) + ZA[481]; + ZA[602] = ZA[584] + ZA[601]; + ZA[598] = ZA[581] + ZA[597]; + ZA[595] = ZA[592] + ZA[594]; + + ZA[632] = (ZCh(ZA[598], ZA[593], ZA[589]) + ZA[602]) + ZR26(ZA[598]); + ZA[599] = ZMa(ZA[591], ZA[585], ZA[595]) + ZR30(ZA[595]); + ZA[483] = ZA[466] + 0x00000100U + ZR25(ZA[467]); + ZA[490] = ZA[482] + 0x4a7484aaU; + + ZA[528] = ZR15(ZA[518]) + ZA[483]; + ZA[736] = ZA[585] + ZA[632]; + ZA[605] = ZA[589] + ZA[490]; + ZA[600] = ZA[597] + ZA[599]; + ZA[485] = ZA[467] + 0x11002000U; + + ZA[738] = (ZCh(ZA[736], ZA[598], ZA[593]) + ZA[605]) + ZR26(ZA[736]); + ZA[744] = ZMa(ZA[595], ZA[591], ZA[600]) + ZR30(ZA[600]); + ZA[487] = ZR15(ZA[482]) + ZA[485]; + ZA[603] = ZA[528] + 0x5cb0a9dcU; + + ZA[502] = ZA[497] + ZA[487]; + ZA[739] = ZA[591] + ZA[738]; + ZA[604] = ZA[593] + ZA[603]; + ZA[737] = ZA[744] + ZA[632]; + + ZA[741] = (ZCh(ZA[739], ZA[736], ZA[598]) + ZA[604]) + ZR26(ZA[739]); + ZA[745] = ZMa(ZA[600], ZA[595], ZA[737]) + ZR30(ZA[737]); + ZA[486] = ZA[471] + 0x80000000U; + ZA[606] = ZA[502] + 0x76f988daU; + + ZA[537] = ZR15(ZA[528]) + ZA[486]; + ZA[742] = ZA[595] + ZA[741]; + ZA[613] = ZA[598] + ZA[606]; + ZA[740] = ZA[745] + ZA[738]; + + ZA[747] = (ZCh(ZA[742], ZA[739], ZA[736]) + ZA[613]) + ZR26(ZA[742]); + ZA[746] = ZMa(ZA[737], ZA[600], ZA[740]) + ZR30(ZA[740]); + ZA[607] = ZA[537] + 0x983e5152U; + + ZA[546] = ZR15(ZA[502]) + ZA[501]; + ZA[751] = ZA[736] + ZA[607]; + ZA[748] = ZA[600] + ZA[747]; + ZA[743] = ZA[746] + ZA[741]; + + ZA[752] = (ZCh(ZA[748], ZA[742], ZA[739]) + ZA[751]) + ZR26(ZA[748]); + ZA[749] = ZMa(ZA[740], ZA[737], ZA[743]) + ZR30(ZA[743]); + ZA[608] = ZA[546] + 0xa831c66dU; + + ZA[554] = ZR15(ZA[537]) + ZA[474]; + ZA[756] = ZA[739] + ZA[608]; + ZA[753] = ZA[737] + ZA[752]; + ZA[750] = ZA[747] + ZA[749]; + + ZA[757] = (ZCh(ZA[753], ZA[748], ZA[742]) + ZA[756]) + ZR26(ZA[753]); + ZA[754] = ZMa(ZA[743], ZA[740], ZA[750]) + ZR30(ZA[750]); + ZA[609] = ZA[554] + 0xb00327c8U; + + ZA[566] = ZR15(ZA[546]) + ZA[518]; + ZA[761] = ZA[742] + ZA[609]; + ZA[758] = ZA[740] + ZA[757]; + ZA[755] = ZA[752] + ZA[754]; + + ZA[762] = (ZCh(ZA[758], ZA[753], ZA[748]) + ZA[761]) + ZR26(ZA[758]); + ZA[759] = ZMa(ZA[750], ZA[743], ZA[755]) + ZR30(ZA[755]); + ZA[610] = ZA[566] + 0xbf597fc7U; + + ZA[567] = ZR15(ZA[554]) + ZA[482]; + ZA[766] = ZA[748] + ZA[610]; + ZA[763] = ZA[743] + ZA[762]; + ZA[760] = ZA[757] + ZA[759]; + + ZA[767] = (ZCh(ZA[763], ZA[758], ZA[753]) + ZA[766]) + ZR26(ZA[763]); + ZA[764] = ZMa(ZA[755], ZA[750], ZA[760]) + ZR30(ZA[760]); + ZA[611] = ZA[567] + 0xc6e00bf3U; + + ZA[614] = ZR15(ZA[566]) + ZA[528]; + ZA[771] = ZA[753] + ZA[611]; + ZA[768] = ZA[750] + ZA[767]; + ZA[765] = ZA[762] + ZA[764]; + + ZA[772] = (ZCh(ZA[768], ZA[763], ZA[758]) + ZA[771]) + ZR26(ZA[768]); + ZA[769] = ZMa(ZA[760], ZA[755], ZA[765]) + ZR30(ZA[765]); + ZA[612] = ZA[502] + 0x00400022U; + ZA[615] = ZA[614] + 0xd5a79147U; + + ZA[616] = ZR15(ZA[567]) + ZA[612]; + ZA[504] = ZR25(ZA[497]) + 0x00000100U; + ZA[776] = ZA[758] + ZA[615]; + ZA[773] = ZA[755] + ZA[772]; + ZA[770] = ZA[767] + ZA[769]; + + ZA[777] = (ZCh(ZA[773], ZA[768], ZA[763]) + ZA[776]) + ZR26(ZA[773]); + ZA[774] = ZMa(ZA[765], ZA[760], ZA[770]) + ZR30(ZA[770]); + ZA[492] = ZR25(ZA[471]); + ZA[618] = ZA[537] + ZA[504]; + ZA[617] = ZA[616] + 0x06ca6351U; + + ZA[619] = ZR15(ZA[614]) + ZA[618]; + ZA[781] = ZA[763] + ZA[617]; + ZA[778] = ZA[760] + ZA[777]; + ZA[775] = ZA[772] + ZA[774]; + ZA[505] = ZA[492] + ZA[497]; + + ZA[782] = (ZCh(ZA[778], ZA[773], ZA[768]) + ZA[781]) + ZR26(ZA[778]); + ZA[779] = ZMa(ZA[770], ZA[765], ZA[775]) + ZR30(ZA[775]); + ZA[621] = ZA[505] + ZA[546]; + ZA[620] = ZA[619] + 0x14292967U; + + ZA[622] = ZR15(ZA[616]) + ZA[621]; + ZA[625] = ZR25(ZA[501]); + ZA[786] = ZA[768] + ZA[620]; + ZA[783] = ZA[765] + ZA[782]; + ZA[624] = ZA[554] + ZA[471]; + ZA[780] = ZA[777] + ZA[779]; + + ZA[787] = (ZCh(ZA[783], ZA[778], ZA[773]) + ZA[786]) + ZR26(ZA[783]); + ZA[784] = ZMa(ZA[775], ZA[770], ZA[780]) + ZR30(ZA[780]); + ZA[493] = ZR25(ZA[474]); + ZA[626] = ZA[625] + ZA[624]; + ZA[623] = ZA[622] + 0x27b70a85U; + + ZA[627] = ZR15(ZA[619]) + ZA[626]; + ZA[791] = ZA[773] + ZA[623]; + ZA[788] = ZA[770] + ZA[787]; + ZA[785] = ZA[782] + ZA[784]; + ZA[629] = ZA[493] + ZA[501]; + + ZA[792] = (ZCh(ZA[788], ZA[783], ZA[778]) + ZA[791]) + ZR26(ZA[788]); + ZA[789] = ZMa(ZA[780], ZA[775], ZA[785]) + ZR30(ZA[785]); + ZA[630] = ZA[566] + ZA[629]; + ZA[628] = ZA[627] + 0x2e1b2138U; + + ZA[634] = ZR25(ZA[518]) + ZA[474]; + ZA[631] = ZR15(ZA[622]) + ZA[630]; + ZA[796] = ZA[778] + ZA[628]; + ZA[793] = ZA[775] + ZA[792]; + ZA[790] = ZA[787] + ZA[789]; + + ZA[797] = (ZCh(ZA[793], ZA[788], ZA[783]) + ZA[796]) + ZR26(ZA[793]); + ZA[794] = ZMa(ZA[785], ZA[780], ZA[790]) + ZR30(ZA[790]); + ZA[491] = ZR25(ZA[482]); + ZA[635] = ZA[567] + ZA[634]; + ZA[633] = ZA[631] + 0x4d2c6dfcU; + + ZA[636] = ZR15(ZA[627]) + ZA[635]; + ZA[801] = ZA[783] + ZA[633]; + ZA[798] = ZA[780] + ZA[797]; + ZA[795] = ZA[792] + ZA[794]; + ZA[638] = ZA[491] + ZA[518]; + + ZA[802] = (ZCh(ZA[798], ZA[793], ZA[788]) + ZA[801]) + ZR26(ZA[798]); + ZA[799] = ZMa(ZA[790], ZA[785], ZA[795]) + ZR30(ZA[795]); + ZA[639] = ZA[638] + ZA[614]; + ZA[637] = ZA[636] + 0x53380d13U; + + ZA[642] = ZR25(ZA[528]) + ZA[482]; + ZA[640] = ZR15(ZA[631]) + ZA[639]; + ZA[806] = ZA[788] + ZA[637]; + ZA[803] = ZA[785] + ZA[802]; + ZA[800] = ZA[797] + ZA[799]; + + ZA[807] = (ZCh(ZA[803], ZA[798], ZA[793]) + ZA[806]) + ZR26(ZA[803]); + ZA[804] = ZMa(ZA[795], ZA[790], ZA[800]) + ZR30(ZA[800]); + ZA[643] = ZA[616] + ZA[642]; + ZA[641] = ZA[640] + 0x650a7354U; + + ZA[646] = ZR25(ZA[502]) + ZA[528]; + ZA[644] = ZR15(ZA[636]) + ZA[643]; + ZA[811] = ZA[793] + ZA[641]; + ZA[808] = ZA[790] + ZA[807]; + ZA[805] = ZA[802] + ZA[804]; + + ZA[812] = (ZCh(ZA[808], ZA[803], ZA[798]) + ZA[811]) + ZR26(ZA[808]); + ZA[809] = ZMa(ZA[800], ZA[795], ZA[805]) + ZR30(ZA[805]); + ZA[647] = ZA[619] + ZA[646]; + ZA[645] = ZA[644] + 0x766a0abbU; + + ZA[650] = ZR25(ZA[537]) + ZA[502]; + ZA[648] = ZR15(ZA[640]) + ZA[647]; + ZA[816] = ZA[798] + ZA[645]; + ZA[813] = ZA[795] + ZA[812]; + ZA[810] = ZA[807] + ZA[809]; + + ZA[817] = (ZCh(ZA[813], ZA[808], ZA[803]) + ZA[816]) + ZR26(ZA[813]); + ZA[814] = ZMa(ZA[805], ZA[800], ZA[810]) + ZR30(ZA[810]); + ZA[925] = ZA[622] + ZA[650]; + ZA[649] = ZA[648] + 0x81c2c92eU; + + ZA[653] = ZR25(ZA[546]) + ZA[537]; + ZA[651] = ZR15(ZA[644]) + ZA[925]; + ZA[821] = ZA[803] + ZA[649]; + ZA[818] = ZA[800] + ZA[817]; + ZA[815] = ZA[812] + ZA[814]; + + ZA[822] = (ZCh(ZA[818], ZA[813], ZA[808]) + ZA[821]) + ZR26(ZA[818]); + ZA[819] = ZMa(ZA[810], ZA[805], ZA[815]) + ZR30(ZA[815]); + ZA[654] = ZA[627] + ZA[653]; + ZA[652] = ZA[651] + 0x92722c85U; + + ZA[657] = ZR25(ZA[554]) + ZA[546]; + ZA[655] = ZR15(ZA[648]) + ZA[654]; + ZA[826] = ZA[808] + ZA[652]; + ZA[823] = ZA[805] + ZA[822]; + ZA[820] = ZA[817] + ZA[819]; + + ZA[827] = (ZCh(ZA[823], ZA[818], ZA[813]) + ZA[826]) + ZR26(ZA[823]); + ZA[824] = ZMa(ZA[815], ZA[810], ZA[820]) + ZR30(ZA[820]); + ZA[658] = ZA[631] + ZA[657]; + ZA[656] = ZA[655] + 0xa2bfe8a1U; + + ZA[661] = ZR25(ZA[566]) + ZA[554]; + ZA[659] = ZR15(ZA[651]) + ZA[658]; + ZA[831] = ZA[813] + ZA[656]; + ZA[828] = ZA[810] + ZA[827]; + ZA[825] = ZA[822] + ZA[824]; + + ZA[832] = (ZCh(ZA[828], ZA[823], ZA[818]) + ZA[831]) + ZR26(ZA[828]); + ZA[829] = ZMa(ZA[820], ZA[815], ZA[825]) + ZR30(ZA[825]); + ZA[662] = ZA[636] + ZA[661]; + ZA[660] = ZA[659] + 0xa81a664bU; + + ZA[665] = ZR25(ZA[567]) + ZA[566]; + ZA[663] = ZR15(ZA[655]) + ZA[662]; + ZA[836] = ZA[818] + ZA[660]; + ZA[833] = ZA[815] + ZA[832]; + ZA[830] = ZA[827] + ZA[829]; + + ZA[837] = (ZCh(ZA[833], ZA[828], ZA[823]) + ZA[836]) + ZR26(ZA[833]); + ZA[834] = ZMa(ZA[825], ZA[820], ZA[830]) + ZR30(ZA[830]); + ZA[666] = ZA[640] + ZA[665]; + ZA[664] = ZA[663] + 0xc24b8b70U; + + ZA[669] = ZR25(ZA[614]) + ZA[567]; + ZA[667] = ZR15(ZA[659]) + ZA[666]; + ZA[841] = ZA[823] + ZA[664]; + ZA[838] = ZA[820] + ZA[837]; + ZA[835] = ZA[832] + ZA[834]; + + ZA[842] = (ZCh(ZA[838], ZA[833], ZA[828]) + ZA[841]) + ZR26(ZA[838]); + ZA[839] = ZMa(ZA[830], ZA[825], ZA[835]) + ZR30(ZA[835]); + ZA[670] = ZA[644] + ZA[669]; + ZA[668] = ZA[667] + 0xc76c51a3U; + + ZA[677] = ZR25(ZA[616]) + ZA[614]; + ZA[671] = ZR15(ZA[663]) + ZA[670]; + ZA[846] = ZA[828] + ZA[668]; + ZA[843] = ZA[825] + ZA[842]; + ZA[840] = ZA[837] + ZA[839]; + + ZA[847] = (ZCh(ZA[843], ZA[838], ZA[833]) + ZA[846]) + ZR26(ZA[843]); + ZA[844] = ZMa(ZA[835], ZA[830], ZA[840]) + ZR30(ZA[840]); + ZA[678] = ZA[648] + ZA[677]; + ZA[676] = ZA[671] + 0xd192e819U; + + ZA[682] = ZR25(ZA[619]) + ZA[616]; + ZA[679] = ZR15(ZA[667]) + ZA[678]; + ZA[851] = ZA[833] + ZA[676]; + ZA[848] = ZA[830] + ZA[847]; + ZA[845] = ZA[842] + ZA[844]; + + ZA[852] = (ZCh(ZA[848], ZA[843], ZA[838]) + ZA[851]) + ZR26(ZA[848]); + ZA[849] = ZMa(ZA[840], ZA[835], ZA[845]) + ZR30(ZA[845]); + ZA[683] = ZA[651] + ZA[682]; + ZA[680] = ZA[679] + 0xd6990624U; + + ZA[686] = ZR25(ZA[622]) + ZA[619]; + ZA[684] = ZR15(ZA[671]) + ZA[683]; + ZA[856] = ZA[838] + ZA[680]; + ZA[853] = ZA[835] + ZA[852]; + ZA[850] = ZA[847] + ZA[849]; + + ZA[857] = (ZCh(ZA[853], ZA[848], ZA[843]) + ZA[856]) + ZR26(ZA[853]); + ZA[854] = ZMa(ZA[845], ZA[840], ZA[850]) + ZR30(ZA[850]); + ZA[687] = ZA[655] + ZA[686]; + ZA[685] = ZA[684] + 0xf40e3585U; + + ZA[690] = ZR25(ZA[627]) + ZA[622]; + ZA[688] = ZR15(ZA[679]) + ZA[687]; + ZA[861] = ZA[843] + ZA[685]; + ZA[858] = ZA[840] + ZA[857]; + ZA[855] = ZA[852] + ZA[854]; + + ZA[862] = (ZCh(ZA[858], ZA[853], ZA[848]) + ZA[861]) + ZR26(ZA[858]); + ZA[859] = ZMa(ZA[850], ZA[845], ZA[855]) + ZR30(ZA[855]); + ZA[691] = ZA[659] + ZA[690]; + ZA[689] = ZA[688] + 0x106aa070U; + + ZA[694] = ZR25(ZA[631]) + ZA[627]; + ZA[692] = ZR15(ZA[684]) + ZA[691]; + ZA[866] = ZA[848] + ZA[689]; + ZA[863] = ZA[845] + ZA[862]; + ZA[860] = ZA[857] + ZA[859]; + + ZA[867] = (ZCh(ZA[863], ZA[858], ZA[853]) + ZA[866]) + ZR26(ZA[863]); + ZA[864] = ZMa(ZA[855], ZA[850], ZA[860]) + ZR30(ZA[860]); + ZA[695] = ZA[663] + ZA[694]; + ZA[693] = ZA[692] + 0x19a4c116U; + + ZA[698] = ZR25(ZA[636]) + ZA[631]; + ZA[696] = ZR15(ZA[688]) + ZA[695]; + ZA[871] = ZA[853] + ZA[693]; + ZA[868] = ZA[850] + ZA[867]; + ZA[865] = ZA[862] + ZA[864]; + + ZA[873] = (ZCh(ZA[868], ZA[863], ZA[858]) + ZA[871]) + ZR26(ZA[868]); + ZA[869] = ZMa(ZA[860], ZA[855], ZA[865]) + ZR30(ZA[865]); + ZA[699] = ZA[667] + ZA[698]; + ZA[697] = ZA[696] + 0x1e376c08U; + + ZA[702] = ZR25(ZA[640]) + ZA[636]; + ZA[700] = ZR15(ZA[692]) + ZA[699]; + ZA[877] = ZA[858] + ZA[697]; + ZA[874] = ZA[855] + ZA[873]; + ZA[870] = ZA[867] + ZA[869]; + + ZA[878] = (ZCh(ZA[874], ZA[868], ZA[863]) + ZA[877]) + ZR26(ZA[874]); + ZA[875] = ZMa(ZA[865], ZA[860], ZA[870]) + ZR30(ZA[870]); + ZA[703] = ZA[671] + ZA[702]; + ZA[701] = ZA[700] + 0x2748774cU; + + ZA[706] = ZR25(ZA[644]) + ZA[640]; + ZA[704] = ZR15(ZA[696]) + ZA[703]; + ZA[882] = ZA[863] + ZA[701]; + ZA[879] = ZA[860] + ZA[878]; + ZA[876] = ZA[873] + ZA[875]; + + ZA[883] = (ZCh(ZA[879], ZA[874], ZA[868]) + ZA[882]) + ZR26(ZA[879]); + ZA[880] = ZMa(ZA[870], ZA[865], ZA[876]) + ZR30(ZA[876]); + ZA[707] = ZA[679] + ZA[706]; + ZA[705] = ZA[704] + 0x34b0bcb5U; + + ZA[710] = ZR25(ZA[648]) + ZA[644]; + ZA[708] = ZR15(ZA[700]) + ZA[707]; + ZA[887] = ZA[868] + ZA[705]; + ZA[884] = ZA[865] + ZA[883]; + ZA[881] = ZA[878] + ZA[880]; + + ZA[888] = (ZCh(ZA[884], ZA[879], ZA[874]) + ZA[887]) + ZR26(ZA[884]); + ZA[885] = ZMa(ZA[876], ZA[870], ZA[881]) + ZR30(ZA[881]); + ZA[711] = ZA[684] + ZA[710]; + ZA[709] = ZA[708] + 0x391c0cb3U; + + ZA[714] = ZR25(ZA[651]) + ZA[648]; + ZA[712] = ZR15(ZA[704]) + ZA[711]; + ZA[892] = ZA[874] + ZA[709]; + ZA[889] = ZA[870] + ZA[888]; + ZA[886] = ZA[883] + ZA[885]; + + ZA[893] = (ZCh(ZA[889], ZA[884], ZA[879]) + ZA[892]) + ZR26(ZA[889]); + ZA[890] = ZMa(ZA[881], ZA[876], ZA[886]) + ZR30(ZA[886]); + ZA[715] = ZA[688] + ZA[714]; + ZA[713] = ZA[712] + 0x4ed8aa4aU; + + ZA[718] = ZR25(ZA[655]) + ZA[651]; + ZA[716] = ZR15(ZA[708]) + ZA[715]; + ZA[897] = ZA[879] + ZA[713]; + ZA[894] = ZA[876] + ZA[893]; + ZA[891] = ZA[888] + ZA[890]; + + ZA[898] = (ZCh(ZA[894], ZA[889], ZA[884]) + ZA[897]) + ZR26(ZA[894]); + ZA[895] = ZMa(ZA[886], ZA[881], ZA[891]) + ZR30(ZA[891]); + ZA[719] = ZA[692] + ZA[718]; + ZA[717] = ZA[716] + 0x5b9cca4fU; + + ZA[722] = ZR25(ZA[659]) + ZA[655]; + ZA[720] = ZR15(ZA[712]) + ZA[719]; + ZA[902] = ZA[884] + ZA[717]; + ZA[899] = ZA[881] + ZA[898]; + ZA[896] = ZA[893] + ZA[895]; + + ZA[903] = (ZCh(ZA[899], ZA[894], ZA[889]) + ZA[902]) + ZR26(ZA[899]); + ZA[900] = ZMa(ZA[891], ZA[886], ZA[896]) + ZR30(ZA[896]); + ZA[723] = ZA[696] + ZA[722]; + ZA[721] = ZA[720] + 0x682e6ff3U; + + ZA[672] = ZR25(ZA[663]) + ZA[659]; + ZA[724] = ZR15(ZA[716]) + ZA[723]; + ZA[907] = ZA[889] + ZA[721]; + ZA[904] = ZA[886] + ZA[903]; + ZA[901] = ZA[898] + ZA[900]; + + ZA[908] = (ZCh(ZA[904], ZA[899], ZA[894]) + ZA[907]) + ZR26(ZA[904]); + ZA[905] = ZMa(ZA[896], ZA[891], ZA[901]) + ZR30(ZA[901]); + ZA[673] = ZR25(ZA[667]) + ZA[663]; + ZA[726] = ZA[700] + ZA[672]; + ZA[725] = ZA[724] + 0x748f82eeU; + + ZA[727] = ZR15(ZA[720]) + ZA[726]; + ZA[912] = ZA[894] + ZA[725]; + ZA[909] = ZA[891] + ZA[908]; + ZA[906] = ZA[903] + ZA[905]; + ZA[675] = ZA[667] + 0x8cc70208U; + ZA[729] = ZA[704] + ZA[673]; + + ZA[913] = (ZCh(ZA[909], ZA[904], ZA[899]) + ZA[912]) + ZR26(ZA[909]); + ZA[910] = ZMa(ZA[901], ZA[896], ZA[906]) + ZR30(ZA[906]); + ZA[674] = ZR25(ZA[671]) + ZA[675]; + ZA[730] = ZR15(ZA[724]) + ZA[729]; + ZA[728] = ZA[727] + 0x78a5636fU; + + ZA[681] = ZR25(ZA[679]) + ZA[671]; + ZA[917] = ZA[899] + ZA[901] + ZA[728]; + ZA[914] = ZA[896] + ZA[913]; + ZA[911] = ZA[908] + ZA[910]; + ZA[732] = ZA[708] + ZA[674]; + ZA[731] = ZA[730] + 0x84c87814U; + + ZA[918] = (ZCh(ZA[914], ZA[909], ZA[904]) + ZA[917]) + ZR26(ZA[914]); + ZA[915] = ZMa(ZA[906], ZA[901], ZA[911]) + ZR30(ZA[911]); + ZA[733] = ZR15(ZA[727]) + ZA[732]; + ZA[919] = ZA[906] + ZA[904] + ZA[731]; + ZA[734] = ZA[712] + ZA[681]; + + ZA[920] = (ZCh(ZA[918], ZA[914], ZA[909]) + ZA[919]) + ZR26(ZA[918]); + ZA[735] = ZR15(ZA[730]) + ZA[734]; + ZA[921] = ZA[911] + ZA[909] + ZA[733]; + ZA[916] = ZA[913] + ZA[915]; + + ZA[922] = (ZCh(ZA[920], ZA[918], ZA[914]) + ZA[921]) + ZR26(ZA[920]); + ZA[923] = ZA[916] + ZA[914] + ZA[735]; + + ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]); + +#define FOUND (0x800) +#define NFLAG (0x7FF) + +#if defined(VECTORS4) + bool result = any(ZA[924] == 0x136032EDU); + + if (result) { + if (ZA[924].x == 0x136032EDU) + output[FOUND] = output[NFLAG & Znonce.x] = Znonce.x; + if (ZA[924].y == 0x136032EDU) + output[FOUND] = output[NFLAG & Znonce.y] = Znonce.y; + if (ZA[924].z == 0x136032EDU) + output[FOUND] = output[NFLAG & Znonce.z] = Znonce.z; + if (ZA[924].w == 0x136032EDU) + output[FOUND] = output[NFLAG & Znonce.w] = Znonce.w; + } +#elif defined(VECTORS2) + bool result = any(ZA[924] == 0x136032EDU); + + if (result) { + if (ZA[924].x == 0x136032EDU) + output[FOUND] = output[NFLAG & Znonce.x] = Znonce.x; + if (ZA[924].y == 0x136032EDU) + output[FOUND] = output[NFLAG & Znonce.y] = Znonce.y; + } +#else + if (ZA[924] == 0x136032EDU) + output[FOUND] = output[NFLAG & Znonce] = Znonce; +#endif +} diff --git a/diakgcn120724.cl b/diakgcn120724.cl new file mode 100644 index 00000000..7dd73fb9 --- /dev/null +++ b/diakgcn120724.cl @@ -0,0 +1,587 @@ +// DiaKGCN 27-04-2012 - OpenCL kernel by Diapolo +// +// Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3. +// The kernel was rewritten by me (Diapolo) and is still public-domain! + +#ifdef VECTORS4 + typedef uint4 u; +#elif defined VECTORS2 + typedef uint2 u; +#else + typedef uint u; +#endif + +#ifdef BITALIGN + #pragma OPENCL EXTENSION cl_amd_media_ops : enable + #ifdef BFI_INT + #define ch(x, y, z) amd_bytealign(x, y, z) + #define ma(x, y, z) amd_bytealign(z ^ x, y, x) + #else + #define ch(x, y, z) bitselect(z, y, x) + #define ma(z, x, y) bitselect(z, y, z ^ x) + #endif +#else + #define ch(x, y, z) (z ^ (x & (y ^ z))) + #define ma(x, y, z) ((x & z) | (y & (x | z))) +#endif + +#define rotr15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U)) +#define rotr25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U)) +#define rotr26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U)) +#define rotr30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U)) + +__kernel + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) + void search( +#ifndef GOFFSET + const u base, +#endif + const uint PreVal0, const uint PreVal4, + const uint H1, const uint D1A, const uint B1, const uint C1, + const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7, + const uint W16addK16, const uint W17addK17, + const uint PreW18, const uint PreW19, + const uint W16, const uint W17, + const uint PreW31, const uint PreW32, + const uint state0, const uint state1, const uint state2, const uint state3, + const uint state4, const uint state5, const uint state6, const uint state7, + const uint state0A, const uint state0B, + const uint state1A, const uint state2A, const uint state3A, const uint state4A, + const uint state5A, const uint state6A, const uint state7A, + __global uint * output) +{ + u V[8]; + u W[16]; + +#ifdef VECTORS4 + const u nonce = (uint)(get_local_id(0)) * 4U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base; +#elif defined VECTORS2 + const u nonce = (uint)(get_local_id(0)) * 2U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base; +#else + #ifdef GOFFSET + const u nonce = (uint)(get_global_id(0)); + #else + const u nonce = (uint)(get_local_id(0)) + (uint)(get_group_id(0)) * (uint)(WORKSIZE) + base; + #endif +#endif + + V[0] = PreVal0 + nonce; + V[1] = B1; + V[2] = C1; + V[3] = D1A; + V[4] = PreVal4 + nonce; + V[5] = F1; + V[6] = G1; + V[7] = H1; + + V[7] += V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + +//---------------------------------------------------------------------------------- + +#ifdef VECTORS4 + W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U, rotr25(nonce.x) ^ 0x4008000U, rotr25(nonce.x) ^ 0x600c000U); +#elif defined VECTORS2 + W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U); +#else + W[0] = PreW18 + rotr25(nonce); +#endif + W[1] = PreW19 + nonce; + W[2] = 0x80000000U + rotr15(W[0]); + W[3] = rotr15(W[1]); + W[4] = 0x00000280U + rotr15(W[2]); + W[5] = W16 + rotr15(W[3]); + W[6] = W17 + rotr15(W[4]); + W[7] = W[0] + rotr15(W[5]); + W[8] = W[1] + rotr15(W[6]); + W[9] = W[2] + rotr15(W[7]); + W[10] = W[3] + rotr15(W[8]); + W[11] = W[4] + rotr15(W[9]); + W[12] = W[5] + 0x00a00055U + rotr15(W[10]); + W[13] = W[6] + PreW31 + rotr15(W[11]); + W[14] = W[7] + PreW32 + rotr15(W[12]); + W[15] = W[8] + W17 + rotr15(W[13]) + rotr25(W[0]); + + V[1] += 0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0]; + V[5] = 0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0] + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + +//---------------------------------------------------------------------------------- + + W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]); + W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]); + W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]); + W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]); + W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]); + W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]); + W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]); + W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]); + W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]); + W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]); + W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]); + W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]); + W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]); + W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]); + W[14] = W[14] + W[7] + rotr15(W[12]) + rotr25(W[15]); + W[15] = W[15] + W[8] + rotr15(W[13]) + rotr25( W[0]); + + V[1] += 0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + +//---------------------------------------------------------------------------------- + + W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]); + W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]); + W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]); + W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]); + W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]); + W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]); + W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]); + W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]); + W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]); + W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]); + W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]); + W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]); + W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]); + W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]); + + V[1] += 0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + +//---------------------------------------------------------------------------------- + + W[0] = state0 + V[0] + rotr25(state1 + V[1]); + W[1] = state1 + V[1] + 0x00a00000U + rotr25(state2 + V[2]); + W[2] = state2 + V[2] + rotr15(W[0]) + rotr25(state3 + V[3]); + W[3] = state3 + V[3] + rotr15(W[1]) + rotr25(state4 + V[4]); + W[4] = state4 + V[4] + rotr15(W[2]) + rotr25(state5 + V[5]); + W[5] = state5 + V[5] + rotr15(W[3]) + rotr25(state6 + V[6]); + W[6] = state6 + V[6] + 0x00000100U + rotr15(W[4]) + rotr25(state7 + V[7]); + W[7] = state7 + V[7] + W[0] + 0x11002000U + rotr15(W[5]); + W[8] = W[1] + 0x80000000U + rotr15(W[6]); + W[9] = W[2] + rotr15(W[7]); + W[10] = W[3] + rotr15(W[8]); + W[11] = W[4] + rotr15(W[9]); + W[12] = W[5] + rotr15(W[10]); + W[13] = W[6] + rotr15(W[11]); + W[14] = W[7] + 0x00400022U + rotr15(W[12]); + W[15] = W[8] + 0x00000100U + rotr15(W[13]) + rotr25(W[0]); + + // 0x71374491U + 0x1f83d9abU + state1 + const u state1AaddV1 = state1A + V[1]; + // 0xb5c0fbcfU + 0x9b05688cU + state2 + const u state2AaddV2 = state2A + V[2]; + // 0x510e527fU + 0xe9b5dba5U + state3 + const u state3AaddV3 = state3A + V[3]; + // 0x3956c25bU + state4 + const u state4AaddV4 = state4A + V[4]; + // 0x59f111f1U + state5 + const u state5AaddV5 = state5A + V[5]; + // 0x923f82a4U + state6 + const u state6AaddV6 = state6A + V[6]; + // 0xab1c5ed5U + state7 + const u state7AaddV7 = state7A + V[7]; + + // 0x98c7e2a2U + state0 + V[3] = state0A + V[0]; + // 0xfc08884dU + state0 + V[7] = state0B + V[0]; + V[0] = 0x6a09e667U; + V[1] = 0xbb67ae85U; + V[2] = 0x3c6ef372U; + V[4] = 0x510e527fU; + V[5] = 0x9b05688cU; + V[6] = 0x1f83d9abU; + + V[2] += state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + +//---------------------------------------------------------------------------------- + + W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]); + W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]); + W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]); + W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]); + W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]); + W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]); + W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]); + W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]); + W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]); + W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]); + W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]); + W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]); + W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]); + W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]); + W[14] = W[14] + W[7] + rotr15(W[12]) + rotr25(W[15]); + W[15] = W[15] + W[8] + rotr15(W[13]) + rotr25( W[0]); + + V[3] += 0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + +//---------------------------------------------------------------------------------- + + W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]); + W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]); + W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]); + W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]); + W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]); + W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]); + W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]); + W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]); + W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]); + W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]); + W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]); + W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]); + W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]); + + V[3] += 0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + V[6] = 0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]); + + V[1] += 0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + V[5] = 0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]); + + V[0] += 0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + V[4] = 0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]); + + V[7] += 0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + V[3] = 0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]); + + V[6] += 0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]); + V[2] = 0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]); + + V[5] += 0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]); + V[1] = 0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]); + + V[4] += 0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]); + V[0] = 0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]); + + V[3] += 0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]); + V[7] = 0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]); + + V[2] += 0x78a5636fU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]); + + V[1] += 0x84c87814U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]); + + V[0] += 0x8cc70208U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]); + + V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]); + +#define FOUND (0x800) +#define NFLAG (0x7FF) + +#ifdef VECTORS4 + if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU)) + output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : ((V[7].y == 0x136032edU) ? nonce.y : ((V[7].z == 0x136032edU) ? nonce.z : nonce.w)); +#elif defined VECTORS2 + if ((V[7].x == 0x136032edU) + (V[7].y == 0x136032edU)) + output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : nonce.y; +#else + if (V[7] == 0x136032edU) + output[FOUND] = output[NFLAG & nonce] = nonce; +#endif +} diff --git a/phatk120724.cl b/phatk120724.cl new file mode 100644 index 00000000..0f604436 --- /dev/null +++ b/phatk120724.cl @@ -0,0 +1,417 @@ +// This file is taken and modified from the public-domain poclbm project, and +// I have therefore decided to keep it public-domain. +// Modified version copyright 2011-2012 Con Kolivas + +#ifdef VECTORS4 + typedef uint4 u; +#elif defined VECTORS2 + typedef uint2 u; +#else + typedef uint u; +#endif + +__constant uint K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +__constant uint ConstW[128] = { +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 +}; + +__constant uint H[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 +}; + + +#ifdef BITALIGN + #pragma OPENCL EXTENSION cl_amd_media_ops : enable + #define rot(x, y) amd_bitalign(x, x, (uint)(32 - y)) + +// This part is not from the stock poclbm kernel. It's part of an optimization +// added in the Phoenix Miner. + +// Some AMD devices have Vals[0] BFI_INT opcode, which behaves exactly like the +// SHA-256 Ch function, but provides it in exactly one instruction. If +// detected, use it for Ch. Otherwise, construct Ch out of simpler logical +// primitives. + + #ifdef BFI_INT + // Well, slight problem... It turns out BFI_INT isn't actually exposed to + // OpenCL (or CAL IL for that matter) in any way. However, there is + // a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via + // amd_bytealign, takes the same inputs, and provides the same output. + // We can use that as a placeholder for BFI_INT and have the application + // patch it after compilation. + + // This is the BFI_INT function + #define Ch(x, y, z) amd_bytealign(x,y,z) + // Ma can also be implemented in terms of BFI_INT... + #define Ma(z, x, y) amd_bytealign(z^x,y,x) + #else // BFI_INT + // Later SDKs optimise this to BFI INT without patching and GCN + // actually fails if manually patched with BFI_INT + + #define Ch(x, y, z) bitselect((u)z, (u)y, (u)x) + #define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) + #define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y) + #endif +#else // BITALIGN + #define Ch(x, y, z) (z ^ (x & (y ^ z))) + #define Ma(x, y, z) ((x & z) | (y & (x | z))) + #define rot(x, y) rotate((u)x, (u)y) + #define rotr(x, y) rotate((u)x, (u)(32-y)) +#endif + + + +//Various intermediate calculations for each SHA round +#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8])) +#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u)) + +#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8])) +#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u)) + +#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8]) +#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8]) + +//t1 calc when W is already calculated +#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] + W[(n)] + s1(n) + ch(n) + +//t1 calc which calculates W +#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] + W(n) + s1(n) + ch(n) + +//Used for constant W Values (the compiler optimizes out zeros) +#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n) + +//t2 Calc +#define t2(n) maj(n) + s0(n) + +#define rotC(x,n) (x<> (32-n)) + +//W calculation used for SHA round +#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n)) + + + +//Partial W calculations (used for the begining where only some values are nonzero) +#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U))) +#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U))) + + +#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U))) +#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U))) + + +#define P3(n) W[n-7] +#define P4(n) W[n-16] + + +//Partial Calcs for constant W values +#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U))) +#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U))) +#define P3C(x) ConstW[x-7] +#define P4C(x) ConstW[x-16] + +//SHA round with built in W calc +#define sharoundW(n) Barrier1(n); Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n); + +//SHA round without W calc +#define sharound(n) Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n); + +//SHA round for constant W values +#define sharoundC(n) Barrier3(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n); + +//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order +#define Barrier1(n) t1 = t1C((n+1)) +#define Barrier2(n) t1 = t1C((n)) +#define Barrier3(n) t1 = t1C((n)) + +//#define WORKSIZE 256 +#define MAXBUFFERS (4095) + +__kernel + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void search( const uint state0, const uint state1, const uint state2, const uint state3, + const uint state4, const uint state5, const uint state6, const uint state7, + const uint B1, const uint C1, const uint D1, + const uint F1, const uint G1, const uint H1, + const u base, + const uint W16, const uint W17, + const uint PreVal4, const uint PreVal0, + const uint PreW18, const uint PreW19, + const uint PreW31, const uint PreW32, + + __global uint * output) +{ + + + u W[124]; + u Vals[8]; + +//Dummy Variable to prevent compiler from reordering between rounds + u t1; + + //Vals[0]=state0; + Vals[1]=B1; + Vals[2]=C1; + Vals[3]=D1; + //Vals[4]=PreVal4; + Vals[5]=F1; + Vals[6]=G1; + Vals[7]=H1; + + W[16] = W16; + W[17] = W17; + +#ifdef VECTORS4 + //Less dependencies to get both the local id and group id and then add them + W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); + uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U); + //Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3 + W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U}; +#elif defined VECTORS2 + W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); + uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U); + W[18] = PreW18 + (u){r, r ^ 0x2004000U}; +#else + W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); + u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U); + W[18] = PreW18 + r; +#endif + //the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions + + + + Vals[4] = PreVal4 + W[3]; + Vals[0] = PreVal0 + W[3]; + + sharoundC(4); + W[19] = PreW19 + W[3]; + sharoundC(5); + W[20] = P4C(20) + P1(20); + sharoundC(6); + W[21] = P1(21); + sharoundC(7); + W[22] = P3C(22) + P1(22); + sharoundC(8); + W[23] = W[16] + P1(23); + sharoundC(9); + W[24] = W[17] + P1(24); + sharoundC(10); + W[25] = P1(25) + P3(25); + W[26] = P1(26) + P3(26); + sharoundC(11); + W[27] = P1(27) + P3(27); + W[28] = P1(28) + P3(28); + sharoundC(12); + W[29] = P1(29) + P3(29); + sharoundC(13); + W[30] = P1(30) + P2C(30) + P3(30); + W[31] = PreW31 + (P1(31) + P3(31)); + sharoundC(14); + W[32] = PreW32 + (P1(32) + P3(32)); + sharoundC(15); + sharound(16); + sharound(17); + sharound(18); + sharound(19); + sharound(20); + sharound(21); + sharound(22); + sharound(23); + sharound(24); + sharound(25); + sharound(26); + sharound(27); + sharound(28); + sharound(29); + sharound(30); + sharound(31); + sharound(32); + sharoundW(33); + sharoundW(34); + sharoundW(35); + sharoundW(36); + sharoundW(37); + sharoundW(38); + sharoundW(39); + sharoundW(40); + sharoundW(41); + sharoundW(42); + sharoundW(43); + sharoundW(44); + sharoundW(45); + sharoundW(46); + sharoundW(47); + sharoundW(48); + sharoundW(49); + sharoundW(50); + sharoundW(51); + sharoundW(52); + sharoundW(53); + sharoundW(54); + sharoundW(55); + sharoundW(56); + sharoundW(57); + sharoundW(58); + sharoundW(59); + sharoundW(60); + sharoundW(61); + sharoundW(62); + sharoundW(63); + + W[64]=state0+Vals[0]; + W[65]=state1+Vals[1]; + W[66]=state2+Vals[2]; + W[67]=state3+Vals[3]; + W[68]=state4+Vals[4]; + W[69]=state5+Vals[5]; + W[70]=state6+Vals[6]; + W[71]=state7+Vals[7]; + + Vals[0]=H[0]; + Vals[1]=H[1]; + Vals[2]=H[2]; + Vals[3]=H[3]; + Vals[4]=H[4]; + Vals[5]=H[5]; + Vals[6]=H[6]; + Vals[7]=H[7]; + + //sharound(64 + 0); + const u Temp = (0xb0edbdd0U + K[0]) + W[64]; + Vals[7] = Temp + 0x08909ae5U; + Vals[3] = 0xa54ff53aU + Temp; + +#define P124(n) P2(n) + P1(n) + P4(n) + + + W[64 + 16] = + P2(64 + 16) + P4(64 + 16); + sharound(64 + 1); + W[64 + 17] = P1C(64 + 17) + P2(64 + 17) + P4(64 + 17); + sharound(64 + 2); + W[64 + 18] = P124(64 + 18); + sharound(64 + 3); + W[64 + 19] = P124(64 + 19); + sharound(64 + 4); + W[64 + 20] = P124(64 + 20); + sharound(64 + 5); + W[64 + 21] = P124(64 + 21); + sharound(64 + 6); + W[64 + 22] = P4(64 + 22) + P3C(64 + 22) + P2(64 + 22) + P1(64 + 22); + sharound(64 + 7); + W[64 + 23] = P4(64 + 23) + P3(64 + 23) + P2C(64 + 23) + P1(64 + 23); + sharoundC(64 + 8); + W[64 + 24] = P1(64 + 24) + P4C(64 + 24) + P3(64 + 24); + sharoundC(64 + 9); + W[64 + 25] = P3(64 + 25) + P1(64 + 25); + sharoundC(64 + 10); + W[64 + 26] = P3(64 + 26) + P1(64 + 26); + sharoundC(64 + 11); + W[64 + 27] = P3(64 + 27) + P1(64 + 27); + sharoundC(64 + 12); + W[64 + 28] = P3(64 + 28) + P1(64 + 28); + sharoundC(64 + 13); + W[64 + 29] = P1(64 + 29) + P3(64 + 29); + W[64 + 30] = P3(64 + 30) + P2C(64 + 30) + P1(64 + 30); + sharoundC(64 + 14); + W[64 + 31] = P4C(64 + 31) + P3(64 + 31) + P2(64 + 31) + P1(64 + 31); + sharoundC(64 + 15); + sharound(64 + 16); + sharound(64 + 17); + sharound(64 + 18); + sharound(64 + 19); + sharound(64 + 20); + sharound(64 + 21); + sharound(64 + 22); + sharound(64 + 23); + sharound(64 + 24); + sharound(64 + 25); + sharound(64 + 26); + sharound(64 + 27); + sharound(64 + 28); + sharound(64 + 29); + sharound(64 + 30); + sharound(64 + 31); + sharoundW(64 + 32); + sharoundW(64 + 33); + sharoundW(64 + 34); + sharoundW(64 + 35); + sharoundW(64 + 36); + sharoundW(64 + 37); + sharoundW(64 + 38); + sharoundW(64 + 39); + sharoundW(64 + 40); + sharoundW(64 + 41); + sharoundW(64 + 42); + sharoundW(64 + 43); + sharoundW(64 + 44); + sharoundW(64 + 45); + sharoundW(64 + 46); + sharoundW(64 + 47); + sharoundW(64 + 48); + sharoundW(64 + 49); + sharoundW(64 + 50); + sharoundW(64 + 51); + sharoundW(64 + 52); + sharoundW(64 + 53); + sharoundW(64 + 54); + sharoundW(64 + 55); + sharoundW(64 + 56); + sharoundW(64 + 57); + sharoundW(64 + 58); + + W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) - + (-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64))); + +#define FOUND (0x800) +#define NFLAG (0x7FF) + +#ifdef VECTORS4 + bool result = W[117].x & W[117].y & W[117].z & W[117].w; + if (!result) { + if (!W[117].x) + output[FOUND] = output[NFLAG & W[3].x] = W[3].x; + if (!W[117].y) + output[FOUND] = output[NFLAG & W[3].y] = W[3].y; + if (!W[117].z) + output[FOUND] = output[NFLAG & W[3].z] = W[3].z; + if (!W[117].w) + output[FOUND] = output[NFLAG & W[3].w] = W[3].w; + } +#elif defined VECTORS2 + bool result = W[117].x & W[117].y; + if (!result) { + if (!W[117].x) + output[FOUND] = output[NFLAG & W[3].x] = W[3].x; + if (!W[117].y) + output[FOUND] = output[NFLAG & W[3].y] = W[3].y; + } +#else + if (!W[117]) + output[FOUND] = output[NFLAG & W[3]] = W[3]; +#endif +} diff --git a/poclbm120724.cl b/poclbm120724.cl new file mode 100644 index 00000000..3e8b9943 --- /dev/null +++ b/poclbm120724.cl @@ -0,0 +1,1353 @@ +// -ck modified kernel taken from Phoenix taken from poclbm, with aspects of +// phatk and others. +// Modified version copyright 2011-2012 Con Kolivas + +// This file is taken and modified from the public-domain poclbm project, and +// we have therefore decided to keep it public-domain in Phoenix. + +#ifdef VECTORS4 + typedef uint4 u; +#elif defined VECTORS2 + typedef uint2 u; +#else + typedef uint u; +#endif + +__constant uint K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + + +// This part is not from the stock poclbm kernel. It's part of an optimization +// added in the Phoenix Miner. + +// Some AMD devices have a BFI_INT opcode, which behaves exactly like the +// SHA-256 ch function, but provides it in exactly one instruction. If +// detected, use it for ch. Otherwise, construct ch out of simpler logical +// primitives. + +#ifdef BITALIGN + #pragma OPENCL EXTENSION cl_amd_media_ops : enable + #define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y) +#else + #define rotr(x, y) rotate((u)x, (u)(32 - y)) +#endif +#ifdef BFI_INT + // Well, slight problem... It turns out BFI_INT isn't actually exposed to + // OpenCL (or CAL IL for that matter) in any way. However, there is + // a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via + // amd_bytealign, takes the same inputs, and provides the same output. + // We can use that as a placeholder for BFI_INT and have the application + // patch it after compilation. + + // This is the BFI_INT function + #define ch(x, y, z) amd_bytealign(x, y, z) + + // Ma can also be implemented in terms of BFI_INT... + #define Ma(x, y, z) amd_bytealign( (z^x), (y), (x) ) + + // AMD's KernelAnalyzer throws errors compiling the kernel if we use + // amd_bytealign on constants with vectors enabled, so we use this to avoid + // problems. (this is used 4 times, and likely optimized out by the compiler.) + #define Ma2(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) +#else // BFI_INT + //GCN actually fails if manually patched with BFI_INT + + #define ch(x, y, z) bitselect((u)z, (u)y, (u)x) + #define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x) + #define Ma2(x, y, z) Ma(x, y, z) +#endif + + +__kernel +__attribute__((vec_type_hint(u))) +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +void search(const uint state0, const uint state1, const uint state2, const uint state3, + const uint state4, const uint state5, const uint state6, const uint state7, + const uint b1, const uint c1, + const uint f1, const uint g1, const uint h1, +#ifndef GOFFSET + const u base, +#endif + const uint fw0, const uint fw1, const uint fw2, const uint fw3, const uint fw15, const uint fw01r, + const uint D1A, const uint C1addK5, const uint B1addK6, + const uint W16addK16, const uint W17addK17, + const uint PreVal4addT1, const uint Preval0, + __global uint * output) +{ + u Vals[24]; + u *W = &Vals[8]; + +#ifdef GOFFSET + const u nonce = (uint)(get_global_id(0)); +#else + const u nonce = base + (uint)(get_global_id(0)); +#endif + +Vals[5]=Preval0; +Vals[5]+=nonce; + +Vals[0]=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],b1,c1); +Vals[0]+=D1A; + +Vals[2]=Vals[0]; +Vals[2]+=h1; + +Vals[1]=PreVal4addT1; +Vals[1]+=nonce; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); + +Vals[6]=C1addK5; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],b1); + +Vals[3]=Vals[6]; +Vals[3]+=g1; +Vals[0]+=Ma2(g1,Vals[1],f1); +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma2(f1,Vals[0],Vals[1]); + +Vals[7]=B1addK6; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); + +Vals[4]=Vals[7]; +Vals[4]+=f1; + +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[7]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[8]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[9]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[10]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[11]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[12]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[13]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[14]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=0xC19BF3F4U; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=W16addK16; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=W17addK17; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[2]=(rotr(nonce,7)^rotr(nonce,18)^(nonce>>3U)); +W[2]+=fw2; +Vals[4]+=W[2]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[18]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[3]=nonce; +W[3]+=fw3; +Vals[1]+=W[3]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[19]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); +W[4]+=0x80000000U; +Vals[0]+=W[4]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[20]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[5]=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); +Vals[6]+=W[5]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[21]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[6]=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); +W[6]+=0x00000280U; +Vals[7]+=W[6]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[22]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[7]=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); +W[7]+=fw0; +Vals[5]+=W[7]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[23]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[8]=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); +W[8]+=fw1; +Vals[2]+=W[8]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[24]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[9]=W[2]; +W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); +Vals[3]+=W[9]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[25]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[10]=W[3]; +W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); +Vals[4]+=W[10]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[26]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[11]=W[4]; +W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); +Vals[1]+=W[11]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[27]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[12]=W[5]; +W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); +Vals[0]+=W[12]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[28]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[13]=W[6]; +W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); +Vals[6]+=W[13]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[29]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[14]=0x00a00055U; +W[14]+=W[7]; +W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); +Vals[7]+=W[14]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[30]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[15]=fw15; +W[15]+=W[8]; +W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); +Vals[5]+=W[15]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[31]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[0]=fw01r; +W[0]+=W[9]; +W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U)); +Vals[2]+=W[0]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[32]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[1]=fw1; +W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); +W[1]+=W[10]; +W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U)); +Vals[3]+=W[1]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[33]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); +W[2]+=W[11]; +W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); +Vals[4]+=W[2]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[34]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); +W[3]+=W[12]; +W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); +Vals[1]+=W[3]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[35]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); +W[4]+=W[13]; +W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); +Vals[0]+=W[4]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[36]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); +W[5]+=W[14]; +W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); +Vals[6]+=W[5]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[37]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); +W[6]+=W[15]; +W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); +Vals[7]+=W[6]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[38]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U)); +W[7]+=W[0]; +W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); +Vals[5]+=W[7]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[39]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U)); +W[8]+=W[1]; +W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); +Vals[2]+=W[8]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[40]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U)); +W[9]+=W[2]; +W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); +Vals[3]+=W[9]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[41]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U)); +W[10]+=W[3]; +W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); +Vals[4]+=W[10]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[42]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U)); +W[11]+=W[4]; +W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); +Vals[1]+=W[11]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[43]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U)); +W[12]+=W[5]; +W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); +Vals[0]+=W[12]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[44]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U)); +W[13]+=W[6]; +W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); +Vals[6]+=W[13]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[45]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[14]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U)); +W[14]+=W[7]; +W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); +Vals[7]+=W[14]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[46]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); +W[15]+=W[8]; +W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); +Vals[5]+=W[15]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[47]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U)); +W[0]+=W[9]; +W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U)); +Vals[2]+=W[0]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[48]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); +W[1]+=W[10]; +W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U)); +Vals[3]+=W[1]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[49]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); +W[2]+=W[11]; +W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); +Vals[4]+=W[2]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[50]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); +W[3]+=W[12]; +W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); +Vals[1]+=W[3]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[51]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); +W[4]+=W[13]; +W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); +Vals[0]+=W[4]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[52]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); +W[5]+=W[14]; +W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); +Vals[6]+=W[5]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[53]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); +W[6]+=W[15]; +W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); +Vals[7]+=W[6]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[54]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U)); +W[7]+=W[0]; +W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); +Vals[5]+=W[7]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[55]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U)); +W[8]+=W[1]; +W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); +Vals[2]+=W[8]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[56]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U)); +W[9]+=W[2]; +W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); +Vals[3]+=W[9]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[57]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U)); +W[10]+=W[3]; +W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); +Vals[4]+=W[10]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[58]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U)); +W[11]+=W[4]; +W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); +Vals[1]+=W[11]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[59]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U)); +W[12]+=W[5]; +W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); +Vals[0]+=W[12]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[60]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U)); +W[13]+=W[6]; +W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); +Vals[6]+=W[13]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[61]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +Vals[7]+=W[14]; +Vals[7]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U)); +Vals[7]+=W[7]; +Vals[7]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[62]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +Vals[5]+=W[15]; +Vals[5]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); +Vals[5]+=W[8]; +Vals[5]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[63]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +Vals[5]+=state0; + +W[7]=state7; +W[7]+=Vals[2]; + +Vals[2]=0xF377ED68U; +Vals[2]+=Vals[5]; + +W[3]=state3; +W[3]+=Vals[0]; + +Vals[0]=0xa54ff53aU; +Vals[0]+=Vals[2]; +Vals[2]+=0x08909ae5U; + +W[6]=state6; +W[6]+=Vals[3]; + +Vals[3]=0x90BB1E3CU; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=(0x9b05688cU^(Vals[0]&0xca0b3af3U)); + +Vals[7]+=state1; +Vals[3]+=Vals[7]; + +W[2]=state2; +W[2]+=Vals[6]; + +Vals[6]=0x3c6ef372U; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma2(0xbb67ae85U,Vals[2],0x6a09e667U); + +W[5]=state5; +W[5]+=Vals[4]; + +Vals[4]=0x50C6645BU; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],0x510e527fU); +Vals[4]+=W[2]; + +W[1]=Vals[7]; +Vals[7]=0xbb67ae85U; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma2(0x6a09e667U,Vals[3],Vals[2]); + +W[4]=state4; +W[4]+=Vals[1]; + +Vals[1]=0x3AC42E24U; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=W[3]; + +W[0]=Vals[5]; + +Vals[5]=Vals[1]; +Vals[5]+=0x6a09e667U; + +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[4]; +Vals[0]+=W[4]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[5]; +Vals[6]+=W[5]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[6]; +Vals[7]+=W[6]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[7]; +Vals[5]+=W[7]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=0x5807AA98U; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[9]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[10]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[11]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[12]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[13]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[14]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=0xC19BF274U; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U)); +Vals[2]+=W[0]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[16]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); +W[1]+=0x00a00000U; +Vals[3]+=W[1]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[17]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); +W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); +Vals[4]+=W[2]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[18]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); +W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); +Vals[1]+=W[3]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[19]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); +W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); +Vals[0]+=W[4]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[20]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); +W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); +Vals[6]+=W[5]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[21]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); +W[6]+=0x00000100U; +W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); +Vals[7]+=W[6]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[22]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[7]+=0x11002000U; +W[7]+=W[0]; +W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); +Vals[5]+=W[7]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[23]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[8]=0x80000000U; +W[8]+=W[1]; +W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); +Vals[2]+=W[8]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[24]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[9]=W[2]; +W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); +Vals[3]+=W[9]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[25]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[10]=W[3]; +W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); +Vals[4]+=W[10]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[26]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[11]=W[4]; +W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); +Vals[1]+=W[11]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[27]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[12]=W[5]; +W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); +Vals[0]+=W[12]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[28]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[13]=W[6]; +W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); +Vals[6]+=W[13]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[29]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[14]=0x00400022U; +W[14]+=W[7]; +W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); +Vals[7]+=W[14]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[30]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[15]=0x00000100U; +W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); +W[15]+=W[8]; +W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); +Vals[5]+=W[15]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[31]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U)); +W[0]+=W[9]; +W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U)); +Vals[2]+=W[0]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[32]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); +W[1]+=W[10]; +W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U)); +Vals[3]+=W[1]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[33]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); +W[2]+=W[11]; +W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); +Vals[4]+=W[2]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[34]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); +W[3]+=W[12]; +W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); +Vals[1]+=W[3]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[35]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); +W[4]+=W[13]; +W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); +Vals[0]+=W[4]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[36]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); +W[5]+=W[14]; +W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); +Vals[6]+=W[5]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[37]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); +W[6]+=W[15]; +W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); +Vals[7]+=W[6]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[38]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U)); +W[7]+=W[0]; +W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); +Vals[5]+=W[7]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[39]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U)); +W[8]+=W[1]; +W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); +Vals[2]+=W[8]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[40]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U)); +W[9]+=W[2]; +W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); +Vals[3]+=W[9]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[41]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U)); +W[10]+=W[3]; +W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); +Vals[4]+=W[10]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[42]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U)); +W[11]+=W[4]; +W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); +Vals[1]+=W[11]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[43]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U)); +W[12]+=W[5]; +W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); +Vals[0]+=W[12]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[44]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U)); +W[13]+=W[6]; +W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U)); +Vals[6]+=W[13]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[45]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[14]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U)); +W[14]+=W[7]; +W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U)); +Vals[7]+=W[14]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[46]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U)); +W[15]+=W[8]; +W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U)); +Vals[5]+=W[15]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[47]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U)); +W[0]+=W[9]; +W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U)); +Vals[2]+=W[0]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[48]; +Vals[0]+=Vals[2]; +Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); +Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + +W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U)); +W[1]+=W[10]; +W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U)); +Vals[3]+=W[1]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[49]; +Vals[6]+=Vals[3]; +Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); +Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]); + +W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U)); +W[2]+=W[11]; +W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U)); +Vals[4]+=W[2]; +Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25)); +Vals[4]+=ch(Vals[6],Vals[0],Vals[1]); +Vals[4]+=K[50]; +Vals[7]+=Vals[4]; +Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22)); +Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]); + +W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U)); +W[3]+=W[12]; +W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U)); +Vals[1]+=W[3]; +Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25)); +Vals[1]+=ch(Vals[7],Vals[6],Vals[0]); +Vals[1]+=K[51]; +Vals[5]+=Vals[1]; +Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22)); +Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]); + +W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U)); +W[4]+=W[13]; +W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); +Vals[0]+=W[4]; +Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); +Vals[0]+=ch(Vals[5],Vals[7],Vals[6]); +Vals[0]+=K[52]; +Vals[2]+=Vals[0]; +Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); +Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]); + +W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U)); +W[5]+=W[14]; +W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U)); +Vals[6]+=W[5]; +Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); +Vals[6]+=ch(Vals[2],Vals[5],Vals[7]); +Vals[6]+=K[53]; +Vals[3]+=Vals[6]; +Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); +Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]); + +W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U)); +W[6]+=W[15]; +W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U)); +Vals[7]+=W[6]; +Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[7]+=ch(Vals[3],Vals[2],Vals[5]); +Vals[7]+=K[54]; +Vals[4]+=Vals[7]; +Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); +Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]); + +W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U)); +W[7]+=W[0]; +W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U)); +Vals[5]+=W[7]; +Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[5]+=ch(Vals[4],Vals[3],Vals[2]); +Vals[5]+=K[55]; +Vals[1]+=Vals[5]; +Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); +Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]); + +W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U)); +W[8]+=W[1]; +W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); +Vals[2]+=W[8]; +Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); +Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); +Vals[2]+=K[56]; +Vals[0]+=Vals[2]; + +W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U)); +W[9]+=W[2]; +W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U)); +Vals[3]+=W[9]; +Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); +Vals[3]+=ch(Vals[0],Vals[1],Vals[4]); +Vals[3]+=K[57]; +Vals[3]+=Vals[6]; + +W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U)); +W[10]+=W[3]; +W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U)); +Vals[4]+=W[10]; +Vals[4]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); +Vals[4]+=ch(Vals[3],Vals[0],Vals[1]); +Vals[4]+=K[58]; +Vals[4]+=Vals[7]; +Vals[1]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); +Vals[1]+=ch(Vals[4],Vals[3],Vals[0]); +Vals[1]+=W[11]; +Vals[1]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U)); +Vals[1]+=W[4]; +Vals[1]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U)); +Vals[1]+=K[59]; +Vals[1]+=Vals[5]; + +#define FOUND (0x800) +#define NFLAG (0x7FF) + +#if defined(VECTORS2) || defined(VECTORS4) + Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]); + Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); + Vals[2]+=W[12]; + Vals[2]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U)); + Vals[2]+=W[5]; + Vals[2]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U)); + Vals[2]+=Vals[0]; + Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); + Vals[2]+=ch(Vals[1],Vals[4],Vals[3]); + + if (any(Vals[2] == 0x136032edU)) { + if (Vals[2].x == 0x136032edU) + output[FOUND] = output[NFLAG & nonce.x] = nonce.x; + if (Vals[2].y == 0x136032edU) + output[FOUND] = output[NFLAG & nonce.y] = nonce.y; +#if defined(VECTORS4) + if (Vals[2].z == 0x136032edU) + output[FOUND] = output[NFLAG & nonce.z] = nonce.z; + if (Vals[2].w == 0x136032edU) + output[FOUND] = output[NFLAG & nonce.w] = nonce.w; +#endif + } +#else + if ((Vals[2]+ + Ma(Vals[6],Vals[5],Vals[7])+ + (rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22))+ + W[12]+ + (rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U))+ + W[5]+ + (rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U))+ + Vals[0]+ + (rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25))+ + ch(Vals[1],Vals[4],Vals[3])) == 0x136032edU) + output[FOUND] = output[NFLAG & nonce] = nonce; +#endif +} diff --git a/scrypt120724.cl b/scrypt120724.cl new file mode 100644 index 00000000..d38f6a54 --- /dev/null +++ b/scrypt120724.cl @@ -0,0 +1,757 @@ +#define rotl(x,y) rotate(x,y) +#define Ch(x,y,z) bitselect(z,y,x) +#define Maj(x,y,z) Ch((x^z),y,z) + +#define EndianSwap(n) (rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U)) + +#define Tr2(x) (rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U)) +#define Tr1(x) (rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U)) +#define Wr2(x) (rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U)) +#define Wr1(x) (rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U)) + +#define RND(a, b, c, d, e, f, g, h, k) \ + h += Tr1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += Tr2(a) + Maj(a, b, c); + +void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) +{ + uint4 S0 = *state0; + uint4 S1 = *state1; + +#define A S0.x +#define B S0.y +#define C S0.z +#define D S0.w +#define E S1.x +#define F S1.y +#define G S1.z +#define H S1.w + + uint4 W[4]; + + W[ 0].x = block0.x; + RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U); + W[ 0].y = block0.y; + RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U); + W[ 0].z = block0.z; + RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU); + W[ 0].w = block0.w; + RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U); + + W[ 1].x = block1.x; + RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); + W[ 1].y = block1.y; + RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); + W[ 1].z = block1.z; + RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); + W[ 1].w = block1.w; + RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); + + W[ 2].x = block2.x; + RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); + W[ 2].y = block2.y; + RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); + W[ 2].z = block2.z; + RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); + W[ 2].w = block2.w; + RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); + + W[ 3].x = block3.x; + RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); + W[ 3].y = block3.y; + RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); + W[ 3].z = block3.z; + RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); + W[ 3].w = block3.w; + RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); + + W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); + + W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); + + W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); + + W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); + + W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); + + W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); + + W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); + + W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); + + W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); + + W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); + + W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); + + W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); + + W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); + + W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); + + W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); + + W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + + *state0 += S0; + *state1 += S1; +} + +void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3) +{ +#define A (*state0).x +#define B (*state0).y +#define C (*state0).z +#define D (*state0).w +#define E (*state1).x +#define F (*state1).y +#define G (*state1).z +#define H (*state1).w + + uint4 W[4]; + + W[0].x = block0.x; + D=0x98c7e2a2U+W[0].x; + H=0xfc08884dU+W[0].x; + + W[0].y = block0.y; + C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y; + G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U); + + W[0].z = block0.z; + B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z; + F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U); + + W[0].w = block0.w; + A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; + E=0x95F61999U+A+Tr2(F)+Maj(F,G,H); + + W[1].x = block1.x; + RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU); + W[1].y = block1.y; + RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U); + W[1].z = block1.z; + RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U); + W[1].w = block1.w; + RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U); + + W[2].x = block2.x; + RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U); + W[2].y = block2.y; + RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U); + W[2].z = block2.z; + RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU); + W[2].w = block2.w; + RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U); + + W[3].x = block3.x; + RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U); + W[3].y = block3.y; + RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU); + W[3].z = block3.z; + RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U); + W[3].w = block3.w; + RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U); + + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U); + + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U); + + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U); + + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU); + + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU); + + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU); + + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU); + + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU); + + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U); + + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU); + + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U); + + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U); + + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U); + + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U); + + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U); + + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U); + + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U); + + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U); + + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU); + + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U); + + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U); + + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU); + + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU); + + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U); + + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U); + + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU); + + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U); + + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U); + + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U); + + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U); + + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U); + + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U); + + W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y); + RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U); + + W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z); + RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U); + + W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w); + RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU); + + W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x); + RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U); + + W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y); + RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U); + + W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z); + RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU); + + W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w); + RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU); + + W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x); + RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U); + + W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y); + RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU); + + W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z); + RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU); + + W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w); + RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U); + + W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x); + RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U); + + W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y); + RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU); + + W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z); + RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU); + + W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w); + RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U); + + W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x); + RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + + *state0 += (uint4)(0x6A09E667U,0xBB67AE85U,0x3C6EF372U,0xA54FF53AU); + *state1 += (uint4)(0x510E527FU,0x9B05688CU,0x1F83D9ABU,0x5BE0CD19U); +} + +__constant uint fixedW[64] = +{ + 0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, + 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794, + 0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f, + 0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c, + 0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa, + 0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012, + 0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4, + 0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848, +}; + +void SHA256_fixed(uint4*restrict state0,uint4*restrict state1) +{ + uint4 S0 = *state0; + uint4 S1 = *state1; + +#define A S0.x +#define B S0.y +#define C S0.z +#define D S0.w +#define E S1.x +#define F S1.y +#define G S1.z +#define H S1.w + + RND(A,B,C,D,E,F,G,H, fixedW[0]); + RND(H,A,B,C,D,E,F,G, fixedW[1]); + RND(G,H,A,B,C,D,E,F, fixedW[2]); + RND(F,G,H,A,B,C,D,E, fixedW[3]); + RND(E,F,G,H,A,B,C,D, fixedW[4]); + RND(D,E,F,G,H,A,B,C, fixedW[5]); + RND(C,D,E,F,G,H,A,B, fixedW[6]); + RND(B,C,D,E,F,G,H,A, fixedW[7]); + RND(A,B,C,D,E,F,G,H, fixedW[8]); + RND(H,A,B,C,D,E,F,G, fixedW[9]); + RND(G,H,A,B,C,D,E,F, fixedW[10]); + RND(F,G,H,A,B,C,D,E, fixedW[11]); + RND(E,F,G,H,A,B,C,D, fixedW[12]); + RND(D,E,F,G,H,A,B,C, fixedW[13]); + RND(C,D,E,F,G,H,A,B, fixedW[14]); + RND(B,C,D,E,F,G,H,A, fixedW[15]); + RND(A,B,C,D,E,F,G,H, fixedW[16]); + RND(H,A,B,C,D,E,F,G, fixedW[17]); + RND(G,H,A,B,C,D,E,F, fixedW[18]); + RND(F,G,H,A,B,C,D,E, fixedW[19]); + RND(E,F,G,H,A,B,C,D, fixedW[20]); + RND(D,E,F,G,H,A,B,C, fixedW[21]); + RND(C,D,E,F,G,H,A,B, fixedW[22]); + RND(B,C,D,E,F,G,H,A, fixedW[23]); + RND(A,B,C,D,E,F,G,H, fixedW[24]); + RND(H,A,B,C,D,E,F,G, fixedW[25]); + RND(G,H,A,B,C,D,E,F, fixedW[26]); + RND(F,G,H,A,B,C,D,E, fixedW[27]); + RND(E,F,G,H,A,B,C,D, fixedW[28]); + RND(D,E,F,G,H,A,B,C, fixedW[29]); + RND(C,D,E,F,G,H,A,B, fixedW[30]); + RND(B,C,D,E,F,G,H,A, fixedW[31]); + RND(A,B,C,D,E,F,G,H, fixedW[32]); + RND(H,A,B,C,D,E,F,G, fixedW[33]); + RND(G,H,A,B,C,D,E,F, fixedW[34]); + RND(F,G,H,A,B,C,D,E, fixedW[35]); + RND(E,F,G,H,A,B,C,D, fixedW[36]); + RND(D,E,F,G,H,A,B,C, fixedW[37]); + RND(C,D,E,F,G,H,A,B, fixedW[38]); + RND(B,C,D,E,F,G,H,A, fixedW[39]); + RND(A,B,C,D,E,F,G,H, fixedW[40]); + RND(H,A,B,C,D,E,F,G, fixedW[41]); + RND(G,H,A,B,C,D,E,F, fixedW[42]); + RND(F,G,H,A,B,C,D,E, fixedW[43]); + RND(E,F,G,H,A,B,C,D, fixedW[44]); + RND(D,E,F,G,H,A,B,C, fixedW[45]); + RND(C,D,E,F,G,H,A,B, fixedW[46]); + RND(B,C,D,E,F,G,H,A, fixedW[47]); + RND(A,B,C,D,E,F,G,H, fixedW[48]); + RND(H,A,B,C,D,E,F,G, fixedW[49]); + RND(G,H,A,B,C,D,E,F, fixedW[50]); + RND(F,G,H,A,B,C,D,E, fixedW[51]); + RND(E,F,G,H,A,B,C,D, fixedW[52]); + RND(D,E,F,G,H,A,B,C, fixedW[53]); + RND(C,D,E,F,G,H,A,B, fixedW[54]); + RND(B,C,D,E,F,G,H,A, fixedW[55]); + RND(A,B,C,D,E,F,G,H, fixedW[56]); + RND(H,A,B,C,D,E,F,G, fixedW[57]); + RND(G,H,A,B,C,D,E,F, fixedW[58]); + RND(F,G,H,A,B,C,D,E, fixedW[59]); + RND(E,F,G,H,A,B,C,D, fixedW[60]); + RND(D,E,F,G,H,A,B,C, fixedW[61]); + RND(C,D,E,F,G,H,A,B, fixedW[62]); + RND(B,C,D,E,F,G,H,A, fixedW[63]); + +#undef A +#undef B +#undef C +#undef D +#undef E +#undef F +#undef G +#undef H + *state0 += S0; + *state1 += S1; +} + +void shittify(uint4 B[8]) +{ + uint4 tmp[4]; + tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w); + tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w); + tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w); + tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i] = EndianSwap(tmp[i]); + + tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w); + tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w); + tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w); + tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] = EndianSwap(tmp[i]); +} + +void unshittify(uint4 B[8]) +{ + uint4 tmp[4]; + tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w); + tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w); + tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w); + tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i] = EndianSwap(tmp[i]); + + tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w); + tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w); + tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w); + tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w); + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] = EndianSwap(tmp[i]); +} + +void salsa(uint4 B[8]) +{ + uint4 w[4]; + +#pragma unroll + for(uint i=0; i<4; ++i) + w[i] = (B[i]^=B[i+4]); + +#pragma unroll + for(uint i=0; i<4; ++i) + { + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + } + +#pragma unroll + for(uint i=0; i<4; ++i) + w[i] = (B[i+4]^=(B[i]+=w[i])); + +#pragma unroll + for(uint i=0; i<4; ++i) + { + w[0] ^= rotl(w[3] +w[2] , 7U); + w[1] ^= rotl(w[0] +w[3] , 9U); + w[2] ^= rotl(w[1] +w[0] ,13U); + w[3] ^= rotl(w[2] +w[1] ,18U); + w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U); + w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U); + w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U); + w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U); + } + +#pragma unroll + for(uint i=0; i<4; ++i) + B[i+4] += w[i]; +} + +#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE) +#define CO Coord(z,x,y) + +void scrypt_core(uint4 X[8], __global uint4*restrict lookup) +{ + shittify(X); + const uint zSIZE = 8; + const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0)); + const uint xSIZE = CONCURRENT_THREADS; + uint x = get_global_id(0)%xSIZE; + + for(uint y=0; y<1024/LOOKUP_GAP; ++y) + { +#pragma unroll + for(uint z=0; z Date: Wed, 25 Jul 2012 22:02:14 +1000 Subject: [PATCH 079/178] Find the nearest power of 2 maximum alloc size for the scrypt buffer that can successfully be allocated and is large enough to accomodate the thread concurrency chosen, thus mapping it to an intensity. --- ocl.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/ocl.c b/ocl.c index f7264447..ba8cde2f 100644 --- a/ocl.c +++ b/ocl.c @@ -472,17 +472,35 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) #ifdef USE_SCRYPT if (opt_scrypt) { + cl_ulong ma = gpus[gpu].max_alloc, mt; + int pow2 = 0; + if (!gpus[gpu].lookup_gap) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); gpus[gpu].lookup_gap = 2; } if (!gpus[gpu].thread_concurrency) { - gpus[gpu].thread_concurrency = gpus[gpu].max_alloc / 32768 / gpus[gpu].lookup_gap; + gpus[gpu].thread_concurrency = ma / 32768 / gpus[gpu].lookup_gap; if (gpus[gpu].shaders && gpus[gpu].thread_concurrency > gpus[gpu].shaders) gpus[gpu].thread_concurrency -= gpus[gpu].thread_concurrency % gpus[gpu].shaders; applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, gpus[gpu].thread_concurrency); } + + /* If we have memory to spare, try to find a power of 2 value + * >= required amount to map nicely to an intensity */ + mt = gpus[gpu].thread_concurrency * 32768 * gpus[gpu].lookup_gap; + if (ma > mt) { + while (ma >>= 1) + pow2++; + ma = 1; + while (--pow2 && ma < mt) + ma <<= 1; + if (ma >= mt) { + gpus[gpu].max_alloc = ma; + applog(LOG_DEBUG, "Max alloc decreased to %lu", gpus[gpu].max_alloc); + } + } } #endif @@ -776,8 +794,8 @@ built: size_t ipt = (1024 / gpus[gpu].lookup_gap + (1024 % gpus[gpu].lookup_gap > 0)); size_t bufsize = 128 * ipt * gpus[gpu].thread_concurrency; - /* Always allocate the largest possible buffer allowed, even if we're not initially requiring it - * based on thread_concurrency, giving us some headroom for intensity levels. */ + /* Use the max alloc value which has been rounded to a power of + * 2 greater >= required amount earlier */ if (bufsize > gpus[gpu].max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u, your scrypt settings come to %u", gpu, gpus[gpu].max_alloc, bufsize); From 5148502c1e8f859e8ca0a9934a2bfa1ffcebffdc Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 25 Jul 2012 22:30:37 +1000 Subject: [PATCH 080/178] Reinstate help information for lookup gap and thread concurrency since tc will often need to be manually set for optimal scrypt performance. --- cgminer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index 6e9dd891..3d8be562 100644 --- a/cgminer.c +++ b/cgminer.c @@ -857,7 +857,7 @@ static struct opt_table opt_config_table[] = { #ifdef USE_SCRYPT OPT_WITH_ARG("--lookup-gap", set_lookup_gap, NULL, NULL, - opt_hidden), + "Set GPU lookup gap for scrypt mining, comma separated"), #endif OPT_WITH_ARG("--intensity|-I", set_intensity, NULL, NULL, @@ -1010,7 +1010,7 @@ static struct opt_table opt_config_table[] = { #ifdef USE_SCRYPT OPT_WITH_ARG("--thread-concurrency", set_thread_concurrency, NULL, NULL, - opt_hidden), + "Set GPU thread concurrency for scrypt mining, comma separated"), #endif OPT_WITH_ARG("--url|-o", set_url, NULL, NULL, From da1b996a396d82f328e82818a49cdb6e56b38e57 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Thu, 26 Jul 2012 16:10:21 +1000 Subject: [PATCH 081/178] Simplify repeated use of gpus[gpu]. in ocl.c --- ocl.c | 61 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/ocl.c b/ocl.c index ba8cde2f..71b69ac0 100644 --- a/ocl.c +++ b/ocl.c @@ -208,6 +208,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) { _clState *clState = calloc(1, sizeof(_clState)); bool patchbfi = false, prog_built = false; + struct cgpu_info *cgpu = &gpus[gpu]; cl_platform_id platform = NULL; char pbuff[256], vbuff[255]; cl_platform_id* platforms; @@ -363,12 +364,12 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) } applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size); - status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&gpus[gpu].max_alloc, NULL); + status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status); return NULL; } - applog(LOG_DEBUG, "Max mem alloc size is %u", gpus[gpu].max_alloc); + applog(LOG_DEBUG, "Max mem alloc size is %u", cgpu->max_alloc); /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would @@ -381,7 +382,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) char filename[255]; char numbuf[10]; - if (gpus[gpu].kernel == KL_NONE) { + if (cgpu->kernel == KL_NONE) { if (opt_scrypt) { applog(LOG_INFO, "Selecting scrypt kernel"); clState->chosen_kernel = KL_SCRYPT; @@ -403,9 +404,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) applog(LOG_INFO, "Selecting phatk kernel"); clState->chosen_kernel = KL_PHATK; } - gpus[gpu].kernel = clState->chosen_kernel; + cgpu->kernel = clState->chosen_kernel; } else { - clState->chosen_kernel = gpus[gpu].kernel; + clState->chosen_kernel = cgpu->kernel; if (clState->chosen_kernel == KL_PHATK && (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") || strstr(vbuff, "831.4") || strstr(vbuff, "898.1") || @@ -442,7 +443,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) strcpy(filename, SCRYPT_KERNNAME".cl"); strcpy(binaryfilename, SCRYPT_KERNNAME); /* Scrypt only supports vector 1 */ - gpus[gpu].vwidth = 1; + cgpu->vwidth = 1; break; case KL_NONE: /* Shouldn't happen */ case KL_DIABLO: @@ -451,45 +452,45 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) break; } - if (gpus[gpu].vwidth) - clState->vwidth = gpus[gpu].vwidth; + if (cgpu->vwidth) + clState->vwidth = cgpu->vwidth; else { clState->vwidth = preferred_vwidth; - gpus[gpu].vwidth = preferred_vwidth; + cgpu->vwidth = preferred_vwidth; } if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) && clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt) clState->goffset = true; - if (gpus[gpu].work_size && gpus[gpu].work_size <= clState->max_work_size) - clState->wsize = gpus[gpu].work_size; + if (cgpu->work_size && cgpu->work_size <= clState->max_work_size) + clState->wsize = cgpu->work_size; else if (strstr(name, "Tahiti")) clState->wsize = 64; else clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth; - gpus[gpu].work_size = clState->wsize; + cgpu->work_size = clState->wsize; #ifdef USE_SCRYPT if (opt_scrypt) { - cl_ulong ma = gpus[gpu].max_alloc, mt; + cl_ulong ma = cgpu->max_alloc, mt; int pow2 = 0; - if (!gpus[gpu].lookup_gap) { + if (!cgpu->lookup_gap) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); - gpus[gpu].lookup_gap = 2; + cgpu->lookup_gap = 2; } - if (!gpus[gpu].thread_concurrency) { - gpus[gpu].thread_concurrency = ma / 32768 / gpus[gpu].lookup_gap; - if (gpus[gpu].shaders && gpus[gpu].thread_concurrency > gpus[gpu].shaders) - gpus[gpu].thread_concurrency -= gpus[gpu].thread_concurrency % gpus[gpu].shaders; + if (!cgpu->thread_concurrency) { + cgpu->thread_concurrency = ma / 32768 / cgpu->lookup_gap; + if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) + cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; - applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, gpus[gpu].thread_concurrency); + applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, cgpu->thread_concurrency); } /* If we have memory to spare, try to find a power of 2 value * >= required amount to map nicely to an intensity */ - mt = gpus[gpu].thread_concurrency * 32768 * gpus[gpu].lookup_gap; + mt = cgpu->thread_concurrency * 32768 * cgpu->lookup_gap; if (ma > mt) { while (ma >>= 1) pow2++; @@ -497,8 +498,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) while (--pow2 && ma < mt) ma <<= 1; if (ma >= mt) { - gpus[gpu].max_alloc = ma; - applog(LOG_DEBUG, "Max alloc decreased to %lu", gpus[gpu].max_alloc); + cgpu->max_alloc = ma; + applog(LOG_DEBUG, "Max alloc decreased to %lu", cgpu->max_alloc); } } } @@ -532,7 +533,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) if (clState->goffset) strcat(binaryfilename, "g"); if (opt_scrypt) { - sprintf(numbuf, "lg%dtc%d", gpus[gpu].lookup_gap, gpus[gpu].thread_concurrency); + sprintf(numbuf, "lg%dtc%d", cgpu->lookup_gap, cgpu->thread_concurrency); strcat(binaryfilename, numbuf); } else { sprintf(numbuf, "v%d", clState->vwidth); @@ -604,7 +605,7 @@ build: #ifdef USE_SCRYPT if (opt_scrypt) sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d", - gpus[gpu].lookup_gap, gpus[gpu].thread_concurrency, (int)clState->wsize); + cgpu->lookup_gap, cgpu->thread_concurrency, (int)clState->wsize); else #endif { @@ -791,16 +792,16 @@ built: #ifdef USE_SCRYPT if (opt_scrypt) { - size_t ipt = (1024 / gpus[gpu].lookup_gap + (1024 % gpus[gpu].lookup_gap > 0)); - size_t bufsize = 128 * ipt * gpus[gpu].thread_concurrency; + size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0)); + size_t bufsize = 128 * ipt * cgpu->thread_concurrency; /* Use the max alloc value which has been rounded to a power of * 2 greater >= required amount earlier */ - if (bufsize > gpus[gpu].max_alloc) { + if (bufsize > cgpu->max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u, your scrypt settings come to %u", - gpu, gpus[gpu].max_alloc, bufsize); + gpu, cgpu->max_alloc, bufsize); } else - bufsize = gpus[gpu].max_alloc; + bufsize = cgpu->max_alloc; applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize); clState->padbufsize = bufsize; clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); From 43752ee58c4b26216dee509274d0e44c76a7181f Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Thu, 26 Jul 2012 16:12:45 +1000 Subject: [PATCH 082/178] Limit thread concurrency for scrypt to 5xshaders if shaders is specified. --- ocl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ocl.c b/ocl.c index 71b69ac0..5fe7b1b4 100644 --- a/ocl.c +++ b/ocl.c @@ -482,8 +482,11 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) } if (!cgpu->thread_concurrency) { cgpu->thread_concurrency = ma / 32768 / cgpu->lookup_gap; - if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) + if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; + if (cgpu->thread_concurrency > cgpu->shaders * 5) + cgpu->thread_concurrency = cgpu->shaders * 5; + } applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, cgpu->thread_concurrency); } From 8a35b4ac7aace4cf580bc27a4383cd55aca26dae Mon Sep 17 00:00:00 2001 From: Kano Date: Fri, 27 Jul 2012 00:55:31 +1000 Subject: [PATCH 083/178] miner.php add a socket RCV timeout for if cgminer is hung and the API thread is still running --- miner.php | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/miner.php b/miner.php index 6f750be9..be420aa6 100644 --- a/miner.php +++ b/miner.php @@ -1,7 +1,8 @@ = SND +# Feel free to increase SND if your network is very slow +# or decrease RCV if that happens often to you # Also, on some windows PHP, apparently the $usec is ignored -$socktimeoutsec = 10; +$socksndtimeoutsec = 10; +$sockrcvtimeoutsec = 40; # # List of fields NOT to be displayed # You can use this to hide data you don't want to see or don't want @@ -260,7 +265,7 @@ $error = null; # function getsock($addr, $port) { - global $haderror, $error, $socktimeoutsec; + global $haderror, $error, $socksndtimeoutsec, $sockrcvtimeoutsec; $error = null; $socket = null; @@ -277,7 +282,8 @@ function getsock($addr, $port) // Ignore if this fails since the socket connect may work anyway // and nothing is gained by aborting if the option cannot be set // since we don't know in advance if it can connect - socket_set_option($socket, SOL_SOCKET, SO_SNDTIMEO, array('sec' => $socktimeoutsec, 'usec' => 0)); + socket_set_option($socket, SOL_SOCKET, SO_SNDTIMEO, array('sec' => $socksndtimeoutsec, 'usec' => 0)); + socket_set_option($socket, SOL_SOCKET, SO_RCVTIMEO, array('sec' => $sockrcvtimeoutsec, 'usec' => 0)); $res = socket_connect($socket, $addr, $port); if ($res === false) From e8fb2bc1ba4db543ed5fe6c42259d6f8b308858c Mon Sep 17 00:00:00 2001 From: Zefir Kurtisi Date: Thu, 26 Jul 2012 22:54:39 +0200 Subject: [PATCH 084/178] fpgautils: add support for 57.6 kBd serial --- fpgautils.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fpgautils.c b/fpgautils.c index 07b3fe36..5f488740 100644 --- a/fpgautils.c +++ b/fpgautils.c @@ -211,6 +211,10 @@ serial_open(const char*devpath, unsigned long baud, signed short timeout, bool p switch (baud) { case 0: break; + case 57600: + cfsetispeed( &my_termios, B57600 ); + cfsetospeed( &my_termios, B57600 ); + break; case 115200: cfsetispeed( &my_termios, B115200 ); cfsetospeed( &my_termios, B115200 ); From 7418f5e211cb73d0104791e1759effee9b9f2eb3 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Fri, 27 Jul 2012 21:15:19 +0000 Subject: [PATCH 085/178] strtok_ts: Thread-safe strtok that work on POSIX or Windows --- compat.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/compat.h b/compat.h index c6e38d8a..19a14f0d 100644 --- a/compat.h +++ b/compat.h @@ -9,6 +9,10 @@ #include +// NOTE: Windows strtok uses a thread-local static buffer, so this is safe +#define SETUP_STRTOK_TS /*nothing needed*/ +#define strtok_ts strtok + #include "miner.h" // for timersub static inline int nanosleep(const struct timespec *req, struct timespec *rem) @@ -72,8 +76,13 @@ typedef long suseconds_t; #endif #define PTH(thr) ((thr)->pth.p) -#else +#else /* ! WIN32 */ + #define PTH(thr) ((thr)->pth) + +#define SETUP_STRTOK_TS char*_strtok_ts_saveptr +#define strtok_ts(str, delim) strtok_r(str, delim, &_strtok_ts_saveptr) + #endif /* WIN32 */ #endif /* __COMPAT_H__ */ From 8326d2dcaf3b9d4e4b1a52998f68a6b565642d23 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Fri, 27 Jul 2012 20:03:25 +0000 Subject: [PATCH 086/178] RPC: New "poolpriority" command to set the order of pool priorities --- API-README | 4 ++++ api.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/API-README b/API-README index df905ef3..a00a0dcc 100644 --- a/API-README +++ b/API-README @@ -174,6 +174,10 @@ The list of requests - a (*) means it requires privileged access - and replies a Use '\\' to get a '\' and '\,' to include a comma inside URL, USR or PASS + poolpriority|N,... (*) + none There is no reply section just the STATUS section + stating the results of changing pool priorities + disablepool|N (*) none There is no reply section just the STATUS section stating the results of disabling pool N diff --git a/api.c b/api.c index d2f01487..86de3c80 100644 --- a/api.c +++ b/api.c @@ -339,6 +339,7 @@ static const char *JSON_PARAMETER = "parameter"; #define MSG_ACCDENY 45 #define MSG_ACCOK 46 #define MSG_ENAPOOL 47 +#define MSG_POOLPRIO 73 #define MSG_DISPOOL 48 #define MSG_ALRENAP 49 #define MSG_ALRDISP 50 @@ -501,6 +502,7 @@ struct CODES { { SEVERITY_ERR, MSG_ACCDENY, PARAM_STR, "Access denied to '%s' command" }, { SEVERITY_SUCC, MSG_ACCOK, PARAM_NONE, "Privileged access OK" }, { SEVERITY_SUCC, MSG_ENAPOOL, PARAM_POOL, "Enabling pool %d:'%s'" }, + { SEVERITY_SUCC, MSG_POOLPRIO,PARAM_NONE, "Changed pool priorities" }, { SEVERITY_SUCC, MSG_DISPOOL, PARAM_POOL, "Disabling pool %d:'%s'" }, { SEVERITY_INFO, MSG_ALRENAP, PARAM_POOL, "Pool %d:'%s' already enabled" }, { SEVERITY_INFO, MSG_ALRDISP, PARAM_POOL, "Pool %d:'%s' already disabled" }, @@ -2132,6 +2134,39 @@ static void enablepool(__maybe_unused SOCKETTYPE c, char *param, bool isjson, __ strcpy(io_buffer, message(MSG_ENAPOOL, id, NULL, isjson)); } +static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, __maybe_unused char group) +{ + SETUP_STRTOK_TS; + int total_pools_ = total_pools; // Keep a local copy, to be more threadsafe + char *a; + int i, prio = 0; + + if (total_pools_ == 0) { + strcpy(io_buffer, message(MSG_NOPOOL, 0, NULL, isjson)); + return; + } + + bool pools_changed[total_pools_]; + for (i = 0; i < total_pools_; ++i) + pools_changed[i] = false; + + a = strtok_ts(param, ","); + do { + i = atoi(a); + pools[i]->prio = prio++; + pools_changed[i] = true; + } while ( (a = strtok_ts(NULL, ",")) ); + + for (i = 0; i < total_pools_; ++i) + if (!pools_changed[i]) + pools[i]->prio = prio++; + + if (current_pool()->prio) + switch_pools(NULL); + + strcpy(io_buffer, message(MSG_POOLPRIO, 0, NULL, isjson)); +} + static void disablepool(__maybe_unused SOCKETTYPE c, char *param, bool isjson, __maybe_unused char group) { struct pool *pool; @@ -2663,6 +2698,7 @@ struct CMDS { { "cpucount", cpucount, false }, { "switchpool", switchpool, true }, { "addpool", addpool, true }, + { "poolpriority", poolpriority, true }, { "enablepool", enablepool, true }, { "disablepool", disablepool, true }, { "removepool", removepool, true }, From 5ef9c139240f17270d5124101b9972b955d070c9 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Fri, 27 Jul 2012 20:53:59 +0000 Subject: [PATCH 087/178] Bugfix: API: Report errors from poolpriority command --- api.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/api.c b/api.c index 86de3c80..4e1da9a7 100644 --- a/api.c +++ b/api.c @@ -2139,7 +2139,7 @@ static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, SETUP_STRTOK_TS; int total_pools_ = total_pools; // Keep a local copy, to be more threadsafe char *a; - int i, prio = 0; + int i, prio = 0, e = -1; if (total_pools_ == 0) { strcpy(io_buffer, message(MSG_NOPOOL, 0, NULL, isjson)); @@ -2152,7 +2152,11 @@ static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, a = strtok_ts(param, ","); do { - i = atoi(a); + i = strtol(a, &a, 10); + if (unlikely(*a > 0x20 || i < 0 || i >= total_pools)) { + e = (*a > 0x20) ? -2 : i; + continue; + } pools[i]->prio = prio++; pools_changed[i] = true; } while ( (a = strtok_ts(NULL, ",")) ); @@ -2164,6 +2168,14 @@ static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, if (current_pool()->prio) switch_pools(NULL); + if (e != -1) { + if (e == -2) + strcpy(io_buffer, message(MSG_MISPID, 0, NULL, isjson)); + else + strcpy(io_buffer, message(MSG_INVPID, e, NULL, isjson)); + return; + } + strcpy(io_buffer, message(MSG_POOLPRIO, 0, NULL, isjson)); } From 0c985b24110112dcfcbe1c543562260458a07202 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Fri, 27 Jul 2012 23:42:05 +0000 Subject: [PATCH 088/178] RPC: Writeup on poolpriority command usage --- API-README | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/API-README b/API-README index a00a0dcc..6ea5a539 100644 --- a/API-README +++ b/API-README @@ -177,6 +177,7 @@ The list of requests - a (*) means it requires privileged access - and replies a poolpriority|N,... (*) none There is no reply section just the STATUS section stating the results of changing pool priorities + See usage below disablepool|N (*) none There is no reply section just the STATUS section @@ -274,8 +275,15 @@ The list of requests - a (*) means it requires privileged access - and replies a When you enable, disable or restart a GPU or PGA, you will also get Thread messages in the cgminer status window -When you switch to a different pool to the current one, you will get a -'Switching to URL' message in the cgminer status windows +The 'poolpriority' command can be used to reset the priority order of pools. +Each pool should be listed by id number in order of preference (first = most +preferred). Any pools not listed will be prioritized after the ones that are, +in an undefined order. If the priority change affects the miner's preference +for mining, it may switch immediately. + +When you switch to a different pool to the current one (including by priority +change), you will get a 'Switching to URL' message in the cgminer status +windows Obviously, the JSON format is simply just the names as given before the '=' with the values after the '=' From e87ff7c177ca1fb327c96acf09da0c77238d6a12 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 13:15:39 +1000 Subject: [PATCH 089/178] Keep a counter of enabled pools and use that instead of iterating over the pool list. Use that value to ensure we don't set the last remaining active pool to the rejecting state. --- api.c | 2 +- cgminer.c | 58 ++++++++++++++++++++++++++++++++----------------------- miner.h | 2 +- 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/api.c b/api.c index 943c0d5d..342c3710 100644 --- a/api.c +++ b/api.c @@ -2155,7 +2155,7 @@ static void disablepool(__maybe_unused SOCKETTYPE c, char *param, bool isjson, _ return; } - if (active_pools() <= 1) { + if (enabled_pools <= 1) { strcpy(io_buffer, message(MSG_DISLASTP, id, NULL, isjson)); return; } diff --git a/cgminer.c b/cgminer.c index 3d8be562..28a1b104 100644 --- a/cgminer.c +++ b/cgminer.c @@ -200,7 +200,7 @@ unsigned int total_go, total_ro; struct pool **pools; static struct pool *currentpool = NULL; -int total_pools; +int total_pools, enabled_pools; enum pool_strategy pool_strategy = POOL_FAILOVER; int opt_rotate_period; static int total_urls, total_users, total_passes, total_userpasses; @@ -1671,6 +1671,28 @@ bool regeneratehash(const struct work *work) return false; } +static void enable_pool(struct pool *pool) +{ + if (pool->enabled != POOL_ENABLED) { + enabled_pools++; + pool->enabled = POOL_ENABLED; + } +} + +static void disable_pool(struct pool *pool) +{ + if (pool->enabled == POOL_ENABLED) + enabled_pools--; + pool->enabled = POOL_DISABLED; +} + +static void reject_pool(struct pool *pool) +{ + if (pool->enabled == POOL_ENABLED) + enabled_pools--; + pool->enabled = POOL_REJECTING; +} + static bool submit_upstream_work(const struct work *work, CURL *curl) { char *hexstr = NULL; @@ -1766,7 +1788,7 @@ static bool submit_upstream_work(const struct work *work, CURL *curl) * longpoll */ if (unlikely(pool->enabled == POOL_REJECTING)) { applog(LOG_WARNING, "Rejecting pool %d now accepting shares, re-enabling!", pool->pool_no); - pool->enabled = POOL_ENABLED; + enable_pool(pool); switch_pools(NULL); } } else { @@ -1811,13 +1833,13 @@ static bool submit_upstream_work(const struct work *work, CURL *curl) * ensued. Do not do this if we know the share just happened to * be stale due to networking delays. */ - if (pool->seq_rejects > 10 && !work->stale && opt_disable_pool && total_pools > 1) { + if (pool->seq_rejects > 10 && !work->stale && opt_disable_pool && enabled_pools > 1) { double utility = total_accepted / ( total_secs ? total_secs : 1 ) * 60; if (pool->seq_rejects > utility * 3) { applog(LOG_WARNING, "Pool %d rejected %d sequential shares, disabling!", pool->pool_no, pool->seq_rejects); - pool->enabled = POOL_REJECTING; + reject_pool(pool); if (pool == current_pool()) switch_pools(NULL); pool->seq_rejects = 0; @@ -2759,18 +2781,6 @@ int curses_int(const char *query) static bool input_pool(bool live); #endif -int active_pools(void) -{ - int ret = 0; - int i; - - for (i = 0; i < total_pools; i++) { - if ((pools[i])->enabled == POOL_ENABLED) - ret++; - } - return ret; -} - #ifdef HAVE_CURSES static void display_pool_summary(struct pool *pool) { @@ -3047,7 +3057,7 @@ retry: wlogprint("Unable to remove pool due to activity\n"); goto retry; } - pool->enabled = POOL_DISABLED; + disable_pool(pool); remove_pool(pool); goto updated; } else if (!strncasecmp(&input, "s", 1)) { @@ -3057,11 +3067,11 @@ retry: goto retry; } pool = pools[selected]; - pool->enabled = POOL_ENABLED; + enable_pool(pool); switch_pools(pool); goto updated; } else if (!strncasecmp(&input, "d", 1)) { - if (active_pools() <= 1) { + if (enabled_pools <= 1) { wlogprint("Cannot disable last pool"); goto retry; } @@ -3071,7 +3081,7 @@ retry: goto retry; } pool = pools[selected]; - pool->enabled = POOL_DISABLED; + disable_pool(pool); if (pool == current_pool()) switch_pools(NULL); goto updated; @@ -3082,7 +3092,7 @@ retry: goto retry; } pool = pools[selected]; - pool->enabled = POOL_ENABLED; + enable_pool(pool); if (pool->prio < current_pool()->prio) switch_pools(pool); goto updated; @@ -4869,7 +4879,7 @@ void add_pool_details(bool live, char *url, char *user, char *pass) /* Test the pool is not idle if we're live running, otherwise * it will be tested separately */ - pool->enabled = POOL_ENABLED; + enable_pool(pool); if (live && !pool_active(pool, false)) pool->idle = true; } @@ -5197,7 +5207,7 @@ int main(int argc, char *argv[]) strcpy(pool->rpc_url, "Benchmark"); pool->rpc_user = pool->rpc_url; pool->rpc_pass = pool->rpc_url; - pool->enabled = POOL_ENABLED; + enable_pool(pool); pool->idle = false; successful_connect = true; } @@ -5440,7 +5450,7 @@ int main(int argc, char *argv[]) for (i = 0; i < total_pools; i++) { struct pool *pool = pools[i]; - pool->enabled = POOL_ENABLED; + enable_pool(pool); pool->idle = true; } diff --git a/miner.h b/miner.h index 68c6e159..11dcb5fb 100644 --- a/miner.h +++ b/miner.h @@ -606,7 +606,7 @@ extern int set_memoryclock(int gpu, int iMemoryClock); extern void api(int thr_id); extern struct pool *current_pool(void); -extern int active_pools(void); +extern int enabled_pools; extern void add_pool_details(bool live, char *url, char *user, char *pass); #define MAX_GPUDEVICES 16 From cf36331d815e7b87131d547b92b9ceaa218d114d Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Sun, 29 Jul 2012 06:26:23 +0000 Subject: [PATCH 090/178] bitforce: Skip out of sending work if work restart requested --- driver-bitforce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 21569486..d187f933 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -271,7 +271,8 @@ re_send: BFgets(pdevbuf, sizeof(pdevbuf), fdDev); if (!pdevbuf[0] || !strncasecmp(pdevbuf, "B", 1)) { mutex_unlock(&bitforce->device_mutex); - nmsleep(WORK_CHECK_INTERVAL_MS); + if (!restart_wait(WORK_CHECK_INTERVAL_MS)) + return false; goto re_send; } else if (unlikely(strncasecmp(pdevbuf, "OK", 2))) { mutex_unlock(&bitforce->device_mutex); From b40c8b848f370fbec4ef2b77b1369253129f0b89 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 19:10:40 +1000 Subject: [PATCH 091/178] Limit total number of curls recruited per pool to the number of mining threads to prevent blasting the network when we only have one pool to talk to. --- cgminer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index 28a1b104..4c5b5da5 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2140,16 +2140,19 @@ static void recruit_curl(struct pool *pool) /* Grab an available curl if there is one. If not, then recruit extra curls * unless we are in a submit_fail situation, or we have opt_delaynet enabled - * and there are already 5 curls in circulation */ + * and there are already 5 curls in circulation. Limit total number to the + * number of mining threads per pool as well to prevent blasting a pool during + * network delays/outages. */ static struct curl_ent *pop_curl_entry(struct pool *pool) { + int curl_limit = opt_delaynet ? 5 : mining_threads; struct curl_ent *ce; mutex_lock(&pool->pool_lock); if (!pool->curls) recruit_curl(pool); else if (list_empty(&pool->curlring)) { - if ((pool->submit_fail || opt_delaynet) && pool->curls > 4) + if (pool->curls >= curl_limit) pthread_cond_wait(&pool->cr_cond, &pool->pool_lock); else recruit_curl(pool); From 97aa6ea4922deff9731a85a38891a1b5c4342085 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 19:13:45 +1000 Subject: [PATCH 092/178] Fix build error without scrypt enabled. --- ocl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocl.c b/ocl.c index 5fe7b1b4..e71b9cc8 100644 --- a/ocl.c +++ b/ocl.c @@ -536,8 +536,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) if (clState->goffset) strcat(binaryfilename, "g"); if (opt_scrypt) { +#ifdef USE_SCRYPT sprintf(numbuf, "lg%dtc%d", cgpu->lookup_gap, cgpu->thread_concurrency); strcat(binaryfilename, numbuf); +#endif } else { sprintf(numbuf, "v%d", clState->vwidth); strcat(binaryfilename, numbuf); From 6332c4268b589985018a7253889b88d07269c321 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 20:13:22 +1000 Subject: [PATCH 093/178] Add scrypt documentation in the form of a separate readme. --- README | 9 ++++ SCRYPT-README | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 SCRYPT-README diff --git a/README b/README index bc1c59d6..ab508e5d 100644 --- a/README +++ b/README @@ -153,6 +153,7 @@ Options for both config file and command line: --scan-time|-s Upper bound on time spent scanning current work, in seconds (default: 60) --sched-start Set a time of day in HH:MM to start mining (a once off without a stop time) --sched-stop Set a time of day in HH:MM to stop mining (will quit without a start time) +--scrypt Use the scrypt algorithm for mining (litecoin only) --sharelog Append share log to file --shares Quit after mining N shares (default: unlimited) --socks-proxy Set socks4 proxy (host:port) @@ -197,6 +198,14 @@ GPU only options: --worksize|-w Override detected optimal worksize - one value or comma separated list +SCRYPT only options: + +--lookup-gap Set GPU lookup gap for scrypt mining, comma separated +--thread-concurrency Set GPU thread concurrency for scrypt mining, comma separated + +See SCRYPT-README for more information regarding litecoin mining. + + FPGA mining boards(BitForce, Icarus, ModMiner, Ztex) only options: --scan-serial|-S Serial port to probe for FPGA mining device diff --git a/SCRYPT-README b/SCRYPT-README new file mode 100644 index 00000000..62a3351b --- /dev/null +++ b/SCRYPT-README @@ -0,0 +1,136 @@ +If you wish to donate to the author, Con Kolivas, in LTC, please submit your +donations to: + +Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH + +Otherwise, please donate in BTC as per the main README. + +--- + + +Scrypt mining, AKA litecoin mining, for GPU is completely different to sha256 +used for bitcoin mining. The algorithm was originally developed in a manner +that it was anticipated would make it suitable for mining on CPU but NOT GPU. +Thanks to some innovative work by Artforz and mtrlt, this was proven to be +wrong. However, it has very different requirements to bitcoin mining and is a +lot more complicated to get working well. Note that it is a ram dependent +workload, and requires you to have enough system ram as well as fast enough +GPU ram. + +There are 5 main parameters to tuning scrypt, 2 of which you MUST set, and +the others are optional for further fine tuning. When you start scrypt mining +with the --scrypt option, cgminer will fail IN RANDOM WAYS. They are all due +to parameters being outside what the GPU can cope with. Not giving cgminer a +hint as to your GPU type, it will hardly ever perform well. + + +Step 1 on linux: +export GPU_MAX_ALLOC_PERCENT=100 +If you do not do this, you may find it impossible to scrypt mine. You may find +a value of 40 is enough and increasing this further has little effect. + +export GPU_USE_SYNC_OBJECTS=1 +may help CPU usage a little as well. + +--shaders XXX + +is a new option where you tell cgminer how many shaders your GPU has. This +helps cgminer try to choose some meaningful baseline parameters. Use this table +below to determine how many shaders your GPU has, and note that there are some +variants of these cards, and nvidia shaders are much much lower and virtually +pointless trying to mine on. + +GPU Shaders +7750 512 +7770 640 +7850 1024 +7870 1280 +7950 1792 +7970 2048 + +6850 960 +6870 1120 +6950 1408 +6970 1536 +6990 (6970x2) + +6570 480 +6670 480 +6790 800 + +6450 160 + +5670 400 +5750 720 +5770 800 +5830 1120 +5850 1440 +5870 1600 +5970 (5870x2) + +These are only used as a rough guide for cgminer, and it is rare that this is +all you will need to set. + + +--intensity XX + +Just like in bitcoin mining, scrypt mining takes an intensity, however the +scale goes from 0 to 20 to mimic the "Aggression" used in mtrlt's reaper. The +reason this is crucial is that too high an intensity can actually be +disastrous with scrypt because it CAN run out of ram. Intensities over 13 +start writing over the same ram and it is highly dependent on the GPU, but they +can start actually DECREASING your hashrate, or even worse, start producing +garbage with rejects skyrocketing. + + +Optional parameters to tune: +-g, --thread-concurrency, --lookup-gap + +-g: +Once you have found the optimal shaders and intensity, you can start increasing +the -g value till cgminer fails to start. Rarely will you be able to go over +about -g 4 and each increase in -g only increases hashrate slightly. + +--thread-concurrency: +This tunes the optimal size of work that scrypt can do. It is internally tuned +by cgminer to be the highest reasonable multiple of shaders that it can +allocate on your GPU. Ideally it should be a multiple of your shader count. +vliw5 architecture (R5XXX) would be best at 5x shaders, while VLIW4 (R6xxx and +R7xxx) are best at 4x. Setting thread concurrency overrides anything you put +into --shaders. + +--lookup-gap +This tunes a compromise between ram usage and performance. Performance peaks +at a gap of 2, but increasing the gap can save you some GPU ram, but almost +always at the cost of significant loss of hashrate. Setting lookup gap +overrides the default of 2, but cgminer will use the --shaders value to choose +a thread-concurrency if you haven't chosen one. + + +Overclocking for scrypt mining: +First of all, do not underclock your memory initially. Scrypt mining requires +memory speed and on most, but not all, GPUs, lowering memory speed lowers +mining performance. + +Second, absolute engine clock speeds do NOT correlate with hashrate. The ratio +of engine clock speed to memory matters, so if you set your memory to the +default value, and then start overclocking as you are running it, you should +find a sweet spot where the hashrate peaks and then it might actually drop if +you increase the engine clock speed further. Unless you wish to run with a +dynamic intensity, do not go over 13 without testing it while it's running to +see that it increases hashrate AND utility WITHOUT increasing your rejects. + + +Suggested values for 7970 for example: +export GPU_MAX_ALLOC_PERCENT=100 +--shaders 2048 -g 5 --gpu-engine 1135 --gpu-memclock 1375 + + +--- + +If you wish to donate to the author, Con Kolivas, in LTC, please submit your +donations to: + +Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH + +Otherwise, please donate in BTC as per the main README. From 7e55a41209a115da4f2235b7683387e282c1401a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 20:17:15 +1000 Subject: [PATCH 094/178] Sleep only the extra amount of time we overran the dynamic interval in dynamic mode. --- driver-opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver-opencl.c b/driver-opencl.c index b22017c1..0bfd805c 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1510,7 +1510,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, if (gpu->intensity > MIN_INTENSITY) --gpu->intensity; else - nmsleep(opt_dynamic_interval / 2 ? : 1); + nmsleep(gpu->gpu_us_average - dynamic_us); } else if (gpu->gpu_us_average < dynamic_us / 2) { if (gpu->intensity < MAX_INTENSITY) ++gpu->intensity; From 6b80592cde18df19f69ffc7579220f810f051ac3 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 20:27:37 +1000 Subject: [PATCH 095/178] Sleeping on intensity decrease is broken, remove it. --- driver-opencl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 0bfd805c..8df0b101 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1509,8 +1509,6 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, if (gpu->gpu_us_average > dynamic_us) { if (gpu->intensity > MIN_INTENSITY) --gpu->intensity; - else - nmsleep(gpu->gpu_us_average - dynamic_us); } else if (gpu->gpu_us_average < dynamic_us / 2) { if (gpu->intensity < MAX_INTENSITY) ++gpu->intensity; From b5517af02498b1b09380410200b2173034cfcc90 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 21:07:24 +1000 Subject: [PATCH 096/178] Don't try and print curses output for devices that won't fit on the screen. --- cgminer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cgminer.c b/cgminer.c index edfe4e39..cfcef1b5 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1458,11 +1458,12 @@ static void curses_print_devstatus(int thr_id) struct cgpu_info *cgpu = thr_info[thr_id].cgpu; char logline[255]; + if (devcursor + cgpu->cgminer_id > LINES - 2) + return; + cgpu->utility = cgpu->accepted / ( total_secs ? total_secs : 1 ) * 60; - /* Check this isn't out of the window size */ - if (wmove(statuswin,devcursor + cgpu->cgminer_id, 0) == ERR) - return; + wmove(statuswin,devcursor + cgpu->cgminer_id, 0); wprintw(statuswin, " %s %*d: ", cgpu->api->name, dev_width, cgpu->device_id); if (cgpu->api->get_statline_before) { logline[0] = '\0'; From d8ec6d3e23d3b8a0bfba8c12f71ad4477c593439 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 21:35:24 +1000 Subject: [PATCH 097/178] Display kilohash when suitable, but store the global mhash value still truly in megahashes to not break the API output. --- cgminer.c | 62 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/cgminer.c b/cgminer.c index cfcef1b5..9fb1f1b9 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1381,6 +1381,16 @@ void tailsprintf(char *f, const char *fmt, ...) static void get_statline(char *buf, struct cgpu_info *cgpu) { + double displayed_hashes, displayed_rolling = cgpu->rolling; + bool mhash_base = true; + + displayed_hashes = cgpu->total_mhashes / total_secs; + if (displayed_hashes < 1) { + displayed_hashes *= 1000; + displayed_rolling *= 1000; + mhash_base = false; + } + sprintf(buf, "%s%d ", cgpu->api->name, cgpu->device_id); if (cgpu->api->get_statline_before) cgpu->api->get_statline_before(buf, cgpu); @@ -1388,9 +1398,9 @@ static void get_statline(char *buf, struct cgpu_info *cgpu) tailsprintf(buf, " | "); tailsprintf(buf, "(%ds):%.1f (avg):%.1f %sh/s | A:%d R:%d HW:%d U:%.1f/m", opt_log_interval, - cgpu->rolling, - cgpu->total_mhashes / total_secs, - opt_scrypt ? "K" : "M", + displayed_rolling, + displayed_hashes, + mhash_base ? "M" : "K", cgpu->accepted, cgpu->rejected, cgpu->hw_errors, @@ -1456,6 +1466,8 @@ static void curses_print_devstatus(int thr_id) { static int awidth = 1, rwidth = 1, hwwidth = 1, uwidth = 1; struct cgpu_info *cgpu = thr_info[thr_id].cgpu; + double displayed_hashes, displayed_rolling; + bool mhash_base = true; char logline[255]; if (devcursor + cgpu->cgminer_id > LINES - 2) @@ -1473,6 +1485,14 @@ static void curses_print_devstatus(int thr_id) else wprintw(statuswin, " | "); + displayed_hashes = cgpu->total_mhashes / total_secs; + displayed_rolling = cgpu->rolling; + if (displayed_hashes < 1) { + displayed_hashes *= 1000; + displayed_rolling *= 1000; + mhash_base = false; + } + if (cgpu->status == LIFE_DEAD) wprintw(statuswin, "DEAD "); else if (cgpu->status == LIFE_SICK) @@ -1482,14 +1502,15 @@ static void curses_print_devstatus(int thr_id) else if (cgpu->deven == DEV_RECOVER) wprintw(statuswin, "REST "); else - wprintw(statuswin, "%5.1f", cgpu->rolling); + wprintw(statuswin, "%5.1f", displayed_rolling); adj_width(cgpu->accepted, &awidth); adj_width(cgpu->rejected, &rwidth); adj_width(cgpu->hw_errors, &hwwidth); adj_width(cgpu->utility, &uwidth); + wprintw(statuswin, "/%5.1f%sh/s | A:%*d R:%*d HW:%*d U:%*.2f/m", - cgpu->total_mhashes / total_secs, - opt_scrypt ? "K" : "M", + displayed_hashes, + mhash_base ? "M" : "K", awidth, cgpu->accepted, rwidth, cgpu->rejected, hwwidth, cgpu->hw_errors, @@ -3433,13 +3454,11 @@ static void hashmeter(int thr_id, struct timeval *diff, double utility, efficiency = 0.0; static double local_mhashes_done = 0; static double rolling = 0; - double local_mhashes; + double local_mhashes, displayed_hashes, displayed_rolling; + bool mhash_base = true; bool showlog = false; - if (opt_scrypt) - local_mhashes = (double)hashes_done / 1000.0; - else - local_mhashes = (double)hashes_done / 1000000.0; + local_mhashes = (double)hashes_done / 1000000.0; /* Update the last time this thread reported in */ if (thr_id >= 0) { gettimeofday(&thr_info[thr_id].last, NULL); @@ -3515,9 +3534,17 @@ static void hashmeter(int thr_id, struct timeval *diff, utility = total_accepted / ( total_secs ? total_secs : 1 ) * 60; efficiency = total_getworks ? total_accepted * 100.0 / total_getworks : 0.0; + displayed_hashes = total_mhashes_done / total_secs; + displayed_rolling = rolling; + if (displayed_hashes < 1) { + displayed_hashes *= 1000; + displayed_rolling *= 1000; + mhash_base = false; + } + sprintf(statusline, "%s(%ds):%.1f (avg):%.1f %sh/s | Q:%d A:%d R:%d HW:%d E:%.0f%% U:%.1f/m", want_per_device_stats ? "ALL " : "", - opt_log_interval, rolling, total_mhashes_done / total_secs, opt_scrypt ? "K" : "M", + opt_log_interval, displayed_rolling, displayed_hashes, mhash_base ? "M" : "K", total_getworks, total_accepted, total_rejected, hw_errors, efficiency, utility); @@ -4726,7 +4753,8 @@ static void print_summary(void) { struct timeval diff; int hours, mins, secs, i; - double utility, efficiency = 0.0; + double utility, efficiency = 0.0, displayed_hashes; + bool mhash_base = true; timersub(&total_tv_end, &total_tv_start, &diff); hours = diff.tv_sec / 3600; @@ -4745,8 +4773,14 @@ static void print_summary(void) applog(LOG_WARNING, "CPU hasher algorithm used: %s", algo_names[opt_algo]); #endif applog(LOG_WARNING, "Runtime: %d hrs : %d mins : %d secs", hours, mins, secs); + displayed_hashes = total_mhashes_done / total_secs; + if (displayed_hashes < 1) { + displayed_hashes *= 1000; + mhash_base = false; + } + if (total_secs) - applog(LOG_WARNING, "Average hashrate: %.1f %shash/s", total_mhashes_done / total_secs, opt_scrypt? "Kilo" : "Mega"); + applog(LOG_WARNING, "Average hashrate: %.1f %shash/s", displayed_hashes, mhash_base? "Mega" : "Kilo"); applog(LOG_WARNING, "Solved blocks: %d", found_blocks); applog(LOG_WARNING, "Queued work requests: %d", total_getworks); applog(LOG_WARNING, "Share submissions: %d", total_accepted + total_rejected); From d140427a908b7d47759b42bccac6545a4d5020b9 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 21:47:46 +1000 Subject: [PATCH 098/178] Update NEWS. --- NEWS | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/NEWS b/NEWS index 06370090..80770bf8 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,156 @@ +Version 2.6.0 - July 29, 2012 + +- Display kilohash when suitable, but store the global mhash value still truly +in megahashes to not break the API output. +- Don't try and print curses output for devices that won't fit on the screen. +- Add scrypt documentation in the form of a separate readme. +- Fix build error without scrypt enabled. +- Limit total number of curls recruited per pool to the number of mining threads +to prevent blasting the network when we only have one pool to talk to. +- bitforce: Skip out of sending work if work restart requested +- Keep a counter of enabled pools and use that instead of iterating over the +pool list. Use that value to ensure we don't set the last remaining active pool +to the rejecting state. +- fpgautils: add support for 57.6 kBd serial +- miner.php add a socket RCV timeout for if cgminer is hung and the API thread +is still running +- Limit thread concurrency for scrypt to 5xshaders if shaders is specified. +- Simplify repeated use of gpus[gpu]. in ocl.c +- Find the nearest power of 2 maximum alloc size for the scrypt buffer that can +successfully be allocated and is large enough to accomodate the thread +concurrency chosen, thus mapping it to an intensity. +- Don't make opt_scrypt mandatory blocking with opencl code. +- Update kernel versions reflecting changes in the API. +- Make the thread concurrency and lookup gap options hidden on the command line +and autotune parameters with a newly parsed --shaders option. +- Fix target testing with scrypt kernel as it would have been missing shares +below target. +- Bugfix: Use a mutex to control non-curses output +- Simplify code to a single vprintf path for curses-less printing +- Move opt_quiet check to my_log_curses, so it works for curses-less builds +- Use log_generic for vapplog to cut down on code duplication +- Add space to log output now that there is more screen real estate available. +- BFL force all code to timeout to avoid hanging +- Bugfix: Copy argv[0] given to dirname() +- Always create the largest possible padbuffer for scrypt kernels even if not +needed for thread_concurrency, giving us some headroom for intensity levels. +- Use the detected maximum allocable memory on a GPU to determine the optimal +scrypt settings when lookup_gap and thread_concurrency parameters are not given. +- Check the maximum allocable memory size per opencl device. +- Add debugging output if buffer allocation fails for scrypt and round up +bufsize to a multiple of 256. +- Nonce testing for btc got screwed up, leading to no accepted shares. Fix it. +- Display size of scrypt buffer used in debug. +- Allow intensities up to 20 if scrypt is compiled in. +- Add name to scrypt kernel copyright. +- Allow lookup gap and thread concurrency to be passed per device and store +details in kernel binary filename. +- Ignore negative intensities for scrypt. +- Change the scale of intensity for scrypt kernel and fix a build warning. +- Correct target value passed to scrypt kernel. +- Use 256 output slots for kernels to allow 1 for each worksize. +- Test the target in the actual scrypt kernel itself saving further +calculations. +- Reinstate GPU only opencl device detection. +- Decrease lookup gap to 1. Does not seem to help in any way being 2. +- Fix build. +- Make pad0 and pad1 local variable in scrypt kernel. +- Constify input variable in scrypt kernel. +- Send correct values to scrypt kernel to get it finally working. +- Create command queue before compiling program in opencl. +- Detach pthread from within the api thread in case it is terminated due to not +being instantiated before pthread_cancel is called from main, leading to a +segfault. +- Debug output per thread hashrate is out by a factor of 1000. +- Initialise mdplatform. +- Find the gpu platform with the most devices and use that if no platform option +is passed. +- Allow more platforms to be probed if first does not return GPUs. +- Fix external scrypt algo missing. +- Limit scrypt to 1 vector. +- Handle KL_SCRYPT in config write. +- Get rid of stuff. +- Don't enqueuewrite buffer at all for pad8 and pass work details around for +scrypt in dev_blk. +- Set the correct data for cldata and prepare for pad8 fixes. +- Bugfix: Fix build without curses but with OpenCL +- Find the gpu platform with the most devices and use that if no platform option +is passed. +- Allow more platforms to be probed if first does not return GPUs. +- Get rid of spaces in arrays in scrypt kernel. +- Start with smaller amount of hashes in cpu mining to enable scrypt to return +today sometime. +- Show Khash hashrates when scrypt is in use. +- Free the scratchbuf memory allocated in scrypt and don't check if CPUs are +sick since they can't be. Prepare for khash hash rates in display. +- Add cpumining capability for scrypt. +- Set scrypt settings and buffer size in ocl.c code to be future modifiable. +- Cope with when we cannot set intensity low enough to meet dynamic interval by +inducing a forced sleep. +- Make dynamic and scrypt opencl calls blocking. +- Calculate midstate in separate function and remove likely/unlikely macros +since they're dependent on pools, not code design. +- bitforce: Use "full work" vs "nonce range" for kernel name +- Display in debug mode when we're making the midstate locally. +- Fix nonce submission code for scrypt. +- Make sure goffset is set for scrypt and drop padbuffer8 to something +manageable for now. +- Set up buffer8 for scrypt. +- Build fix for opt scrypt. +- Don't check postcalc nonce with sha256 in scrypt. +- Don't test nonce with sha and various fixes for scrypt. +- Make scrypt buffers and midstate compatible with cgminer. +- Use cgminer specific output array entries in scrypt kernel. +- Provide initial support for the scrypt kernel to compile with and mine scrypt +with the --scrypt option. +- Enable completely compiling scrypt out. +- Begin import of scrypt opencl kernel from reaper. +- bitforce_get_result returns -1 on error now. +- Check return value of read in BFgets +- Bugfix: Make our Windows nanosleep/sleep replacements standards-compliant +(which fixes nmsleep) and include compat.h for bitforce (for sleep) +- rpc: Use a single switch statement for both stringifications of cgpu->status +- Fix whitespace mangling. +- miner.php fix rig # when miners fail +- Only try to shut down work cleanly if we've successfully connected and started +mining. +- Use switch statement for cgpu->status and fix spelling. +- Abbrv. correction +- Bugfix: Don't declare devices SICK if they're just busy initialising +- Bugfix: Calculate nsec in nmsleep correctly +- Bugfix: Adapt OpenCL scanhash errors to driver API change (errors are now -1, +not 0) +- Remove superfluous ave_wait +- Put kname change for broken nonce-range back in +- Add average wait time to api stats +- Change BFL driver thread initialising to a constant 100ms delay between +devices instead of a random arrangement. +- Spelling typo. +- Time opencl work from start of queueing a kernel till it's flushed when +calculating dynamic intensity. +- Modify te scanhash API to use an int64_t and return -1 on error, allowing zero +to be a valid return value. +- Check for work restart after the hashmeter is invoked for we lose the hashes +otherwise contributed in the count. +- Remove disabled: label from mining thread function, using a separate +mt_disable function. +- Style changes. +- Missed one nonce-range disabling. +- Add average return time to api stats +- miner.php allow rig names in number buttons +- Remove bitforce_thread_init The delay thing does nothing useful... when long +poll comes around, all threads restart at the same time anyway. +- Change timeouts to time-vals for accuracy. +- fix API support for big endian machines +- Cope with signals interrupting the nanosleep of nmsleep. +- Use standard cfsetispeed/cfsetospeed to set baud rate on *nix +- miner.php split() flagged deprecated in PHP 5.3.0 +- More BFL tweaks. Add delay between closing and reopening port. Remove buffer +clear in re-init Add kernel type (mini-rig or single) +- Make long timeout 10seconds on bitforce for when usleep or nanosleep just +can't be accurate... + + Version 2.5.0 - July 6, 2012 - Fix --benchmark not working since the dynamic addition of pools and pool From 0aa0a0667f22a1993c9724fec6cdd32f7d419e59 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 21:54:23 +1000 Subject: [PATCH 099/178] Smarter autogen.sh script. --- autogen.sh | 25 +++++++++++++++---------- mkinstalldirs | 4 ++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/autogen.sh b/autogen.sh index bf564eb1..e922cfc9 100755 --- a/autogen.sh +++ b/autogen.sh @@ -1,12 +1,17 @@ #!/bin/sh +cwd="$PWD" +bs_dir="$(dirname $(readlink -f $0))" +rm -rf "${bs_dir}"/autom4te.cache +rm -f "${bs_dir}"/aclocal.m4 "${bs_dir}"/ltmain.sh -# You need autoconf 2.5x, preferably 2.57 or later -# You need automake 1.7 or later. 1.6 might work. - -set -e - -aclocal -I m4 -autoheader -automake --add-missing --copy -autoconf - +echo 'Running autoreconf -if...' +autoreconf -if || exit 1 +if test -z "$NOCONFIGURE" ; then + echo 'Configuring...' + cd "${bs_dir}" &> /dev/null + test "$?" = "0" || e=1 + test "$cwd" != "$bs_dir" && cd "$bs_dir" &> /dev/null + ./configure $@ + test "$e" = "1" && exit 1 + cd "$cwd" +fi diff --git a/mkinstalldirs b/mkinstalldirs index 4191a45d..55d537f8 100755 --- a/mkinstalldirs +++ b/mkinstalldirs @@ -81,9 +81,9 @@ case $dirmode in echo "mkdir -p -- $*" exec mkdir -p -- "$@" else - # On NextStep and OpenStep, the `mkdir' command does not + # On NextStep and OpenStep, the 'mkdir' command does not # recognize any option. It will interpret all options as - # directories to create, and then abort because `.' already + # directories to create, and then abort because '.' already # exists. test -d ./-p && rmdir ./-p test -d ./--version && rmdir ./--version From ac6315b735eb15a6e2749602a2794e29d40347df Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 29 Jul 2012 22:15:42 +1000 Subject: [PATCH 100/178] Bump version 2.6.0, adding SCRYPT README to makefile. --- Makefile.am | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index a784ef91..013fd340 100644 --- a/Makefile.am +++ b/Makefile.am @@ -10,7 +10,7 @@ endif EXTRA_DIST = example.conf m4/gnulib-cache.m4 linux-usb-cgminer \ ADL_SDK/readme.txt api-example.php miner.php \ API.class API.java api-example.c windows-build.txt \ - bitstreams/* API-README FPGA-README + bitstreams/* API-README FPGA-README SCRYPT-README SUBDIRS = lib compat ccan diff --git a/configure.ac b/configure.ac index 8cf840bf..332e79e0 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_maj], [2]) -m4_define([v_min], [5]) +m4_define([v_min], [6]) m4_define([v_mic], [0]) ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_ver], [v_maj.v_min.v_mic]) From 854a80ff037112dd16a0f2ad4209f672b498905a Mon Sep 17 00:00:00 2001 From: Kano Date: Sun, 29 Jul 2012 23:15:20 +1000 Subject: [PATCH 101/178] api.c poolpriority changes --- API-README | 9 ++++++- api.c | 74 ++++++++++++++++++++++++++++++++++-------------------- compat.h | 11 +------- 3 files changed, 56 insertions(+), 38 deletions(-) diff --git a/API-README b/API-README index 6ea5a539..6c43929a 100644 --- a/API-README +++ b/API-README @@ -321,7 +321,14 @@ miner.php - an example web page to access the API Feature Changelog for external applications using the API: -API V1.14 +API V1.15 + +Added API commands: + 'poolpriority' + +---------- + +API V1.14 (cgminer v2.5.0) Modified API commands: 'stats' - more icarus timing stats added diff --git a/api.c b/api.c index bfa3fcac..b9f0f535 100644 --- a/api.c +++ b/api.c @@ -166,7 +166,7 @@ static const char SEPARATOR = '|'; #define SEPSTR "|" static const char GPUSEP = ','; -static const char *APIVERSION = "1.14"; +static const char *APIVERSION = "1.15"; static const char *DEAD = "Dead"; static const char *SICK = "Sick"; static const char *NOSTART = "NoStart"; @@ -339,7 +339,6 @@ static const char *JSON_PARAMETER = "parameter"; #define MSG_ACCDENY 45 #define MSG_ACCOK 46 #define MSG_ENAPOOL 47 -#define MSG_POOLPRIO 73 #define MSG_DISPOOL 48 #define MSG_ALRENAP 49 #define MSG_ALRDISP 50 @@ -373,6 +372,8 @@ static const char *JSON_PARAMETER = "parameter"; #define MSG_MINESTATS 70 #define MSG_MISCHK 71 #define MSG_CHECK 72 +#define MSG_POOLPRIO 73 +#define MSG_DUPPID 74 enum code_severity { SEVERITY_ERR, @@ -386,6 +387,7 @@ enum code_parameters { PARAM_GPU, PARAM_PGA, PARAM_CPU, + PARAM_PID, PARAM_GPUMAX, PARAM_PGAMAX, PARAM_CPUMAX, @@ -503,6 +505,7 @@ struct CODES { { SEVERITY_SUCC, MSG_ACCOK, PARAM_NONE, "Privileged access OK" }, { SEVERITY_SUCC, MSG_ENAPOOL, PARAM_POOL, "Enabling pool %d:'%s'" }, { SEVERITY_SUCC, MSG_POOLPRIO,PARAM_NONE, "Changed pool priorities" }, + { SEVERITY_ERR, MSG_DUPPID, PARAM_PID, "Duplicate pool specified %d" }, { SEVERITY_SUCC, MSG_DISPOOL, PARAM_POOL, "Disabling pool %d:'%s'" }, { SEVERITY_INFO, MSG_ALRENAP, PARAM_POOL, "Pool %d:'%s' already enabled" }, { SEVERITY_INFO, MSG_ALRDISP, PARAM_POOL, "Pool %d:'%s' already disabled" }, @@ -1064,6 +1067,7 @@ static char *message(int messageid, int paramid, char *param2, bool isjson) case PARAM_GPU: case PARAM_PGA: case PARAM_CPU: + case PARAM_PID: sprintf(buf, codes[i].description, paramid); break; case PARAM_POOL: @@ -2132,46 +2136,62 @@ static void enablepool(__maybe_unused SOCKETTYPE c, char *param, bool isjson, __ static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, __maybe_unused char group) { - SETUP_STRTOK_TS; - int total_pools_ = total_pools; // Keep a local copy, to be more threadsafe - char *a; - int i, prio = 0, e = -1; + char *ptr, *next; + int i, pr, prio = 0; - if (total_pools_ == 0) { + // TODO: all cgminer code needs a mutex added everywhere for change + // access to total_pools and also parts of the pools[] array, + // just copying total_pools here wont solve that + + if (total_pools == 0) { strcpy(io_buffer, message(MSG_NOPOOL, 0, NULL, isjson)); return; } - bool pools_changed[total_pools_]; - for (i = 0; i < total_pools_; ++i) + if (param == NULL || *param == '\0') { + strcpy(io_buffer, message(MSG_MISPID, 0, NULL, isjson)); + return; + } + + bool pools_changed[total_pools]; + for (i = 0; i < total_pools; ++i) pools_changed[i] = false; - a = strtok_ts(param, ","); - do { - i = strtol(a, &a, 10); - if (unlikely(*a > 0x20 || i < 0 || i >= total_pools)) { - e = (*a > 0x20) ? -2 : i; - continue; + next = param; + while (next && *next) { + ptr = next; + next = strchr(ptr, ','); + if (next) + *(next++) = '\0'; + + i = atoi(ptr); + if (i < 0 || i >= total_pools) { + strcpy(io_buffer, message(MSG_INVPID, i, NULL, isjson)); + return; } + + if (pools_changed[i]) { + strcpy(io_buffer, message(MSG_DUPPID, i, NULL, isjson)); + return; + } + pools[i]->prio = prio++; pools_changed[i] = true; - } while ( (a = strtok_ts(NULL, ",")) ); + } - for (i = 0; i < total_pools_; ++i) - if (!pools_changed[i]) - pools[i]->prio = prio++; + // In priority order, cycle through the unchanged pools and append them + for (pr = 0; pr < total_pools; pr++) + for (i = 0; i < total_pools; i++) { + if (!pools_changed[i] && pools[i]->prio == pr) { + pools[i]->prio = prio++; + pools_changed[i] = true; + break; + } + } if (current_pool()->prio) switch_pools(NULL); - if (e != -1) { - if (e == -2) - strcpy(io_buffer, message(MSG_MISPID, 0, NULL, isjson)); - else - strcpy(io_buffer, message(MSG_INVPID, e, NULL, isjson)); - return; - } - strcpy(io_buffer, message(MSG_POOLPRIO, 0, NULL, isjson)); } diff --git a/compat.h b/compat.h index 19a14f0d..c6e38d8a 100644 --- a/compat.h +++ b/compat.h @@ -9,10 +9,6 @@ #include -// NOTE: Windows strtok uses a thread-local static buffer, so this is safe -#define SETUP_STRTOK_TS /*nothing needed*/ -#define strtok_ts strtok - #include "miner.h" // for timersub static inline int nanosleep(const struct timespec *req, struct timespec *rem) @@ -76,13 +72,8 @@ typedef long suseconds_t; #endif #define PTH(thr) ((thr)->pth.p) -#else /* ! WIN32 */ - +#else #define PTH(thr) ((thr)->pth) - -#define SETUP_STRTOK_TS char*_strtok_ts_saveptr -#define strtok_ts(str, delim) strtok_r(str, delim, &_strtok_ts_saveptr) - #endif /* WIN32 */ #endif /* __COMPAT_H__ */ From 51940ec7193159ce25658729046e5b1c6672f2d6 Mon Sep 17 00:00:00 2001 From: Kano Date: Sun, 29 Jul 2012 23:40:01 +1000 Subject: [PATCH 102/178] api.c verify poolpriority parameters before changing pools --- api.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/api.c b/api.c index b9f0f535..7c856a7e 100644 --- a/api.c +++ b/api.c @@ -2154,8 +2154,10 @@ static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, } bool pools_changed[total_pools]; - for (i = 0; i < total_pools; ++i) + int new_prio[total_pools]; + for (i = 0; i < total_pools; ++i) { pools_changed[i] = false; + } next = param; while (next && *next) { @@ -2175,8 +2177,14 @@ static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, return; } - pools[i]->prio = prio++; pools_changed[i] = true; + new_prio[i] = prio++; + } + + // Only change them if no errors + for (i = 0; i < total_pools; i++) { + if (pools_changed[i]) + pools[i]->prio = new_prio[i]; } // In priority order, cycle through the unchanged pools and append them From 68b041be6e308e350d24bd7ab8cfcd5a7e36fb6b Mon Sep 17 00:00:00 2001 From: Kano Date: Mon, 30 Jul 2012 00:13:37 +1000 Subject: [PATCH 103/178] API-README poolpriority changes --- API-README | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/API-README b/API-README index 6c43929a..a62b6304 100644 --- a/API-README +++ b/API-README @@ -39,7 +39,7 @@ To give an IP address/subnet access to a group you use the group letter in front of the IP address instead of W: e.g. P:192.168.0/32 An IP address/subnet can only be a member of one group A sample API group would be: - --api-groups P:switchpool:enablepool:addpool:disablepool:removepool:* + --api-groups P:switchpool:enablepool:addpool:disablepool:removepool.poolpriority:* This would create a group 'P' that can do all current pool commands and all non-priviliged commands - the '*' means all non-priviledged commands Without the '*' the group would only have access to the pool commands @@ -275,11 +275,14 @@ The list of requests - a (*) means it requires privileged access - and replies a When you enable, disable or restart a GPU or PGA, you will also get Thread messages in the cgminer status window -The 'poolpriority' command can be used to reset the priority order of pools. +The 'poolpriority' command can be used to reset the priority order of multiple +pools with a single command - 'switchpool' only sets a single pool to first priority Each pool should be listed by id number in order of preference (first = most -preferred). Any pools not listed will be prioritized after the ones that are, -in an undefined order. If the priority change affects the miner's preference -for mining, it may switch immediately. +preferred) +Any pools not listed will be prioritised after the ones that are listed, in the +priority order they were originally +If the priority change affects the miner's preference for mining, it may switch +immediately When you switch to a different pool to the current one (including by priority change), you will get a 'Switching to URL' message in the cgminer status From 70cba2ae255beffb44ed40cd5668ad91bfb01772 Mon Sep 17 00:00:00 2001 From: Kano Date: Mon, 30 Jul 2012 07:34:06 +1000 Subject: [PATCH 104/178] Style --- api.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/api.c b/api.c index 7c856a7e..8e66e810 100644 --- a/api.c +++ b/api.c @@ -2155,9 +2155,8 @@ static void poolpriority(__maybe_unused SOCKETTYPE c, char *param, bool isjson, bool pools_changed[total_pools]; int new_prio[total_pools]; - for (i = 0; i < total_pools; ++i) { + for (i = 0; i < total_pools; ++i) pools_changed[i] = false; - } next = param; while (next && *next) { From c91148f193d3c93c389fa7f357b7a29e4e885596 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 30 Jul 2012 09:02:23 +1000 Subject: [PATCH 105/178] Remove the low hash count determinant of hardware being sick. A low hash rate can be for poor network connectivity or scrypt mining, neither of which are due to sick hardware. --- cgminer.c | 19 +++---------------- miner.h | 1 - 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/cgminer.c b/cgminer.c index 9fb1f1b9..9108d1b7 100644 --- a/cgminer.c +++ b/cgminer.c @@ -4567,7 +4567,6 @@ static void age_work(void) #define WATCHDOG_DEAD_TIME 600 #define WATCHDOG_SICK_COUNT (WATCHDOG_SICK_TIME/WATCHDOG_INTERVAL) #define WATCHDOG_DEAD_COUNT (WATCHDOG_DEAD_TIME/WATCHDOG_INTERVAL) -#define WATCHDOG_LOW_HASH 1.0 /* consider < 1MH too low for any device */ static void *watchdog_thread(void __maybe_unused *userdata) { @@ -4648,9 +4647,6 @@ static void *watchdog_thread(void __maybe_unused *userdata) struct cgpu_info *cgpu = devices[i]; struct thr_info *thr = cgpu->thr[0]; enum dev_enable *denable; - bool dev_count_well; - bool dev_count_sick; - bool dev_count_dead; char dev_str[8]; int gpu; @@ -4682,21 +4678,12 @@ static void *watchdog_thread(void __maybe_unused *userdata) if (!strcmp(cgpu->api->dname, "cpu")) continue; #endif - if (cgpu->rolling < WATCHDOG_LOW_HASH) - cgpu->low_count++; - else - cgpu->low_count = 0; - - dev_count_well = (cgpu->low_count < WATCHDOG_SICK_COUNT); - dev_count_sick = (cgpu->low_count > WATCHDOG_SICK_COUNT); - dev_count_dead = (cgpu->low_count > WATCHDOG_DEAD_COUNT); - - if (cgpu->status != LIFE_WELL && (now.tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME) && dev_count_well) { + if (cgpu->status != LIFE_WELL && (now.tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME)) { if (cgpu->status != LIFE_INIT) applog(LOG_ERR, "%s: Recovered, declaring WELL!", dev_str); cgpu->status = LIFE_WELL; cgpu->device_last_well = time(NULL); - } else if (cgpu->status == LIFE_WELL && ((now.tv_sec - thr->last.tv_sec > WATCHDOG_SICK_TIME) || dev_count_sick)) { + } else if (cgpu->status == LIFE_WELL && (now.tv_sec - thr->last.tv_sec > WATCHDOG_SICK_TIME)) { thr->rolling = cgpu->rolling = 0; cgpu->status = LIFE_SICK; applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str); @@ -4715,7 +4702,7 @@ static void *watchdog_thread(void __maybe_unused *userdata) applog(LOG_ERR, "%s: Attempting to restart", dev_str); reinit_device(cgpu); } - } else if (cgpu->status == LIFE_SICK && ((now.tv_sec - thr->last.tv_sec > WATCHDOG_DEAD_TIME) || dev_count_dead)) { + } else if (cgpu->status == LIFE_SICK && (now.tv_sec - thr->last.tv_sec > WATCHDOG_DEAD_TIME)) { cgpu->status = LIFE_DEAD; applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str); gettimeofday(&thr->sick, NULL); diff --git a/miner.h b/miner.h index 4f553cd2..09cb503c 100644 --- a/miner.h +++ b/miner.h @@ -336,7 +336,6 @@ struct cgpu_info { int accepted; int rejected; int hw_errors; - unsigned int low_count; double rolling; double total_mhashes; double utility; From 58cb42c2ef42307e4075566a13a9233c7d0f5dad Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 30 Jul 2012 09:22:55 +1000 Subject: [PATCH 106/178] Update README with more build instructions. --- README | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README b/README index ab508e5d..dbe7bfd1 100644 --- a/README +++ b/README @@ -33,22 +33,32 @@ READ EXECUTIVE SUMMARY BELOW FOR FIRST TIME USERS! Dependencies: curl dev library http://curl.haxx.se/libcurl/ (libcurl4-openssl-dev) + curses dev library (libncurses5-dev or libpdcurses on WIN32) + pkg-config http://www.freedesktop.org/wiki/Software/pkg-config + libtool http://www.gnu.org/software/libtool/ + jansson http://www.digip.org/jansson/ (jansson is included in-tree and not necessary) + yasm 1.0.1+ http://yasm.tortall.net/ (yasm is optional, gives assembly routines for CPU mining) + AMD APP SDK http://developer.amd.com/sdks/AMDAPPSDK (This sdk is mandatory for GPU mining) + AMD ADL SDK http://developer.amd.com/sdks/ADLSDK (This sdk is mandatory for ATI GPU monitoring & clocking) + libudev headers (This is only required for FPGA auto-detection and is linux only) + libusb headers (This is only required for ZTEX support) + CGMiner specific configuration options: --enable-cpumining Build with cpu mining support(default disabled) --disable-opencl Override detection and disable building with opencl @@ -57,6 +67,9 @@ CGMiner specific configuration options: --enable-icarus Compile support for Icarus Board(default disabled) --enable-modminer Compile support for ModMiner FPGAs(default disabled) --enable-ztex Compile support for Ztex Board(default disabled) + --enable-scrypt Compile support for scrypt litecoin mining (default disabled) + --without-curses Compile support for curses TUI (default enabled) + --without-libudev Autodetect FPGAs using libudev (default enabled) Basic *nix build instructions: To build with GPU mining support: From 52821d4241ffab3cc5e3cc75415287c2bf6efa7d Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 30 Jul 2012 09:30:32 +1000 Subject: [PATCH 107/178] Minor readme updates. --- README | 3 ++- SCRYPT-README | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README b/README index dbe7bfd1..5ff103a8 100644 --- a/README +++ b/README @@ -744,7 +744,8 @@ A: Cgminer is being packaged with other trojan scripts and some antivirus software is falsely accusing cgminer.exe as being the actual virus, rather than whatever it is being packaged with. If you installed cgminer yourself, then you do not have a virus on your computer. Complain to your antivirus -software company. +software company. They seem to be flagging even source code now from cgminer +as viruses, even though text source files can't do anything by themself. Q: Can you modify the display to include more of one thing in the output and less of another, or can you change the quiet mode or can you add yet another diff --git a/SCRYPT-README b/SCRYPT-README index 62a3351b..a4b5df4e 100644 --- a/SCRYPT-README +++ b/SCRYPT-README @@ -15,7 +15,8 @@ Thanks to some innovative work by Artforz and mtrlt, this was proven to be wrong. However, it has very different requirements to bitcoin mining and is a lot more complicated to get working well. Note that it is a ram dependent workload, and requires you to have enough system ram as well as fast enough -GPU ram. +GPU ram. If you have less system ram than your GPU has, it may not be possible +to mine at any reasonable rate. There are 5 main parameters to tuning scrypt, 2 of which you MUST set, and the others are optional for further fine tuning. When you start scrypt mining @@ -123,7 +124,7 @@ see that it increases hashrate AND utility WITHOUT increasing your rejects. Suggested values for 7970 for example: export GPU_MAX_ALLOC_PERCENT=100 ---shaders 2048 -g 5 --gpu-engine 1135 --gpu-memclock 1375 +--thread-concurrency 8192 -g 4 --gpu-engine 1135 --gpu-memclock 1375 --- From 4cf2b4da236aecfc316ebb94248c4beda7ee650e Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 30 Jul 2012 09:33:04 +1000 Subject: [PATCH 108/178] More scrypt intensity information. --- SCRYPT-README | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/SCRYPT-README b/SCRYPT-README index a4b5df4e..8d4c5a42 100644 --- a/SCRYPT-README +++ b/SCRYPT-README @@ -81,7 +81,10 @@ reason this is crucial is that too high an intensity can actually be disastrous with scrypt because it CAN run out of ram. Intensities over 13 start writing over the same ram and it is highly dependent on the GPU, but they can start actually DECREASING your hashrate, or even worse, start producing -garbage with rejects skyrocketing. +garbage with rejects skyrocketing. The low level detail is that intensity is +only guaranteed up to the power of 2 that most closely matches the thread +concurrency. i.e. a thread concurrency of 6144 has 8192 as the nearest power +of two above it, thus as 2^13=8192, that is an intensity of 13. Optional parameters to tune: From 2353233d18c262eec54bf495c0eefa3e51a8ef67 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 30 Jul 2012 13:58:07 +1000 Subject: [PATCH 109/178] News update for upcoming 2.6.1. --- NEWS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS b/NEWS index 80770bf8..be5f7f4c 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,10 @@ +Version 2.6.1 - July 30, 2012 + +- Remove the low hash count determinant of hardware being sick. A low hash rate +can be for poor network connectivity or scrypt mining, neither of which a +- api.c poolpriority changes + + Version 2.6.0 - July 29, 2012 - Display kilohash when suitable, but store the global mhash value still truly From 7a07c7d04c16e4f7b1cc643eed701511fa6aac8c Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 30 Jul 2012 14:03:15 +1000 Subject: [PATCH 110/178] Fix build warning about KL_SCRYPT when built without scrypt support. --- cgminer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index 9108d1b7..20fcabf4 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2903,11 +2903,9 @@ void write_config(FILE *fcfg) case KL_DIABLO: fprintf(fcfg, "diablo"); break; -#ifdef USE_SCRYPT case KL_SCRYPT: fprintf(fcfg, "scrypt"); break; -#endif } } #ifdef HAVE_ADL From 99204bd59a5fe2bd46c18dc17dfcc5b38437492e Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 30 Jul 2012 14:05:16 +1000 Subject: [PATCH 111/178] Display scrypt as being built in as well. --- cgminer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cgminer.c b/cgminer.c index 20fcabf4..0de93f04 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1191,6 +1191,9 @@ static char *opt_verusage_and_exit(const char *extra) #endif #ifdef USE_ZTEX "ztex " +#endif +#ifdef USE_SCRYPT + "scrypt " #endif "mining support.\n" , packagename); From fdb67e8b99ad439fa7643c7c6846fb0140771d9e Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 30 Jul 2012 13:58:53 +1000 Subject: [PATCH 112/178] Bump version to 2.6.1 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 332e79e0..d4357599 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_maj], [2]) m4_define([v_min], [6]) -m4_define([v_mic], [0]) +m4_define([v_mic], [1]) ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_ver], [v_maj.v_min.v_mic]) m4_define([lt_rev], m4_eval(v_maj + v_min)) From fd51e5ba01f25222d43472dc51f989d19e1acfb9 Mon Sep 17 00:00:00 2001 From: Tydus Date: Mon, 30 Jul 2012 18:27:33 +0800 Subject: [PATCH 113/178] Add scrypt support while writing conf --- cgminer.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cgminer.c b/cgminer.c index 0de93f04..7f8a7128 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2911,6 +2911,20 @@ void write_config(FILE *fcfg) break; } } +#ifdef USE_SCRYPT + fputs("\",\n\"lookup-gap\" : \"", fcfg); + for(i = 0; i < nDevs; i++) + fprintf(fcfg, "%s%d", i > 0 ? "," : "", + (int)gpus[i].lookup_gap); + fputs("\",\n\"thread-concurrency\" : \"", fcfg); + for(i = 0; i < nDevs; i++) + fprintf(fcfg, "%s%d", i > 0 ? "," : "", + (int)gpus[i].thread_concurrency); + fputs("\",\n\"shaders\" : \"", fcfg); + for(i = 0; i < nDevs; i++) + fprintf(fcfg, "%s%d", i > 0 ? "," : "", + (int)gpus[i].shaders); +#endif #ifdef HAVE_ADL fputs("\",\n\"gpu-engine\" : \"", fcfg); for(i = 0; i < nDevs; i++) From 23a8c60420c01b452394d7aaa2638b61d513adaa Mon Sep 17 00:00:00 2001 From: ckolivas Date: Tue, 31 Jul 2012 10:28:48 +1000 Subject: [PATCH 114/178] Revert "bitforce: Skip out of sending work if work restart requested" This reverts commit cf36331d815e7b87131d547b92b9ceaa218d114d. --- driver-bitforce.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 6b76288e..123c5fc4 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -287,8 +287,7 @@ re_send: BFgets(pdevbuf, sizeof(pdevbuf), fdDev); if (!pdevbuf[0] || !strncasecmp(pdevbuf, "B", 1)) { mutex_unlock(&bitforce->device_mutex); - if (!restart_wait(WORK_CHECK_INTERVAL_MS)) - return false; + nmsleep(WORK_CHECK_INTERVAL_MS); goto re_send; } else if (unlikely(strncasecmp(pdevbuf, "OK", 2))) { mutex_unlock(&bitforce->device_mutex); From 3cbf835d3ecc489053bf0af931df5fff68271f4a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 31 Jul 2012 17:16:53 +1000 Subject: [PATCH 115/178] Update .gitignore for files that should not be tracked. --- .gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.gitignore b/.gitignore index 9ab93c0f..3e345162 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,13 @@ mingw32-config.cache ext_deps config.h.in config.h + +ccan/libccan.a +lib/arg-nonnull.h +lib/c++defs.h +lib/libgnu.a +lib/signal.h +lib/string.h +lib/warn-on-use.h + +mkinstalldirs From ddcf3d20cadf65dfe0f13c84cbcdd51775cbf858 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 31 Jul 2012 22:19:39 +1000 Subject: [PATCH 116/178] Differentiate between the send return value being a bool and the get return value when managing them in bitforce scanhash. --- driver-bitforce.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 123c5fc4..4606e29b 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -474,9 +474,10 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_ { struct cgpu_info *bitforce = thr->cgpu; unsigned int sleep_time; + bool send_ret; int64_t ret; - ret = bitforce_send_work(thr, work); + send_ret = bitforce_send_work(thr, work); if (!bitforce->nonce_range) { /* Initially wait 2/3 of the average cycle time so we can request more @@ -502,8 +503,10 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_ bitforce->wait_ms = sleep_time; } - if (ret) + if (send_ret) ret = bitforce_get_result(thr, work); + else + ret = -1; if (ret == -1) { ret = 0; From 33019b11ba2e305c425ce637646a9d4e5558e804 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 31 Jul 2012 22:39:04 +1000 Subject: [PATCH 117/178] Show the correct base units on GPU summary. --- driver-opencl.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 0d1d0ea8..2027a8f6 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -660,9 +660,19 @@ retry: for (gpu = 0; gpu < nDevs; gpu++) { struct cgpu_info *cgpu = &gpus[gpu]; + double displayed_rolling, displayed_total; + bool mhash_base = true; + + displayed_rolling = cgpu->rolling; + displayed_total = cgpu->total_mhashes / total_secs; + if (displayed_rolling < 1) { + displayed_rolling *= 1000; + displayed_total *= 1000; + mhash_base = false; + } - wlog("GPU %d: %.1f / %.1f Mh/s | A:%d R:%d HW:%d U:%.2f/m I:%d\n", - gpu, cgpu->rolling, cgpu->total_mhashes / total_secs, + wlog("GPU %d: %.1f / %.1f %sh/s | A:%d R:%d HW:%d U:%.2f/m I:%d\n", + gpu, displayed_rolling, displayed_total, mhash_base ? "M" : "K", cgpu->accepted, cgpu->rejected, cgpu->hw_errors, cgpu->utility, cgpu->intensity); #ifdef HAVE_ADL @@ -710,7 +720,10 @@ retry: if (thr->cgpu != cgpu) continue; get_datestamp(checkin, &thr->last); - wlog("Thread %d: %.1f Mh/s %s ", i, thr->rolling, cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled"); + displayed_rolling = thr->rolling; + if (!mhash_base) + displayed_rolling *= 1000; + wlog("Thread %d: %.1f %sh/s %s ", i, displayed_rolling, mhash_base ? "M" : "K" , cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled"); switch (cgpu->status) { default: case LIFE_WELL: From a688951d37d8a4c98d31f2aa44ef0f754ee87cc8 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 1 Aug 2012 19:34:43 +1000 Subject: [PATCH 118/178] Revert "Remove bitforce_thread_init" This reverts commit 2dfe0d628e05763bce09c51420860ef4f2ea7919. Roll back to init'ing bitforce devices at regular intervals. They may reinit more than previously thought. --- driver-bitforce.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/driver-bitforce.c b/driver-bitforce.c index 4606e29b..6a3f109f 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -525,6 +525,20 @@ static bool bitforce_get_stats(struct cgpu_info *bitforce) return bitforce_get_temp(bitforce); } +static bool bitforce_thread_init(struct thr_info *thr) +{ + struct cgpu_info *bitforce = thr->cgpu; + unsigned int wait; + + /* Pause each new thread a random time between 0-100ms + so the devices aren't making calls all at the same time. */ + wait = (rand() * MAX_START_DELAY_US)/RAND_MAX; + applog(LOG_DEBUG, "BFL%i: Delaying start by %dms", bitforce->device_id, wait / 1000); + usleep(wait); + + return true; +} + static struct api_data *bitforce_api_stats(struct cgpu_info *cgpu) { struct api_data *root = NULL; @@ -548,6 +562,7 @@ struct device_api bitforce_api = { .get_statline_before = get_bitforce_statline_before, .get_stats = bitforce_get_stats, .thread_prepare = bitforce_thread_prepare, + .thread_init = bitforce_thread_init, .scanhash = bitforce_scanhash, .thread_shutdown = bitforce_shutdown, .thread_enable = biforce_thread_enable From 7aa809ca241bfeafacbfbc088798b09c725231ff Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 1 Aug 2012 19:36:00 +1000 Subject: [PATCH 119/178] Revert "Revert "Change BFL driver thread initialising to a constant 100ms delay between devices instead of a random arrangement."" This reverts commit 89e613b94361a68305a68095b1b8f4756f9ef8f8. Leave the delay between each device initialising. --- driver-bitforce.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 6a3f109f..f7e5b3e4 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -530,9 +530,9 @@ static bool bitforce_thread_init(struct thr_info *thr) struct cgpu_info *bitforce = thr->cgpu; unsigned int wait; - /* Pause each new thread a random time between 0-100ms - so the devices aren't making calls all at the same time. */ - wait = (rand() * MAX_START_DELAY_US)/RAND_MAX; + /* Pause each new thread at least 100ms between initialising + * so the devices aren't making calls all at the same time. */ + wait = thr->id * MAX_START_DELAY_US; applog(LOG_DEBUG, "BFL%i: Delaying start by %dms", bitforce->device_id, wait / 1000); usleep(wait); From d37d044fb7b34e437e34315f5ae5e3cc443c289a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 1 Aug 2012 20:05:30 +1000 Subject: [PATCH 120/178] Add some headroom to the number of curls available per pool to allow for longpoll and sendwork curls. --- cgminer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cgminer.c b/cgminer.c index 0de93f04..0cc3fc05 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2169,7 +2169,7 @@ static void recruit_curl(struct pool *pool) * network delays/outages. */ static struct curl_ent *pop_curl_entry(struct pool *pool) { - int curl_limit = opt_delaynet ? 5 : mining_threads; + int curl_limit = opt_delaynet ? 5 : mining_threads * 4 / 3; struct curl_ent *ce; mutex_lock(&pool->pool_lock); From 7f8250132a9ba2853354e392d2470eae943e06d1 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 1 Aug 2012 20:08:37 +1000 Subject: [PATCH 121/178] Clear bitforce buffer on init as previously. --- driver-bitforce.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver-bitforce.c b/driver-bitforce.c index f7e5b3e4..4005d251 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -185,6 +185,8 @@ void bitforce_init(struct cgpu_info *bitforce) applog(LOG_WARNING, "BFL%i: Re-initialising", bitforce->device_id); + biforce_clear_buffer(bitforce); + mutex_lock(&bitforce->device_mutex); if (fdDev) { BFclose(fdDev); From efba82fb56d1f38d486979b86ebafd90cf1a96f7 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 1 Aug 2012 20:39:12 +1000 Subject: [PATCH 122/178] Print the 3 parameters that are passed to applog for a debug line in bitforce.c --- driver-bitforce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 4005d251..ebfdb0a3 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -415,7 +415,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work) } if (delay_time_ms != bitforce->sleep_ms) - applog(LOG_DEBUG, "BFL%i: Wait time changed to: %d", bitforce->device_id, bitforce->sleep_ms, bitforce->wait_ms); + applog(LOG_DEBUG, "BFL%i: Wait time changed to: %d, waited %u", bitforce->device_id, bitforce->sleep_ms, bitforce->wait_ms); /* Work out the average time taken. Float for calculation, uint for display */ bitforce->avg_wait_f += (tv_to_ms(elapsed) - bitforce->avg_wait_f) / TIME_AVG_CONSTANT; From 8414a9a7e20f7583298192fc889ad22c40d149dd Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 1 Aug 2012 22:48:38 +1000 Subject: [PATCH 123/178] Add debugging output when work is found stale as to why. --- cgminer.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cgminer.c b/cgminer.c index 0cc3fc05..537c9cb8 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2306,14 +2306,20 @@ static bool stale_work(struct work *work, bool share) work_expiry = 5; gettimeofday(&now, NULL); - if ((now.tv_sec - work->tv_staged.tv_sec) >= work_expiry) + if ((now.tv_sec - work->tv_staged.tv_sec) >= work_expiry) { + applog(LOG_DEBUG, "Work stale due to expiry"); return true; + } - if (work->work_block != work_block) + if (work->work_block != work_block) { + applog(LOG_DEBUG, "Work stale due to block mismatch"); return true; + } - if (opt_fail_only && !share && pool != current_pool() && pool->enabled != POOL_REJECTING) + if (opt_fail_only && !share && pool != current_pool() && pool->enabled != POOL_REJECTING) { + applog(LOG_DEBUG, "Work stale due to fail only pool mismatch"); return true; + } return false; } From e067be421aa43558504c1b6198f4134d6bcb9e09 Mon Sep 17 00:00:00 2001 From: Kano Date: Wed, 1 Aug 2012 22:50:30 +1000 Subject: [PATCH 124/178] ICA support 57600 baud rate, up to 8 FPGA and partial working FPGA boards --- cgminer.c | 13 ++++ driver-icarus.c | 183 +++++++++++++++++++++++++++++++++++++++++++----- miner.h | 1 + 3 files changed, 181 insertions(+), 16 deletions(-) diff --git a/cgminer.c b/cgminer.c index 0de93f04..e2976fd8 100644 --- a/cgminer.c +++ b/cgminer.c @@ -142,6 +142,7 @@ bool opt_api_listen; bool opt_api_network; bool opt_delaynet; bool opt_disable_pool = true; +char *opt_icarus_options = NULL; char *opt_icarus_timing = NULL; char *opt_kernel_path; @@ -710,6 +711,13 @@ static char *set_api_description(const char *arg) } #ifdef USE_ICARUS +static char *set_icarus_options(const char *arg) +{ + opt_set_charp(arg, &opt_icarus_options); + + return NULL; +} + static char *set_icarus_timing(const char *arg) { opt_set_charp(arg, &opt_icarus_timing); @@ -873,6 +881,9 @@ static struct opt_table opt_config_table[] = { "Override sha256 kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"), #endif #ifdef USE_ICARUS + OPT_WITH_ARG("--icarus-options", + set_icarus_options, NULL, NULL, + opt_hidden), OPT_WITH_ARG("--icarus-timing", set_icarus_timing, NULL, NULL, opt_hidden), @@ -3011,6 +3022,8 @@ void write_config(FILE *fcfg) fprintf(fcfg, ",\n\"api-description\" : \"%s\"", opt_api_description); if (opt_api_groups) fprintf(fcfg, ",\n\"api-groups\" : \"%s\"", opt_api_groups); + if (opt_icarus_options) + fprintf(fcfg, ",\n\"icarus-options\" : \"%s\"", opt_icarus_options); if (opt_icarus_timing) fprintf(fcfg, ",\n\"icarus-timing\" : \"%s\"", opt_icarus_timing); fputs("\n}", fcfg); diff --git a/driver-icarus.c b/driver-icarus.c index 5f2c78ad..0b3aea7a 100644 --- a/driver-icarus.c +++ b/driver-icarus.c @@ -65,7 +65,7 @@ #define ASSERT1(condition) __maybe_unused static char sizeof_uint32_t_must_be_4[(condition)?1:-1] ASSERT1(sizeof(uint32_t) == 4); -#define ICARUS_READ_TIME ((double)ICARUS_READ_SIZE * (double)8.0 / (double)ICARUS_IO_SPEED) +#define ICARUS_READ_TIME(baud) ((double)ICARUS_READ_SIZE * (double)8.0 / (double)(baud)) // Fraction of a second, USB timeout is measured in // i.e. 10 means 1/10 of a second @@ -176,11 +176,36 @@ struct ICARUS_INFO { // (which will only affect W) uint64_t history_count; struct timeval history_time; + + // icarus-options + int baud; + int work_division; + int fpga_count; + uint32_t nonce_mask; }; +#define END_CONDITION 0x0000ffff + // One for each possible device static struct ICARUS_INFO **icarus_info; +// Looking for options in --icarus-timing and --icarus-options: +// +// Code increments this each time we start to look at a device +// However, this means that if other devices are checked by +// the Icarus code (e.g. BFL) they will count in the option offset +// +// This, however, is deterministic so that's OK +// +// If we were to increment after successfully finding an Icarus +// that would be random since an Icarus may fail and thus we'd +// not be able to predict the option order +// +// This also assumes that serial_detect() checks them sequentially +// and in the order specified on the command line +// +static int option_offset = -1; + struct device_api icarus_api; static void rev(unsigned char *s, size_t l) @@ -195,8 +220,8 @@ static void rev(unsigned char *s, size_t l) } } -#define icarus_open2(devpath, purge) serial_open(devpath, 115200, ICARUS_READ_FAULT_DECISECONDS, purge) -#define icarus_open(devpath) icarus_open2(devpath, false) +#define icarus_open2(devpath, baud, purge) serial_open(devpath, baud, ICARUS_READ_FAULT_DECISECONDS, purge) +#define icarus_open(devpath, baud) icarus_open2(devpath, baud, false) static int icarus_gets(unsigned char *buf, int fd, struct timeval *tv_finish, struct thr_info *thr, int read_count) { @@ -272,7 +297,7 @@ static const char *timing_mode_str(enum timing_mode timing_mode) } } -static void set_timing_mode(struct cgpu_info *icarus) +static void set_timing_mode(int this_option_offset, struct cgpu_info *icarus) { struct ICARUS_INFO *info = icarus_info[icarus->device_id]; double Hs; @@ -285,7 +310,7 @@ static void set_timing_mode(struct cgpu_info *icarus) buf[0] = '\0'; else { ptr = opt_icarus_timing; - for (i = 0; i < icarus->device_id; i++) { + for (i = 0; i < this_option_offset; i++) { comma = strchr(ptr, ','); if (comma == NULL) break; @@ -354,11 +379,122 @@ static void set_timing_mode(struct cgpu_info *icarus) applog(LOG_DEBUG, "Icarus: Init: %d mode=%s read_count=%d Hs=%e", icarus->device_id, timing_mode_str(info->timing_mode), info->read_count, info->Hs); +} + +static uint32_t mask(int work_division) +{ + char err_buf[BUFSIZ+1]; + uint32_t nonce_mask = 0x7fffffff; + + // yes we can calculate these, but this way it's easy to see what they are + switch (work_division) { + case 1: + nonce_mask = 0xffffffff; + break; + case 2: + nonce_mask = 0x7fffffff; + break; + case 4: + nonce_mask = 0x3fffffff; + break; + case 8: + nonce_mask = 0x1fffffff; + break; + default: + sprintf(err_buf, "Invalid2 icarus-options for work_division (%d) must be 1, 2, 4 or 8", work_division); + quit(1, err_buf); + } + + return nonce_mask; +} + +static void get_options(int this_option_offset, int *baud, int *work_division, int *fpga_count) +{ + char err_buf[BUFSIZ+1]; + char buf[BUFSIZ+1]; + char *ptr, *comma, *colon, *colon2; + size_t max; + int i, tmp; + + if (opt_icarus_options == NULL) + buf[0] = '\0'; + else { + ptr = opt_icarus_options; + for (i = 0; i < this_option_offset; i++) { + comma = strchr(ptr, ','); + if (comma == NULL) + break; + ptr = comma + 1; + } + + comma = strchr(ptr, ','); + if (comma == NULL) + max = strlen(ptr); + else + max = comma - ptr; + if (max > BUFSIZ) + max = BUFSIZ; + strncpy(buf, ptr, max); + buf[max] = '\0'; + } + + *baud = ICARUS_IO_SPEED; + *work_division = 2; + *fpga_count = 2; + + if (*buf) { + colon = strchr(buf, ':'); + if (colon) + *(colon++) = '\0'; + + if (*buf) { + tmp = atoi(buf); + switch (tmp) { + case 115200: + *baud = 115200; + break; + case 57600: + *baud = 57600; + break; + default: + sprintf(err_buf, "Invalid icarus-options for baud (%s) must be 115200 or 57600", buf); + quit(1, err_buf); + } + } + + if (colon && *colon) { + colon2 = strchr(colon, ':'); + if (colon2) + *(colon2++) = '\0'; + + if (*colon) { + tmp = atoi(colon); + if (tmp == 1 || tmp == 2 || tmp == 4 || tmp == 8) + *work_division = tmp; + else { + sprintf(err_buf, "Invalid icarus-options for work_division (%s) must be 1, 2, 4 or 8", colon); + quit(1, err_buf); + } + } + + if (colon2 && *colon2) { + tmp = atoi(colon2); + if (tmp > 0 && tmp <= *work_division) + *fpga_count = tmp; + else { + sprintf(err_buf, "Invalid icarus-options for fpga_count (%s) must be >0 and <=work_division (%d)", colon2, *work_division); + quit(1, err_buf); + } + } + } + } } static bool icarus_detect_one(const char *devpath) { + int this_option_offset = ++option_offset; + struct ICARUS_INFO *info; struct timeval tv_start, tv_finish; int fd; @@ -379,9 +515,13 @@ static bool icarus_detect_one(const char *devpath) unsigned char ob_bin[64], nonce_bin[ICARUS_READ_SIZE]; char *nonce_hex; + int baud, work_division, fpga_count; + + get_options(this_option_offset, &baud, &work_division, &fpga_count); + applog(LOG_DEBUG, "Icarus Detect: Attempting to open %s", devpath); - fd = icarus_open2(devpath, true); + fd = icarus_open2(devpath, baud, true); if (unlikely(fd == -1)) { applog(LOG_ERR, "Icarus Detect: Failed to open %s", devpath); return false; @@ -429,6 +569,9 @@ static bool icarus_detect_one(const char *devpath) applog(LOG_INFO, "Found Icarus at %s, mark as %d", devpath, icarus->device_id); + applog(LOG_DEBUG, "Icarus: Init: %d baud=%d work_division=%d fpga_count=%d", + icarus->device_id, baud, work_division, fpga_count); + // Since we are adding a new device on the end it needs to always be allocated icarus_info[icarus->device_id] = (struct ICARUS_INFO *)malloc(sizeof(struct ICARUS_INFO)); if (unlikely(!(icarus_info[icarus->device_id]))) @@ -439,10 +582,15 @@ static bool icarus_detect_one(const char *devpath) // Initialise everything to zero for a new device memset(info, 0, sizeof(struct ICARUS_INFO)); - info->golden_hashes = (golden_nonce_val & 0x7fffffff) << 1; + info->baud = baud; + info->work_division = work_division; + info->fpga_count = fpga_count; + info->nonce_mask = mask(work_division); + + info->golden_hashes = (golden_nonce_val & info->nonce_mask) * fpga_count; timersub(&tv_finish, &tv_start, &(info->golden_tv)); - set_timing_mode(icarus); + set_timing_mode(this_option_offset, icarus); return true; } @@ -458,7 +606,7 @@ static bool icarus_prepare(struct thr_info *thr) struct timeval now; - int fd = icarus_open(icarus->device_path); + int fd = icarus_open(icarus->device_path, icarus_info[icarus->device_id]->baud); if (unlikely(-1 == fd)) { applog(LOG_ERR, "Failed to open Icarus on %s", icarus->device_path); @@ -565,11 +713,9 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work, submit_nonce(thr, work, nonce); - hash_count = (nonce & 0x7fffffff); - if (hash_count++ == 0x7fffffff) - hash_count = 0xffffffff; - else - hash_count <<= 1; + hash_count = (nonce & info->nonce_mask); + hash_count++; + hash_count *= info->fpga_count; if (opt_debug || info->do_icarus_timing) timersub(&tv_finish, &tv_start, &elapsed); @@ -580,7 +726,9 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work, } // ignore possible end condition values - if (info->do_icarus_timing && (nonce & 0x7fffffff) > 0x000fffff && (nonce & 0x7fffffff) < 0x7ff00000) { + if (info->do_icarus_timing + && ((nonce & info->nonce_mask) > END_CONDITION) + && ((nonce & info->nonce_mask) < (info->nonce_mask & ~END_CONDITION))) { gettimeofday(&tv_history_start, NULL); history0 = &(info->history[0]); @@ -590,7 +738,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work, Ti = (double)(elapsed.tv_sec) + ((double)(elapsed.tv_usec))/((double)1000000) - - ICARUS_READ_TIME; + - ((double)ICARUS_READ_TIME(info->baud)); Xi = (double)hash_count; history0->sumXiTi += Xi * Ti; history0->sumXi += Xi; @@ -700,6 +848,9 @@ static struct api_data *icarus_api_stats(struct cgpu_info *cgpu) root = api_add_uint(root, "timing_values", &(info->history[0].values), false); root = api_add_const(root, "timing_mode", timing_mode_str(info->timing_mode), false); root = api_add_bool(root, "is_timing", &(info->do_icarus_timing), false); + root = api_add_int(root, "baud", &(info->baud), false); + root = api_add_int(root, "work_division", &(info->work_division), false); + root = api_add_int(root, "fpga_count", &(info->fpga_count), false); return root; } diff --git a/miner.h b/miner.h index 09cb503c..44d03421 100644 --- a/miner.h +++ b/miner.h @@ -557,6 +557,7 @@ extern bool opt_api_listen; extern bool opt_api_network; extern bool opt_delaynet; extern bool opt_restart; +extern char *opt_icarus_options; extern char *opt_icarus_timing; #ifdef USE_BITFORCE extern bool opt_bfl_noncerange; From 94c09b6c5403ed871ef5915e47d3a4c34538a1a5 Mon Sep 17 00:00:00 2001 From: Kano Date: Wed, 1 Aug 2012 23:08:02 +1000 Subject: [PATCH 125/178] FPGA-README document new hidden --icarus-options --- FPGA-README | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/FPGA-README b/FPGA-README index 7b9f004d..1c83d9fb 100644 --- a/FPGA-README +++ b/FPGA-README @@ -16,7 +16,23 @@ p2pool. Icarus -There is a hidden option in cgminer when Icarus support is compiled in: +There are two hidden options in cgminer when Icarus support is compiled in: + +--icarus-options Set specific FPGA board configurations - one set of values for all or comma separated + baud:work_division:fpga_count + + baud The Serial/USB baud rate - 115200 or 57600 only - default 115200 + work_division The fraction of work divided up for each FPGA chip - 1, 2, 4 or 8 + e.g. 2 means each FPGA does half the nonce range - default 2 + fpga_count The actual number of FPGA working - this would normally be the same + as work_division - range is from 1 up to 'work_division' + +If you define fewer comma seperated values than Icarus devices, the last values will be used +for all extra devices + +An example would be: --icarus-options 57600:2:1 +This would mean: use 57600 baud, the FPGA board divides the work in half however +only 1 FPGA actually runs on the board (e.g. like an early CM1 Icarus copy bitstream) --icarus-timing Set how the Icarus timing is calculated - one setting/value for all or comma separated default[=N] Use the default Icarus hash time (2.6316ns) @@ -24,6 +40,9 @@ There is a hidden option in cgminer when Icarus support is compiled in: long Re-calculate the hash time continuously value[=N] Specify the hash time in nanoseconds (e.g. 2.6316) and abort time (e.g. 2.6316=80) +If you define fewer comma seperated values than Icarus devices, the last values will be used +for all extra devices + Icarus timing is required for devices that do not exactly match a default Icarus Rev3 in processing speed If you have an Icarus Rev3 you should not normally need to use --icarus-timing since the @@ -55,9 +74,9 @@ bitstream to the default one, use 'long' mode and give it at least a few hundred 'short' mode and take note of the final hash time value (Hs) calculated You can also use the RPC API 'stats' command to see the current hash time (Hs) at any time -The Icarus code currently only works with a dual FPGA device that supports the same commands as +The Icarus code currently only works with an FPGA device that supports the same commands as Icarus Rev3 requires and also is less than ~840MH/s and greater than 2MH/s -If a dual FPGA device does hash faster than ~840MH/s it should work correctly if you supply the +If an FPGA device does hash faster than ~840MH/s it should work correctly if you supply the correct hash time nanoseconds value The timing code itself will affect the Icarus performance since it increases the delay after From aa52db453978f77252a60d5c15720fec070b5e66 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 1 Aug 2012 23:42:00 +1000 Subject: [PATCH 126/178] Make test work for pool_active mandatory work items to smooth out staged work counts when in failover-only mode. --- cgminer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cgminer.c b/cgminer.c index 537c9cb8..510fe1df 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3590,6 +3590,7 @@ static bool pool_active(struct pool *pool, bool pinging) struct work *work = make_work(); bool rc; + work->mandatory = true; rc = work_decode(json_object_get(val, "result"), work); if (rc) { applog(LOG_DEBUG, "Successfully retrieved and deciphered work from pool %u %s", From b8b9c468e0ed2c069379b5e2ebd72c1ea3dc602b Mon Sep 17 00:00:00 2001 From: Kano Date: Thu, 2 Aug 2012 07:58:05 +1000 Subject: [PATCH 127/178] ICA default fpga_count to work_division if specified --- FPGA-README | 2 ++ driver-icarus.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/FPGA-README b/FPGA-README index 1c83d9fb..0c4da8a5 100644 --- a/FPGA-README +++ b/FPGA-README @@ -26,6 +26,8 @@ There are two hidden options in cgminer when Icarus support is compiled in: e.g. 2 means each FPGA does half the nonce range - default 2 fpga_count The actual number of FPGA working - this would normally be the same as work_division - range is from 1 up to 'work_division' + It defaults to the value of work_division - or 2 if you don't specify + work_division If you define fewer comma seperated values than Icarus devices, the last values will be used for all extra devices diff --git a/driver-icarus.c b/driver-icarus.c index 0b3aea7a..f1cf9d17 100644 --- a/driver-icarus.c +++ b/driver-icarus.c @@ -470,9 +470,10 @@ static void get_options(int this_option_offset, int *baud, int *work_division, i if (*colon) { tmp = atoi(colon); - if (tmp == 1 || tmp == 2 || tmp == 4 || tmp == 8) + if (tmp == 1 || tmp == 2 || tmp == 4 || tmp == 8) { *work_division = tmp; - else { + *fpga_count = tmp; // default to the same + } else { sprintf(err_buf, "Invalid icarus-options for work_division (%s) must be 1, 2, 4 or 8", colon); quit(1, err_buf); } From ed331e58a25a17ac3393a5616c28a00a1618e787 Mon Sep 17 00:00:00 2001 From: Kano Date: Thu, 2 Aug 2012 16:02:31 +1000 Subject: [PATCH 128/178] miner.php support custom report section joins --- miner.php | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 190 insertions(+), 10 deletions(-) diff --git a/miner.php b/miner.php index be420aa6..08a419ec 100644 --- a/miner.php +++ b/miner.php @@ -87,11 +87,13 @@ $mobilepage = array( 'DATE' => null, 'RIGS' => null, 'SUMMARY' => array('Elapsed', 'MHS av', 'Found Blocks=Blks', 'Accepted', 'Rejected=Rej', 'Utility'), - 'DEVS' => array('ID', 'Name', 'GPU', 'Status', 'MHS av', 'Accepted', 'Rejected=Rej', 'Utility'), + 'DEVS+NOTIFY' => array('DEVS.Name=Name', 'DEVS.ID=ID', 'DEVS.Status=Status', 'DEVS.Temperature=Temp', + 'DEVS.MHS av=MHS av', 'DEVS.Accepted=Accept', 'DEVS.Rejected=Rej', + 'DEVS.Utility=Utility', 'NOTIFY.Last Not Well=Not Well'), 'POOL' => array('POOL', 'Status', 'Accepted', 'Rejected=Rej', 'Last Share Time')); $mobilesum = array( 'SUMMARY' => array('MHS av', 'Found Blocks', 'Accepted', 'Rejected', 'Utility'), - 'DEVS' => array('MHS av', 'Accepted', 'Rejected', 'Utility'), + 'DEVS+NOTIFY' => array('DEVS.MHS av', 'DEVS.Accepted', 'DEVS.Rejected', 'DEVS.Utility'), 'POOL' => array('Accepted', 'Rejected')); # # customsummarypages is an array of these Custom Summary Pages @@ -716,6 +718,9 @@ function fmt($section, $name, $value, $when, $alldata) if ($class == '' && ($rownum % 2) == 0) $class = $c2class; + if ($ret == '') + $ret = $b; + return array($ret, $class); } # @@ -1274,8 +1279,171 @@ $sectionmap = array( 'GPU' => 'devs', // You would normally use DEVS 'PGA' => 'devs', // You would normally use DEVS 'NOTIFY' => 'notify', + 'DEVDETAILS' => 'devdetails', + 'STATS' => 'stats', 'CONFIG' => 'config'); # +function joinfields($section1, $section2, $join, $results) +{ + global $sectionmap; + + $name1 = $sectionmap[$section1]; + $name2 = $sectionmap[$section2]; + $newres = array(); + + // foreach rig in section1 + foreach ($results[$name1] as $rig => $result) + { + $status = null; + + // foreach answer section in the rig api call + foreach ($result as $name1b => $fields1b) + { + if ($name1b == 'STATUS') + { + // remember the STATUS from section1 + $status = $result[$name1b]; + continue; + } + + // foreach answer section in the rig api call (for the other api command) + foreach ($results[$name2][$rig] as $name2b => $fields2b) + { + if ($name2b == 'STATUS') + continue; + + // If match the same field values of fields in $join + $match = true; + foreach ($join as $field) + if ($fields1b[$field] != $fields2b[$field]) + { + $match = false; + break; + } + + if ($match === true) + { + if ($status != null) + { + $newres[$rig]['STATUS'] = $status; + $status = null; + } + + $subsection = $section1.'+'.$section2; + $subsection .= preg_replace('/[^0-9]/', '', $name1b.$name2b); + + foreach ($fields1b as $nam => $val) + $newres[$rig][$subsection]["$section1.$nam"] = $val; + foreach ($fields2b as $nam => $val) + $newres[$rig][$subsection]["$section2.$nam"] = $val; + } + } + } + } + return $newres; +} +# +function joinall($section1, $section2, $results) +{ + global $sectionmap; + + $name1 = $sectionmap[$section1]; + $name2 = $sectionmap[$section2]; + $newres = array(); + + // foreach rig in section1 + foreach ($results[$name1] as $rig => $result) + { + // foreach answer section in the rig api call + foreach ($result as $name1b => $fields1b) + { + if ($name1b == 'STATUS') + { + // copy the STATUS from section1 + $newres[$rig][$name1b] = $result[$name1b]; + continue; + } + + // foreach answer section in the rig api call (for the other api command) + foreach ($results[$name2][$rig] as $name2b => $fields2b) + { + if ($name2b == 'STATUS') + continue; + + $subsection = $section1.'+'.$section2; + $subsection .= preg_replace('/[^0-9]/', '', $name1b.$name2b); + + foreach ($fields1b as $nam => $val) + $newres[$rig][$subsection]["$section1.$nam"] = $val; + foreach ($fields2b as $nam => $val) + $newres[$rig][$subsection]["$section2.$nam"] = $val; + } + } + } + return $newres; +} +# +function joinsections($sections, $results, $errors) +{ + global $sectionmap; + +#echo "results['pools']=".print_r($results['pools'],true)."
"; + + // GPU's don't have Name,ID fields - so create them + foreach ($results as $section => $res) + foreach ($res as $rig => $result) + foreach ($result as $name => $fields) + { + $subname = preg_replace('/[0-9]/', '', $name); + if ($subname == 'GPU' and isset($result[$name]['GPU'])) + { + $results[$section][$rig][$name]['Name'] = 'GPU'; + $results[$section][$rig][$name]['ID'] = $result[$name]['GPU']; + } + } + + foreach ($sections as $section => $fields) + if ($section != 'DATE' && !isset($sectionmap[$section])) + { + $both = explode('+', $section, 2); + if (count($both) > 1) + { + switch($both[0]) + { + case 'SUMMARY': + switch($both[1]) + { + case 'POOL': + case 'DEVS': + case 'CONFIG': + $sectionmap[$section] = $section; + $results[$section] = joinall($both[0], $both[1], $results); + break; + } + break; + case 'DEVS': + $join = array('Name', 'ID'); + switch($both[1]) + { + case 'NOTIFY': + case 'DEVDETAILS': + $sectionmap[$section] = $section; + $results[$section] = joinfields($both[0], $both[1], $join, $results); + break; + } + break; + default: + $errors[] = "Error: Invalid section '$section'"; + break; + } + } + else + $errors[] = "Error: Invalid section '$section'"; + } + + return array($results, $errors); +} +# function secmatch($section, $field) { if ($section == $field) @@ -1335,7 +1503,14 @@ function customset($showfields, $sum, $section, $rig, $isbutton, $result, $total $value = null; } - list($showvalue, $class) = fmt($secname, $name, $value, $when, $row); + if (strpos($secname, '+') === false) + list($showvalue, $class) = fmt($secname, $name, $value, $when, $row); + else + { + $parts = explode('.', $name, 2); + list($showvalue, $class) = fmt($parts[0], $parts[1], $value, $when, $row); + } + echo "$showvalue"; } endrow(); @@ -1356,15 +1531,19 @@ function processcustompage($pagename, $sections, $sum, $namemap) $errors = array(); foreach ($sections as $section => $fields) { - if (isset($sectionmap[$section])) + $all = explode('+', $section); + foreach ($all as $section) { - $cmd = $sectionmap[$section]; - if (!isset($cmds[$cmd])) - $cmds[$cmd] = 1; + if (isset($sectionmap[$section])) + { + $cmd = $sectionmap[$section]; + if (!isset($cmds[$cmd])) + $cmds[$cmd] = 1; + } + else + if ($section != 'DATE') + $errors[] = "Error: unknown section '$section' in custom summary page '$pagename'"; } - else - if ($section != 'DATE') - $errors[] = "Error: unknown section '$section' in custom summary page '$pagename'"; } $results = array(); @@ -1399,6 +1578,7 @@ function processcustompage($pagename, $sections, $sum, $namemap) $shownsomething = false; if (count($results) > 0) { + list($results, $errors) = joinsections($sections, $results, $errors); $first = true; foreach ($sections as $section => $fields) { From d5dffa6aadf57171445430dfd83e1dcb5ebedeaa Mon Sep 17 00:00:00 2001 From: ckolivas Date: Thu, 2 Aug 2012 16:45:12 +1000 Subject: [PATCH 129/178] Don't make mandatory work and its clones last forever. --- cgminer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cgminer.c b/cgminer.c index 510fe1df..82d06271 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2278,9 +2278,6 @@ static bool stale_work(struct work *work, bool share) struct pool *pool; int getwork_delay; - if (work->mandatory) - return false; - if (share) { /* Technically the rolltime should be correct but some pools * advertise a broken expire= that is lower than a meaningful @@ -2316,7 +2313,7 @@ static bool stale_work(struct work *work, bool share) return true; } - if (opt_fail_only && !share && pool != current_pool() && pool->enabled != POOL_REJECTING) { + if (opt_fail_only && !share && pool != current_pool() && !work->mandatory) { applog(LOG_DEBUG, "Work stale due to fail only pool mismatch"); return true; } @@ -3842,6 +3839,7 @@ static struct work *make_clone(struct work *work) memcpy(work_clone, work, sizeof(struct work)); work_clone->clone = true; work_clone->longpoll = false; + work_clone->mandatory = false; /* Make cloned work appear slightly older to bias towards keeping the * master work item which can be further rolled */ work_clone->tv_staged.tv_sec -= 1; From caa9600fa3d309fb18f3ca3b8a7670b6957e3f12 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Thu, 2 Aug 2012 21:05:58 +1000 Subject: [PATCH 130/178] News cutoff fixed. --- NEWS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index be5f7f4c..c0e6cb7c 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,8 @@ Version 2.6.1 - July 30, 2012 - Remove the low hash count determinant of hardware being sick. A low hash rate -can be for poor network connectivity or scrypt mining, neither of which a +can be for poor network connectivity or scrypt mining, neither of which are due +to a sick device. - api.c poolpriority changes From 9410875e2f3742d530e231aaba38b8007a6eb20a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Thu, 2 Aug 2012 22:46:13 +1000 Subject: [PATCH 131/178] Make threads report in either side of the scanhash function in case we miss reporting in when restarting work. --- cgminer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cgminer.c b/cgminer.c index 82d06271..ed19b3a4 100644 --- a/cgminer.c +++ b/cgminer.c @@ -4197,7 +4197,9 @@ void *miner_thread(void *userdata) } pool_stats->getwork_calls++; + thread_reportin(mythr); hashes = api->scanhash(mythr, work, work->blk.nonce + max_nonce); + thread_reportin(mythr); gettimeofday(&getwork_start, NULL); From 2953aa25027fa62bf725b68811ed3f0ad3564dc8 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Thu, 2 Aug 2012 22:46:49 +1000 Subject: [PATCH 132/178] We dropped the temporary stopping of curl recruiting on submit_fail by mistake, reinstate it. --- cgminer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cgminer.c b/cgminer.c index ed19b3a4..3a64b014 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2176,7 +2176,7 @@ static struct curl_ent *pop_curl_entry(struct pool *pool) if (!pool->curls) recruit_curl(pool); else if (list_empty(&pool->curlring)) { - if (pool->curls >= curl_limit) + if (pool->submit_fail || pool->curls >= curl_limit) pthread_cond_wait(&pool->cr_cond, &pool->pool_lock); else recruit_curl(pool); From 5118e3ee7c06bd0dcdaabdec5df8afd9d80e3956 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 3 Aug 2012 08:12:05 +1000 Subject: [PATCH 133/178] Check there is a cutoff temp actually set in bitforce before using it as a cut off value otherwise it may think it's set to zero degrees. --- driver-bitforce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index ebfdb0a3..7424b083 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -257,7 +257,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce) if (temp > 0) { bitforce->temp = temp; - if (temp > bitforce->cutofftemp) { + if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) { applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id); bitforce->deven = DEV_RECOVER; From 7fa794a4997384a5c4227843ff20bc08ec231478 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 3 Aug 2012 09:26:43 +1000 Subject: [PATCH 134/178] It is not critical getting the temperature response in bitforce so don't mandatorily wait on the mutex lock. --- driver-bitforce.c | 6 +++++- miner.h | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 7424b083..88beed41 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -241,7 +241,11 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce) if (!fdDev) return false; - mutex_lock(&bitforce->device_mutex); + /* It is not critical getting temperature so don't get stuck if we + * can't grab the mutex here */ + if (mutex_trylock(&bitforce->device_mutex)) + return false; + BFwrite(fdDev, "ZLX", 3); BFgets(pdevbuf, sizeof(pdevbuf), fdDev); mutex_unlock(&bitforce->device_mutex); diff --git a/miner.h b/miner.h index 09cb503c..02c488db 100644 --- a/miner.h +++ b/miner.h @@ -500,6 +500,11 @@ static inline void mutex_unlock(pthread_mutex_t *lock) quit(1, "WTF MUTEX ERROR ON UNLOCK!"); } +static inline int mutex_trylock(pthread_mutex_t *lock) +{ + return pthread_mutex_trylock(lock); +} + static inline void wr_lock(pthread_rwlock_t *lock) { if (unlikely(pthread_rwlock_wrlock(lock))) From 245552c5b5a1828c2e2ab461325a22cfab938729 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 3 Aug 2012 09:49:14 +1000 Subject: [PATCH 135/178] Clear the bitforce buffer whenever we get an unexpected result as it has likely throttled and we are getting cached responses out of order, and use the temperature monitoring as a kind of watchdog to flush unexpected results. --- driver-bitforce.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 88beed41..1cb958b1 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -157,7 +157,7 @@ static bool bitforce_thread_prepare(struct thr_info *thr) return true; } -static void biforce_clear_buffer(struct cgpu_info *bitforce) +static void bitforce_clear_buffer(struct cgpu_info *bitforce) { int fdDev = bitforce->device_fd; char pdevbuf[0x100]; @@ -185,7 +185,7 @@ void bitforce_init(struct cgpu_info *bitforce) applog(LOG_WARNING, "BFL%i: Re-initialising", bitforce->device_id); - biforce_clear_buffer(bitforce); + bitforce_clear_buffer(bitforce); mutex_lock(&bitforce->device_mutex); if (fdDev) { @@ -270,7 +270,15 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce) bitforce->dev_thermal_cutoff_count++; } } + } else { + /* Use the temperature monitor as a kind of watchdog for when + * our responses are out of sync and flush the buffer to + * hopefully recover */ + applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer"); + bitforce_clear_buffer(bitforce); + return false;; } + return true; } @@ -305,6 +313,7 @@ re_send: goto re_send; } applog(LOG_ERR, "BFL%i: Error: Send work reports: %s", bitforce->device_id, pdevbuf); + bitforce_clear_buffer(bitforce); return false; } @@ -345,6 +354,7 @@ re_send: if (unlikely(strncasecmp(pdevbuf, "OK", 2))) { applog(LOG_ERR, "BFL%i: Error: Send block data reports: %s", bitforce->device_id, pdevbuf); + bitforce_clear_buffer(bitforce); return false; } @@ -433,6 +443,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work) return 0; /* Device idle */ else if (strncasecmp(pdevbuf, "NONCE-FOUND", 11)) { applog(LOG_WARNING, "BFL%i: Error: Get result reports: %s", bitforce->device_id, pdevbuf); + bitforce_clear_buffer(bitforce); return 0; } @@ -521,7 +532,7 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_ bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR; bitforce->dev_comms_error_count++; /* empty read buffer */ - biforce_clear_buffer(bitforce); + bitforce_clear_buffer(bitforce); } return ret; } From 3ee6c1d3103604a58e2452af912afaa0d70ee7b0 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 3 Aug 2012 10:09:25 +1000 Subject: [PATCH 136/178] Update NEWS. --- NEWS | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/NEWS b/NEWS index c0e6cb7c..d91e9d54 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,38 @@ +Version 2.6.2 - August 3, 2012 + +- Clear the bitforce buffer whenever we get an unexpected result as it has +likely throttled and we are getting cached responses out of order, and use the +temperature monitoring as a kind of watchdog to flush unexpected results. +- It is not critical getting the temperature response in bitforce so don't +mandatorily wait on the mutex lock. +- Check there is a cutoff temp actually set in bitforce before using it as a cut +off value otherwise it may think it's set to zero degrees. +- We dropped the temporary stopping of curl recruiting on submit_fail by +mistake, reinstate it. +- Make threads report in either side of the scanhash function in case we miss +reporting in when restarting work. +- Don't make mandatory work and its clones last forever. +- Make test work for pool_active mandatory work items to smooth out staged work +counts when in failover-only mode. +- Add debugging output when work is found stale as to why. +- Print the 3 parameters that are passed to applog for a debug line in +bitforce.c +- Clear bitforce buffer on init as previously. +- Add some headroom to the number of curls available per pool to allow for +longpoll and sendwork curls. +- Revert "Revert "Change BFL driver thread initialising to a constant 100ms +delay between devices instead of a random arrangement."" +- Revert "Remove bitforce_thread_init" +- Show the correct base units on GPU summary. +- Differentiate between the send return value being a bool and the get return +value when managing them in bitforce scanhash. +- 23a8c60 Revert "bitforce: Skip out of sending work if work restart requested" + + Version 2.6.1 - July 30, 2012 +- Display scrypt as being built in as well. +- Fix build warning about KL_SCRYPT when built without scrypt support. - Remove the low hash count determinant of hardware being sick. A low hash rate can be for poor network connectivity or scrypt mining, neither of which are due to a sick device. From 15dc4bb320c2a9460c446287cfb35b944e3ee1ed Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 3 Aug 2012 10:10:53 +1000 Subject: [PATCH 137/178] Scrypt mining does not support block testing yet so don't try to print it. --- cgminer.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cgminer.c b/cgminer.c index 3a64b014..67aacb20 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1770,10 +1770,9 @@ static bool submit_upstream_work(const struct work *work, CURL *curl) if (!QUIET) { hash32 = (uint32_t *)(work->hash); - if (opt_scrypt) { - sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[7]), (unsigned long)(hash32[6]), - work->block? " BLOCK!" : ""); - } else { + if (opt_scrypt) + sprintf(hashshow, "%08lx.%08lx", (unsigned long)(hash32[7]), (unsigned long)(hash32[6])); + else { sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]), work->block? " BLOCK!" : ""); } From e65a3b92d71a5ac086c91a2c59bb3a7a37393e07 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 3 Aug 2012 10:11:41 +1000 Subject: [PATCH 138/178] More NEWS. --- NEWS | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS b/NEWS index d91e9d54..fcf2f230 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,6 @@ Version 2.6.2 - August 3, 2012 +- Scrypt mining does not support block testing yet so don't try to print it. - Clear the bitforce buffer whenever we get an unexpected result as it has likely throttled and we are getting cached responses out of order, and use the temperature monitoring as a kind of watchdog to flush unexpected results. From 179885b2a6aaa7d14f9c73cbb9cc2bfb1df17b65 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Fri, 3 Aug 2012 10:13:11 +1000 Subject: [PATCH 139/178] Bump version to 2.6.2 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index d4357599..2f03c981 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_maj], [2]) m4_define([v_min], [6]) -m4_define([v_mic], [1]) +m4_define([v_mic], [2]) ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_ver], [v_maj.v_min.v_mic]) m4_define([lt_rev], m4_eval(v_maj + v_min)) From 76a02d8725b71848f32204dbc48816db06fc488d Mon Sep 17 00:00:00 2001 From: nushor Date: Fri, 3 Aug 2012 12:15:03 -0500 Subject: [PATCH 140/178] Update debian package configs to v2.6.2 --- debian/changelog | 186 +++--- debian/patches/series | 3 + debian/patches/v2.6.2 | 1275 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1388 insertions(+), 76 deletions(-) create mode 100644 debian/patches/series create mode 100644 debian/patches/v2.6.2 diff --git a/debian/changelog b/debian/changelog index 85339a96..df3ef2c3 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,112 +1,146 @@ +cgminer (2.6.2-1) precise; urgency=low + Version 2.6.2 - August 3, 2012 + + * Scrypt mining does not support block testing yet so don't try to print it. + * Clear the bitforce buffer whenever we get an unexpected result as it has + likely throttled and we are getting cached responses out of order, and use the + temperature monitoring as a kind of watchdog to flush unexpected results. + * It is not critical getting the temperature response in bitforce so don't + mandatorily wait on the mutex lock. + * Check there is a cutoff temp actually set in bitforce before using it as a cut + off value otherwise it may think it's set to zero degrees. + * We dropped the temporary stopping of curl recruiting on submit_fail by + mistake, reinstate it. + * Make threads report in either side of the scanhash function in case we miss + reporting in when restarting work. + * Don't make mandatory work and its clones last forever. + * Make test work for pool_active mandatory work items to smooth out staged work + counts when in failover-only mode. + * Add debugging output when work is found stale as to why. + * Print the 3 parameters that are passed to applog for a debug line in + bitforce.c + * Clear bitforce buffer on init as previously. + * Add some headroom to the number of curls available per pool to allow for + longpoll and sendwork curls. + * Revert "Revert "Change BFL driver thread initialising to a constant 100ms + delay between devices instead of a random arrangement."" + * Revert "Remove bitforce_thread_init" + * Show the correct base units on GPU summary. + * Differentiate between the send return value being a bool and the get return + value when managing them in bitforce scanhash. + * 23a8c60 Revert "bitforce: Skip out of sending work if work restart requested" + + -- nushor Fri, 03 Aug 2012 11:27:44 -0500 + cgminer (2.4.2-1) stable; urgency=medium Version 2.4.2 - June 2, 2012 - - API.class compiled with Java SE 6.0_03 - works with Win7x64 - - miner.php highlight devs too slow finding shares (possibly failing) - - API update version to V1.11 and document changes - - API save default config file if none specified - - api.c save success incorrectly returns error - - api.c replace BUFSIZ (linux/windows have different values) - - Move RPC API content out of README to API-README - - Open a longpoll connection if a pool is in the REJECTING state as it's the + * API.class compiled with Java SE 6.0_03 - works with Win7x64 + * miner.php highlight devs too slow finding shares (possibly failing) + * API update version to V1.11 and document changes + * API save default config file if none specified + * api.c save success incorrectly returns error + * api.c replace BUFSIZ (linux/windows have different values) + * Move RPC API content out of README to API-README + * Open a longpoll connection if a pool is in the REJECTING state as it's the only way to re-enable it automatically. - - Use only one longpoll as much as possible by using a pthread conditional + * Use only one longpoll as much as possible by using a pthread conditional broadcast that each longpoll thread waits on and checks if it's the current pool before - - If shares are known stale, don't use them to decide to disable a pool for + * If shares are known stale, don't use them to decide to disable a pool for sequential rejects. - - Restarting cgminer from within after ADL has been corrupted only leads to a + * Restarting cgminer from within after ADL has been corrupted only leads to a crash. Display a warning only and disable fanspeed monitoring. - - Icarus: fix abort calculation/allow user specified abort - - Icarus: make --icarus-timing hidden and document it in FPGA-README - - Icarus: high accuracy timing and other bitstream speed support - - add-MIPSEB-to-icarus-for-BIG_ENDIAN - - work_decode only needs swab32 on midstate under BIG ENDIAN - - add compile command to api-example.c - - save config bugfix: writing an extra ',' when no gpus - - Add dpkg-source commits + * Icarus: fix abort calculation/allow user specified abort + * Icarus: make --icarus-timing hidden and document it in FPGA-README + * Icarus: high accuracy timing and other bitstream speed support + * add-MIPSEB-to-icarus-for-BIG_ENDIAN + * work_decode only needs swab32 on midstate under BIG ENDIAN + * add compile command to api-example.c + * save config bugfix: writing an extra ',' when no gpus + * Add dpkg-source commits -- nushor Sun, 03 Jun 2012 22:02:03 -0500 cgminer (2.4.1-1) stable; urgency=low Version 2.4.1-1 - May 6, 2012 - - In the unlikely event of finding a block, display the block solved count with + * In the unlikely event of finding a block, display the block solved count with the pool it came from for auditing. - - Display the device summary on exit even if a device has been disabled. - - Use correct pool enabled enums in api.c. - - Import Debian packaging configs - - Ensure we test for a pool recovering from idle so long as it's not set to + * Display the device summary on exit even if a device has been disabled. + * Use correct pool enabled enums in api.c. + * Import Debian packaging configs + * Ensure we test for a pool recovering from idle so long as it's not set to disabled. - - Fix pool number display. - - Give cgminer -T message only if curses is in use. - - Reinit_adl is no longer used. - - API 'stats' allow devices to add their own stats also for testing/debug - - API add getwork stats to cgminer - accesable from API 'stats' - - Don't initialise variables to zero when in global scope since they're already + * Fix pool number display. + * Give cgminer -T message only if curses is in use. + * Reinit_adl is no longer used. + * API 'stats' allow devices to add their own stats also for testing/debug + * API add getwork stats to cgminer - accesable from API 'stats' + * Don't initialise variables to zero when in global scope since they're already initialised. - - Get rid of unitialised variable warning when it's false. - - Move a pool to POOL_REJECTING to be disabled only after 3 minutes of + * Get rid of unitialised variable warning when it's false. + * Move a pool to POOL_REJECTING to be disabled only after 3 minutes of continuous rejected shares. - - Some tweaks to reporting and logging. - - Change FPGA detection order since BFL hangs on an ICA - - API support new pool status - - Add a temporarily disabled state for enabled pools called POOL_REJECTING and + * Some tweaks to reporting and logging. + * Change FPGA detection order since BFL hangs on an ICA + * API support new pool status + * Add a temporarily disabled state for enabled pools called POOL_REJECTING and use the work from each longpoll to help determine when a rejecting pool has started working again. Switch pools based on the multipool strategy once a pool is re-enabled. - - Removing extra debug - - Fix the benchmark feature by bypassing the new networking code. - - Reset sequential reject counter after a pool is disabled for when it is + * Removing extra debug + * Fix the benchmark feature by bypassing the new networking code. + * Reset sequential reject counter after a pool is disabled for when it is re-enabled. - - Icarus - correct MH/s and U: with work restart set at 8 seconds - - ztex updateFreq was always reporting on fpga 0 - - Trying harder to get 1.15y working - - Specifying threads on multi fpga boards extra cgpu - - Missing the add cgpu per extra fpga on 1.15y boards - - API add last share time to each pool - - Don't try to reap curls if benchmarking is enabled. + * Icarus - correct MH/s and U: with work restart set at 8 seconds + * ztex updateFreq was always reporting on fpga 0 + * Trying harder to get 1.15y working + * Specifying threads on multi fpga boards extra cgpu + * Missing the add cgpu per extra fpga on 1.15y boards + * API add last share time to each pool + * Don't try to reap curls if benchmarking is enabled. -- nushor Sun, 06 May 2012 11:09:46 -0500 cgminer (2.4.0-1) stable; urgency=low Version 2.4.0 - May 3, 2012 - - Only show longpoll warning once when it has failed. - - Convert hashes to an unsigned long long as well. - - Detect pools that have issues represented by endless rejected shares and + * Only show longpoll warning once when it has failed. + * Convert hashes to an unsigned long long as well. + * Detect pools that have issues represented by endless rejected shares and disable them, with a parameter to optionally disable this feature. - - Bugfix: Use a 64-bit type for hashes_done (miner_thread) since it can overflow + * Bugfix: Use a 64-bit type for hashes_done (miner_thread) since it can overflow 32-bit on some FPGAs - - Implement an older header fix for a label existing before the pthread_cleanup + * Implement an older header fix for a label existing before the pthread_cleanup macro. - - Limit the number of curls we recruit on communication failures and with + * Limit the number of curls we recruit on communication failures and with delaynet enabled to 5 by maintaining a per-pool curl count, and using a pthread conditional that wakes up when one is returned to the ring buffer. - - Generalise add_pool() functions since they're repeated in add_pool_details. - - Bugfix: Return failure, rather than quit, if BFwrite fails - - Disable failing devices such that the user can attempt to re-enable them - - Bugfix: thread_shutdown shouldn't try to free the device, since it's needed + * Generalise add_pool() functions since they're repeated in add_pool_details. + * Bugfix: Return failure, rather than quit, if BFwrite fails + * Disable failing devices such that the user can attempt to re-enable them + * Bugfix: thread_shutdown shouldn't try to free the device, since it's needed afterward - - API bool's and 1TBS fixes - - Icarus - minimise code delays and name timer variables - - api.c V1.9 add 'restart' + redesign 'quit' so thread exits cleanly - - api.c bug - remove extra ']'s in notify command - - Increase pool watch interval to 30 seconds. - - Reap curls that are unused for over a minute. This allows connections to be + * API bool's and 1TBS fixes + * Icarus - minimise code delays and name timer variables + * api.c V1.9 add 'restart' + redesign 'quit' so thread exits cleanly + * api.c bug - remove extra ']'s in notify command + * Increase pool watch interval to 30 seconds. + * Reap curls that are unused for over a minute. This allows connections to be closed, thereby allowing the number of curl handles to always be the minimum necessary to not delay networking. - - Use the ringbuffer of curls from the same pool for submit as well as getwork + * Use the ringbuffer of curls from the same pool for submit as well as getwork threads. Since the curl handles were already connected to the same pool and are immediately available, share submission will not be delayed by getworks. - - Implement a scaleable networking framework designed to cope with any sized + * Implement a scaleable networking framework designed to cope with any sized network requirements, yet minimise the number of connections being reopened. Do this by create a ring buffer linked list of curl handles to be used by getwork, recruiting extra handles when none is immediately available. - - There is no need for the submit and getwork curls to be tied to the pool + * There is no need for the submit and getwork curls to be tied to the pool struct. - - Do not recruit extra connection threads if there have been connection errors + * Do not recruit extra connection threads if there have been connection errors to the pool in question. - - We should not retry submitting shares indefinitely or we may end up with a + * We should not retry submitting shares indefinitely or we may end up with a huge backlog during network outages, so discard stale shares if we failed to submit them and they've become stale in the interim. @@ -114,32 +148,32 @@ cgminer (2.4.0-1) stable; urgency=low cgminer (2.3.6-3) stable; urgency=low Version 2.3.6-3 - may 3, 2012 - - More bug fixes, Pre 2.4.1 release. + * More bug fixes, Pre 2.4.1 release. -- nushor Thurs, 03 May 2012 00:36:50 -0500 cgminer (2.3.6-2) stable; urgency=low Version 2.3.6-2 - May 2, 2012 - - Various bug fixes, latest build from repository. + * Various bug fixes, latest build from repository. -- nushor Wed, 02 May 2012 18:17:49 -0500 cgminer (2.3.6-1) stable; urgency=low Version 2.3.6 - April 29, 2012 - - Shorten stale share messages slightly. - - Protect the freeing of current_hash under mutex_lock to prevent racing on it + * Shorten stale share messages slightly. + * Protect the freeing of current_hash under mutex_lock to prevent racing on it when set_curblock is hit concurrently. - - Change default behaviour to submitting stale, removing the --submit-stale + * Change default behaviour to submitting stale, removing the --submit-stale option and adding a --no-submit-stale option. - - Make sure to start the getwork and submit threads when a pool is added on the + * Make sure to start the getwork and submit threads when a pool is added on the fly. This fixes a crash when a pool is added to running cgminer and then switched to. - - Faster hardware can easily outstrip the speed we can get work and submit + * Faster hardware can easily outstrip the speed we can get work and submit shares when using only one connection per pool. - - Test the queued list to see if any get/submits are already queued and if they + * Test the queued list to see if any get/submits are already queued and if they are, start recruiting extra connections by generating new threads. - - This allows us to reuse network connections at low loads but recuit new open + * This allows us to reuse network connections at low loads but recuit new open connections as they're needed, so that cgminer can scale to hardware of any size. diff --git a/debian/patches/series b/debian/patches/series new file mode 100644 index 00000000..539cc484 --- /dev/null +++ b/debian/patches/series @@ -0,0 +1,3 @@ +v2.4.1 +v2.4.2 +v2.6.2 diff --git a/debian/patches/v2.6.2 b/debian/patches/v2.6.2 new file mode 100644 index 00000000..2223a00e --- /dev/null +++ b/debian/patches/v2.6.2 @@ -0,0 +1,1275 @@ +--- a/FPGA-README ++++ b/FPGA-README +@@ -16,7 +16,25 @@ + + Icarus + +-There is a hidden option in cgminer when Icarus support is compiled in: ++There are two hidden options in cgminer when Icarus support is compiled in: ++ ++--icarus-options Set specific FPGA board configurations - one set of values for all or comma separated ++ baud:work_division:fpga_count ++ ++ baud The Serial/USB baud rate - 115200 or 57600 only - default 115200 ++ work_division The fraction of work divided up for each FPGA chip - 1, 2, 4 or 8 ++ e.g. 2 means each FPGA does half the nonce range - default 2 ++ fpga_count The actual number of FPGA working - this would normally be the same ++ as work_division - range is from 1 up to 'work_division' ++ It defaults to the value of work_division - or 2 if you don't specify ++ work_division ++ ++If you define fewer comma seperated values than Icarus devices, the last values will be used ++for all extra devices ++ ++An example would be: --icarus-options 57600:2:1 ++This would mean: use 57600 baud, the FPGA board divides the work in half however ++only 1 FPGA actually runs on the board (e.g. like an early CM1 Icarus copy bitstream) + + --icarus-timing Set how the Icarus timing is calculated - one setting/value for all or comma separated + default[=N] Use the default Icarus hash time (2.6316ns) +@@ -24,6 +42,9 @@ + long Re-calculate the hash time continuously + value[=N] Specify the hash time in nanoseconds (e.g. 2.6316) and abort time (e.g. 2.6316=80) + ++If you define fewer comma seperated values than Icarus devices, the last values will be used ++for all extra devices ++ + Icarus timing is required for devices that do not exactly match a default Icarus Rev3 in + processing speed + If you have an Icarus Rev3 you should not normally need to use --icarus-timing since the +@@ -55,9 +76,9 @@ + 'short' mode and take note of the final hash time value (Hs) calculated + You can also use the RPC API 'stats' command to see the current hash time (Hs) at any time + +-The Icarus code currently only works with a dual FPGA device that supports the same commands as ++The Icarus code currently only works with an FPGA device that supports the same commands as + Icarus Rev3 requires and also is less than ~840MH/s and greater than 2MH/s +-If a dual FPGA device does hash faster than ~840MH/s it should work correctly if you supply the ++If an FPGA device does hash faster than ~840MH/s it should work correctly if you supply the + correct hash time nanoseconds value + + The timing code itself will affect the Icarus performance since it increases the delay after +--- a/NEWS ++++ b/NEWS +@@ -1,7 +1,42 @@ ++Version 2.6.2 - August 3, 2012 ++ ++- Scrypt mining does not support block testing yet so don't try to print it. ++- Clear the bitforce buffer whenever we get an unexpected result as it has ++likely throttled and we are getting cached responses out of order, and use the ++temperature monitoring as a kind of watchdog to flush unexpected results. ++- It is not critical getting the temperature response in bitforce so don't ++mandatorily wait on the mutex lock. ++- Check there is a cutoff temp actually set in bitforce before using it as a cut ++off value otherwise it may think it's set to zero degrees. ++- We dropped the temporary stopping of curl recruiting on submit_fail by ++mistake, reinstate it. ++- Make threads report in either side of the scanhash function in case we miss ++reporting in when restarting work. ++- Don't make mandatory work and its clones last forever. ++- Make test work for pool_active mandatory work items to smooth out staged work ++counts when in failover-only mode. ++- Add debugging output when work is found stale as to why. ++- Print the 3 parameters that are passed to applog for a debug line in ++bitforce.c ++- Clear bitforce buffer on init as previously. ++- Add some headroom to the number of curls available per pool to allow for ++longpoll and sendwork curls. ++- Revert "Revert "Change BFL driver thread initialising to a constant 100ms ++delay between devices instead of a random arrangement."" ++- Revert "Remove bitforce_thread_init" ++- Show the correct base units on GPU summary. ++- Differentiate between the send return value being a bool and the get return ++value when managing them in bitforce scanhash. ++- 23a8c60 Revert "bitforce: Skip out of sending work if work restart requested" ++ ++ + Version 2.6.1 - July 30, 2012 + ++- Display scrypt as being built in as well. ++- Fix build warning about KL_SCRYPT when built without scrypt support. + - Remove the low hash count determinant of hardware being sick. A low hash rate +-can be for poor network connectivity or scrypt mining, neither of which a ++can be for poor network connectivity or scrypt mining, neither of which are due ++to a sick device. + - api.c poolpriority changes + + +--- a/cgminer.c ++++ b/cgminer.c +@@ -142,6 +142,7 @@ + bool opt_api_network; + bool opt_delaynet; + bool opt_disable_pool = true; ++char *opt_icarus_options = NULL; + char *opt_icarus_timing = NULL; + + char *opt_kernel_path; +@@ -710,6 +711,13 @@ + } + + #ifdef USE_ICARUS ++static char *set_icarus_options(const char *arg) ++{ ++ opt_set_charp(arg, &opt_icarus_options); ++ ++ return NULL; ++} ++ + static char *set_icarus_timing(const char *arg) + { + opt_set_charp(arg, &opt_icarus_timing); +@@ -873,6 +881,9 @@ + "Override sha256 kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"), + #endif + #ifdef USE_ICARUS ++ OPT_WITH_ARG("--icarus-options", ++ set_icarus_options, NULL, NULL, ++ opt_hidden), + OPT_WITH_ARG("--icarus-timing", + set_icarus_timing, NULL, NULL, + opt_hidden), +@@ -1770,10 +1781,9 @@ + + if (!QUIET) { + hash32 = (uint32_t *)(work->hash); +- if (opt_scrypt) { +- sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[7]), (unsigned long)(hash32[6]), +- work->block? " BLOCK!" : ""); +- } else { ++ if (opt_scrypt) ++ sprintf(hashshow, "%08lx.%08lx", (unsigned long)(hash32[7]), (unsigned long)(hash32[6])); ++ else { + sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]), + work->block? " BLOCK!" : ""); + } +@@ -2169,14 +2179,14 @@ + * network delays/outages. */ + static struct curl_ent *pop_curl_entry(struct pool *pool) + { +- int curl_limit = opt_delaynet ? 5 : mining_threads; ++ int curl_limit = opt_delaynet ? 5 : mining_threads * 4 / 3; + struct curl_ent *ce; + + mutex_lock(&pool->pool_lock); + if (!pool->curls) + recruit_curl(pool); + else if (list_empty(&pool->curlring)) { +- if (pool->curls >= curl_limit) ++ if (pool->submit_fail || pool->curls >= curl_limit) + pthread_cond_wait(&pool->cr_cond, &pool->pool_lock); + else + recruit_curl(pool); +@@ -2278,9 +2288,6 @@ + struct pool *pool; + int getwork_delay; + +- if (work->mandatory) +- return false; +- + if (share) { + /* Technically the rolltime should be correct but some pools + * advertise a broken expire= that is lower than a meaningful +@@ -2306,14 +2313,20 @@ + work_expiry = 5; + + gettimeofday(&now, NULL); +- if ((now.tv_sec - work->tv_staged.tv_sec) >= work_expiry) ++ if ((now.tv_sec - work->tv_staged.tv_sec) >= work_expiry) { ++ applog(LOG_DEBUG, "Work stale due to expiry"); + return true; ++ } + +- if (work->work_block != work_block) ++ if (work->work_block != work_block) { ++ applog(LOG_DEBUG, "Work stale due to block mismatch"); + return true; ++ } + +- if (opt_fail_only && !share && pool != current_pool() && pool->enabled != POOL_REJECTING) ++ if (opt_fail_only && !share && pool != current_pool() && !work->mandatory) { ++ applog(LOG_DEBUG, "Work stale due to fail only pool mismatch"); + return true; ++ } + + return false; + } +@@ -3011,6 +3024,8 @@ + fprintf(fcfg, ",\n\"api-description\" : \"%s\"", opt_api_description); + if (opt_api_groups) + fprintf(fcfg, ",\n\"api-groups\" : \"%s\"", opt_api_groups); ++ if (opt_icarus_options) ++ fprintf(fcfg, ",\n\"icarus-options\" : \"%s\"", opt_icarus_options); + if (opt_icarus_timing) + fprintf(fcfg, ",\n\"icarus-timing\" : \"%s\"", opt_icarus_timing); + fputs("\n}", fcfg); +@@ -3584,6 +3599,7 @@ + struct work *work = make_work(); + bool rc; + ++ work->mandatory = true; + rc = work_decode(json_object_get(val, "result"), work); + if (rc) { + applog(LOG_DEBUG, "Successfully retrieved and deciphered work from pool %u %s", +@@ -3835,6 +3851,7 @@ + memcpy(work_clone, work, sizeof(struct work)); + work_clone->clone = true; + work_clone->longpoll = false; ++ work_clone->mandatory = false; + /* Make cloned work appear slightly older to bias towards keeping the + * master work item which can be further rolled */ + work_clone->tv_staged.tv_sec -= 1; +@@ -4192,7 +4209,9 @@ + } + pool_stats->getwork_calls++; + ++ thread_reportin(mythr); + hashes = api->scanhash(mythr, work, work->blk.nonce + max_nonce); ++ thread_reportin(mythr); + + gettimeofday(&getwork_start, NULL); + +--- a/configure.ac ++++ b/configure.ac +@@ -2,7 +2,7 @@ + ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## + m4_define([v_maj], [2]) + m4_define([v_min], [6]) +-m4_define([v_mic], [1]) ++m4_define([v_mic], [2]) + ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## + m4_define([v_ver], [v_maj.v_min.v_mic]) + m4_define([lt_rev], m4_eval(v_maj + v_min)) +--- a/debian/changelog ++++ b/debian/changelog +@@ -1,112 +1,151 @@ ++cgminer (2.4.3-1) stable; urgency=medium ++ Version 2.4.3 - June 14, 2012 ++ ++ * can_roll and should_roll should have no bearing on the cycle period within the ++ miner_thread so remove it. ++ * Check for strategy being changed to load balance when enabling LPs. ++ * Check that all threads on the device that called get_work are waiting on getwork ++ before considering the pool lagging. ++ * Iterate over each thread belonging to each device in the hashmeter instead of ++ searching for them now that they're a list. ++ * When using rotate pool strategy, ensure we only select from alive enabled pools. ++ * Start longpoll from every pool when load balance strategy is in use. ++ * Add mandatory and block fields to the work struct. Flag any shares that are ++ detected as blocks as mandatory to submit, along with longpoll work from a previously ++ rejecting pool. ++ * Consider the fan optimal if fanspeed is dropping but within the optimal speed window. ++ * Fix typo in some API messages (succeess/success) ++ * api.c MMQ stat bugs ++ * Bugfix: Fix warnings when built without libudev support ++ * Bugfix: slay a variety of warnings ++ * Bugfix: modminer: Fix unsigned/signed comparison and similar warnings ++ * API add ModMinerQuad support ++ * Bugfix: Honour forceauto parameter in serial_detect functions ++ * modminer: Temperature sensor improvements ++ * modminer: Make log messages more consistent in format ++ * Only adjust GPU speed up if the fanspeed is within the normal fanrange and hasn't been ++ turned to maximum speed under overheat conditions. ++ * ModMiner use valid .name ++ * New driver: BTCFPGA ModMiner ++ * Abstract generally useful FPGA code into fpgautils.c ++ * API add stats for pool getworks ++ * miner.php option to hide specific fields from the display ++ * miner.php add version numbers to the summary page ++ * Update debian configs to v2.4.2 ++ * Add API and FPGA READMEs into Makefile to be included in source distribution. ++ * Icarus - fix unit64_t printf warnings ++ ++ -- nushor Fri, 15 Jun 2012 11:31:51 -0500 ++ + cgminer (2.4.2-1) stable; urgency=medium + Version 2.4.2 - June 2, 2012 + +- - API.class compiled with Java SE 6.0_03 - works with Win7x64 +- - miner.php highlight devs too slow finding shares (possibly failing) +- - API update version to V1.11 and document changes +- - API save default config file if none specified +- - api.c save success incorrectly returns error +- - api.c replace BUFSIZ (linux/windows have different values) +- - Move RPC API content out of README to API-README +- - Open a longpoll connection if a pool is in the REJECTING state as it's the ++ * API.class compiled with Java SE 6.0_03 - works with Win7x64 ++ * miner.php highlight devs too slow finding shares (possibly failing) ++ * API update version to V1.11 and document changes ++ * API save default config file if none specified ++ * api.c save success incorrectly returns error ++ * api.c replace BUFSIZ (linux/windows have different values) ++ * Move RPC API content out of README to API-README ++ * Open a longpoll connection if a pool is in the REJECTING state as it's the + only way to re-enable it automatically. +- - Use only one longpoll as much as possible by using a pthread conditional ++ * Use only one longpoll as much as possible by using a pthread conditional + broadcast that each longpoll thread waits on and checks if it's the current pool + before +- - If shares are known stale, don't use them to decide to disable a pool for ++ * If shares are known stale, don't use them to decide to disable a pool for + sequential rejects. +- - Restarting cgminer from within after ADL has been corrupted only leads to a ++ * Restarting cgminer from within after ADL has been corrupted only leads to a + crash. Display a warning only and disable fanspeed monitoring. +- - Icarus: fix abort calculation/allow user specified abort +- - Icarus: make --icarus-timing hidden and document it in FPGA-README +- - Icarus: high accuracy timing and other bitstream speed support +- - add-MIPSEB-to-icarus-for-BIG_ENDIAN +- - work_decode only needs swab32 on midstate under BIG ENDIAN +- - add compile command to api-example.c +- - save config bugfix: writing an extra ',' when no gpus +- - Add dpkg-source commits ++ * Icarus: fix abort calculation/allow user specified abort ++ * Icarus: make --icarus-timing hidden and document it in FPGA-README ++ * Icarus: high accuracy timing and other bitstream speed support ++ * add-MIPSEB-to-icarus-for-BIG_ENDIAN ++ * work_decode only needs swab32 on midstate under BIG ENDIAN ++ * add compile command to api-example.c ++ * save config bugfix: writing an extra ',' when no gpus ++ * Add dpkg-source commits + + -- nushor Sun, 03 Jun 2012 22:02:03 -0500 + + cgminer (2.4.1-1) stable; urgency=low + Version 2.4.1-1 - May 6, 2012 +- - In the unlikely event of finding a block, display the block solved count with ++ * In the unlikely event of finding a block, display the block solved count with + the pool it came from for auditing. +- - Display the device summary on exit even if a device has been disabled. +- - Use correct pool enabled enums in api.c. +- - Import Debian packaging configs +- - Ensure we test for a pool recovering from idle so long as it's not set to ++ * Display the device summary on exit even if a device has been disabled. ++ * Use correct pool enabled enums in api.c. ++ * Import Debian packaging configs ++ * Ensure we test for a pool recovering from idle so long as it's not set to + disabled. +- - Fix pool number display. +- - Give cgminer -T message only if curses is in use. +- - Reinit_adl is no longer used. +- - API 'stats' allow devices to add their own stats also for testing/debug +- - API add getwork stats to cgminer - accesable from API 'stats' +- - Don't initialise variables to zero when in global scope since they're already ++ * Fix pool number display. ++ * Give cgminer -T message only if curses is in use. ++ * Reinit_adl is no longer used. ++ * API 'stats' allow devices to add their own stats also for testing/debug ++ * API add getwork stats to cgminer - accesable from API 'stats' ++ * Don't initialise variables to zero when in global scope since they're already + initialised. +- - Get rid of unitialised variable warning when it's false. +- - Move a pool to POOL_REJECTING to be disabled only after 3 minutes of ++ * Get rid of unitialised variable warning when it's false. ++ * Move a pool to POOL_REJECTING to be disabled only after 3 minutes of + continuous rejected shares. +- - Some tweaks to reporting and logging. +- - Change FPGA detection order since BFL hangs on an ICA +- - API support new pool status +- - Add a temporarily disabled state for enabled pools called POOL_REJECTING and ++ * Some tweaks to reporting and logging. ++ * Change FPGA detection order since BFL hangs on an ICA ++ * API support new pool status ++ * Add a temporarily disabled state for enabled pools called POOL_REJECTING and + use the work from each longpoll to help determine when a rejecting pool has + started working again. Switch pools based on the multipool strategy once a pool + is re-enabled. +- - Removing extra debug +- - Fix the benchmark feature by bypassing the new networking code. +- - Reset sequential reject counter after a pool is disabled for when it is ++ * Removing extra debug ++ * Fix the benchmark feature by bypassing the new networking code. ++ * Reset sequential reject counter after a pool is disabled for when it is + re-enabled. +- - Icarus - correct MH/s and U: with work restart set at 8 seconds +- - ztex updateFreq was always reporting on fpga 0 +- - Trying harder to get 1.15y working +- - Specifying threads on multi fpga boards extra cgpu +- - Missing the add cgpu per extra fpga on 1.15y boards +- - API add last share time to each pool +- - Don't try to reap curls if benchmarking is enabled. ++ * Icarus - correct MH/s and U: with work restart set at 8 seconds ++ * ztex updateFreq was always reporting on fpga 0 ++ * Trying harder to get 1.15y working ++ * Specifying threads on multi fpga boards extra cgpu ++ * Missing the add cgpu per extra fpga on 1.15y boards ++ * API add last share time to each pool ++ * Don't try to reap curls if benchmarking is enabled. + + -- nushor Sun, 06 May 2012 11:09:46 -0500 + + cgminer (2.4.0-1) stable; urgency=low + Version 2.4.0 - May 3, 2012 + +- - Only show longpoll warning once when it has failed. +- - Convert hashes to an unsigned long long as well. +- - Detect pools that have issues represented by endless rejected shares and ++ * Only show longpoll warning once when it has failed. ++ * Convert hashes to an unsigned long long as well. ++ * Detect pools that have issues represented by endless rejected shares and + disable them, with a parameter to optionally disable this feature. +- - Bugfix: Use a 64-bit type for hashes_done (miner_thread) since it can overflow ++ * Bugfix: Use a 64-bit type for hashes_done (miner_thread) since it can overflow + 32-bit on some FPGAs +- - Implement an older header fix for a label existing before the pthread_cleanup ++ * Implement an older header fix for a label existing before the pthread_cleanup + macro. +- - Limit the number of curls we recruit on communication failures and with ++ * Limit the number of curls we recruit on communication failures and with + delaynet enabled to 5 by maintaining a per-pool curl count, and using a pthread + conditional that wakes up when one is returned to the ring buffer. +- - Generalise add_pool() functions since they're repeated in add_pool_details. +- - Bugfix: Return failure, rather than quit, if BFwrite fails +- - Disable failing devices such that the user can attempt to re-enable them +- - Bugfix: thread_shutdown shouldn't try to free the device, since it's needed ++ * Generalise add_pool() functions since they're repeated in add_pool_details. ++ * Bugfix: Return failure, rather than quit, if BFwrite fails ++ * Disable failing devices such that the user can attempt to re-enable them ++ * Bugfix: thread_shutdown shouldn't try to free the device, since it's needed + afterward +- - API bool's and 1TBS fixes +- - Icarus - minimise code delays and name timer variables +- - api.c V1.9 add 'restart' + redesign 'quit' so thread exits cleanly +- - api.c bug - remove extra ']'s in notify command +- - Increase pool watch interval to 30 seconds. +- - Reap curls that are unused for over a minute. This allows connections to be ++ * API bool's and 1TBS fixes ++ * Icarus - minimise code delays and name timer variables ++ * api.c V1.9 add 'restart' + redesign 'quit' so thread exits cleanly ++ * api.c bug - remove extra ']'s in notify command ++ * Increase pool watch interval to 30 seconds. ++ * Reap curls that are unused for over a minute. This allows connections to be + closed, thereby allowing the number of curl handles to always be the minimum + necessary to not delay networking. +- - Use the ringbuffer of curls from the same pool for submit as well as getwork ++ * Use the ringbuffer of curls from the same pool for submit as well as getwork + threads. Since the curl handles were already connected to the same pool and are + immediately available, share submission will not be delayed by getworks. +- - Implement a scaleable networking framework designed to cope with any sized ++ * Implement a scaleable networking framework designed to cope with any sized + network requirements, yet minimise the number of connections being reopened. Do + this by create a ring buffer linked list of curl handles to be used by getwork, + recruiting extra handles when none is immediately available. +- - There is no need for the submit and getwork curls to be tied to the pool ++ * There is no need for the submit and getwork curls to be tied to the pool + struct. +- - Do not recruit extra connection threads if there have been connection errors ++ * Do not recruit extra connection threads if there have been connection errors + to the pool in question. +- - We should not retry submitting shares indefinitely or we may end up with a ++ * We should not retry submitting shares indefinitely or we may end up with a + huge backlog during network outages, so discard stale shares if we failed to + submit them and they've become stale in the interim. + +@@ -114,32 +153,32 @@ + + cgminer (2.3.6-3) stable; urgency=low + Version 2.3.6-3 - may 3, 2012 +- - More bug fixes, Pre 2.4.1 release. ++ * More bug fixes, Pre 2.4.1 release. + + -- nushor Thurs, 03 May 2012 00:36:50 -0500 + + cgminer (2.3.6-2) stable; urgency=low + Version 2.3.6-2 - May 2, 2012 +- - Various bug fixes, latest build from repository. ++ * Various bug fixes, latest build from repository. + + -- nushor Wed, 02 May 2012 18:17:49 -0500 + + cgminer (2.3.6-1) stable; urgency=low + + Version 2.3.6 - April 29, 2012 +- - Shorten stale share messages slightly. +- - Protect the freeing of current_hash under mutex_lock to prevent racing on it ++ * Shorten stale share messages slightly. ++ * Protect the freeing of current_hash under mutex_lock to prevent racing on it + when set_curblock is hit concurrently. +- - Change default behaviour to submitting stale, removing the --submit-stale ++ * Change default behaviour to submitting stale, removing the --submit-stale + option and adding a --no-submit-stale option. +- - Make sure to start the getwork and submit threads when a pool is added on the ++ * Make sure to start the getwork and submit threads when a pool is added on the + fly. This fixes a crash when a pool is added to running cgminer and then + switched to. +- - Faster hardware can easily outstrip the speed we can get work and submit ++ * Faster hardware can easily outstrip the speed we can get work and submit + shares when using only one connection per pool. +- - Test the queued list to see if any get/submits are already queued and if they ++ * Test the queued list to see if any get/submits are already queued and if they + are, start recruiting extra connections by generating new threads. +- - This allows us to reuse network connections at low loads but recuit new open ++ * This allows us to reuse network connections at low loads but recuit new open + connections as they're needed, so that cgminer can scale to hardware of any + size. + +--- a/driver-bitforce.c ++++ b/driver-bitforce.c +@@ -157,7 +157,7 @@ + return true; + } + +-static void biforce_clear_buffer(struct cgpu_info *bitforce) ++static void bitforce_clear_buffer(struct cgpu_info *bitforce) + { + int fdDev = bitforce->device_fd; + char pdevbuf[0x100]; +@@ -185,6 +185,8 @@ + + applog(LOG_WARNING, "BFL%i: Re-initialising", bitforce->device_id); + ++ bitforce_clear_buffer(bitforce); ++ + mutex_lock(&bitforce->device_mutex); + if (fdDev) { + BFclose(fdDev); +@@ -239,7 +241,11 @@ + if (!fdDev) + return false; + +- mutex_lock(&bitforce->device_mutex); ++ /* It is not critical getting temperature so don't get stuck if we ++ * can't grab the mutex here */ ++ if (mutex_trylock(&bitforce->device_mutex)) ++ return false; ++ + BFwrite(fdDev, "ZLX", 3); + BFgets(pdevbuf, sizeof(pdevbuf), fdDev); + mutex_unlock(&bitforce->device_mutex); +@@ -255,7 +261,7 @@ + + if (temp > 0) { + bitforce->temp = temp; +- if (temp > bitforce->cutofftemp) { ++ if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) { + applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id); + bitforce->deven = DEV_RECOVER; + +@@ -264,7 +270,15 @@ + bitforce->dev_thermal_cutoff_count++; + } + } ++ } else { ++ /* Use the temperature monitor as a kind of watchdog for when ++ * our responses are out of sync and flush the buffer to ++ * hopefully recover */ ++ applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer"); ++ bitforce_clear_buffer(bitforce); ++ return false;; + } ++ + return true; + } + +@@ -287,8 +301,7 @@ + BFgets(pdevbuf, sizeof(pdevbuf), fdDev); + if (!pdevbuf[0] || !strncasecmp(pdevbuf, "B", 1)) { + mutex_unlock(&bitforce->device_mutex); +- if (!restart_wait(WORK_CHECK_INTERVAL_MS)) +- return false; ++ nmsleep(WORK_CHECK_INTERVAL_MS); + goto re_send; + } else if (unlikely(strncasecmp(pdevbuf, "OK", 2))) { + mutex_unlock(&bitforce->device_mutex); +@@ -300,6 +313,7 @@ + goto re_send; + } + applog(LOG_ERR, "BFL%i: Error: Send work reports: %s", bitforce->device_id, pdevbuf); ++ bitforce_clear_buffer(bitforce); + return false; + } + +@@ -340,6 +354,7 @@ + + if (unlikely(strncasecmp(pdevbuf, "OK", 2))) { + applog(LOG_ERR, "BFL%i: Error: Send block data reports: %s", bitforce->device_id, pdevbuf); ++ bitforce_clear_buffer(bitforce); + return false; + } + +@@ -414,7 +429,7 @@ + } + + if (delay_time_ms != bitforce->sleep_ms) +- applog(LOG_DEBUG, "BFL%i: Wait time changed to: %d", bitforce->device_id, bitforce->sleep_ms, bitforce->wait_ms); ++ applog(LOG_DEBUG, "BFL%i: Wait time changed to: %d, waited %u", bitforce->device_id, bitforce->sleep_ms, bitforce->wait_ms); + + /* Work out the average time taken. Float for calculation, uint for display */ + bitforce->avg_wait_f += (tv_to_ms(elapsed) - bitforce->avg_wait_f) / TIME_AVG_CONSTANT; +@@ -428,6 +443,7 @@ + return 0; /* Device idle */ + else if (strncasecmp(pdevbuf, "NONCE-FOUND", 11)) { + applog(LOG_WARNING, "BFL%i: Error: Get result reports: %s", bitforce->device_id, pdevbuf); ++ bitforce_clear_buffer(bitforce); + return 0; + } + +@@ -475,9 +491,10 @@ + { + struct cgpu_info *bitforce = thr->cgpu; + unsigned int sleep_time; ++ bool send_ret; + int64_t ret; + +- ret = bitforce_send_work(thr, work); ++ send_ret = bitforce_send_work(thr, work); + + if (!bitforce->nonce_range) { + /* Initially wait 2/3 of the average cycle time so we can request more +@@ -503,8 +520,10 @@ + bitforce->wait_ms = sleep_time; + } + +- if (ret) ++ if (send_ret) + ret = bitforce_get_result(thr, work); ++ else ++ ret = -1; + + if (ret == -1) { + ret = 0; +@@ -513,7 +532,7 @@ + bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR; + bitforce->dev_comms_error_count++; + /* empty read buffer */ +- biforce_clear_buffer(bitforce); ++ bitforce_clear_buffer(bitforce); + } + return ret; + } +@@ -523,6 +542,20 @@ + return bitforce_get_temp(bitforce); + } + ++static bool bitforce_thread_init(struct thr_info *thr) ++{ ++ struct cgpu_info *bitforce = thr->cgpu; ++ unsigned int wait; ++ ++ /* Pause each new thread at least 100ms between initialising ++ * so the devices aren't making calls all at the same time. */ ++ wait = thr->id * MAX_START_DELAY_US; ++ applog(LOG_DEBUG, "BFL%i: Delaying start by %dms", bitforce->device_id, wait / 1000); ++ usleep(wait); ++ ++ return true; ++} ++ + static struct api_data *bitforce_api_stats(struct cgpu_info *cgpu) + { + struct api_data *root = NULL; +@@ -546,6 +579,7 @@ + .get_statline_before = get_bitforce_statline_before, + .get_stats = bitforce_get_stats, + .thread_prepare = bitforce_thread_prepare, ++ .thread_init = bitforce_thread_init, + .scanhash = bitforce_scanhash, + .thread_shutdown = bitforce_shutdown, + .thread_enable = biforce_thread_enable +--- a/driver-icarus.c ++++ b/driver-icarus.c +@@ -65,7 +65,7 @@ + #define ASSERT1(condition) __maybe_unused static char sizeof_uint32_t_must_be_4[(condition)?1:-1] + ASSERT1(sizeof(uint32_t) == 4); + +-#define ICARUS_READ_TIME ((double)ICARUS_READ_SIZE * (double)8.0 / (double)ICARUS_IO_SPEED) ++#define ICARUS_READ_TIME(baud) ((double)ICARUS_READ_SIZE * (double)8.0 / (double)(baud)) + + // Fraction of a second, USB timeout is measured in + // i.e. 10 means 1/10 of a second +@@ -176,11 +176,36 @@ + // (which will only affect W) + uint64_t history_count; + struct timeval history_time; ++ ++ // icarus-options ++ int baud; ++ int work_division; ++ int fpga_count; ++ uint32_t nonce_mask; + }; + ++#define END_CONDITION 0x0000ffff ++ + // One for each possible device + static struct ICARUS_INFO **icarus_info; + ++// Looking for options in --icarus-timing and --icarus-options: ++// ++// Code increments this each time we start to look at a device ++// However, this means that if other devices are checked by ++// the Icarus code (e.g. BFL) they will count in the option offset ++// ++// This, however, is deterministic so that's OK ++// ++// If we were to increment after successfully finding an Icarus ++// that would be random since an Icarus may fail and thus we'd ++// not be able to predict the option order ++// ++// This also assumes that serial_detect() checks them sequentially ++// and in the order specified on the command line ++// ++static int option_offset = -1; ++ + struct device_api icarus_api; + + static void rev(unsigned char *s, size_t l) +@@ -195,8 +220,8 @@ + } + } + +-#define icarus_open2(devpath, purge) serial_open(devpath, 115200, ICARUS_READ_FAULT_DECISECONDS, purge) +-#define icarus_open(devpath) icarus_open2(devpath, false) ++#define icarus_open2(devpath, baud, purge) serial_open(devpath, baud, ICARUS_READ_FAULT_DECISECONDS, purge) ++#define icarus_open(devpath, baud) icarus_open2(devpath, baud, false) + + static int icarus_gets(unsigned char *buf, int fd, struct timeval *tv_finish, struct thr_info *thr, int read_count) + { +@@ -272,7 +297,7 @@ + } + } + +-static void set_timing_mode(struct cgpu_info *icarus) ++static void set_timing_mode(int this_option_offset, struct cgpu_info *icarus) + { + struct ICARUS_INFO *info = icarus_info[icarus->device_id]; + double Hs; +@@ -285,7 +310,7 @@ + buf[0] = '\0'; + else { + ptr = opt_icarus_timing; +- for (i = 0; i < icarus->device_id; i++) { ++ for (i = 0; i < this_option_offset; i++) { + comma = strchr(ptr, ','); + if (comma == NULL) + break; +@@ -354,11 +379,123 @@ + + applog(LOG_DEBUG, "Icarus: Init: %d mode=%s read_count=%d Hs=%e", + icarus->device_id, timing_mode_str(info->timing_mode), info->read_count, info->Hs); ++} ++ ++static uint32_t mask(int work_division) ++{ ++ char err_buf[BUFSIZ+1]; ++ uint32_t nonce_mask = 0x7fffffff; + ++ // yes we can calculate these, but this way it's easy to see what they are ++ switch (work_division) { ++ case 1: ++ nonce_mask = 0xffffffff; ++ break; ++ case 2: ++ nonce_mask = 0x7fffffff; ++ break; ++ case 4: ++ nonce_mask = 0x3fffffff; ++ break; ++ case 8: ++ nonce_mask = 0x1fffffff; ++ break; ++ default: ++ sprintf(err_buf, "Invalid2 icarus-options for work_division (%d) must be 1, 2, 4 or 8", work_division); ++ quit(1, err_buf); ++ } ++ ++ return nonce_mask; ++} ++ ++static void get_options(int this_option_offset, int *baud, int *work_division, int *fpga_count) ++{ ++ char err_buf[BUFSIZ+1]; ++ char buf[BUFSIZ+1]; ++ char *ptr, *comma, *colon, *colon2; ++ size_t max; ++ int i, tmp; ++ ++ if (opt_icarus_options == NULL) ++ buf[0] = '\0'; ++ else { ++ ptr = opt_icarus_options; ++ for (i = 0; i < this_option_offset; i++) { ++ comma = strchr(ptr, ','); ++ if (comma == NULL) ++ break; ++ ptr = comma + 1; ++ } ++ ++ comma = strchr(ptr, ','); ++ if (comma == NULL) ++ max = strlen(ptr); ++ else ++ max = comma - ptr; ++ ++ if (max > BUFSIZ) ++ max = BUFSIZ; ++ strncpy(buf, ptr, max); ++ buf[max] = '\0'; ++ } ++ ++ *baud = ICARUS_IO_SPEED; ++ *work_division = 2; ++ *fpga_count = 2; ++ ++ if (*buf) { ++ colon = strchr(buf, ':'); ++ if (colon) ++ *(colon++) = '\0'; ++ ++ if (*buf) { ++ tmp = atoi(buf); ++ switch (tmp) { ++ case 115200: ++ *baud = 115200; ++ break; ++ case 57600: ++ *baud = 57600; ++ break; ++ default: ++ sprintf(err_buf, "Invalid icarus-options for baud (%s) must be 115200 or 57600", buf); ++ quit(1, err_buf); ++ } ++ } ++ ++ if (colon && *colon) { ++ colon2 = strchr(colon, ':'); ++ if (colon2) ++ *(colon2++) = '\0'; ++ ++ if (*colon) { ++ tmp = atoi(colon); ++ if (tmp == 1 || tmp == 2 || tmp == 4 || tmp == 8) { ++ *work_division = tmp; ++ *fpga_count = tmp; // default to the same ++ } else { ++ sprintf(err_buf, "Invalid icarus-options for work_division (%s) must be 1, 2, 4 or 8", colon); ++ quit(1, err_buf); ++ } ++ } ++ ++ if (colon2 && *colon2) { ++ tmp = atoi(colon2); ++ if (tmp > 0 && tmp <= *work_division) ++ *fpga_count = tmp; ++ else { ++ sprintf(err_buf, "Invalid icarus-options for fpga_count (%s) must be >0 and <=work_division (%d)", colon2, *work_division); ++ quit(1, err_buf); ++ } ++ } ++ } ++ } + } + + static bool icarus_detect_one(const char *devpath) + { ++ int this_option_offset = ++option_offset; ++ + struct ICARUS_INFO *info; + struct timeval tv_start, tv_finish; + int fd; +@@ -379,9 +516,13 @@ + unsigned char ob_bin[64], nonce_bin[ICARUS_READ_SIZE]; + char *nonce_hex; + ++ int baud, work_division, fpga_count; ++ ++ get_options(this_option_offset, &baud, &work_division, &fpga_count); ++ + applog(LOG_DEBUG, "Icarus Detect: Attempting to open %s", devpath); + +- fd = icarus_open2(devpath, true); ++ fd = icarus_open2(devpath, baud, true); + if (unlikely(fd == -1)) { + applog(LOG_ERR, "Icarus Detect: Failed to open %s", devpath); + return false; +@@ -429,6 +570,9 @@ + applog(LOG_INFO, "Found Icarus at %s, mark as %d", + devpath, icarus->device_id); + ++ applog(LOG_DEBUG, "Icarus: Init: %d baud=%d work_division=%d fpga_count=%d", ++ icarus->device_id, baud, work_division, fpga_count); ++ + // Since we are adding a new device on the end it needs to always be allocated + icarus_info[icarus->device_id] = (struct ICARUS_INFO *)malloc(sizeof(struct ICARUS_INFO)); + if (unlikely(!(icarus_info[icarus->device_id]))) +@@ -439,10 +583,15 @@ + // Initialise everything to zero for a new device + memset(info, 0, sizeof(struct ICARUS_INFO)); + +- info->golden_hashes = (golden_nonce_val & 0x7fffffff) << 1; ++ info->baud = baud; ++ info->work_division = work_division; ++ info->fpga_count = fpga_count; ++ info->nonce_mask = mask(work_division); ++ ++ info->golden_hashes = (golden_nonce_val & info->nonce_mask) * fpga_count; + timersub(&tv_finish, &tv_start, &(info->golden_tv)); + +- set_timing_mode(icarus); ++ set_timing_mode(this_option_offset, icarus); + + return true; + } +@@ -458,7 +607,7 @@ + + struct timeval now; + +- int fd = icarus_open(icarus->device_path); ++ int fd = icarus_open(icarus->device_path, icarus_info[icarus->device_id]->baud); + if (unlikely(-1 == fd)) { + applog(LOG_ERR, "Failed to open Icarus on %s", + icarus->device_path); +@@ -565,11 +714,9 @@ + + submit_nonce(thr, work, nonce); + +- hash_count = (nonce & 0x7fffffff); +- if (hash_count++ == 0x7fffffff) +- hash_count = 0xffffffff; +- else +- hash_count <<= 1; ++ hash_count = (nonce & info->nonce_mask); ++ hash_count++; ++ hash_count *= info->fpga_count; + + if (opt_debug || info->do_icarus_timing) + timersub(&tv_finish, &tv_start, &elapsed); +@@ -580,7 +727,9 @@ + } + + // ignore possible end condition values +- if (info->do_icarus_timing && (nonce & 0x7fffffff) > 0x000fffff && (nonce & 0x7fffffff) < 0x7ff00000) { ++ if (info->do_icarus_timing ++ && ((nonce & info->nonce_mask) > END_CONDITION) ++ && ((nonce & info->nonce_mask) < (info->nonce_mask & ~END_CONDITION))) { + gettimeofday(&tv_history_start, NULL); + + history0 = &(info->history[0]); +@@ -590,7 +739,7 @@ + + Ti = (double)(elapsed.tv_sec) + + ((double)(elapsed.tv_usec))/((double)1000000) +- - ICARUS_READ_TIME; ++ - ((double)ICARUS_READ_TIME(info->baud)); + Xi = (double)hash_count; + history0->sumXiTi += Xi * Ti; + history0->sumXi += Xi; +@@ -700,6 +849,9 @@ + root = api_add_uint(root, "timing_values", &(info->history[0].values), false); + root = api_add_const(root, "timing_mode", timing_mode_str(info->timing_mode), false); + root = api_add_bool(root, "is_timing", &(info->do_icarus_timing), false); ++ root = api_add_int(root, "baud", &(info->baud), false); ++ root = api_add_int(root, "work_division", &(info->work_division), false); ++ root = api_add_int(root, "fpga_count", &(info->fpga_count), false); + + return root; + } +--- a/driver-opencl.c ++++ b/driver-opencl.c +@@ -660,9 +660,19 @@ + + for (gpu = 0; gpu < nDevs; gpu++) { + struct cgpu_info *cgpu = &gpus[gpu]; ++ double displayed_rolling, displayed_total; ++ bool mhash_base = true; + +- wlog("GPU %d: %.1f / %.1f Mh/s | A:%d R:%d HW:%d U:%.2f/m I:%d\n", +- gpu, cgpu->rolling, cgpu->total_mhashes / total_secs, ++ displayed_rolling = cgpu->rolling; ++ displayed_total = cgpu->total_mhashes / total_secs; ++ if (displayed_rolling < 1) { ++ displayed_rolling *= 1000; ++ displayed_total *= 1000; ++ mhash_base = false; ++ } ++ ++ wlog("GPU %d: %.1f / %.1f %sh/s | A:%d R:%d HW:%d U:%.2f/m I:%d\n", ++ gpu, displayed_rolling, displayed_total, mhash_base ? "M" : "K", + cgpu->accepted, cgpu->rejected, cgpu->hw_errors, + cgpu->utility, cgpu->intensity); + #ifdef HAVE_ADL +@@ -710,7 +720,10 @@ + if (thr->cgpu != cgpu) + continue; + get_datestamp(checkin, &thr->last); +- wlog("Thread %d: %.1f Mh/s %s ", i, thr->rolling, cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled"); ++ displayed_rolling = thr->rolling; ++ if (!mhash_base) ++ displayed_rolling *= 1000; ++ wlog("Thread %d: %.1f %sh/s %s ", i, displayed_rolling, mhash_base ? "M" : "K" , cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled"); + switch (cgpu->status) { + default: + case LIFE_WELL: +--- a/miner.h ++++ b/miner.h +@@ -500,6 +500,11 @@ + quit(1, "WTF MUTEX ERROR ON UNLOCK!"); + } + ++static inline int mutex_trylock(pthread_mutex_t *lock) ++{ ++ return pthread_mutex_trylock(lock); ++} ++ + static inline void wr_lock(pthread_rwlock_t *lock) + { + if (unlikely(pthread_rwlock_wrlock(lock))) +@@ -557,6 +562,7 @@ + extern bool opt_api_network; + extern bool opt_delaynet; + extern bool opt_restart; ++extern char *opt_icarus_options; + extern char *opt_icarus_timing; + #ifdef USE_BITFORCE + extern bool opt_bfl_noncerange; +--- a/miner.php ++++ b/miner.php +@@ -87,11 +87,13 @@ + 'DATE' => null, + 'RIGS' => null, + 'SUMMARY' => array('Elapsed', 'MHS av', 'Found Blocks=Blks', 'Accepted', 'Rejected=Rej', 'Utility'), +- 'DEVS' => array('ID', 'Name', 'GPU', 'Status', 'MHS av', 'Accepted', 'Rejected=Rej', 'Utility'), ++ 'DEVS+NOTIFY' => array('DEVS.Name=Name', 'DEVS.ID=ID', 'DEVS.Status=Status', 'DEVS.Temperature=Temp', ++ 'DEVS.MHS av=MHS av', 'DEVS.Accepted=Accept', 'DEVS.Rejected=Rej', ++ 'DEVS.Utility=Utility', 'NOTIFY.Last Not Well=Not Well'), + 'POOL' => array('POOL', 'Status', 'Accepted', 'Rejected=Rej', 'Last Share Time')); + $mobilesum = array( + 'SUMMARY' => array('MHS av', 'Found Blocks', 'Accepted', 'Rejected', 'Utility'), +- 'DEVS' => array('MHS av', 'Accepted', 'Rejected', 'Utility'), ++ 'DEVS+NOTIFY' => array('DEVS.MHS av', 'DEVS.Accepted', 'DEVS.Rejected', 'DEVS.Utility'), + 'POOL' => array('Accepted', 'Rejected')); + # + # customsummarypages is an array of these Custom Summary Pages +@@ -716,6 +718,9 @@ + if ($class == '' && ($rownum % 2) == 0) + $class = $c2class; + ++ if ($ret == '') ++ $ret = $b; ++ + return array($ret, $class); + } + # +@@ -1274,8 +1279,171 @@ + 'GPU' => 'devs', // You would normally use DEVS + 'PGA' => 'devs', // You would normally use DEVS + 'NOTIFY' => 'notify', ++ 'DEVDETAILS' => 'devdetails', ++ 'STATS' => 'stats', + 'CONFIG' => 'config'); + # ++function joinfields($section1, $section2, $join, $results) ++{ ++ global $sectionmap; ++ ++ $name1 = $sectionmap[$section1]; ++ $name2 = $sectionmap[$section2]; ++ $newres = array(); ++ ++ // foreach rig in section1 ++ foreach ($results[$name1] as $rig => $result) ++ { ++ $status = null; ++ ++ // foreach answer section in the rig api call ++ foreach ($result as $name1b => $fields1b) ++ { ++ if ($name1b == 'STATUS') ++ { ++ // remember the STATUS from section1 ++ $status = $result[$name1b]; ++ continue; ++ } ++ ++ // foreach answer section in the rig api call (for the other api command) ++ foreach ($results[$name2][$rig] as $name2b => $fields2b) ++ { ++ if ($name2b == 'STATUS') ++ continue; ++ ++ // If match the same field values of fields in $join ++ $match = true; ++ foreach ($join as $field) ++ if ($fields1b[$field] != $fields2b[$field]) ++ { ++ $match = false; ++ break; ++ } ++ ++ if ($match === true) ++ { ++ if ($status != null) ++ { ++ $newres[$rig]['STATUS'] = $status; ++ $status = null; ++ } ++ ++ $subsection = $section1.'+'.$section2; ++ $subsection .= preg_replace('/[^0-9]/', '', $name1b.$name2b); ++ ++ foreach ($fields1b as $nam => $val) ++ $newres[$rig][$subsection]["$section1.$nam"] = $val; ++ foreach ($fields2b as $nam => $val) ++ $newres[$rig][$subsection]["$section2.$nam"] = $val; ++ } ++ } ++ } ++ } ++ return $newres; ++} ++# ++function joinall($section1, $section2, $results) ++{ ++ global $sectionmap; ++ ++ $name1 = $sectionmap[$section1]; ++ $name2 = $sectionmap[$section2]; ++ $newres = array(); ++ ++ // foreach rig in section1 ++ foreach ($results[$name1] as $rig => $result) ++ { ++ // foreach answer section in the rig api call ++ foreach ($result as $name1b => $fields1b) ++ { ++ if ($name1b == 'STATUS') ++ { ++ // copy the STATUS from section1 ++ $newres[$rig][$name1b] = $result[$name1b]; ++ continue; ++ } ++ ++ // foreach answer section in the rig api call (for the other api command) ++ foreach ($results[$name2][$rig] as $name2b => $fields2b) ++ { ++ if ($name2b == 'STATUS') ++ continue; ++ ++ $subsection = $section1.'+'.$section2; ++ $subsection .= preg_replace('/[^0-9]/', '', $name1b.$name2b); ++ ++ foreach ($fields1b as $nam => $val) ++ $newres[$rig][$subsection]["$section1.$nam"] = $val; ++ foreach ($fields2b as $nam => $val) ++ $newres[$rig][$subsection]["$section2.$nam"] = $val; ++ } ++ } ++ } ++ return $newres; ++} ++# ++function joinsections($sections, $results, $errors) ++{ ++ global $sectionmap; ++ ++#echo "results['pools']=".print_r($results['pools'],true)."
"; ++ ++ // GPU's don't have Name,ID fields - so create them ++ foreach ($results as $section => $res) ++ foreach ($res as $rig => $result) ++ foreach ($result as $name => $fields) ++ { ++ $subname = preg_replace('/[0-9]/', '', $name); ++ if ($subname == 'GPU' and isset($result[$name]['GPU'])) ++ { ++ $results[$section][$rig][$name]['Name'] = 'GPU'; ++ $results[$section][$rig][$name]['ID'] = $result[$name]['GPU']; ++ } ++ } ++ ++ foreach ($sections as $section => $fields) ++ if ($section != 'DATE' && !isset($sectionmap[$section])) ++ { ++ $both = explode('+', $section, 2); ++ if (count($both) > 1) ++ { ++ switch($both[0]) ++ { ++ case 'SUMMARY': ++ switch($both[1]) ++ { ++ case 'POOL': ++ case 'DEVS': ++ case 'CONFIG': ++ $sectionmap[$section] = $section; ++ $results[$section] = joinall($both[0], $both[1], $results); ++ break; ++ } ++ break; ++ case 'DEVS': ++ $join = array('Name', 'ID'); ++ switch($both[1]) ++ { ++ case 'NOTIFY': ++ case 'DEVDETAILS': ++ $sectionmap[$section] = $section; ++ $results[$section] = joinfields($both[0], $both[1], $join, $results); ++ break; ++ } ++ break; ++ default: ++ $errors[] = "Error: Invalid section '$section'"; ++ break; ++ } ++ } ++ else ++ $errors[] = "Error: Invalid section '$section'"; ++ } ++ ++ return array($results, $errors); ++} ++# + function secmatch($section, $field) + { + if ($section == $field) +@@ -1335,7 +1503,14 @@ + $value = null; + } + +- list($showvalue, $class) = fmt($secname, $name, $value, $when, $row); ++ if (strpos($secname, '+') === false) ++ list($showvalue, $class) = fmt($secname, $name, $value, $when, $row); ++ else ++ { ++ $parts = explode('.', $name, 2); ++ list($showvalue, $class) = fmt($parts[0], $parts[1], $value, $when, $row); ++ } ++ + echo "$showvalue"; + } + endrow(); +@@ -1356,15 +1531,19 @@ + $errors = array(); + foreach ($sections as $section => $fields) + { +- if (isset($sectionmap[$section])) ++ $all = explode('+', $section); ++ foreach ($all as $section) + { +- $cmd = $sectionmap[$section]; +- if (!isset($cmds[$cmd])) +- $cmds[$cmd] = 1; ++ if (isset($sectionmap[$section])) ++ { ++ $cmd = $sectionmap[$section]; ++ if (!isset($cmds[$cmd])) ++ $cmds[$cmd] = 1; ++ } ++ else ++ if ($section != 'DATE') ++ $errors[] = "Error: unknown section '$section' in custom summary page '$pagename'"; + } +- else +- if ($section != 'DATE') +- $errors[] = "Error: unknown section '$section' in custom summary page '$pagename'"; + } + + $results = array(); +@@ -1399,6 +1578,7 @@ + $shownsomething = false; + if (count($results) > 0) + { ++ list($results, $errors) = joinsections($sections, $results, $errors); + $first = true; + foreach ($sections as $section => $fields) + { From b66e0241f70d0053f094205e1d1fedc4a811988a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 4 Aug 2012 09:52:18 +1000 Subject: [PATCH 141/178] Queue an extra request whenever staged work drops below mining thread count in hash_pop. --- cgminer.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cgminer.c b/cgminer.c index 557461da..6f667004 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3784,6 +3784,7 @@ out: static struct work *hash_pop(const struct timespec *abstime) { struct work *work = NULL; + bool queue = false; int rc = 0; mutex_lock(stgd_lock); @@ -3795,9 +3796,14 @@ static struct work *hash_pop(const struct timespec *abstime) HASH_DEL(staged_work, work); if (work->clone) --staged_extras; + if (HASH_COUNT(staged_work) < mining_threads) + queue = true; } mutex_unlock(stgd_lock); + if (queue) + queue_request(NULL, false); + return work; } From ede0dd6a5271b0206424950b37e61059291929e9 Mon Sep 17 00:00:00 2001 From: Kano Date: Sat, 4 Aug 2012 16:48:55 +1000 Subject: [PATCH 142/178] api.c in linux allow to open a closed socket in TIME_WAIT --- api.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/api.c b/api.c index 8e66e810..8dcb5058 100644 --- a/api.c +++ b/api.c @@ -3197,6 +3197,20 @@ void api(int api_thr_id) serv.sin_port = htons(port); +#ifndef WIN32 + // On linux with SO_REUSEADDR, bind will get the port if the previous + // socket is closed (even if it is still in TIME_WAIT) but fail if + // another program has it open - which is what we want + int optval = 1; + // If it doesn't work, we don't really care - just show a debug message + if (SOCKETFAIL(setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (void *)(&optval), sizeof(optval)))) + applog(LOG_DEBUG, "API setsockopt SO_REUSEADDR failed (ignored): %s", SOCKERRMSG); +#else + // On windows a 2nd program can bind to a port>1024 already in use unless + // SO_EXCLUSIVEADDRUSE is used - however then the bind to a closed port + // in TIME_WAIT will fail until the timeout - so we leave the options alone +#endif + // try for more than 1 minute ... in case the old one hasn't completely gone yet bound = 0; bindstart = time(NULL); From 56f06e1e9b3b58907371ad7c49c87653958d94f1 Mon Sep 17 00:00:00 2001 From: Kano Date: Sat, 4 Aug 2012 16:55:55 +1000 Subject: [PATCH 143/178] API remove unused warning in non-GPU compile --- api.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/api.c b/api.c index 8dcb5058..75676ecc 100644 --- a/api.c +++ b/api.c @@ -176,7 +176,9 @@ static const char *ALIVE = "Alive"; static const char *REJECTING = "Rejecting"; static const char *UNKNOWN = "Unknown"; #define _DYNAMIC "D" +#ifdef HAVE_OPENCL static const char *DYNAMIC = _DYNAMIC; +#endif static const char *YES = "Y"; static const char *NO = "N"; From bf9c3247527965a4fe6af11286f5b4aafba6a5b4 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 13:40:46 +1000 Subject: [PATCH 144/178] The queueing mechanism has become a complex state machine that is no longer predictable. Rewrite it from scratch watching only current queues in flight and staged work available on a pool by pool basis. --- cgminer.c | 214 ++++++++++++++++++++---------------------------------- miner.h | 2 + 2 files changed, 82 insertions(+), 134 deletions(-) diff --git a/cgminer.c b/cgminer.c index 6f667004..83cdeeb4 100644 --- a/cgminer.c +++ b/cgminer.c @@ -246,7 +246,6 @@ struct thread_q *getq; static int total_work; struct work *staged_work = NULL; -static int staged_extras; struct schedtime { bool enable; @@ -1335,7 +1334,7 @@ void decay_time(double *f, double fadd) *f = (fadd + *f * 0.58) / 1.58; } -static int requests_staged(void) +static int total_staged(void) { int ret; @@ -1345,6 +1344,16 @@ static int requests_staged(void) return ret; } +static int pool_staged(struct pool *pool) +{ + int ret; + + mutex_lock(stgd_lock); + ret = pool->staged; + mutex_unlock(stgd_lock); + return ret; +} + #ifdef HAVE_CURSES WINDOW *mainwin, *statuswin, *logwin; #endif @@ -1451,7 +1460,7 @@ static void curses_print_status(void) mvwprintw(statuswin, 2, 0, " %s", statusline); wclrtoeol(statuswin); mvwprintw(statuswin, 3, 0, " TQ: %d ST: %d SS: %d DW: %d NB: %d LW: %d GF: %d RF: %d", - total_queued, requests_staged(), total_stale, total_discarded, new_blocks, + total_queued, total_staged(), total_stale, total_discarded, new_blocks, local_work, total_go, total_ro); wclrtoeol(statuswin); if (pool_strategy == POOL_LOADBALANCE && total_pools > 1) @@ -2207,6 +2216,41 @@ static void push_curl_entry(struct curl_ent *ce, struct pool *pool) mutex_unlock(&pool->pool_lock); } +/* This is overkill, but at least we'll know accurately how much work is + * queued to prevent ever being left without work */ +static void inc_queued(struct pool *pool) +{ + if (unlikely(!pool)) + return; + + mutex_lock(&qd_lock); + pool->queued++; + total_queued++; + mutex_unlock(&qd_lock); +} + +static void dec_queued(struct pool *pool) +{ + if (unlikely(!pool)) + return; + + mutex_lock(&qd_lock); + pool->queued--; + total_queued--; + mutex_unlock(&qd_lock); +} + +static int current_queued(void) +{ + struct pool *pool = current_pool(); + int ret; + + mutex_lock(&qd_lock); + ret = pool->queued; + mutex_unlock(&qd_lock); + return ret; +} + /* ce and pool may appear uninitialised at push_curl_entry, but they're always * set when we don't have opt_benchmark enabled */ static void *get_work_thread(void *userdata) @@ -2230,6 +2274,7 @@ static void *get_work_thread(void *userdata) get_benchmark_work(ret_work); else { pool = ret_work->pool = select_pool(wc->lagging); + inc_queued(pool); ce = pop_curl_entry(pool); @@ -2249,6 +2294,8 @@ static void *get_work_thread(void *userdata) fail_pause += opt_fail_pause; } fail_pause = opt_fail_pause; + + dec_queued(pool); } applog(LOG_DEBUG, "Pushing work to requesting thread"); @@ -2503,11 +2550,6 @@ void switch_pools(struct pool *selected) if (pool != last_pool) applog(LOG_WARNING, "Switching to %s", pool->rpc_url); - /* Reset the queued amount to allow more to be queued for the new pool */ - mutex_lock(&qd_lock); - total_queued = 0; - mutex_unlock(&qd_lock); - mutex_lock(&lp_lock); pthread_cond_broadcast(&lp_cond); mutex_unlock(&lp_lock); @@ -2525,58 +2567,16 @@ static void discard_work(struct work *work) free_work(work); } -/* This is overkill, but at least we'll know accurately how much work is - * queued to prevent ever being left without work */ -static void inc_queued(void) -{ - mutex_lock(&qd_lock); - total_queued++; - mutex_unlock(&qd_lock); -} - -static void dec_queued(struct work *work) -{ - if (work->clone) - return; - - mutex_lock(&qd_lock); - if (total_queued > 0) - total_queued--; - mutex_unlock(&qd_lock); -} - -static int requests_queued(void) -{ - int ret; - - mutex_lock(&qd_lock); - ret = total_queued; - mutex_unlock(&qd_lock); - return ret; -} - -static void subtract_queued(int work_units) -{ - mutex_lock(&qd_lock); - total_queued -= work_units; - if (total_queued < 0) - total_queued = 0; - mutex_unlock(&qd_lock); -} - static void discard_stale(void) { struct work *work, *tmp; - int stale = 0, nonclone = 0; + int stale = 0; mutex_lock(stgd_lock); HASH_ITER(hh, staged_work, work, tmp) { if (stale_work(work, false)) { HASH_DEL(staged_work, work); - if (work->clone) - --staged_extras; - else - nonclone++; + work->pool->staged--; discard_work(work); stale++; } @@ -2584,9 +2584,6 @@ static void discard_stale(void) mutex_unlock(stgd_lock); applog(LOG_DEBUG, "Discarded %d stales that didn't match current hash", stale); - - /* Dec queued outside the loop to not have recursive locks */ - subtract_queued(nonclone); } bool queue_request(struct thr_info *thr, bool needed); @@ -2749,9 +2746,8 @@ static bool hash_push(struct work *work) mutex_lock(stgd_lock); if (likely(!getq->frozen)) { HASH_ADD_INT(staged_work, id, work); + work->pool->staged++; HASH_SORT(staged_work, tv_sort); - if (work->clone) - ++staged_extras; } else rc = false; pthread_cond_signal(&getq->cond); @@ -3611,7 +3607,6 @@ static bool pool_active(struct pool *pool, bool pinging) tq_push(thr_info[stage_thr_id].q, work); total_getworks++; pool->getwork_requested++; - inc_queued(); ret = true; gettimeofday(&pool->tv_idle, NULL); } else { @@ -3692,93 +3687,48 @@ static void pool_resus(struct pool *pool) switch_pools(NULL); } -static time_t requested_tv_sec; - -static bool control_tset(bool *var) -{ - bool ret; - - mutex_lock(&control_lock); - ret = *var; - *var = true; - mutex_unlock(&control_lock); - - return ret; -} - -static void control_tclear(bool *var) -{ - mutex_lock(&control_lock); - *var = false; - mutex_unlock(&control_lock); -} - -static bool queueing; - bool queue_request(struct thr_info *thr, bool needed) { + int cq, ts, maxq = opt_queue + mining_threads; struct workio_cmd *wc; - struct timeval now; - time_t scan_post; - int rq, rs; bool ret = true; - /* Prevent multiple requests being executed at once */ - if (control_tset(&queueing)) - return ret; - - rq = requests_queued(); - rs = requests_staged(); - - /* Grab more work every 2/3 of the scan time to avoid all work expiring - * at the same time */ - scan_post = opt_scantime * 2 / 3; - if (scan_post < 5) - scan_post = 5; - - gettimeofday(&now, NULL); + cq = current_queued(); + ts = total_staged(); /* Test to make sure we have enough work for pools without rolltime * and enough original work for pools with rolltime */ - if ((rq >= mining_threads || rs >= mining_threads) && - rq > staged_extras + opt_queue && - now.tv_sec - requested_tv_sec < scan_post) - goto out; - - requested_tv_sec = now.tv_sec; - - inc_queued(); + if ((cq >= opt_queue && ts >= maxq) || cq >= maxq) { + /* If we're queueing work faster than we can stage it, consider the + * system lagging and allow work to be gathered from another pool if + * possible */ + if (needed & !ts) { + wc->lagging = true; + applog(LOG_DEBUG, "Pool lagging"); + } + return true; + } /* fill out work request message */ wc = calloc(1, sizeof(*wc)); if (unlikely(!wc)) { applog(LOG_ERR, "Failed to calloc wc in queue_request"); - ret = false; - goto out; + return false; } wc->cmd = WC_GET_WORK; wc->thr = thr; - /* If we're queueing work faster than we can stage it, consider the - * system lagging and allow work to be gathered from another pool if - * possible */ - if (rq && needed && !rs && !opt_fail_only) - wc->lagging = true; - applog(LOG_DEBUG, "Queueing getwork request to work thread"); /* send work request to workio thread */ if (unlikely(!tq_push(thr_info[work_thr_id].q, wc))) { applog(LOG_ERR, "Failed to tq_push in queue_request"); workio_cmd_free(wc); - ret = false; + return false; } -out: - control_tclear(&queueing); - - return ret; + return true; } static struct work *hash_pop(const struct timespec *abstime) @@ -3794,8 +3744,6 @@ static struct work *hash_pop(const struct timespec *abstime) if (HASH_COUNT(staged_work)) { work = staged_work; HASH_DEL(staged_work, work); - if (work->clone) - --staged_extras; if (HASH_COUNT(staged_work) < mining_threads) queue = true; } @@ -3870,7 +3818,7 @@ static struct work *make_clone(struct work *work) * the future */ static struct work *clone_work(struct work *work) { - int mrs = mining_threads - requests_staged(); + int mrs = mining_threads + opt_queue - total_staged(); struct work *work_clone; bool cloned; @@ -3910,8 +3858,8 @@ static bool get_work(struct work *work, bool requested, struct thr_info *thr, struct timespec abstime = {0, 0}; struct timeval now; struct work *work_heap; + int failures = 0, cq; struct pool *pool; - int failures = 0; /* Tell the watchdog thread this thread is waiting on getwork and * should not be restarted */ @@ -3922,9 +3870,11 @@ static bool get_work(struct work *work, bool requested, struct thr_info *thr, thread_reportin(thr); return true; } + + cq = current_queued(); retry: pool = current_pool(); - if (!requested || requests_queued() < opt_queue) { + if (!requested || cq < opt_queue) { if (unlikely(!queue_request(thr, true))) { applog(LOG_WARNING, "Failed to queue_request in get_work"); goto out; @@ -3937,7 +3887,7 @@ retry: goto out; } - if (!pool->lagging && requested && !newreq && !requests_staged() && requests_queued() >= mining_threads) { + if (!pool->lagging && requested && !newreq && !pool_staged(pool) && cq >= mining_threads + opt_queue) { struct cgpu_info *cgpu = thr->cgpu; bool stalled = true; int i; @@ -3974,7 +3924,6 @@ retry: } if (stale_work(work_heap, false)) { - dec_queued(work_heap); discard_work(work_heap); goto retry; } @@ -3989,7 +3938,6 @@ retry: work_heap = clone_work(work_heap); memcpy(work, work_heap, sizeof(struct work)); - dec_queued(work_heap); free_work(work_heap); ret = true; @@ -4368,10 +4316,8 @@ static void convert_to_work(json_t *val, int rolltime, struct pool *pool) if (unlikely(!stage_work(work))) free_work(work); - else { - inc_queued(); + else applog(LOG_DEBUG, "Converted longpoll data to work"); - } } /* If we want longpoll, enable it for the chosen default pool, or, if @@ -4568,12 +4514,12 @@ static void *watchpool_thread(void __maybe_unused *userdata) } /* Work is sorted according to age, so discard the oldest work items, leaving - * only 1 staged work item per mining thread */ + * only 1/3 more staged work item than mining threads */ static void age_work(void) { int discarded = 0; - while (requests_staged() > mining_threads * 4 / 3 + opt_queue) { + while (total_staged() > mining_threads * 4 / 3 + opt_queue) { struct work *work = hash_pop(NULL); if (unlikely(!work)) @@ -4609,8 +4555,8 @@ static void *watchdog_thread(void __maybe_unused *userdata) struct timeval now; sleep(interval); - if (requests_queued() < opt_queue) - queue_request(NULL, false); + + queue_request(NULL, false); age_work(); diff --git a/miner.h b/miner.h index db272da0..2f51e3a6 100644 --- a/miner.h +++ b/miner.h @@ -718,6 +718,8 @@ struct pool { int accepted, rejected; int seq_rejects; int solved; + int queued; + int staged; bool submit_fail; bool idle; From 9a45a6d9932e8b8354d3ac255e8c474c37015c8f Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 14:19:49 +1000 Subject: [PATCH 145/178] Check the current staged and global queued as well before queueing requests. Discard stales before ageing work in the watchdog thread. Queue requests after discarding and ageing work in watchdog thread. Display accurate global queued in curses output. Reuse variable in age_work(). --- cgminer.c | 54 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/cgminer.c b/cgminer.c index 83cdeeb4..b1ba16ed 100644 --- a/cgminer.c +++ b/cgminer.c @@ -1354,6 +1354,13 @@ static int pool_staged(struct pool *pool) return ret; } +static int current_staged(void) +{ + struct pool *pool = current_pool(); + + return pool_staged(pool); +} + #ifdef HAVE_CURSES WINDOW *mainwin, *statuswin, *logwin; #endif @@ -1443,6 +1450,8 @@ static void text_print_status(int thr_id) } } +static int global_queued(void); + #ifdef HAVE_CURSES /* Must be called with curses mutex lock held and curses_active */ static void curses_print_status(void) @@ -1460,7 +1469,7 @@ static void curses_print_status(void) mvwprintw(statuswin, 2, 0, " %s", statusline); wclrtoeol(statuswin); mvwprintw(statuswin, 3, 0, " TQ: %d ST: %d SS: %d DW: %d NB: %d LW: %d GF: %d RF: %d", - total_queued, total_staged(), total_stale, total_discarded, new_blocks, + global_queued(), total_staged(), total_stale, total_discarded, new_blocks, local_work, total_go, total_ro); wclrtoeol(statuswin); if (pool_strategy == POOL_LOADBALANCE && total_pools > 1) @@ -2251,6 +2260,16 @@ static int current_queued(void) return ret; } +static int global_queued(void) +{ + int ret; + + mutex_lock(&qd_lock); + ret = total_queued; + mutex_unlock(&qd_lock); + return ret; +} + /* ce and pool may appear uninitialised at push_curl_entry, but they're always * set when we don't have opt_benchmark enabled */ static void *get_work_thread(void *userdata) @@ -2567,6 +2586,8 @@ static void discard_work(struct work *work) free_work(work); } +bool queue_request(struct thr_info *thr, bool needed); + static void discard_stale(void) { struct work *work, *tmp; @@ -2583,11 +2604,12 @@ static void discard_stale(void) } mutex_unlock(stgd_lock); - applog(LOG_DEBUG, "Discarded %d stales that didn't match current hash", stale); + if (stale) { + applog(LOG_DEBUG, "Discarded %d stales that didn't match current hash", stale); + queue_request(NULL, false); + } } -bool queue_request(struct thr_info *thr, bool needed); - /* A generic wait function for threads that poll that will wait a specified * time tdiff waiting on the pthread conditional that is broadcast when a * work restart is required. Returns the value of pthread_cond_timedwait @@ -3689,25 +3711,20 @@ static void pool_resus(struct pool *pool) bool queue_request(struct thr_info *thr, bool needed) { - int cq, ts, maxq = opt_queue + mining_threads; + int cq, cs, ts, tq, maxq = opt_queue + mining_threads; struct workio_cmd *wc; bool ret = true; cq = current_queued(); + cs = current_staged(); ts = total_staged(); + tq = global_queued(); /* Test to make sure we have enough work for pools without rolltime * and enough original work for pools with rolltime */ - if ((cq >= opt_queue && ts >= maxq) || cq >= maxq) { - /* If we're queueing work faster than we can stage it, consider the - * system lagging and allow work to be gathered from another pool if - * possible */ - if (needed & !ts) { - wc->lagging = true; - applog(LOG_DEBUG, "Pool lagging"); - } + if (((cs || cq >= opt_queue) && ts >= maxq) || + ((cs || cq) && tq >= maxq)) return true; - } /* fill out work request message */ wc = calloc(1, sizeof(*wc)); @@ -3744,6 +3761,7 @@ static struct work *hash_pop(const struct timespec *abstime) if (HASH_COUNT(staged_work)) { work = staged_work; HASH_DEL(staged_work, work); + work->pool->staged--; if (HASH_COUNT(staged_work) < mining_threads) queue = true; } @@ -4517,9 +4535,9 @@ static void *watchpool_thread(void __maybe_unused *userdata) * only 1/3 more staged work item than mining threads */ static void age_work(void) { - int discarded = 0; + int discarded = 0, maxq = (mining_threads + opt_queue) * 4 / 3; - while (total_staged() > mining_threads * 4 / 3 + opt_queue) { + while (total_staged() > maxq) { struct work *work = hash_pop(NULL); if (unlikely(!work)) @@ -4556,10 +4574,12 @@ static void *watchdog_thread(void __maybe_unused *userdata) sleep(interval); - queue_request(NULL, false); + discard_stale(); age_work(); + queue_request(NULL, false); + hashmeter(-1, &zero_tv, 0); #ifdef HAVE_CURSES From e4326e3ca277a9be0300a7e69034fab4eb432d74 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 14:24:31 +1000 Subject: [PATCH 146/178] Fix harmless warnings. --- cgminer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index b1ba16ed..52fcaf64 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3713,7 +3713,6 @@ bool queue_request(struct thr_info *thr, bool needed) { int cq, cs, ts, tq, maxq = opt_queue + mining_threads; struct workio_cmd *wc; - bool ret = true; cq = current_queued(); cs = current_staged(); @@ -3762,7 +3761,7 @@ static struct work *hash_pop(const struct timespec *abstime) work = staged_work; HASH_DEL(staged_work, work); work->pool->staged--; - if (HASH_COUNT(staged_work) < mining_threads) + if (HASH_COUNT(staged_work) < (unsigned int)mining_threads) queue = true; } mutex_unlock(stgd_lock); From 499c594cba8d0233d7fada577749e7ccb7ad55d5 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 14:29:21 +1000 Subject: [PATCH 147/178] There is no need for pool active testing to be mandatory any more with queue request changes. --- cgminer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/cgminer.c b/cgminer.c index 52fcaf64..e824dee2 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3617,7 +3617,6 @@ static bool pool_active(struct pool *pool, bool pinging) struct work *work = make_work(); bool rc; - work->mandatory = true; rc = work_decode(json_object_get(val, "result"), work); if (rc) { applog(LOG_DEBUG, "Successfully retrieved and deciphered work from pool %u %s", From 1a041668a59beceadc980a28d939db974a973dae Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 14:37:49 +1000 Subject: [PATCH 148/178] Reinstate check for system queueing lag when the current pool's queue is maxed out, there is no staged work, and the work is needed now. --- cgminer.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/cgminer.c b/cgminer.c index e824dee2..3a9bd229 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3712,17 +3712,25 @@ bool queue_request(struct thr_info *thr, bool needed) { int cq, cs, ts, tq, maxq = opt_queue + mining_threads; struct workio_cmd *wc; + bool lag = false; cq = current_queued(); cs = current_staged(); ts = total_staged(); tq = global_queued(); - /* Test to make sure we have enough work for pools without rolltime - * and enough original work for pools with rolltime */ - if (((cs || cq >= opt_queue) && ts >= maxq) || - ((cs || cq) && tq >= maxq)) - return true; + if (needed && cq >= maxq && !ts && !opt_fail_only) { + /* If we're queueing work faster than we can stage it, consider + * the system lagging and allow work to be gathered from + * another pool if possible */ + lag = true; + } else { + /* Test to make sure we have enough work for pools without rolltime + * and enough original work for pools with rolltime */ + if (((cs || cq >= opt_queue) && ts >= maxq) || + ((cs || cq) && tq >= maxq)) + return true; + } /* fill out work request message */ wc = calloc(1, sizeof(*wc)); @@ -3733,6 +3741,7 @@ bool queue_request(struct thr_info *thr, bool needed) wc->cmd = WC_GET_WORK; wc->thr = thr; + wc->lagging = lag; applog(LOG_DEBUG, "Queueing getwork request to work thread"); From 4efa31d2e7297df944af82e3e720346f46eeb934 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 14:44:11 +1000 Subject: [PATCH 149/178] Display failover only mode in pool menu and allow it to be toggled live. --- cgminer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cgminer.c b/cgminer.c index 3a9bd229..5187aa9a 100644 --- a/cgminer.c +++ b/cgminer.c @@ -3090,6 +3090,7 @@ retry: strategies[pool_strategy]); if (pool_strategy == POOL_ROTATE) wlogprint("Set to rotate every %d minutes\n", opt_rotate_period); + wlogprint("[F]ailover only %s\n", opt_fail_only ? "enabled" : "disabled"); wlogprint("[A]dd pool [R]emove pool [D]isable pool [E]nable pool\n"); wlogprint("[C]hange management strategy [S]witch pool [I]nformation\n"); wlogprint("Or press any other key to continue\n"); @@ -3183,6 +3184,9 @@ retry: pool = pools[selected]; display_pool_summary(pool); goto retry; + } else if (!strncasecmp(&input, "f", 1)) { + opt_fail_only ^= true; + goto updated; } else clear_logwin(); From 7611499b111c10cf5ee83eeaf35cdb8918f3a03c Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 14:57:46 +1000 Subject: [PATCH 150/178] Queue a request on pool switch in case we have no work from the new pool yet. --- cgminer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cgminer.c b/cgminer.c index 5187aa9a..dd4b9a9d 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2572,6 +2572,8 @@ void switch_pools(struct pool *selected) mutex_lock(&lp_lock); pthread_cond_broadcast(&lp_cond); mutex_unlock(&lp_lock); + + queue_request(NULL, false); } static void discard_work(struct work *work) From fc44b6d7a1eebf07da08463003dd0eba4ce7a564 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 15:32:44 +1000 Subject: [PATCH 151/178] Use different variables for command line specified lookup gap and thread concurrency to differentiate user defined versus auto chosen values. --- driver-opencl.c | 12 ++++++------ miner.h | 4 ++-- ocl.c | 11 +++++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/driver-opencl.c b/driver-opencl.c index 2027a8f6..b6dfe12e 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -163,16 +163,16 @@ char *set_lookup_gap(char *arg) return "Invalid parameters for set lookup gap"; val = atoi(nextptr); - gpus[device++].lookup_gap = val; + gpus[device++].opt_lg = val; while ((nextptr = strtok(NULL, ",")) != NULL) { val = atoi(nextptr); - gpus[device++].lookup_gap = val; + gpus[device++].opt_lg = val; } if (device == 1) { for (i = device; i < MAX_GPUDEVICES; i++) - gpus[i].lookup_gap = gpus[0].lookup_gap; + gpus[i].opt_lg = gpus[0].opt_lg; } return NULL; @@ -188,16 +188,16 @@ char *set_thread_concurrency(char *arg) return "Invalid parameters for set thread concurrency"; val = atoi(nextptr); - gpus[device++].thread_concurrency = val; + gpus[device++].opt_tc = val; while ((nextptr = strtok(NULL, ",")) != NULL) { val = atoi(nextptr); - gpus[device++].thread_concurrency = val; + gpus[device++].opt_tc = val; } if (device == 1) { for (i = device; i < MAX_GPUDEVICES; i++) - gpus[i].thread_concurrency = gpus[0].thread_concurrency; + gpus[i].opt_tc = gpus[0].opt_tc; } return NULL; diff --git a/miner.h b/miner.h index 2f51e3a6..04b765c0 100644 --- a/miner.h +++ b/miner.h @@ -362,8 +362,8 @@ struct cgpu_info { cl_ulong max_alloc; #ifdef USE_SCRYPT - int lookup_gap; - int thread_concurrency; + int opt_lg, lookup_gap; + int opt_tc, thread_concurrency; int shaders; #endif struct timeval tv_gpustart;; diff --git a/ocl.c b/ocl.c index e71b9cc8..9e9ef022 100644 --- a/ocl.c +++ b/ocl.c @@ -476,11 +476,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) cl_ulong ma = cgpu->max_alloc, mt; int pow2 = 0; - if (!cgpu->lookup_gap) { + if (!cgpu->opt_lg) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); cgpu->lookup_gap = 2; - } - if (!cgpu->thread_concurrency) { + } else + cgpu->lookup_gap = cgpu->opt_lg; + + if (!cgpu->opt_tc) { cgpu->thread_concurrency = ma / 32768 / cgpu->lookup_gap; if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; @@ -489,7 +491,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) } applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, cgpu->thread_concurrency); - } + } else + cgpu->thread_concurrency = cgpu->opt_tc; /* If we have memory to spare, try to find a power of 2 value * >= required amount to map nicely to an intensity */ From 25fd6cd0fd572ebef5e5918e85a868841db3cbfa Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 15:35:37 +1000 Subject: [PATCH 152/178] Correct writing of scrypt parameters to config file based on command line parameters only. --- cgminer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index 05040275..c78e46f0 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2948,11 +2948,11 @@ void write_config(FILE *fcfg) fputs("\",\n\"lookup-gap\" : \"", fcfg); for(i = 0; i < nDevs; i++) fprintf(fcfg, "%s%d", i > 0 ? "," : "", - (int)gpus[i].lookup_gap); + (int)gpus[i].opt_lg); fputs("\",\n\"thread-concurrency\" : \"", fcfg); for(i = 0; i < nDevs; i++) fprintf(fcfg, "%s%d", i > 0 ? "," : "", - (int)gpus[i].thread_concurrency); + (int)gpus[i].opt_tc); fputs("\",\n\"shaders\" : \"", fcfg); for(i = 0; i < nDevs; i++) fprintf(fcfg, "%s%d", i > 0 ? "," : "", From 3576abf8a72fd1532dbe64994627390e6ac4634f Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 15:43:27 +1000 Subject: [PATCH 153/178] Make pool_disabled the first in the enums == 0, fixing the pool enabled count which compares if value is not enabled before enabling it. --- miner.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/miner.h b/miner.h index 04b765c0..5afa071b 100644 --- a/miner.h +++ b/miner.h @@ -706,9 +706,11 @@ struct curl_ent { struct timeval tv; }; +/* Disabled needs to be the lowest enum as a freshly calloced value will then + * equal disabled */ enum pool_enable { - POOL_ENABLED, POOL_DISABLED, + POOL_ENABLED, POOL_REJECTING, }; From 31b01c8ac516275bd5990a2931cf9689bb3e3b83 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 16:13:53 +1000 Subject: [PATCH 154/178] Author: Luke Dashjr Date: Thu Jul 12 16:49:26 2012 +0000 Use FTD2XX.DLL on Windows to autodetect BitFORCE SHA256 devices --- driver-bitforce.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/driver-bitforce.c b/driver-bitforce.c index 1cb958b1..c9926deb 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -17,6 +18,28 @@ #include "config.h" +#ifdef WIN32 + +#include + +#define dlsym (void*)GetProcAddress +#define dlclose FreeLibrary + +typedef unsigned long FT_STATUS; +typedef PVOID FT_HANDLE; +__stdcall FT_STATUS (*FT_ListDevices)(PVOID pArg1, PVOID pArg2, DWORD Flags); +__stdcall FT_STATUS (*FT_Open)(int idx, FT_HANDLE*); +__stdcall FT_STATUS (*FT_GetComPortNumber)(FT_HANDLE, LPLONG lplComPortNumber); +__stdcall FT_STATUS (*FT_Close)(FT_HANDLE); +const uint32_t FT_OPEN_BY_DESCRIPTION = 2; +const uint32_t FT_LIST_ALL = 0x20000000; +const uint32_t FT_LIST_NUMBER_ONLY = 0x80000000; +enum { + FT_OK, +}; + +#endif /* WIN32 */ + #include "compat.h" #include "fpgautils.h" #include "miner.h" @@ -114,10 +137,89 @@ static bool bitforce_detect_one(const char *devpath) return add_cgpu(bitforce); } +#define LOAD_SYM(sym) do { \ + if (!(sym = dlsym(dll, #sym))) { \ + applog(LOG_DEBUG, "Failed to load " #sym ", not using FTDI bitforce autodetect"); \ + goto nogood; \ + } \ +} while(0) + +static char bitforce_autodetect_ftdi() +{ +#ifdef WIN32 + FT_STATUS ftStatus; + DWORD numDevs; + HMODULE dll = LoadLibrary("FTD2XX.DLL"); + if (!dll) + { + applog(LOG_DEBUG, "FTD2XX.DLL failed to load, not using FTDI bitforce autodetect"); + return 0; + } + LOAD_SYM(FT_ListDevices); + LOAD_SYM(FT_Open); + LOAD_SYM(FT_GetComPortNumber); + LOAD_SYM(FT_Close); + + ftStatus = FT_ListDevices(&numDevs, NULL, FT_LIST_NUMBER_ONLY); + if (ftStatus != FT_OK) + { + applog(LOG_DEBUG, "FTDI device count failed, not using FTDI bitforce autodetect"); +nogood: + dlclose(dll); + return 0; + } + applog(LOG_DEBUG, "FTDI reports %u devices", (unsigned)numDevs); + + char buf[65 * numDevs]; + char*bufptrs[numDevs + 1]; + int i; + for (i = 0; i < numDevs; ++i) + bufptrs[i] = &buf[i * 65]; + bufptrs[numDevs] = NULL; + ftStatus = FT_ListDevices(bufptrs, &numDevs, FT_LIST_ALL | FT_OPEN_BY_DESCRIPTION); + if (ftStatus != FT_OK) + { + applog(LOG_DEBUG, "FTDI device list failed, not using FTDI bitforce autodetect"); + goto nogood; + } + + char devpath[] = "\\\\.\\COMnnnnn"; + char *devpathnum = &devpath[7]; + char found = 0; + for (i = numDevs; i > 0; ) + { + --i; + bufptrs[i][64] = '\0'; + + if (!(strstr(bufptrs[i], "BitFORCE") && strstr(bufptrs[i], "SHA256"))) + continue; + + FT_HANDLE ftHandle; + if (FT_OK != FT_Open(i, &ftHandle)) + continue; + LONG lComPortNumber; + ftStatus = FT_GetComPortNumber(ftHandle, &lComPortNumber); + FT_Close(ftHandle); + if (FT_OK != ftStatus || lComPortNumber < 0) + continue; + + sprintf(devpathnum, "%d", (int)lComPortNumber); + + if (bitforce_detect_one(devpath)) + ++found; + } + dlclose(dll); + return found; +#else /* NOT WIN32 */ + return 0; +#endif +} + static char bitforce_detect_auto() { return (serial_autodetect_udev (bitforce_detect_one, "BitFORCE*SHA256") ?: serial_autodetect_devserial(bitforce_detect_one, "BitFORCE_SHA256") ?: + bitforce_autodetect_ftdi() ?: 0); } From 0d6763462bca7b7012393d2c5d6a1eff5631f66a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 16:15:32 +1000 Subject: [PATCH 155/178] Style cleanups. --- driver-bitforce.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index c9926deb..66a4095e 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -150,8 +150,7 @@ static char bitforce_autodetect_ftdi() FT_STATUS ftStatus; DWORD numDevs; HMODULE dll = LoadLibrary("FTD2XX.DLL"); - if (!dll) - { + if (!dll) { applog(LOG_DEBUG, "FTD2XX.DLL failed to load, not using FTDI bitforce autodetect"); return 0; } @@ -161,8 +160,7 @@ static char bitforce_autodetect_ftdi() LOAD_SYM(FT_Close); ftStatus = FT_ListDevices(&numDevs, NULL, FT_LIST_NUMBER_ONLY); - if (ftStatus != FT_OK) - { + if (ftStatus != FT_OK) { applog(LOG_DEBUG, "FTDI device count failed, not using FTDI bitforce autodetect"); nogood: dlclose(dll); @@ -177,8 +175,7 @@ nogood: bufptrs[i] = &buf[i * 65]; bufptrs[numDevs] = NULL; ftStatus = FT_ListDevices(bufptrs, &numDevs, FT_LIST_ALL | FT_OPEN_BY_DESCRIPTION); - if (ftStatus != FT_OK) - { + if (ftStatus != FT_OK) { applog(LOG_DEBUG, "FTDI device list failed, not using FTDI bitforce autodetect"); goto nogood; } @@ -186,8 +183,7 @@ nogood: char devpath[] = "\\\\.\\COMnnnnn"; char *devpathnum = &devpath[7]; char found = 0; - for (i = numDevs; i > 0; ) - { + for (i = numDevs; i > 0; ) { --i; bufptrs[i][64] = '\0'; From d3e4ec8754f6a8cd857baaff6294f4ff40820b17 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 17:35:08 +1000 Subject: [PATCH 156/178] Update news. --- NEWS | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/NEWS b/NEWS index fcf2f230..b5e531f6 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,34 @@ +Version 2.6.3 - August 5, 2012 + +- Style cleanups. +- Use FTD2XX.DLL on Windows to autodetect BitFORCE SHA256 devices. +- Make pool_disabled the first in the enums == 0, fixing the pool enabled count +which compares if value is not enabled before enabling it. +- Correct writing of scrypt parameters to config file based on command line +parameters only. +- Use different variables for command line specified lookup gap and thread +concurrency to differentiate user defined versus auto chosen values. +- Queue a request on pool switch in case we have no work from the new pool yet. +- Display failover only mode in pool menu and allow it to be toggled live. +- Reinstate check for system queueing lag when the current pool's queue is maxed +out, there is no staged work, and the work is needed now. +- There is no need for pool active testing to be mandatory any more with queue +request changes. +- Fix harmless warnings. +- Check the current staged and global queued as well before queueing requests. +Discard stales before ageing work in the watchdog thread. Queue requests after +discarding and ageing work in watchdog thread. Display accurate global queued in +curses output. Reuse variable in age_work(). +- The queueing mechanism has become a complex state machine that is no longer +predictable. Rewrite it from scratch watching only current queues in flight and +staged work available on a pool by pool basis. +- API remove unused warning in non-GPU compile +- api.c in linux allow to open a closed socket in TIME_WAIT +- Queue an extra request whenever staged work drops below mining thread count in +hash_pop. +- Update debian package configs to v2.6.2 + + Version 2.6.2 - August 3, 2012 - Scrypt mining does not support block testing yet so don't try to print it. From 4443895f7787edfbacedd0ee912f29a6a584d992 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 17:51:00 +1000 Subject: [PATCH 157/178] Count likely throttling episodes on bitforce devices as hardware errors. --- driver-bitforce.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/driver-bitforce.c b/driver-bitforce.c index 66a4095e..943a7d30 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -373,6 +373,8 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce) * our responses are out of sync and flush the buffer to * hopefully recover */ applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer"); + /* Count throttling episodes as hardware errors */ + bitforce->hw_errors++; bitforce_clear_buffer(bitforce); return false;; } @@ -411,6 +413,7 @@ re_send: goto re_send; } applog(LOG_ERR, "BFL%i: Error: Send work reports: %s", bitforce->device_id, pdevbuf); + bitforce->hw_errors++; bitforce_clear_buffer(bitforce); return false; } @@ -452,6 +455,7 @@ re_send: if (unlikely(strncasecmp(pdevbuf, "OK", 2))) { applog(LOG_ERR, "BFL%i: Error: Send block data reports: %s", bitforce->device_id, pdevbuf); + bitforce->hw_errors++; bitforce_clear_buffer(bitforce); return false; } @@ -540,6 +544,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work) else if (!strncasecmp(pdevbuf, "I", 1)) return 0; /* Device idle */ else if (strncasecmp(pdevbuf, "NONCE-FOUND", 11)) { + bitforce->hw_errors++; applog(LOG_WARNING, "BFL%i: Error: Get result reports: %s", bitforce->device_id, pdevbuf); bitforce_clear_buffer(bitforce); return 0; @@ -629,6 +634,7 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_ bitforce->device_last_not_well = time(NULL); bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR; bitforce->dev_comms_error_count++; + bitforce->hw_errors++; /* empty read buffer */ bitforce_clear_buffer(bitforce); } From a1b5bd7ad8ea385253a7e5054aff1389e9260f96 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 17:55:11 +1000 Subject: [PATCH 158/178] More NEWS. --- NEWS | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS b/NEWS index b5e531f6..d34fc536 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,6 @@ Version 2.6.3 - August 5, 2012 +- Count likely throttling episodes on bitforce devices as hardware errors. - Style cleanups. - Use FTD2XX.DLL on Windows to autodetect BitFORCE SHA256 devices. - Make pool_disabled the first in the enums == 0, fixing the pool enabled count From f6518c06516c6082e719a233c6e2d9c64135250a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sun, 5 Aug 2012 17:42:49 +1000 Subject: [PATCH 159/178] Bump version to 2.6.3 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 2f03c981..87b9faae 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_maj], [2]) m4_define([v_min], [6]) -m4_define([v_mic], [2]) +m4_define([v_mic], [3]) ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_ver], [v_maj.v_min.v_mic]) m4_define([lt_rev], m4_eval(v_maj + v_min)) From 5a5e35635d395cecf15500e0ee8e26ea911d34b7 Mon Sep 17 00:00:00 2001 From: Kano Date: Sun, 5 Aug 2012 23:48:30 +1000 Subject: [PATCH 160/178] miner.php allow pool inputs: delete, addpool, poolpriority --- miner.php | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 129 insertions(+), 10 deletions(-) diff --git a/miner.php b/miner.php index 08a419ec..3ec9c870 100644 --- a/miner.php +++ b/miner.php @@ -3,7 +3,7 @@ session_start(); # global $miner, $port, $readonly, $notify, $rigs; global $socksndtimeoutsec, $sockrcvtimeoutsec; -global $checklastshare, $hidefields; +global $checklastshare, $poolinputs, $hidefields; global $ignorerefresh, $changerefresh, $autorefresh; global $allowcustompages, $customsummarypages; global $miner_font_family, $miner_font_size; @@ -23,13 +23,18 @@ $readonly = false; # coz it doesn't have notify - it just shows the error status table $notify = true; # -# set $checklastshare to true to do the following checks: +# Set $checklastshare to true to do the following checks: # If a device's last share is 12x expected ago then display as an error # If a device's last share is 8x expected ago then display as a warning # If either of the above is true, also display the whole line highlighted # This assumes shares are 1 difficulty shares $checklastshare = true; # +# Set $poolinputs to true to show the input fields for adding a pool +# and changing the pool priorities +# N.B. also if $readonly is true, it will not display the fields +$poolinputs = false; +# # Set $rigs to an array of your cgminer rigs that are running # format: 'IP:Port' or 'Host:Port' or 'Host:Port:Name' # If you only have one rig, it will just show the detail of that rig @@ -197,7 +202,7 @@ function getdom($domname) function htmlhead($checkapi, $rig, $pg = null) { global $miner_font_family, $miner_font_size; - global $error, $readonly, $here; + global $error, $readonly, $poolinputs, $here; global $ignorerefresh, $autorefresh; $extraparams = ''; @@ -252,6 +257,8 @@ if ($ignorerefresh == false) echo "function prc(a,m){pr('&arg='+a,m)} function prs(a,r){var c=a.substr(3);var z=c.split('|',2);var m=z[0].substr(0,1).toUpperCase()+z[0].substr(1)+' GPU '+z[1];prc(a+'&rig='+r,m)} function prs2(a,n,r){var v=document.getElementById('gi'+n).value;var c=a.substr(3);var z=c.split('|',2);var m='Set GPU '+z[1]+' '+z[0].substr(0,1).toUpperCase()+z[0].substr(1)+' to '+v;prc(a+','+v+'&rig='+r,m)}\n"; + if ($poolinputs === true) + echo "function cbs(s){var t=s.replace(/\\\\/g,'\\\\\\\\'); return t.replace(/,/g, '\\\\,')}\nfunction pla(r){var u=document.getElementById('purl').value;var w=document.getElementById('pwork').value;var p=document.getElementById('ppass').value;pr('&rig='+r+'&arg=addpool|'+cbs(u)+','+cbs(w)+','+cbs(p),'Add Pool '+u)}\nfunction psp(r){var p=document.getElementById('prio').value;pr('&rig='+r+'&arg=poolpriority|'+p,'Set Pool Priorities to '+p)}\n"; } ?> @@ -315,6 +322,46 @@ function readsockline($socket) return $line; } # +function api_convert_escape($str) +{ + $res = ''; + $len = strlen($str); + for ($i = 0; $i < $len; $i++) + { + $ch = substr($str, $i, 1); + if ($ch != '\\' || $i == ($len-1)) + $res .= $ch; + else + { + $i++; + $ch = substr($str, $i, 1); + switch ($ch) + { + case '|': + $res .= "\1"; + break; + case '\\': + $res .= "\2"; + break; + case '=': + $res .= "\3"; + break; + case ',': + $res .= "\4"; + break; + default: + $res .= $ch; + } + } + } + return $res; +} +# +function revert($str) +{ + return str_replace(array("\1", "\2", "\3", "\4"), array("|", "\\", "=", ","), $str); +} +# function api($cmd) { global $haderror, $error; @@ -336,6 +383,8 @@ function api($cmd) # print "$cmd returned '$line'\n"; + $line = api_convert_escape($line); + $data = array(); $objs = explode('|', $line); @@ -373,7 +422,7 @@ function api($cmd) continue; if (count($id) == 2) - $data[$name][$id[0]] = $id[1]; + $data[$name][$id[0]] = revert($id[1]); else $data[$name][$counter] = $id[0]; @@ -448,6 +497,9 @@ function classlastshare($when, $alldata, $warnclass, $errorclass) if (!isset($alldata['MHS av'])) return ''; + if ($alldata['MHS av'] == 0) + return ''; + if (!isset($alldata['Last Share Time'])) return ''; @@ -484,6 +536,10 @@ function fmt($section, $name, $value, $when, $alldata) $ret = $value; $class = ''; + $nams = explode('.', $name); + if (count($nams) > 1) + $name = $nams[count($nams)-1]; + if ($value === null) $ret = $b; else @@ -491,6 +547,7 @@ function fmt($section, $name, $value, $when, $alldata) { case 'GPU.Last Share Time': case 'PGA.Last Share Time': + case 'DEVS.Last Share Time': if ($value == 0 || (isset($alldata['Last Share Pool']) && $alldata['Last Share Pool'] == -1)) { @@ -511,6 +568,7 @@ function fmt($section, $name, $value, $when, $alldata) break; case 'GPU.Last Share Pool': case 'PGA.Last Share Pool': + case 'DEVS.Last Share Pool': if ($value == -1) { $ret = 'None'; @@ -573,6 +631,7 @@ function fmt($section, $name, $value, $when, $alldata) break; case 'GPU.Utility': case 'PGA.Utility': + case 'DEVS.Utility': case 'SUMMARY.Utility': case 'total.Utility': $ret = $value.'/m'; @@ -593,18 +652,24 @@ function fmt($section, $name, $value, $when, $alldata) } break; case 'PGA.Temperature': - $ret = $value.'°C'; - break; case 'GPU.Temperature': + case 'DEVS.Temperature': $ret = $value.'°C'; + if (!isset($alldata['GPU'])) + break; case 'GPU.GPU Clock': + case 'DEVS.GPU Clock': case 'GPU.Memory Clock': + case 'DEVS.Memory Clock': case 'GPU.GPU Voltage': + case 'DEVS.GPU Voltage': case 'GPU.GPU Activity': + case 'DEVS.GPU Activity': if ($value == 0) $class = $warnclass; break; case 'GPU.Fan Percent': + case 'DEVS.Fan Percent': if ($value == 0) $class = $warnclass; else @@ -617,6 +682,7 @@ function fmt($section, $name, $value, $when, $alldata) } break; case 'GPU.Fan Speed': + case 'DEVS.Fan Speed': if ($value == 0) $class = $warnclass; else @@ -632,6 +698,7 @@ function fmt($section, $name, $value, $when, $alldata) break; case 'GPU.MHS av': case 'PGA.MHS av': + case 'DEVS.MHS av': case 'SUMMARY.MHS av': case 'total.MHS av': $parts = explode('.', $value, 2); @@ -658,6 +725,7 @@ function fmt($section, $name, $value, $when, $alldata) break; case 'GPU.Total MH': case 'PGA.Total MH': + case 'DEVS.Total MH': case 'SUMMARY.Total MH': case 'total.Total MH': case 'SUMMARY.Getworks': @@ -665,11 +733,13 @@ function fmt($section, $name, $value, $when, $alldata) case 'total.Getworks': case 'GPU.Accepted': case 'PGA.Accepted': + case 'DEVS.Accepted': case 'SUMMARY.Accepted': case 'POOL.Accepted': case 'total.Accepted': case 'GPU.Rejected': case 'PGA.Rejected': + case 'DEVS.Rejected': case 'SUMMARY.Rejected': case 'POOL.Rejected': case 'total.Rejected': @@ -687,12 +757,14 @@ function fmt($section, $name, $value, $when, $alldata) break; case 'GPU.Status': case 'PGA.Status': + case 'DEVS.Status': case 'POOL.Status': if ($value != 'Alive') $class = $errorclass; break; case 'GPU.Enabled': case 'PGA.Enabled': + case 'DEVS.Enabled': if ($value != 'Y') $class = $warnclass; break; @@ -727,7 +799,8 @@ function fmt($section, $name, $value, $when, $alldata) global $poolcmd; $poolcmd = array( 'Switch to' => 'switchpool', 'Enable' => 'enablepool', - 'Disable' => 'disablepool' ); + 'Disable' => 'disablepool', + 'Remove' => 'removepool' ); # function showhead($cmd, $values, $justnames = false) { @@ -938,6 +1011,43 @@ function processgpus($rig) } } # +function showpoolinputs($rig, $ans) +{ + global $readonly, $poolinputs; + + if ($readonly === true || $poolinputs === false) + return; + + newtable(); + newrow(); + + $inps = array('Pool URL' => array('purl', 20), + 'Worker Name' => array('pwork', 10), + 'Worker Password' => array('ppass', 10)); + $b = ' '; + + echo " Add a pool: "; + + foreach ($inps as $text => $name) + echo "$text: "; + + echo ""; + + endrow(); + + if (count($ans) > 1) + { + newrow(); + + echo ' Set pool priorities: '; + echo " Comma list of pool numbers: "; + echo ""; + + endrow(); + } + endtable(); +} +# function process($cmds, $rig) { global $error, $devs; @@ -957,12 +1067,15 @@ function process($cmds, $rig) { details($cmd, $process, $rig); + if ($cmd == 'devs') + $devs = $process; + + if ($cmd == 'pools') + showpoolinputs($rig, $process); + # Not after the last one if (--$count > 0) otherrow('

'); - - if ($cmd == 'devs') - $devs = $process; } } } @@ -1419,6 +1532,9 @@ function joinsections($sections, $results, $errors) $sectionmap[$section] = $section; $results[$section] = joinall($both[0], $both[1], $results); break; + default: + $errors[] = "Error: Invalid section '$section'"; + break; } break; case 'DEVS': @@ -1430,6 +1546,9 @@ function joinsections($sections, $results, $errors) $sectionmap[$section] = $section; $results[$section] = joinfields($both[0], $both[1], $join, $results); break; + default: + $errors[] = "Error: Invalid section '$section'"; + break; } break; default: From 64261e7a91ee2121e30e65b5c722ad520fe4af8c Mon Sep 17 00:00:00 2001 From: Kano Date: Mon, 6 Aug 2012 00:30:50 +1000 Subject: [PATCH 161/178] miner.php ignore arg when readonly --- miner.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/miner.php b/miner.php index 3ec9c870..f83a302c 100644 --- a/miner.php +++ b/miner.php @@ -1891,7 +1891,8 @@ function display() $miner = $parts[0]; $port = $parts[1]; - $preprocess = $arg; + if ($readonly !== true) + $preprocess = $arg; } } } From b33ea2674dd536f696e05975dc64be65ec78cbfd Mon Sep 17 00:00:00 2001 From: Kano Date: Mon, 6 Aug 2012 01:05:21 +1000 Subject: [PATCH 162/178] api.c update API start message and include port number --- api.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api.c b/api.c index 75676ecc..557aa5e8 100644 --- a/api.c +++ b/api.c @@ -3241,12 +3241,12 @@ void api(int api_thr_id) } if (opt_api_allow) - applog(LOG_WARNING, "API running in IP access mode"); + applog(LOG_WARNING, "API running in IP access mode on port %d", port); else { if (opt_api_network) - applog(LOG_WARNING, "API running in UNRESTRICTED access mode"); + applog(LOG_WARNING, "API running in UNRESTRICTED read access mode on port %d", port); else - applog(LOG_WARNING, "API running in local access mode"); + applog(LOG_WARNING, "API running in local read access mode on port %d", port); } io_buffer = malloc(MYBUFSIZ+1); From 920c56c93bc778b269f033b15c00d4ae5e0c1dc3 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 6 Aug 2012 11:39:32 +1000 Subject: [PATCH 163/178] Add specific information when ADL detects error -10 saying the device is not enabled. --- adl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/adl.c b/adl.c index 85ec0aaf..69c06dee 100644 --- a/adl.c +++ b/adl.c @@ -241,6 +241,8 @@ void init_adl(int nDevs) result = ADL_Adapter_ID_Get(iAdapterIndex, &lpAdapterID); if (result != ADL_OK) { applog(LOG_INFO, "Failed to ADL_Adapter_ID_Get. Error %d", result); + if (result == -10) + applog(LOG_INFO, "This error says the device is not enabled"); continue; } From 8897e0657507072448f4e951ce1caadca15b4b30 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Tue, 7 Aug 2012 11:05:23 +1000 Subject: [PATCH 164/178] Only add to the pool curlring and increment the counter under mutex lock. --- cgminer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cgminer.c b/cgminer.c index c78e46f0..d73ecac7 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2185,8 +2185,10 @@ static void recruit_curl(struct pool *pool) if (unlikely(!ce->curl || !ce)) quit(1, "Failed to init in recruit_curl"); + mutex_lock(&pool->pool_lock); list_add(&ce->node, &pool->curlring); pool->curls++; + mutex_unlock(&pool->pool_lock); applog(LOG_DEBUG, "Recruited curl %d for pool %d", pool->curls, pool->pool_no); } From 145f04ccc749073cc10c079002372f3008a0f935 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Tue, 7 Aug 2012 11:10:59 +1000 Subject: [PATCH 165/178] Display reaped debug message outside mutex lock to avoid recursive locking. --- cgminer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cgminer.c b/cgminer.c index d73ecac7..d1cb971a 100644 --- a/cgminer.c +++ b/cgminer.c @@ -4509,12 +4509,13 @@ static void reap_curl(struct pool *pool) { struct curl_ent *ent, *iter; struct timeval now; + int reaped = 0; gettimeofday(&now, NULL); mutex_lock(&pool->pool_lock); list_for_each_entry_safe(ent, iter, &pool->curlring, node) { if (now.tv_sec - ent->tv.tv_sec > 60) { - applog(LOG_DEBUG, "Reaped curl %d from pool %d", pool->curls, pool->pool_no); + reaped++; pool->curls--; list_del(&ent->node); curl_easy_cleanup(ent->curl); @@ -4522,6 +4523,8 @@ static void reap_curl(struct pool *pool) } } mutex_unlock(&pool->pool_lock); + if (reaped) + applog(LOG_DEBUG, "Reaped %d curl%s from pool %d", reaped, reaped > 1 ? "s" : "", pool->pool_no); } static void *watchpool_thread(void __maybe_unused *userdata) From ad8c4b7755e8d1cb01bded4260b5d278d345502d Mon Sep 17 00:00:00 2001 From: ckolivas Date: Tue, 7 Aug 2012 11:52:37 +1000 Subject: [PATCH 166/178] Revert "Only add to the pool curlring and increment the counter under mutex lock." This reverts commit 8897e0657507072448f4e951ce1caadca15b4b30. Wrong fix. Recursive locking now. --- cgminer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index d1cb971a..39e52bd6 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2185,10 +2185,8 @@ static void recruit_curl(struct pool *pool) if (unlikely(!ce->curl || !ce)) quit(1, "Failed to init in recruit_curl"); - mutex_lock(&pool->pool_lock); list_add(&ce->node, &pool->curlring); pool->curls++; - mutex_unlock(&pool->pool_lock); applog(LOG_DEBUG, "Recruited curl %d for pool %d", pool->curls, pool->pool_no); } From c7bcad653b79ee54429aec4964f4aa80e49db1d1 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Tue, 7 Aug 2012 11:59:54 +1000 Subject: [PATCH 167/178] Need to recheck the pool->curls count on regaining the pool lock after the pthread conditional wait returns. --- cgminer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cgminer.c b/cgminer.c index 39e52bd6..cc6f9490 100644 --- a/cgminer.c +++ b/cgminer.c @@ -2201,12 +2201,14 @@ static struct curl_ent *pop_curl_entry(struct pool *pool) struct curl_ent *ce; mutex_lock(&pool->pool_lock); +retry: if (!pool->curls) recruit_curl(pool); else if (list_empty(&pool->curlring)) { - if (pool->submit_fail || pool->curls >= curl_limit) + if (pool->submit_fail || pool->curls >= curl_limit) { pthread_cond_wait(&pool->cr_cond, &pool->pool_lock); - else + goto retry; + } else recruit_curl(pool); } ce = list_entry(pool->curlring.next, struct curl_ent, node); From 3dd1658e1f3410dfdf00b267593a3cbe7041c645 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Tue, 7 Aug 2012 12:10:01 +1000 Subject: [PATCH 168/178] We may as well leave one curl still available per pool instead of reaping the last one. --- cgminer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cgminer.c b/cgminer.c index cc6f9490..a662c1f0 100644 --- a/cgminer.c +++ b/cgminer.c @@ -4514,6 +4514,8 @@ static void reap_curl(struct pool *pool) gettimeofday(&now, NULL); mutex_lock(&pool->pool_lock); list_for_each_entry_safe(ent, iter, &pool->curlring, node) { + if (pool->curls < 2) + break; if (now.tv_sec - ent->tv.tv_sec > 60) { reaped++; pool->curls--; From e49bd98196b9ee4b77822e49b610da808fb16b7f Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 19:56:43 +1000 Subject: [PATCH 169/178] Use the scrypt CPU code to confirm results from OCL code, and mark failures as HW errors, making it easier to tune scrypt parameters. --- Makefile.am | 8 ++++---- findnonce.c | 24 ++++++++++++++++++------ scrypt.c | 29 +++++++++++++++++++++-------- scrypt.h | 13 +++++++++++++ 4 files changed, 56 insertions(+), 18 deletions(-) create mode 100644 scrypt.h diff --git a/Makefile.am b/Makefile.am index 013fd340..42b7a33a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -45,6 +45,10 @@ cgminer_SOURCES += ocl.c ocl.h findnonce.c findnonce.h cgminer_SOURCES += adl.c adl.h adl_functions.h cgminer_SOURCES += *.cl +if HAS_SCRYPT +cgminer_SOURCES += scrypt.c +endif + if HAS_CPUMINE # original CPU related sources, unchanged cgminer_SOURCES += \ @@ -56,10 +60,6 @@ cgminer_SOURCES += \ # the CPU portion extracted from original main.c cgminer_SOURCES += driver-cpu.h driver-cpu.c -if HAS_SCRYPT -cgminer_SOURCES += scrypt.c -endif - if HAS_YASM AM_CFLAGS = -DHAS_YASM if HAVE_x86_64 diff --git a/findnonce.c b/findnonce.c index a11333a1..9980a704 100644 --- a/findnonce.c +++ b/findnonce.c @@ -17,6 +17,7 @@ #include #include "findnonce.h" +#include "scrypt.h" const uint32_t SHA256_K[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, @@ -173,7 +174,7 @@ struct pc_data { pthread_t pth; }; -static void send_nonce(struct pc_data *pcd, cl_uint nonce) +static void send_sha_nonce(struct pc_data *pcd, cl_uint nonce) { dev_blk_ctx *blk = &pcd->work->blk; struct thr_info *thr = pcd->thr; @@ -220,6 +221,19 @@ static void send_nonce(struct pc_data *pcd, cl_uint nonce) } } +static void send_scrypt_nonce(struct pc_data *pcd, uint32_t nonce) +{ + struct thr_info *thr = pcd->thr; + struct work *work = pcd->work; + + if (scrypt_test(work->data, work->target, nonce)) + submit_nonce(thr, pcd->work, nonce); + else { + applog(LOG_INFO, "Scrypt error, review settings"); + thr->cgpu->hw_errors++; + } +} + static void *postcalc_hash(void *userdata) { struct pc_data *pcd = (struct pc_data *)userdata; @@ -233,13 +247,11 @@ static void *postcalc_hash(void *userdata) if (nonce) { applog(LOG_DEBUG, "OCL NONCE %u", nonce); -#ifdef USE_SCRYPT if (opt_scrypt) - submit_nonce(thr, pcd->work, nonce); + send_scrypt_nonce(pcd, nonce); else -#endif - send_nonce(pcd, nonce); - nonces++; + send_sha_nonce(pcd, nonce); + nonces++; } } diff --git a/scrypt.c b/scrypt.c index 4334bcf0..70c3fd3e 100644 --- a/scrypt.c +++ b/scrypt.c @@ -407,19 +407,32 @@ static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad) return PBKDF2_SHA256_80_128_32(input, X); } -bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsigned char *pdata, - unsigned char *phash1, unsigned char *phash, - const unsigned char *ptarget, - uint32_t max_nonce, uint32_t *last_nonce, - uint32_t n) +/* Used externally as confirmation of correct OCL code */ +bool scrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = ((const uint32_t *)ptarget)[7]; + char *scratchbuf; + uint32_t data[20]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = byteswap(nonce); + scratchbuf = alloca(131584); + tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf); + + return (tmp_hash7 <= Htarg); +} + +bool scanhash_scrypt(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) { uint32_t *nonce = (uint32_t *)(pdata + 76); - unsigned char *scratchbuf; + char *scratchbuf; uint32_t data[20]; uint32_t tmp_hash7; uint32_t Htarg = ((const uint32_t *)ptarget)[7]; bool ret = false; - int i; be32enc_vect(data, (const uint32_t *)pdata, 19); @@ -446,7 +459,7 @@ bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsig break; } } -out_ret: + free(scratchbuf);; return ret; } diff --git a/scrypt.h b/scrypt.h new file mode 100644 index 00000000..45dd46bf --- /dev/null +++ b/scrypt.h @@ -0,0 +1,13 @@ +#ifndef SCRYPT_H +#define SCRYPT_H + +#ifdef USE_SCRYPT +extern bool scrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); +#else /* USE_SCRYPT */ +static inline bool scrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + return false; +} +#endif /* USE_SCRYPT */ + +#endif /* SCRYPT_H */ From b406dbfe78dd1d567cb74ab0320fca7ca2c6fa7e Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 20:07:01 +1000 Subject: [PATCH 170/178] Update SCRYPT README with information about HW errors. --- SCRYPT-README | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/SCRYPT-README b/SCRYPT-README index 8d4c5a42..02c9c44d 100644 --- a/SCRYPT-README +++ b/SCRYPT-README @@ -24,6 +24,9 @@ with the --scrypt option, cgminer will fail IN RANDOM WAYS. They are all due to parameters being outside what the GPU can cope with. Not giving cgminer a hint as to your GPU type, it will hardly ever perform well. +NOTE that if it does not fail at startup, the presence of hardware errors (HW) +are a sure sign that you have set the parameters too high. + Step 1 on linux: export GPU_MAX_ALLOC_PERCENT=100 @@ -81,7 +84,7 @@ reason this is crucial is that too high an intensity can actually be disastrous with scrypt because it CAN run out of ram. Intensities over 13 start writing over the same ram and it is highly dependent on the GPU, but they can start actually DECREASING your hashrate, or even worse, start producing -garbage with rejects skyrocketing. The low level detail is that intensity is +garbage with HW errors skyrocketing. The low level detail is that intensity is only guaranteed up to the power of 2 that most closely matches the thread concurrency. i.e. a thread concurrency of 6144 has 8192 as the nearest power of two above it, thus as 2^13=8192, that is an intensity of 13. @@ -122,7 +125,7 @@ default value, and then start overclocking as you are running it, you should find a sweet spot where the hashrate peaks and then it might actually drop if you increase the engine clock speed further. Unless you wish to run with a dynamic intensity, do not go over 13 without testing it while it's running to -see that it increases hashrate AND utility WITHOUT increasing your rejects. +see that it increases hashrate AND utility WITHOUT increasing your HW errors. Suggested values for 7970 for example: From e52762c57ad6bd74fad83c734e36ab3fa76b9677 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 20:26:24 +1000 Subject: [PATCH 171/178] There is no point zeroing temperature in BFL if we fail to get a response, and we should register it as a HW error, suggesting throttling. --- driver-bitforce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 943a7d30..e1c01e12 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -350,7 +350,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce) if (unlikely(!pdevbuf[0])) { applog(LOG_ERR, "BFL%i: Error: Get temp returned empty string/timed out", bitforce->device_id); - bitforce->temp = 0; + bitforce->hw_errors++; return false; } From 7dffa07ed995aed4ec08e84b57c585b61675c8e5 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 20:47:18 +1000 Subject: [PATCH 172/178] Deuglify windows autodetect code for BFL. --- driver-bitforce.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index e1c01e12..f2d8ffe4 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -144,9 +144,16 @@ static bool bitforce_detect_one(const char *devpath) } \ } while(0) -static char bitforce_autodetect_ftdi() -{ #ifdef WIN32 +static char bitforce_autodetect_ftdi(void) +{ + char buf[65 * numDevs]; + char*bufptrs[numDevs + 1]; + char devpath[] = "\\\\.\\COMnnnnn"; + char *devpathnum = &devpath[7]; + char found = 0; + int i; + FT_STATUS ftStatus; DWORD numDevs; HMODULE dll = LoadLibrary("FTD2XX.DLL"); @@ -162,27 +169,19 @@ static char bitforce_autodetect_ftdi() ftStatus = FT_ListDevices(&numDevs, NULL, FT_LIST_NUMBER_ONLY); if (ftStatus != FT_OK) { applog(LOG_DEBUG, "FTDI device count failed, not using FTDI bitforce autodetect"); -nogood: - dlclose(dll); - return 0; + goto out; } applog(LOG_DEBUG, "FTDI reports %u devices", (unsigned)numDevs); - char buf[65 * numDevs]; - char*bufptrs[numDevs + 1]; - int i; for (i = 0; i < numDevs; ++i) bufptrs[i] = &buf[i * 65]; bufptrs[numDevs] = NULL; ftStatus = FT_ListDevices(bufptrs, &numDevs, FT_LIST_ALL | FT_OPEN_BY_DESCRIPTION); if (ftStatus != FT_OK) { applog(LOG_DEBUG, "FTDI device list failed, not using FTDI bitforce autodetect"); - goto nogood; + goto out; } - char devpath[] = "\\\\.\\COMnnnnn"; - char *devpathnum = &devpath[7]; - char found = 0; for (i = numDevs; i > 0; ) { --i; bufptrs[i][64] = '\0'; @@ -204,14 +203,19 @@ nogood: if (bitforce_detect_one(devpath)) ++found; } + +out: dlclose(dll); return found; -#else /* NOT WIN32 */ +} +#else +static char bitforce_autodetect_ftdi(void) +{ return 0; -#endif } +#endif -static char bitforce_detect_auto() +static char bitforce_detect_auto(void) { return (serial_autodetect_udev (bitforce_detect_one, "BitFORCE*SHA256") ?: serial_autodetect_devserial(bitforce_detect_one, "BitFORCE_SHA256") ?: @@ -219,7 +223,7 @@ static char bitforce_detect_auto() 0); } -static void bitforce_detect() +static void bitforce_detect(void) { serial_detect_auto(bitforce_api.dname, bitforce_detect_one, bitforce_detect_auto); } From 9cae9a9d7f806ae410eb142f90b6643c23abbfd6 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 20:49:47 +1000 Subject: [PATCH 173/178] Make the serial open timeout for BFL generically 1 second on windows. --- driver-bitforce.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index f2d8ffe4..0f2ea658 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -38,6 +38,10 @@ enum { FT_OK, }; +// Code must deal with a timeout. Make it 1 second on windows, 0.1 on linux. +#define BFopen(devpath) serial_open(devpath, 0, 10, true) +#else /* WIN32 */ +#define BFopen(devpath) serial_open(devpath, 0, 1, true) #endif /* WIN32 */ #include "compat.h" @@ -60,9 +64,6 @@ enum { struct device_api bitforce_api; -// Code must deal with a timeout -#define BFopen(devpath) serial_open(devpath, 0, 1, true) - static void BFgets(char *buf, size_t bufLen, int fd) { do { From 0a79de9375eaaefab7953e39871786d08d4ec68a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 20:55:55 +1000 Subject: [PATCH 174/178] Convert the serial autodetect functions to use int instead of char to enumerate devices. --- driver-bitforce.c | 8 ++++---- driver-modminer.c | 2 +- fpgautils.c | 8 ++++---- fpgautils.h | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 0f2ea658..5b8570eb 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -146,13 +146,13 @@ static bool bitforce_detect_one(const char *devpath) } while(0) #ifdef WIN32 -static char bitforce_autodetect_ftdi(void) +static int bitforce_autodetect_ftdi(void) { char buf[65 * numDevs]; char*bufptrs[numDevs + 1]; char devpath[] = "\\\\.\\COMnnnnn"; char *devpathnum = &devpath[7]; - char found = 0; + int found = 0; int i; FT_STATUS ftStatus; @@ -210,13 +210,13 @@ out: return found; } #else -static char bitforce_autodetect_ftdi(void) +static int bitforce_autodetect_ftdi(void) { return 0; } #endif -static char bitforce_detect_auto(void) +static int bitforce_detect_auto(void) { return (serial_autodetect_udev (bitforce_detect_one, "BitFORCE*SHA256") ?: serial_autodetect_devserial(bitforce_detect_one, "BitFORCE_SHA256") ?: diff --git a/driver-modminer.c b/driver-modminer.c index ff96ee45..040100db 100644 --- a/driver-modminer.c +++ b/driver-modminer.c @@ -91,7 +91,7 @@ modminer_detect_one(const char *devpath) #undef bailout -static char +static int modminer_detect_auto() { return diff --git a/fpgautils.c b/fpgautils.c index 5f488740..a62b4913 100644 --- a/fpgautils.c +++ b/fpgautils.c @@ -38,7 +38,7 @@ #include "miner.h" #ifdef HAVE_LIBUDEV -char +int serial_autodetect_udev(detectone_func_t detectone, const char*prodname) { struct udev *udev = udev_new(); @@ -69,14 +69,14 @@ serial_autodetect_udev(detectone_func_t detectone, const char*prodname) return found; } #else -char +int serial_autodetect_udev(__maybe_unused detectone_func_t detectone, __maybe_unused const char*prodname) { return 0; } #endif -char +int serial_autodetect_devserial(detectone_func_t detectone, const char*prodname) { #ifndef WIN32 @@ -107,7 +107,7 @@ serial_autodetect_devserial(detectone_func_t detectone, const char*prodname) #endif } -char +int _serial_detect(const char*dname, detectone_func_t detectone, autoscan_func_t autoscan, bool forceauto) { struct string_elist *iter, *tmp; diff --git a/fpgautils.h b/fpgautils.h index c45183b7..5b743bc5 100644 --- a/fpgautils.h +++ b/fpgautils.h @@ -14,17 +14,17 @@ #include typedef bool(*detectone_func_t)(const char*); -typedef char(*autoscan_func_t)(); +typedef int(*autoscan_func_t)(); -extern char _serial_detect(const char*dname, detectone_func_t, autoscan_func_t, bool force_autoscan); +extern int _serial_detect(const char*dname, detectone_func_t, autoscan_func_t, bool force_autoscan); #define serial_detect_fauto(dname, detectone, autoscan) \ _serial_detect(dname, detectone, autoscan, true) #define serial_detect_auto(dname, detectone, autoscan) \ _serial_detect(dname, detectone, autoscan, false) #define serial_detect(dname, detectone) \ _serial_detect(dname, detectone, NULL, false) -extern char serial_autodetect_devserial(detectone_func_t, const char*prodname); -extern char serial_autodetect_udev (detectone_func_t, const char*prodname); +extern int serial_autodetect_devserial(detectone_func_t, const char*prodname); +extern int serial_autodetect_udev (detectone_func_t, const char*prodname); extern int serial_open(const char*devpath, unsigned long baud, signed short timeout, bool purge); extern ssize_t _serial_read(int fd, char *buf, size_t buflen, char*eol); From be57725271cc1bf440c5d3cef964e3560b1f9bda Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 21:02:56 +1000 Subject: [PATCH 175/178] Update NEWS. --- NEWS | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/NEWS b/NEWS index d34fc536..7bb9524f 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,26 @@ +Version 2.6.4 - August 7, 2012 + +- Convert the serial autodetect functions to use int instead of char to +enumerate devices. +- Make the serial open timeout for BFL generically 1 second on windows. +- Deuglify windows autodetect code for BFL. +- There is no point zeroing temperature in BFL if we fail to get a response, and +we should register it as a HW error, suggesting throttling. +- Update SCRYPT README with information about HW errors. +- Use the scrypt CPU code to confirm results from OCL code, and mark failures as +HW errors, making it easier to tune scrypt parameters. +- We may as well leave one curl still available per pool instead of reaping the +last one. +- Need to recheck the pool->curls count on regaining the pool lock after the +pthread conditional wait returns. +- Display reaped debug message outside mutex lock to avoid recursive locking. +- Add specific information when ADL detects error -10 saying the device is not +enabled. +- api.c update API start message and include port number +- miner.php ignore arg when readonly +- miner.php allow pool inputs: delete, addpool, poolpriority + + Version 2.6.3 - August 5, 2012 - Count likely throttling episodes on bitforce devices as hardware errors. From 0adbcd5d07d426c51aa8af1e47b9aac624ecc2dc Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 21:21:01 +1000 Subject: [PATCH 176/178] Fix windows bitforce build. --- driver-bitforce.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/driver-bitforce.c b/driver-bitforce.c index 5b8570eb..bacac16c 100644 --- a/driver-bitforce.c +++ b/driver-bitforce.c @@ -141,17 +141,17 @@ static bool bitforce_detect_one(const char *devpath) #define LOAD_SYM(sym) do { \ if (!(sym = dlsym(dll, #sym))) { \ applog(LOG_DEBUG, "Failed to load " #sym ", not using FTDI bitforce autodetect"); \ - goto nogood; \ + goto out; \ } \ } while(0) #ifdef WIN32 static int bitforce_autodetect_ftdi(void) { - char buf[65 * numDevs]; - char*bufptrs[numDevs + 1]; char devpath[] = "\\\\.\\COMnnnnn"; char *devpathnum = &devpath[7]; + char **bufptrs; + char *buf; int found = 0; int i; @@ -173,7 +173,10 @@ static int bitforce_autodetect_ftdi(void) goto out; } applog(LOG_DEBUG, "FTDI reports %u devices", (unsigned)numDevs); - + + buf = alloca(65 * numDevs); + bufptrs = alloca(numDevs + 1); + for (i = 0; i < numDevs; ++i) bufptrs[i] = &buf[i * 65]; bufptrs[numDevs] = NULL; From d64c32ac320309f6b99730fe9de90a4a878e050d Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 21:49:11 +1000 Subject: [PATCH 177/178] Include scrypt.h in Makefile. --- Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 42b7a33a..d9389101 100644 --- a/Makefile.am +++ b/Makefile.am @@ -46,7 +46,7 @@ cgminer_SOURCES += adl.c adl.h adl_functions.h cgminer_SOURCES += *.cl if HAS_SCRYPT -cgminer_SOURCES += scrypt.c +cgminer_SOURCES += scrypt.c scrypt.h endif if HAS_CPUMINE From 1c98e0cf2164f3f7f577aeb3eddbf62c27321f0d Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 7 Aug 2012 21:48:18 +1000 Subject: [PATCH 178/178] Bump version to 2.6.4 --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 87b9faae..abc6d9fc 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_maj], [2]) m4_define([v_min], [6]) -m4_define([v_mic], [3]) +m4_define([v_mic], [4]) ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_ver], [v_maj.v_min.v_mic]) m4_define([lt_rev], m4_eval(v_maj + v_min))