From 857902a13d5a2669770cbd3cd5d5f18103daf441 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 12 Jul 2011 22:23:03 +1000 Subject: [PATCH] Commit a new phatk kernel renamed to force new binary building and add proper support in makefiles. --- Makefile.am | 9 +- ocl.c | 6 +- phatk.cl | 436 ----------------------------------------------- phatk110711.cl | 449 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 458 insertions(+), 442 deletions(-) delete mode 100644 phatk.cl create mode 100644 phatk110711.cl diff --git a/Makefile.am b/Makefile.am index eaf17fed..f7025427 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,11 +15,14 @@ INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) bin_PROGRAMS = cgminer +bin_SCRIPTS = phatk110711.cl poclbm.cl + cgminer_SOURCES = elist.h miner.h compat.h \ - main.c util.c \ - ocl.c findnonce.c \ + main.c util.c \ + ocl.c ocl.h findnonce.c findnonce.h \ sha256_generic.c sha256_4way.c sha256_via.c \ - sha256_cryptopp.c sha256_sse2_amd64.c + sha256_cryptopp.c sha256_sse2_amd64.c \ + phatk110711.cl poclbm.cl cgminer_LDFLAGS = $(PTHREAD_FLAGS) cgminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @OPENCL_LIBS@ @NCURSES_LIBS@ @PDCURSES_LIBS@ lib/libgnu.a ccan/libccan.a diff --git a/ocl.c b/ocl.c index e4472ae0..9db9df81 100644 --- a/ocl.c +++ b/ocl.c @@ -339,9 +339,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) */ char binaryfilename[255]; char numbuf[10]; - char filename[10]; + char filename[15]; if (clState->hasBitAlign) - strcpy(filename, "phatk.cl"); + strcpy(filename, "phatk110711.cl"); else strcpy(filename, "poclbm.cl"); FILE *binaryfile; @@ -370,7 +370,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) strcpy(binaryfilename, name); if (clState->hasBitAlign) { - strcat(binaryfilename, "phatk"); + strcat(binaryfilename, "phatk110711"); strcat(binaryfilename, "bitalign"); } else strcat(binaryfilename, "poclbm"); diff --git a/phatk.cl b/phatk.cl deleted file mode 100644 index e75d5584..00000000 --- a/phatk.cl +++ /dev/null @@ -1,436 +0,0 @@ -// -ck modified kernel taken from Phoenix taken from phatk -// This file is taken and modified from the public-domain poclbm project, and -// we have therefore decided to keep it public-domain in Phoenix. -// Modified version copyright 2011 Con Kolivas - -// The X is a placeholder for patching to suit hardware -#define VECTORSX - -#ifdef VECTORS4 - typedef uint4 u; -#elif defined VECTORS2 - typedef uint2 u; -#else - typedef uint u; -#endif - -__constant uint K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -__constant uint H[8] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 -}; - -#define BFI_INTX -#define BITALIGNX - -#ifdef BITALIGN -#pragma OPENCL EXTENSION cl_amd_media_ops : enable -#define rot(x, y) amd_bitalign(x, x, (u)(32-y)) -#else -#define rotr(x, y) rotate((u)x, (u)(32-y)) -#endif - -// This part is not from the stock poclbm kernel. It's part of an optimization -// added in the Phoenix Miner. - -// Some AMD devices have the BFI_INT opcode, which behaves exactly like the -// SHA-256 Ch function, but provides it in exactly one instruction. If -// detected, use it for Ch. Otherwise, construct Ch out of simpler logical -// primitives. - -#ifdef BFI_INT - // Well, slight problem... It turns out BFI_INT isn't actually exposed to - // OpenCL (or CAL IL for that matter) in any way. However, there is - // a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via - // amd_bytealign, takes the same inputs, and provides the same output. - // We can use that as a placeholder for BFI_INT and have the application - // patch it after compilation. - - // This is the BFI_INT function - #define Ch(x, y, z) amd_bytealign(x, y, z) - - // Ma can also be implemented in terms of BFI_INT... - #define Ma(a, b, c) amd_bytealign((c ^ a), (b), (a)) -#else - #define Ch(x, y, z) (z ^ (x & (y ^ z))) - #define Ma(x, y, z) ((x & z) | (y & (x | z))) -#endif - -//Various intermediate calculations for each SHA round -#define s0(n) (rot(Vals[(0 + 128 - (n)) % 8], 30)^rot(Vals[(0 + 128 - (n)) % 8], 19)^rot(Vals[(0 + 128 - (n)) % 8], 10)) -#define s1(n) (rot(Vals[(4 + 128 - (n)) % 8], 26)^rot(Vals[(4 + 128 - (n)) % 8], 21)^rot(Vals[(4 + 128 - (n)) % 8], 7)) -#define ch(n) (Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8])) -#define maj(n) (Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8])) -#define t1(n) (Vals[(7 + 128 - (n)) % 8] + K[(n) % 64]+ W[(n)] + ch(n) + s1(n)) -#define t1W(n) (Vals[(7 + 128 - (n)) % 8] + K[(n) % 64]+ w(n) + ch(n) + s1(n)) -#define t2(n) (s0(n) + maj(n)) - -//W calculation used for SHA round -#define w(n) (W[n] = P1(n) + P2(n) + P3(n) + P4(n)) - -//Full W calculation -#define R(x) (W[x] = (rot(W[x-2],15)^rot(W[x-2],13)^((W[x-2])>>10U)) + W[x-7] + (rot(W[x-15],25)^rot(W[x-15],14)^((W[x-15])>>3U)) + W[x-16]) - -//Partial W calculations (used for the begining where only some values are nonzero) -#define r0(x) ((rot(x,25)^rot(x,14)^((x)>>3U))) -#define r1(x) ((rot(x],15)^rot(x,13)^((x)>>10U))) -#define R0(n) ((rot(W[(n)],25)^rot(W[(n)],14)^((W[(n)])>>3U))) -#define R1(n) ((rot(W[(n)],15)^rot(W[(n)],13)^((W[(n)])>>10U))) -#define P1(x) R1(x-2) -#define P2(x) R0(x-15) -#define P3(x) W[x-7] -#define P4(x) W[x-16] - -//SHA round with built in W calc -#define sharound2(n) { Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n); } -//SHA round without W calc -#define sharound(n) {t1 = t1(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n); } - -//Partial SHA calculations (used for begining and end) -#define partround(n) {Vals[(7 + 128 - n) % 8]=(Vals[(7 + 128 - n) % 8]+W[n]); Vals[(3 + 128 - n) % 8]+=Vals[(7 + 128 - n) % 8]; Vals[(7 + 128 - n) % 8]+=t1;} - -__kernel - -void search( const uint state0, const uint state1, const uint state2, const uint state3, - const uint state4, const uint state5, const uint state6, const uint state7, - const uint B1, const uint C1, const uint D1, - const uint F1, const uint G1, const uint H1, - const uint base, - const uint W2, - const uint W16, const uint W17, - const uint PreVal4, const uint T1, - __global uint * output) -{ - - u W[128]; - u Vals[8]; - u t1 = T1; - - Vals[0]=state0; - Vals[1]=B1; - Vals[2]=C1; - Vals[3]=D1; - Vals[4]=PreVal4; - Vals[5]=F1; - Vals[6]=G1; - Vals[7]=H1; - - W[2] = W2; - W[4]=0x80000000U; - W[5]=0x00000000U; - W[6]=0x00000000U; - W[7]=0x00000000U; - W[8]=0x00000000U; - W[9]=0x00000000U; - W[10]=0x00000000U; - W[11]=0x00000000U; - W[12]=0x00000000U; - W[13]=0x00000000U; - W[14]=0x00000000U; - W[15]=0x00000280U; - W[16] = W16; - W[17] = W17; - - W[19] = P1(19) + P2(19) + P3(19); - W[18] = P1(18) + P3(18) + P4(18); - W[20] = P2(20) + P3(20) + P4(20); - uint it = get_local_id(0); - -#ifdef VECTORS4 - W[3] = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3); -#elif defined VECTORS2 - W[3] = base + (get_global_id(0)<<1) + (uint2)(0, 1); -#else - W[3] = base + get_global_id(0); -#endif - -//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions - W[31] = P2(31) + P4(31); - W[18] += P2(18); - partround(3); - W[19] += P4(19); - sharound(4); - W[20] += P1(20); - sharound(5); - W[32] = P2(32) + P4(32); - W[21] = P1(21); - sharound(6); - W[22] = P3(22) + P1(22); - W[23] = P3(23) + P1(23); - sharound(7); - W[24] = P1(24) + P3(24); - sharound(8); - W[25] = P1(25) + P3(25); - sharound(9); - W[26] = P1(26) + P3(26); - W[27] = P1(27) + P3(27); - sharound(10); - sharound(11); - W[28] = P1(28) + P3(28); - sharound(12); - W[29] = P1(29) + P3(29); - W[30] = P1(30) + P2(30) + P3(30); - sharound(13); - sharound(14); - W[31] += (P1(31) + P3(31)); - sharound(15); - sharound(16); - W[32] += (P1(32) + P3(32)); - sharound(17); - sharound(18); - sharound(19); - sharound(20); - sharound(21); - sharound(22); - sharound(23); - sharound(24); - sharound(25); - sharound(26); - sharound(27); - sharound(28); - sharound(29); - sharound(30); - sharound(31); - sharound(32); - sharound2(33); - sharound2(34); - sharound2(35); - sharound2(36); - sharound2(37); - sharound2(38); - sharound2(39); - sharound2(40); - sharound2(41); - sharound2(42); - sharound2(43); - sharound2(44); - sharound2(45); - sharound2(46); -//for some reason, this is faster than using all sharound2... - R(47); - sharound(47); - R(48); - sharound(48); - R(49); - sharound(49); - R(50); - sharound(50); - R(51); - sharound(51); - R(52); - sharound(52); - R(53); - sharound(53); - R(54); - sharound(54); - R(55); - sharound(55); - R(56); - sharound(56); - R(57); - sharound(57); - R(58); - sharound(58); - R(59); - sharound(59); - R(60); - sharound(60); - R(61); - sharound(61); - sharound2(62); - sharound2(63); - - W[64]=state0+Vals[0]; - W[65]=state1+Vals[1]; - W[66]=state2+Vals[2]; - W[67]=state3+Vals[3]; - W[68]=state4+Vals[4]; - W[69]=state5+Vals[5]; - W[70]=state6+Vals[6]; - W[71]=state7+Vals[7]; - - W[64 + 8]=0x80000000U; - W[64 + 9]=0x00000000U; - W[64 + 10]=0x00000000U; - W[64 + 11]=0x00000000U; - W[64 + 12]=0x00000000U; - W[64 + 13]=0x00000000U; - W[64 + 14]=0x00000000U; - W[64 + 15]=0x00000100U; - - Vals[0]=H[0]; - Vals[1]=H[1]; - Vals[2]=H[2]; - Vals[3]=H[3]; - Vals[4]=H[4]; - Vals[5]=H[5]; - Vals[6]=H[6]; - Vals[7]=H[7]; - - Vals[7] = 0xb0edbdd0 + K[0] + W[64] + 0x08909ae5U; - Vals[3] = 0xa54ff53a + 0xb0edbdd0 + K[0] + W[64]; - - R(64 + 16); - - sharound(64 + 1); - sharound(64 + 2); - W[64 + 17] = P1(64 + 17) + P2(64 + 17) + P4(64 + 17); - W[64 + 18] = P1(64 + 18) + P2(64 + 18) + P4(64 + 18); - sharound(64 + 3); - W[64 + 19] = P1(64 + 19) + P2(64 + 19) + P4(64 + 19); - sharound(64 + 4); - W[64 + 20] = P1(64 + 20) + P2(64 + 20) + P4(64 + 20); - sharound(64 + 5); - W[64 + 21] = P1(64 + 21) + P2(64 + 21) + P4(64 + 21); - sharound(64 + 6); - R(64 + 22); - sharound(64 + 7); - sharound(64 + 8); - R(64 + 23); - W[64 + 24] = P1(64 + 24) + P3(64 + 24) + P4(64 + 24); - sharound(64 + 9); - sharound(64 + 10); - W[64 + 25] = P1(64 + 25) + P3(64 + 25); - W[64 + 26] = P1(64 + 26) + P3(64 + 26); - sharound(64 + 11); - sharound(64 + 12); - W[64 + 27] = P1(64 + 27) + P3(64 + 27); - W[64 + 28] = P1(64 + 28) + P3(64 + 28); - sharound(64 + 13); - sharound(64 + 14); - sharound(64 + 15); - sharound(64 + 16); - sharound(64 + 17); - sharound(64 + 18); - sharound(64 + 19); - sharound(64 + 20); - sharound(64 + 21); - sharound(64 + 22); - sharound(64 + 23); - sharound(64 + 24); - sharound(64 + 25); - sharound(64 + 26); - sharound(64 + 27); - sharound(64 + 28); - sharound2(64 + 29); - sharound2(64 + 30); - sharound2(64 + 31); - sharound2(64 + 32); - sharound2(64 + 33); - sharound2(64 + 34); - sharound2(64 + 35); - sharound2(64 + 36); - sharound2(64 + 37); - sharound2(64 + 38); - sharound2(64 + 39); - sharound2(64 + 40); - sharound2(64 + 41); - sharound2(64 + 42); - sharound2(64 + 43); - sharound2(64 + 44); - sharound2(64 + 45); - sharound2(64 + 46); - sharound2(64 + 47); - sharound2(64 + 48); - sharound2(64 + 49); - R(64 + 50); - sharound(64 + 50); - R(64 + 51); - sharound(64 + 51); - R(64 + 52); - sharound(64 + 52); - R(64 + 53); - sharound(64 + 53); - R(64 + 54); - sharound(64 + 54); - R(64 + 55); - sharound(64 + 55); - sharound2(64 + 56); - sharound2(64 + 57); - sharound2(64 + 58); - sharound2(64 + 59); - -//Faster to write it this way... - Vals[3] += K[60] +s1(124) + ch(124); - R(64+60); - partround(64 + 60); - Vals[7] += H[7]; - -#define MAXBUFFERS (4 * 512) - -#if defined(VECTORS4) || defined(VECTORS2) - if (Vals[7].x == 0) - { - // Unlikely event there is something here already ! - if (output[it]) { - for (it = 0; it < MAXBUFFERS; it++) { - if (!output[it]) - break; - } - } - output[it] = W[3].x; - output[MAXBUFFERS] = 1; - } - if (Vals[7].y == 0) - { - it += 512; - if (output[it]) { - for (it = 0; it < MAXBUFFERS; it++) { - if (!output[it]) - break; - } - } - output[it] = W[3].y; - output[MAXBUFFERS] = 1; - } -#ifdef VECTORS4 - if (Vals[7].z == 0) - { - it += 1024; - if (output[it]) { - for (it = 0; it < MAXBUFFERS; it++) { - if (!output[it]) - break; - } - } - output[it] = W[3].z; - output[MAXBUFFERS] = 1; - } - if (Vals[7].w == 0) - { - it += 1536; - if (output[it]) { - for (it = 0; it < MAXBUFFERS; it++) { - if (!output[it]) - break; - } - } - output[it] = W[3].w; - output[MAXBUFFERS] = 1; - } -#endif -#else - if (Vals[7] == 0) - { - if (output[it]) { - for (it = 0; it < MAXBUFFERS; it++) { - if (!output[it]) - break; - } - } - output[it] = W[3]; - output[MAXBUFFERS] = 1; - } -#endif - -} - diff --git a/phatk110711.cl b/phatk110711.cl new file mode 100644 index 00000000..5f1aa6c8 --- /dev/null +++ b/phatk110711.cl @@ -0,0 +1,449 @@ +// This file is taken and modified from the public-domain poclbm project, and +// we have therefore decided to keep it public-domain in Phoenix. + +// 2011-07-11: further modified by Diapolo and still public-domain +// -ck version to be compatible with cgminer + +#define VECTORSX + +#ifdef VECTORS4 + typedef uint4 u; +#elif defined VECTORS2 + typedef uint2 u; +#else + typedef uint u; +#endif + +__constant uint K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +// H[6] = 0x08909ae5U + 0xb0edbdd0 + K[0] == 0xfc08884d +// H[7] = -0x5be0cd19 - (0x90befffa) K[60] == -0xec9fcd13 +__constant uint H[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0xfc08884d, 0xec9fcd13 +}; + +// L = 0xa54ff53a + 0xb0edbdd0 + K[0] == 0x198c7e2a2 +__constant ulong L = 0x198c7e2a2; + +#define BFI_INTX +#define BITALIGNX + +#ifdef BITALIGN + #pragma OPENCL EXTENSION cl_amd_media_ops : enable + #define rot(x, y) amd_bitalign(x, x, (u)(32 - y)) +#else + #define rot(x, y) rotate(x, (u)y) +#endif + +#ifdef BFI_INT + #define Ch(x, y, z) amd_bytealign(x, y, z) +#else + #define Ch(x, y, z) bitselect(z, y, x) +#endif + +// Ma now uses the Ch function, if BFI_INT is enabled, the optimized Ch version is used +#define Ma(x, y, z) Ch((z ^ x), y, x) + +// Various intermediate calculations for each SHA round +#define s0(n) (rot(Vals[(128 - n) % 8], 30) ^ rot(Vals[(128 - n) % 8], 19) ^ rot(Vals[(128 - n) % 8], 10)) +#define s1(n) (rot(Vals[(132 - n) % 8], 26) ^ rot(Vals[(132 - n) % 8], 21) ^ rot(Vals[(132 - n) % 8], 7)) +#define ch(n) (Ch(Vals[(132 - n) % 8], Vals[(133 - n) % 8], Vals[(134 - n) % 8])) +#define ma(n) (Ma(Vals[(129 - n) % 8], Vals[(130 - n) % 8], Vals[(128 - n) % 8])) +#define t1(n) (K[n % 64] + Vals[(135 - n) % 8] + W[n] + s1(n) + ch(n)) + +// intermediate W calculations +#define P1(x) (rot(W[x - 2], 15) ^ rot(W[x - 2], 13) ^ (W[x - 2] >> 10U)) +#define P2(x) (rot(W[x - 15], 25) ^ rot(W[x - 15], 14) ^ (W[x - 15] >> 3U)) +#define P3(x) W[x - 7] +#define P4(x) W[x - 16] + +// full W calculation +#define W(x) (W[x] = P4(x) + P3(x) + P2(x) + P1(x)) + +// SHA round without W calc +#define sharound(n) { Vals[(131 - n) % 8] += t1(n); Vals[(135 - n) % 8] = t1(n) + s0(n) + ma(n); } + +__kernel void search( const uint state0, const uint state1, const uint state2, const uint state3, + const uint state4, const uint state5, const uint state6, const uint state7, + const uint B1, const uint C1, const uint D1, + const uint F1, const uint G1, const uint H1, + const uint base, + const uint W2, + const uint W16, const uint W17, + const uint PreVal4, const uint T1, + __global uint * output) +{ + u W[124]; + u Vals[8]; + uint it; + + Vals[1] = B1; + Vals[2] = C1; + Vals[5] = F1; + Vals[6] = G1; + + W[2] = W2; +#ifdef VECTORS4 + Vals[4] = (W[3] = base + (get_global_id(0) << 2) + (uint4)(0, 1, 2, 3)) + PreVal4; +#elif defined VECTORS2 + Vals[4] = (W[3] = base + (get_global_id(0) << 1) + (uint2)(0, 1)) + PreVal4; +#else + Vals[4] = (W[3] = base + get_global_id(0)) + PreVal4; +#endif + // used in: P2(19) == 285220864 (0x11002000), P4(20) + W[4] = 0x80000000U; + // P1(x) is 0 for x == 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + // P2(x) is 0 for x == 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 + // P3(x) is 0 for x == 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + // P4(x) is 0 for x == 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + // W[x] in sharound(x) is 0 for x == 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 + W[14] = W[13] = W[12] = W[11] = W[10] = W[9] = W[8] = W[7] = W[6] = W[5] = 0x00000000U; + // used in: P2(30) == 10485845 (0xA00055), P3(22), P4(31) + // K[15] + W[15] == 0xc19bf174 + 0x00000280U = 0xc19bf3f4 + W[15] = 0x00000280U; + + W[16] = W16; + W[17] = W17; + // removed P3(18) from add because it is == 0 + W[18] = P1(18) + P4(18) + P2(18); + // removed P3(19) from add because it is == 0 + W[19] = (u)0x11002000 + P1(19) + P4(19); + // removed P2(20), P3(20) from add because it is == 0 + W[20] = P1(20) + P4(20); + W[21] = P1(21); + W[22] = P1(22) + P3(22); + W[23] = P1(23) + P3(23); + W[24] = P1(24) + P3(24); + W[25] = P1(25) + P3(25); + W[26] = P1(26) + P3(26); + W[27] = P1(27) + P3(27); + W[28] = P1(28) + P3(28); + W[29] = P1(29) + P3(29); + W[30] = (u)0xA00055 + P1(30) + P3(30); + + // Round 3 + Vals[0] = state0 + Vals[4]; + Vals[4] += T1; + + // Round 4 + // K[4] + W[4] == 0x3956c25b + 0x80000000U = 0xb956c25b + Vals[7] = (Vals[3] = (u)0xb956c25b + D1 + s1(4) + ch(4)) + H1; + Vals[3] += s0(4) + ma(4); + + // Round 5 + Vals[2] = K[5] + C1 + s1(5) + ch(5) + s0(5) + ma(5); + Vals[6] = K[5] + C1 + G1 + s1(5) + ch(5); + + sharound(6); + sharound(7); + sharound(8); + sharound(9); + sharound(10); + sharound(11); + sharound(12); + sharound(13); + sharound(14); + sharound(15); + sharound(16); + sharound(17); + sharound(18); + sharound(19); + sharound(20); + sharound(21); + sharound(22); + sharound(23); + sharound(24); + sharound(25); + sharound(26); + sharound(27); + sharound(28); + sharound(29); + sharound(30); + + W(31); + sharound(31); + W(32); + sharound(32); + W(33); + sharound(33); + W(34); + sharound(34); + W(35); + sharound(35); + W(36); + sharound(36); + W(37); + sharound(37); + W(38); + sharound(38); + W(39); + sharound(39); + W(40); + sharound(40); + W(41); + sharound(41); + W(42); + sharound(42); + W(43); + sharound(43); + W(44); + sharound(44); + W(45); + sharound(45); + W(46); + sharound(46); + W(47); + sharound(47); + W(48); + sharound(48); + W(49); + sharound(49); + W(50); + sharound(50); + W(51); + sharound(51); + W(52); + sharound(52); + W(53); + sharound(53); + W(54); + sharound(54); + W(55); + sharound(55); + W(56); + sharound(56); + W(57); + sharound(57); + W(58); + sharound(58); + W(59); + sharound(59); + W(60); + sharound(60); + W(61); + sharound(61); + W(62); + sharound(62); + W(63); + sharound(63); + + W[64] = state0 + Vals[0]; + W[65] = state1 + Vals[1]; + W[66] = state2 + Vals[2]; + W[67] = state3 + Vals[3]; + W[68] = state4 + Vals[4]; + W[69] = state5 + Vals[5]; + W[70] = state6 + Vals[6]; + W[71] = state7 + Vals[7]; + // used in: P2(87) = 285220864 (0x11002000), P4(88) + // K[72] + W[72] == + W[72] = 0x80000000U; + // P1(x) is 0 for x == 75, 76, 77, 78, 79, 80 + // P2(x) is 0 for x == 88, 89, 90, 91, 92, 93 + // P3(x) is 0 for x == 80, 81, 82, 83, 84, 85 + // P4(x) is 0 for x == 89, 90, 91, 92, 93, 94 + // W[x] in sharound(x) is 0 for x == 73, 74, 75, 76, 77, 78 + W[78] = W[77] = W[76] = W[75] = W[74] = W[73] = 0x00000000U; + // used in: P1(81) = 10485760 (0xA00000), P2(94) = 4194338 (0x400022), P3(86), P4(95) + // K[79] + W[79] == + W[79] = 0x00000100U; + + Vals[0] = H[0]; + Vals[1] = H[1]; + Vals[2] = H[2]; + Vals[3] = (u)L + W[64]; + Vals[4] = H[3]; + Vals[5] = H[4]; + Vals[6] = H[5]; + Vals[7] = H[6] + W[64]; + + sharound(65); + sharound(66); + sharound(67); + sharound(68); + sharound(69); + sharound(70); + sharound(71); + sharound(72); + sharound(73); + sharound(74); + sharound(75); + sharound(76); + sharound(77); + sharound(78); + sharound(79); + + // removed P1(80), P3(80) from add because it is == 0 + W[80] = P2(80) + P4(80); + W[81] = (u)0xA00000 + P4(81) + P2(81); + W[82] = P4(82) + P2(82) + P1(82); + W[83] = P4(83) + P2(83) + P1(83); + W[84] = P4(84) + P2(84) + P1(84); + W[85] = P4(85) + P2(85) + P1(85); + W(86); + + sharound(80); + sharound(81); + sharound(82); + sharound(83); + sharound(84); + sharound(85); + sharound(86); + + W[87] = (u)0x11002000 + P4(87) + P3(87) + P1(87); + sharound(87); + W[88] = P4(88) + P3(88) + P1(88); + sharound(88); + W[89] = P3(89) + P1(89); + sharound(89); + W[90] = P3(90) + P1(90); + sharound(90); + W[91] = P3(91) + P1(91); + sharound(91); + W[92] = P3(92) + P1(92); + sharound(92); + // removed P2(93), P4(93) from add because it is == 0 + W[93] = P3(93) + P1(93); + sharound(93); + // removed P4(94) from add because it is == 0 + W[94] = (u)0x400022 + P3(94) + P1(94); + sharound(94); + + W(95); + sharound(95); + W(96); + sharound(96); + W(97); + sharound(97); + W(98); + sharound(98); + W(99); + sharound(99); + W(100); + sharound(100); + W(101); + sharound(101); + W(102); + sharound(102); + W(103); + sharound(103); + W(104); + sharound(104); + W(105); + sharound(105); + W(106); + sharound(106); + W(107); + sharound(107); + W(108); + sharound(108); + W(109); + sharound(109); + W(110); + sharound(110); + W(111); + sharound(111); + W(112); + sharound(112); + W(113); + sharound(113); + W(114); + sharound(114); + W(115); + sharound(115); + W(116); + sharound(116); + W(117); + sharound(117); + W(118); + sharound(118); + W(119); + sharound(119); + W(120); + sharound(120); + W(121); + sharound(121); + W(122); + sharound(122); + W(123); + sharound(123); + + // Round 124 + Vals[7] += Vals[3] + P4(124) + P3(124) + P2(124) + P1(124) + s1(124) + ch(124); + +#define MAXBUFFERS (4 * 512) + +#if defined(VECTORS4) || defined(VECTORS2) + if (Vals[7].x == -H[7]) + { + // Unlikely event there is something here already ! + if (output[it]) { + for (it = 0; it < MAXBUFFERS; it++) { + if (!output[it]) + break; + } + } + output[it] = W[3].x; + output[MAXBUFFERS] = 1; + } + if (Vals[7].y == -H[7]) + { + it += 512; + if (output[it]) { + for (it = 0; it < MAXBUFFERS; it++) { + if (!output[it]) + break; + } + } + output[it] = W[3].y; + output[MAXBUFFERS] = 1; + } +#ifdef VECTORS4 + if (Vals[7].z == -H[7]) + { + it += 1024; + if (output[it]) { + for (it = 0; it < MAXBUFFERS; it++) { + if (!output[it]) + break; + } + } + output[it] = W[3].z; + output[MAXBUFFERS] = 1; + } + if (Vals[7].w == -H[7]) + { + it += 1536; + if (output[it]) { + for (it = 0; it < MAXBUFFERS; it++) { + if (!output[it]) + break; + } + } + output[it] = W[3].w; + output[MAXBUFFERS] = 1; + } +#endif +#else + if (Vals[7] == -H[7]) + { + if (output[it]) { + for (it = 0; it < MAXBUFFERS; it++) { + if (!output[it]) + break; + } + } + output[it] = W[3]; + output[MAXBUFFERS] = 1; + } +#endif +} \ No newline at end of file