From 3596e3752e0d9b5bf4b4ba6cce766b520e8e12b3 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Wed, 8 Feb 2012 10:25:45 +1100 Subject: [PATCH 1/8] Import diakgcn kernel. --- diakgcn120208.cl | 610 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 610 insertions(+) create mode 100644 diakgcn120208.cl diff --git a/diakgcn120208.cl b/diakgcn120208.cl new file mode 100644 index 00000000..f8b263fb --- /dev/null +++ b/diakgcn120208.cl @@ -0,0 +1,610 @@ +// DiaKGCN 04-02-2012 - OpenCL kernel by Diapolo +// +// Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3. +// The kernel was rewritten by me (Diapolo) and is still public-domain! + +#ifdef VECTORS8 + typedef uint8 u; +#elif defined VECTORS4 + typedef uint4 u; +#elif defined VECTORS2 + typedef uint2 u; +#else + typedef uint u; +#endif + +#ifdef BFI_INT + #pragma OPENCL EXTENSION cl_amd_media_ops : enable + #define Ch(x, y, z) amd_bytealign(x, y, z) + #define Ma(x, y, z) amd_bytealign(z ^ x, y, x) +#else + #define Ch(x, y, z) bitselect(z, y, x) + #if defined(VECTORS2) || defined(VECTORS4) || defined(VECTORS8) + // GCN - VEC2 or VEC4 + #define Ma(z, x, y) bitselect(z, y, z ^ x) + #else + // GCN - no VEC + #define Ma(z, x, y) Ch(z ^ x, y, x) + #endif +#endif + +#ifdef GOFFSET + typedef uint uu; +#else + #ifdef VECTORS8 + typedef uint8 uu; + #elif defined VECTORS4 + typedef uint4 uu; + #elif defined VECTORS2 + typedef uint2 uu; + #else + typedef uint uu; + #endif +#endif + +#define ch(n) Ch(V[(4 + 128 - n) % 8], V[(5 + 128 - n) % 8], V[(6 + 128 - n) % 8]) +#define ma(n) Ma(V[(1 + 128 - n) % 8], V[(2 + 128 - n) % 8], V[(0 + 128 - n) % 8]) + +#define rot15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U)) +#define rot25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U)) +#define rot26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U)) +#define rot30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U)) + +__kernel + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) + void search( const uu base, const uint PreVal4, + const uint H1, const uint D1, const uint PreVal0, const uint B1, const uint C1, + const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7, + const uint W16addK16, const uint W17addK17, + const uint PreW18, const uint PreW19, + const uint W16, const uint W17, + const uint PreW31, const uint PreW32, + const uint state0, const uint state1, const uint state2, const uint state3, + const uint state4, const uint state5, const uint state6, const uint state7, + const uint state0A, const uint state0B, + __global ulong * output) +{ + u W[17]; + u V[8]; + +#ifdef VECTORS8 + #ifdef GOFFSET + u nonce = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7); + #else + u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + ((uint)get_local_id(0) * 8U) + base; + #endif +#elif defined VECTORS4 + #ifdef GOFFSET + u nonce = ((uint)get_global_id(0) << 2) + (u)(0, 1, 2, 3); + #else + u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + ((uint)get_local_id(0) * 4U) + base; + #endif +#elif defined VECTORS2 + #ifdef GOFFSET + u nonce = ((uint)get_global_id(0) << 1) + (u)(0, 1); + #else + u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) * 2U) + ((uint)get_local_id(0) * 2U) + base; + #endif +#else + #ifdef GOFFSET + u nonce = (uint)get_global_id(0); + #else + u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + (uint)get_local_id(0) + base; + #endif +#endif + + V[4] = PreVal4 + nonce; + + V[7] = H1 + (V[3] = D1 + Ch((PreVal0 + nonce), B1, C1) + rot26(PreVal0 + nonce)); + V[3] += rot30(V[4]) + Ma(F1, G1, V[4]); + + V[6] = G1 + (V[2] = C1addK5 + Ch(V[7], (PreVal0 + nonce), B1) + rot26(V[7])); + V[2] += rot30(V[3]) + Ma(V[4], F1, V[3]); + + V[5] = F1 + (V[1] = B1addK6 + Ch(V[6], V[7], (PreVal0 + nonce)) + rot26(V[6])); + V[1] += rot30(V[2]) + Ma(V[3], V[4], V[2]); + + V[4] += nonce + PreVal0addK7 + Ch(V[5], V[6], V[7]) + rot26(V[5]); + V[0] = nonce + PreVal0addK7 + Ch(V[5], V[6], V[7]) + rot26(V[5]) +rot30(V[1]) + Ma(V[2], V[3], V[1]); + + V[3] += 0xd807aa98 + V[7] + Ch(V[4], V[5], V[6]) + rot26(V[4]); + V[7] = 0xd807aa98 + V[7] + Ch(V[4], V[5], V[6]) + rot26(V[4]) + rot30(V[0]) + Ma(V[1], V[2], V[0]); + + V[2] += 0x12835b01 + V[6] + Ch(V[3], V[4], V[5]) + rot26(V[3]); + V[6] = 0x12835b01 + V[6] + Ch(V[3], V[4], V[5]) + rot26(V[3]) + rot30(V[7]) + Ma(V[0], V[1], V[7]); + + V[1] += 0x243185be + V[5] + Ch(V[2], V[3], V[4]) + rot26(V[2]); + V[5] = 0x243185be + V[5] + Ch(V[2], V[3], V[4]) + rot26(V[2]) + rot30(V[6]) + Ma(V[7], V[0], V[6]); + + V[0] += 0x550c7dc3 + V[4] + Ch(V[1], V[2], V[3]) + rot26(V[1]); + V[4] = 0x550c7dc3 + V[4] + Ch(V[1], V[2], V[3]) + rot26(V[1]) + rot30(V[5]) + Ma(V[6], V[7], V[5]); + +//--------------- ch() + ma() replaced above --------------- + + V[7] += 0x72be5d74 + V[3] + ch(12) + rot26(V[0]); + V[3] = 0x72be5d74 + V[3] + ch(12) + rot26(V[0]) + rot30(V[4]) + ma(12); + + V[6] += 0x80deb1fe + V[2] + ch(13) + rot26(V[7]); + V[2] = 0x80deb1fe + V[2] + ch(13) + rot26(V[7]) + rot30(V[3]) + ma(13); + + V[5] += 0x9bdc06a7 + V[1] + ch(14) + rot26(V[6]); + V[1] = 0x9bdc06a7 + V[1] + ch(14) + rot26(V[6]) + rot30(V[2]) + ma(14); + + V[4] += 0xc19bf3f4 + V[0] + ch(15) + rot26(V[5]); + V[0] = 0xc19bf3f4 + V[0] + ch(15) + rot26(V[5]) + rot30(V[1]) + ma(15); + + V[3] += W16addK16 + V[7] + ch(16) + rot26(V[4]); + V[7] = W16addK16 + V[7] + ch(16) + rot26(V[4]) + rot30(V[0]) + ma(16); + + V[2] += W17addK17 + V[6] + ch(17) + rot26(V[3]); + V[6] = W17addK17 + V[6] + ch(17) + rot26(V[3]) + rot30(V[7]) + ma(17); + +//---------------------------------------------------------------------------------- + +#ifdef VECTORS8 + W[0] = PreW18 + (u)(rot25(nonce.s0), rot25(nonce.s0) ^ 0x2004000, rot25(nonce.s0) ^ 0x4008000, rot25(nonce.s0) ^ 0x600C000, + rot25(nonce.s0) ^ 0x8010000, rot25(nonce.s0) ^ 0xa014000, rot25(nonce.s0) ^ 0xc018000, rot25(nonce.s0) ^ 0xe01c000); +#elif defined VECTORS4 + W[0] = PreW18 + (u)(rot25(nonce.x), rot25(nonce.x) ^ 0x2004000, rot25(nonce.x) ^ 0x4008000, rot25(nonce.x) ^ 0x600C000); +#elif defined VECTORS2 + W[0] = PreW18 + (u)(rot25(nonce.x), rot25(nonce.x) ^ 0x2004000); +#else + W[0] = PreW18 + rot25(nonce); +#endif + W[1] = PreW19 + nonce; + W[2] = 0x80000000 + rot15(W[0]); + W[3] = rot15(W[1]); + W[4] = 0x00000280 + rot15(W[2]); + W[5] = W16 + rot15(W[3]); + W[6] = W17 + rot15(W[4]); + W[7] = W[0] + rot15(W[5]); + W[8] = W[1] + rot15(W[6]); + W[9] = W[2] + rot15(W[7]); + W[10] = W[3] + rot15(W[8]); + W[11] = W[4] + rot15(W[9]); + W[12] = 0x00a00055 + W[5] + rot15(W[10]); + W[13] = PreW31 + W[6] + rot15(W[11]); + W[14] = PreW32 + W[7] + rot15(W[12]); + W[15] = W17 + W[8] + rot15(W[13]) + rot25(W[0]); + W[16] = W[0] + W[9] + rot15(W[14]) + rot25(W[1]); + + V[1] += 0x0fc19dc6 + V[5] + W[0] + ch(18) + rot26(V[2]); + V[5] = 0x0fc19dc6 + V[5] + W[0] + ch(18) + rot26(V[2]) + rot30(V[6]) + ma(18); + + V[0] += 0x240ca1cc + V[4] + W[1] + ch(19) + rot26(V[1]); + V[4] = 0x240ca1cc + V[4] + W[1] + ch(19) + rot26(V[1]) + rot30(V[5]) + ma(19); + + V[7] += 0x2de92c6f + V[3] + W[2] + ch(20) + rot26(V[0]); + V[3] = 0x2de92c6f + V[3] + W[2] + ch(20) + rot26(V[0]) + rot30(V[4]) + ma(20); + + V[6] += 0x4a7484aa + V[2] + W[3] + ch(21) + rot26(V[7]); + V[2] = 0x4a7484aa + V[2] + W[3] + ch(21) + rot26(V[7]) + rot30(V[3]) + ma(21); + + V[5] += 0x5cb0a9dc + V[1] + W[4] + ch(22) + rot26(V[6]); + V[1] = 0x5cb0a9dc + V[1] + W[4] + ch(22) + rot26(V[6]) + rot30(V[2]) + ma(22); + + V[4] += 0x76f988da + V[0] + W[5] + ch(23) + rot26(V[5]); + V[0] = 0x76f988da + V[0] + W[5] + ch(23) + rot26(V[5]) + rot30(V[1]) + ma(23); + + V[3] += 0x983e5152 + V[7] + W[6] + ch(24) + rot26(V[4]); + V[7] = 0x983e5152 + V[7] + W[6] + ch(24) + rot26(V[4]) + rot30(V[0]) + ma(24); + + V[2] += 0xa831c66d + V[6] + W[7] + ch(25) + rot26(V[3]); + V[6] = 0xa831c66d + V[6] + W[7] + ch(25) + rot26(V[3]) + rot30(V[7]) + ma(25); + + V[1] += 0xb00327c8 + V[5] + W[8] + ch(26) + rot26(V[2]); + V[5] = 0xb00327c8 + V[5] + W[8] + ch(26) + rot26(V[2]) + rot30(V[6]) + ma(26); + + V[0] += 0xbf597fc7 + V[4] + W[9] + ch(27) + rot26(V[1]); + V[4] = 0xbf597fc7 + V[4] + W[9] + ch(27) + rot26(V[1]) + rot30(V[5]) + ma(27); + + V[7] += 0xc6e00bf3 + V[3] + W[10] + ch(28) + rot26(V[0]); + V[3] = 0xc6e00bf3 + V[3] + W[10] + ch(28) + rot26(V[0]) + rot30(V[4]) + ma(28); + + V[6] += 0xd5a79147 + V[2] + W[11] + ch(29) + rot26(V[7]); + V[2] = 0xd5a79147 + V[2] + W[11] + ch(29) + rot26(V[7]) + rot30(V[3]) + ma(29); + + V[5] += 0x06ca6351 + V[1] + W[12] + ch(30) + rot26(V[6]); + V[1] = 0x06ca6351 + V[1] + W[12] + ch(30) + rot26(V[6]) + rot30(V[2]) + ma(30); + + V[4] += 0x14292967 + V[0] + W[13] + ch(31) + rot26(V[5]); + V[0] = 0x14292967 + V[0] + W[13] + ch(31) + rot26(V[5]) + rot30(V[1]) + ma(31); + + V[3] += 0x27b70a85 + V[7] + W[14] + ch(32) + rot26(V[4]); + V[7] = 0x27b70a85 + V[7] + W[14] + ch(32) + rot26(V[4]) + rot30(V[0]) + ma(32); + + V[2] += 0x2e1b2138 + V[6] + W[15] + ch(33) + rot26(V[3]); + V[6] = 0x2e1b2138 + V[6] + W[15] + ch(33) + rot26(V[3]) + rot30(V[7]) + ma(33); + + V[1] += 0x4d2c6dfc + V[5] + W[16] + ch(34) + rot26(V[2]); + V[5] = 0x4d2c6dfc + V[5] + W[16] + ch(34) + rot26(V[2]) + rot30(V[6]) + ma(34); + +//---------------------------------------------------------------------------------- + + W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); + W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); + W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); + W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); + W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); + W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); + W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); + W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); + W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); + W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); + W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); + W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); + W[12] = W[13] + W[5] + rot15(W[10]) + rot25(W[14]); + W[13] = W[14] + W[6] + rot15(W[11]) + rot25(W[15]); + W[14] = W[15] + W[7] + rot15(W[12]) + rot25(W[16]); + W[15] = W[16] + W[8] + rot15(W[13]) + rot25( W[0]); + W[16] = W[0] + W[9] + rot15(W[14]) + rot25( W[1]); + + V[0] += 0x53380d13 + V[4] + W[0] + ch(35) + rot26(V[1]); + V[4] = 0x53380d13 + V[4] + W[0] + ch(35) + rot26(V[1]) + rot30(V[5]) + ma(35); + + V[7] += 0x650a7354 + V[3] + W[1] + ch(36) + rot26(V[0]); + V[3] = 0x650a7354 + V[3] + W[1] + ch(36) + rot26(V[0]) + rot30(V[4]) + ma(36); + + V[6] += 0x766a0abb + V[2] + W[2] + ch(37) + rot26(V[7]); + V[2] = 0x766a0abb + V[2] + W[2] + ch(37) + rot26(V[7]) + rot30(V[3]) + ma(37); + + V[5] += 0x81c2c92e + V[1] + W[3] + ch(38) + rot26(V[6]); + V[1] = 0x81c2c92e + V[1] + W[3] + ch(38) + rot26(V[6]) + rot30(V[2]) + ma(38); + + V[4] += 0x92722c85 + V[0] + W[4] + ch(39) + rot26(V[5]); + V[0] = 0x92722c85 + V[0] + W[4] + ch(39) + rot26(V[5]) + rot30(V[1]) + ma(39); + + V[3] += 0xa2bfe8a1 + V[7] + W[5] + ch(40) + rot26(V[4]); + V[7] = 0xa2bfe8a1 + V[7] + W[5] + ch(40) + rot26(V[4]) + rot30(V[0]) + ma(40); + + V[2] += 0xa81a664b + V[6] + W[6] + ch(41) + rot26(V[3]); + V[6] = 0xa81a664b + V[6] + W[6] + ch(41) + rot26(V[3]) + rot30(V[7]) + ma(41); + + V[1] += 0xc24b8b70 + V[5] + W[7] + ch(42) + rot26(V[2]); + V[5] = 0xc24b8b70 + V[5] + W[7] + ch(42) + rot26(V[2]) + rot30(V[6]) + ma(42); + + V[0] += 0xc76c51a3 + V[4] + W[8] + ch(43) + rot26(V[1]); + V[4] = 0xc76c51a3 + V[4] + W[8] + ch(43) + rot26(V[1]) + rot30(V[5]) + ma(43); + + V[7] += 0xd192e819 + V[3] + W[9] + ch(44) + rot26(V[0]); + V[3] = 0xd192e819 + V[3] + W[9] + ch(44) + rot26(V[0]) + rot30(V[4]) + ma(44); + + V[6] += 0xd6990624 + V[2] + W[10] + ch(45) + rot26(V[7]); + V[2] = 0xd6990624 + V[2] + W[10] + ch(45) + rot26(V[7]) + rot30(V[3]) + ma(45); + + V[5] += 0xf40e3585 + V[1] + W[11] + ch(46) + rot26(V[6]); + V[1] = 0xf40e3585 + V[1] + W[11] + ch(46) + rot26(V[6]) + rot30(V[2]) + ma(46); + + V[4] += 0x106aa070 + V[0] + W[12] + ch(47) + rot26(V[5]); + V[0] = 0x106aa070 + V[0] + W[12] + ch(47) + rot26(V[5]) + rot30(V[1]) + ma(47); + + V[3] += 0x19a4c116 + V[7] + W[13] + ch(48) + rot26(V[4]); + V[7] = 0x19a4c116 + V[7] + W[13] + ch(48) + rot26(V[4]) + rot30(V[0]) + ma(48); + + V[2] += 0x1e376c08 + V[6] + W[14] + ch(49) + rot26(V[3]); + V[6] = 0x1e376c08 + V[6] + W[14] + ch(49) + rot26(V[3]) + rot30(V[7]) + ma(49); + + V[1] += 0x2748774c + V[5] + W[15] + ch(50) + rot26(V[2]); + V[5] = 0x2748774c + V[5] + W[15] + ch(50) + rot26(V[2]) + rot30(V[6]) + ma(50); + + V[0] += 0x34b0bcb5 + V[4] + W[16] + ch(51) + rot26(V[1]); + V[4] = 0x34b0bcb5 + V[4] + W[16] + ch(51) + rot26(V[1]) + rot30(V[5]) + ma(51); + +//---------------------------------------------------------------------------------- + + W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); + W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); + W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); + W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); + W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); + W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); + W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); + W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); + W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); + W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); + W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); + W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); + + V[7] += 0x391c0cb3 + V[3] + W[0] + ch(52) + rot26(V[0]); + V[3] = 0x391c0cb3 + V[3] + W[0] + ch(52) + rot26(V[0]) + rot30(V[4]) + ma(52); + + V[6] += 0x4ed8aa4a + V[2] + W[1] + ch(53) + rot26(V[7]); + V[2] = 0x4ed8aa4a + V[2] + W[1] + ch(53) + rot26(V[7]) + rot30(V[3]) + ma(53); + + V[5] += 0x5b9cca4f + V[1] + W[2] + ch(54) + rot26(V[6]); + V[1] = 0x5b9cca4f + V[1] + W[2] + ch(54) + rot26(V[6]) + rot30(V[2]) + ma(54); + + V[4] += 0x682e6ff3 + V[0] + W[3] + ch(55) + rot26(V[5]); + V[0] = 0x682e6ff3 + V[0] + W[3] + ch(55) + rot26(V[5]) + rot30(V[1]) + ma(55); + + V[3] += 0x748f82ee + V[7] + W[4] + ch(56) + rot26(V[4]); + V[7] = 0x748f82ee + V[7] + W[4] + ch(56) + rot26(V[4]) + rot30(V[0]) + ma(56); + + V[2] += 0x78a5636f + V[6] + W[5] + ch(57) + rot26(V[3]); + V[6] = 0x78a5636f + V[6] + W[5] + ch(57) + rot26(V[3]) + rot30(V[7]) + ma(57); + + V[1] += 0x84c87814 + V[5] + W[6] + ch(58) + rot26(V[2]); + V[5] = 0x84c87814 + V[5] + W[6] + ch(58) + rot26(V[2]) + rot30(V[6]) + ma(58); + + V[0] += 0x8cc70208 + V[4] + W[7] + ch(59) + rot26(V[1]); + V[4] = 0x8cc70208 + V[4] + W[7] + ch(59) + rot26(V[1]) + rot30(V[5]) + ma(59); + + V[7] += 0x90befffa + V[3] + W[8] + ch(60) + rot26(V[0]); + V[3] = 0x90befffa + V[3] + W[8] + ch(60) + rot26(V[0]) + rot30(V[4]) + ma(60); + + V[6] += 0xa4506ceb + V[2] + W[9] + ch(61) + rot26(V[7]); + V[2] = 0xa4506ceb + V[2] + W[9] + ch(61) + rot26(V[7]) + rot30(V[3]) + ma(61); + + V[5] += 0xbef9a3f7 + V[1] + W[10] + ch(62) + rot26(V[6]); + V[1] = 0xbef9a3f7 + V[1] + W[10] + ch(62) + rot26(V[6]) + rot30(V[2]) + ma(62); + + V[4] += 0xc67178f2 + V[0] + W[11] + ch(63) + rot26(V[5]); + V[0] = 0xc67178f2 + V[0] + W[11] + ch(63) + rot26(V[5]) + rot30(V[1]) + ma(63); + +//---------------------------------------------------------------------------------- + + W[0] = state0 + V[0]; + W[1] = state1 + V[1]; + W[2] = state2 + V[2]; + W[3] = state3 + V[3]; + W[4] = state4 + V[4]; + W[5] = state5 + V[5]; + W[6] = state6 + V[6]; + W[7] = state7 + V[7]; + + // 0x98c7e2a2 + W[0] + u state0AaddV0 = state0A + V[0]; + // 0xfc08884d + W[0] + u state0BaddV0 = state0B + V[0]; + + V[2] = 0x3c6ef372 + (V[6] = 0x90bb1e3c + W[1] + Ch(state0AaddV0, 0x510e527fU, 0x9b05688cU) + rot26(state0AaddV0)); + V[6] += rot30(state0BaddV0) + Ma(0x6a09e667U, 0xbb67ae85U, state0BaddV0); + + V[1] = 0xbb67ae85 + (V[5] = 0x50c6645b + W[2] + Ch(V[2], state0AaddV0, 0x510e527fU) + rot26(V[2])); + V[5] += rot30(V[6]) + Ma(state0BaddV0, 0x6a09e667U, V[6]); + + V[0] = 0x6a09e667 + (V[4] = 0x3ac42e24 + W[3] + Ch(V[1], V[2], state0AaddV0) + rot26(V[1])); + V[4] += rot30(V[5]) + Ma(V[6], state0BaddV0, V[5]); + + V[7] = (state0BaddV0) + (V[3] = 0x3956c25b + state0AaddV0 + W[4] + Ch(V[0], V[1], V[2]) + rot26(V[0])); + V[3] += rot30(V[4]) + Ma(V[5], V[6], V[4]); + +//--------------- ch() + ma() replaced above --------------- + + V[6] += 0x59f111f1 + V[2] + W[5] + ch(69) + rot26(V[7]); + V[2] = 0x59f111f1 + V[2] + W[5] + ch(69) + rot26(V[7]) + rot30(V[3]) + ma(69); + + V[5] += 0x923f82a4 + V[1] + W[6] + ch(70) + rot26(V[6]); + V[1] = 0x923f82a4 + V[1] + W[6] + ch(70) + rot26(V[6]) + rot30(V[2]) + ma(70); + + V[4] += 0xab1c5ed5 + V[0] + W[7] + ch(71) + rot26(V[5]); + V[0] = 0xab1c5ed5 + V[0] + W[7] + ch(71) + rot26(V[5]) + rot30(V[1]) + ma(71); + + V[3] += 0x5807aa98 + V[7] + ch(72) + rot26(V[4]); + V[7] = 0x5807aa98 + V[7] + ch(72) + rot26(V[4]) + rot30(V[0]) + ma(72); + + V[2] += 0x12835b01 + V[6] + ch(73) + rot26(V[3]); + V[6] = 0x12835b01 + V[6] + ch(73) + rot26(V[3]) + rot30(V[7]) + ma(73); + + V[1] += 0x243185be + V[5] + ch(74) + rot26(V[2]); + V[5] = 0x243185be + V[5] + ch(74) + rot26(V[2]) + rot30(V[6]) + ma(74); + + V[0] += 0x550c7dc3 + V[4] + ch(75) + rot26(V[1]); + V[4] = 0x550c7dc3 + V[4] + ch(75) + rot26(V[1]) + rot30(V[5]) + ma(75); + + V[7] += 0x72be5d74 + V[3] + ch(76) + rot26(V[0]); + V[3] = 0x72be5d74 + V[3] + ch(76) + rot26(V[0]) + rot30(V[4]) + ma(76); + + V[6] += 0x80deb1fe + V[2] + ch(77) + rot26(V[7]); + V[2] = 0x80deb1fe + V[2] + ch(77) + rot26(V[7]) + rot30(V[3]) + ma(77); + + V[5] += 0x9bdc06a7 + V[1] + ch(78) + rot26(V[6]); + V[1] = 0x9bdc06a7 + V[1] + ch(78) + rot26(V[6]) + rot30(V[2]) + ma(78); + + V[4] += 0xc19bf274 + V[0] + ch(79) + rot26(V[5]); + V[0] = 0xc19bf274 + V[0] + ch(79) + rot26(V[5]) + rot30(V[1]) + ma(79); + +//---------------------------------------------------------------------------------- + + W[0] = W[0] + rot25(W[1]); + W[1] = 0x00a00000 + W[1] + rot25(W[2]); + W[2] = W[2] + rot15(W[0]) + rot25(W[3]); + W[3] = W[3] + rot15(W[1]) + rot25(W[4]); + W[4] = W[4] + rot15(W[2]) + rot25(W[5]); + W[5] = W[5] + rot15(W[3]) + rot25(W[6]); + W[6] = 0x00000100 + W[6] + rot15(W[4]) + rot25(W[7]); + W[7] = 0x11002000 + W[7] + W[0] + rot15(W[5]); + W[8] = 0x80000000 + W[1] + rot15(W[6]); + W[9] = W[2] + rot15(W[7]); + W[10] = W[3] + rot15(W[8]); + W[11] = W[4] + rot15(W[9]); + W[12] = W[5] + rot15(W[10]); + W[13] = W[6] + rot15(W[11]); + W[14] = 0x00400022 + W[7] + rot15( W[12]); + W[15] = 0x00000100 + W[8] + rot15( W[13]) + rot25(W[0]); + W[16] = W[0] + W[9] + rot15( W[14]) + rot25(W[1]); + + V[3] += 0xe49b69c1 + V[7] + W[0] + ch(80) + rot26(V[4]); + V[7] = 0xe49b69c1 + V[7] + W[0] + ch(80) + rot26(V[4]) + rot30(V[0]) + ma(80); + + V[2] += 0xefbe4786 + V[6] + W[1] + ch(81) + rot26(V[3]); + V[6] = 0xefbe4786 + V[6] + W[1] + ch(81) + rot26(V[3]) + rot30(V[7]) + ma(81); + + V[1] += 0x0fc19dc6 + V[5] + W[2] + ch(82) + rot26(V[2]); + V[5] = 0x0fc19dc6 + V[5] + W[2] + ch(82) + rot26(V[2]) + rot30(V[6]) + ma(82); + + V[0] += 0x240ca1cc + V[4] + W[3] + ch(83) + rot26(V[1]); + V[4] = 0x240ca1cc + V[4] + W[3] + ch(83) + rot26(V[1]) + rot30(V[5]) + ma(83); + + V[7] += 0x2de92c6f + V[3] + W[4] + ch(84) + rot26(V[0]); + V[3] = 0x2de92c6f + V[3] + W[4] + ch(84) + rot26(V[0]) + rot30(V[4]) + ma(84); + + V[6] += 0x4a7484aa + V[2] + W[5] + ch(85) + rot26(V[7]); + V[2] = 0x4a7484aa + V[2] + W[5] + ch(85) + rot26(V[7]) + rot30(V[3]) + ma(85); + + V[5] += 0x5cb0a9dc + V[1] + W[6] + ch(86) + rot26(V[6]); + V[1] = 0x5cb0a9dc + V[1] + W[6] + ch(86) + rot26(V[6]) + rot30(V[2]) + ma(86); + + V[4] += 0x76f988da + V[0] + W[7] + ch(87) + rot26(V[5]); + V[0] = 0x76f988da + V[0] + W[7] + ch(87) + rot26(V[5]) + rot30(V[1]) + ma(87); + + V[3] += 0x983e5152 + V[7] + W[8] + ch(88) + rot26(V[4]); + V[7] = 0x983e5152 + V[7] + W[8] + ch(88) + rot26(V[4]) + rot30(V[0]) + ma(88); + + V[2] += 0xa831c66d + V[6] + W[9] + ch(89) + rot26(V[3]); + V[6] = 0xa831c66d + V[6] + W[9] + ch(89) + rot26(V[3]) + rot30(V[7]) + ma(89); + + V[1] += 0xb00327c8 + V[5] + W[10] + ch(90) + rot26(V[2]); + V[5] = 0xb00327c8 + V[5] + W[10] + ch(90) + rot26(V[2]) + rot30(V[6]) + ma(90); + + V[0] += 0xbf597fc7 + V[4] + W[11] + ch(91) + rot26(V[1]); + V[4] = 0xbf597fc7 + V[4] + W[11] + ch(91) + rot26(V[1]) + rot30(V[5]) + ma(91); + + V[7] += 0xc6e00bf3 + V[3] + W[12] + ch(92) + rot26(V[0]); + V[3] = 0xc6e00bf3 + V[3] + W[12] + ch(92) + rot26(V[0]) + rot30(V[4]) + ma(92); + + V[6] += 0xd5a79147 + V[2] + W[13] + ch(93) + rot26(V[7]); + V[2] = 0xd5a79147 + V[2] + W[13] + ch(93) + rot26(V[7]) + rot30(V[3]) + ma(93); + + V[5] += 0x06ca6351 + V[1] + W[14] + ch(94) + rot26(V[6]); + V[1] = 0x06ca6351 + V[1] + W[14] + ch(94) + rot26(V[6]) + rot30(V[2]) + ma(94); + + V[4] += 0x14292967 + V[0] + W[15] + ch(95) + rot26(V[5]); + V[0] = 0x14292967 + V[0] + W[15] + ch(95) + rot26(V[5]) + rot30(V[1]) + ma(95); + + V[3] += 0x27b70a85 + V[7] + W[16] + ch(96) + rot26(V[4]); + V[7] = 0x27b70a85 + V[7] + W[16] + ch(96) + rot26(V[4]) + rot30(V[0]) + ma(96); + +//---------------------------------------------------------------------------------- + + W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); + W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); + W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); + W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); + W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); + W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); + W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); + W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); + W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); + W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); + W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); + W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); + W[12] = W[13] + W[5] + rot15(W[10]) + rot25(W[14]); + W[13] = W[14] + W[6] + rot15(W[11]) + rot25(W[15]); + W[14] = W[15] + W[7] + rot15(W[12]) + rot25(W[16]); + W[15] = W[16] + W[8] + rot15(W[13]) + rot25( W[0]); + W[16] = W[0] + W[9] + rot15(W[14]) + rot25( W[1]); + + V[2] += 0x2e1b2138 + V[6] + W[0] + ch(97) + rot26(V[3]); + V[6] = 0x2e1b2138 + V[6] + W[0] + ch(97) + rot26(V[3]) + rot30(V[7]) + ma(97); + + V[1] += 0x4d2c6dfc + V[5] + W[1] + ch(98) + rot26(V[2]); + V[5] = 0x4d2c6dfc + V[5] + W[1] + ch(98) + rot26(V[2]) + rot30(V[6]) + ma(98); + + V[0] += 0x53380d13 + V[4] + W[2] + ch(99) + rot26(V[1]); + V[4] = 0x53380d13 + V[4] + W[2] + ch(99) + rot26(V[1]) + rot30(V[5]) + ma(99); + + V[7] += 0x650a7354 + V[3] + W[3] + ch(100) + rot26(V[0]); + V[3] = 0x650a7354 + V[3] + W[3] + ch(100) + rot26(V[0]) + rot30(V[4]) + ma(100); + + V[6] += 0x766a0abb + V[2] + W[4] + ch(101) + rot26(V[7]); + V[2] = 0x766a0abb + V[2] + W[4] + ch(101) + rot26(V[7]) + rot30(V[3]) + ma(101); + + V[5] += 0x81c2c92e + V[1] + W[5] + ch(102) + rot26(V[6]); + V[1] = 0x81c2c92e + V[1] + W[5] + ch(102) + rot26(V[6]) + rot30(V[2]) + ma(102); + + V[4] += 0x92722c85 + V[0] + W[6] + ch(103) + rot26(V[5]); + V[0] = 0x92722c85 + V[0] + W[6] + ch(103) + rot26(V[5]) + rot30(V[1]) + ma(103); + + V[3] += 0xa2bfe8a1 + V[7] + W[7] + ch(104) + rot26(V[4]); + V[7] = 0xa2bfe8a1 + V[7] + W[7] + ch(104) + rot26(V[4]) + rot30(V[0]) + ma(104); + + V[2] += 0xa81a664b + V[6] + W[8] + ch(105) + rot26(V[3]); + V[6] = 0xa81a664b + V[6] + W[8] + ch(105) + rot26(V[3]) + rot30(V[7]) + ma(105); + + V[1] += 0xc24b8b70 + V[5] + W[9] + ch(106) + rot26(V[2]); + V[5] = 0xc24b8b70 + V[5] + W[9] + ch(106) + rot26(V[2]) + rot30(V[6]) + ma(106); + + V[0] += 0xc76c51a3 + V[4] + W[10] + ch(107) + rot26(V[1]); + V[4] = 0xc76c51a3 + V[4] + W[10] + ch(107) + rot26(V[1]) + rot30(V[5]) + ma(107); + + V[7] += 0xd192e819 + V[3] + W[11] + ch(108) + rot26(V[0]); + V[3] = 0xd192e819 + V[3] + W[11] + ch(108) + rot26(V[0]) + rot30(V[4]) + ma(108); + + V[6] += 0xd6990624 + V[2] + W[12] + ch(109) + rot26(V[7]); + V[2] = 0xd6990624 + V[2] + W[12] + ch(109) + rot26(V[7]) + rot30(V[3]) + ma(109); + + V[5] += 0xf40e3585 + V[1] + W[13] + ch(110) + rot26(V[6]); + V[1] = 0xf40e3585 + V[1] + W[13] + ch(110) + rot26(V[6]) + rot30(V[2]) + ma(110); + + V[4] += 0x106aa070 + V[0] + W[14] + ch(111) + rot26(V[5]); + V[0] = 0x106aa070 + V[0] + W[14] + ch(111) + rot26(V[5]) + rot30(V[1]) + ma(111); + + V[3] += 0x19a4c116 + V[7] + W[15] + ch(112) + rot26(V[4]); + V[7] = 0x19a4c116 + V[7] + W[15] + ch(112) + rot26(V[4]) + rot30(V[0]) + ma(112); + + V[2] += 0x1e376c08 + V[6] + W[16] + ch(113) + rot26(V[3]); + V[6] = 0x1e376c08 + V[6] + W[16] + ch(113) + rot26(V[3]) + rot30(V[7]) + ma(113); + +//---------------------------------------------------------------------------------- + + W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); + W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); + W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); + W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); + W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); + W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); + W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); + W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); + W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); + W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); + W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); + + V[1] += 0x2748774c + V[5] + W[0] + ch(114) + rot26(V[2]); + V[5] = 0x2748774c + V[5] + W[0] + ch(114) + rot26(V[2]) + rot30(V[6]) + ma(114); + + V[0] += 0x34b0bcb5 + V[4] + W[1] + ch(115) + rot26(V[1]); + V[4] = 0x34b0bcb5 + V[4] + W[1] + ch(115) + rot26(V[1]) + rot30(V[5]) + ma(115); + + V[7] += 0x391c0cb3 + V[3] + W[2] + ch(116) + rot26(V[0]); + V[3] = 0x391c0cb3 + V[3] + W[2] + ch(116) + rot26(V[0]) + rot30(V[4]) + ma(116); + + V[6] += 0x4ed8aa4a + V[2] + W[3] + ch(117) + rot26(V[7]); + V[2] = 0x4ed8aa4a + V[2] + W[3] + ch(117) + rot26(V[7]) + rot30(V[3]) + ma(117); + + V[5] += 0x5b9cca4f + V[1] + W[4] + ch(118) + rot26(V[6]); + V[1] = 0x5b9cca4f + V[1] + W[4] + ch(118) + rot26(V[6]) + rot30(V[2]) + ma(118); + + V[4] += 0x682e6ff3 + V[0] + W[5] + ch(119) + rot26(V[5]); + V[0] = 0x682e6ff3 + V[0] + W[5] + ch(119) + rot26(V[5]) + rot30(V[1]) + ma(119); + + V[3] += 0x748f82ee + V[7] + W[6] + ch(120) + rot26(V[4]); + V[7] = 0x748f82ee + V[7] + W[6] + ch(120) + rot26(V[4]) + rot30(V[0]) + ma(120); + + V[2] += 0x78a5636f + V[6] + W[7] + ch(121) + rot26(V[3]); + + V[1] += 0x84c87814 + V[5] + W[8] + ch(122) + rot26(V[2]); + + V[0] += 0x8cc70208 + V[4] + W[9] + ch(123) + rot26(V[1]); + + V[7] += V[3] + W[10] + ch(124) + rot26(V[0]); + +#ifdef VECTORS8 + u result = (u)(((V[7].s0 == 0x136032ed) * nonce.s0), ((V[7].s1 == 0x136032ed) * nonce.s1), ((V[7].s2 == 0x136032ed) * nonce.s2), ((V[7].s3 == 0x136032ed) * nonce.s3), + ((V[7].s4 == 0x136032ed) * nonce.s4), ((V[7].s5 == 0x136032ed) * nonce.s5), ((V[7].s6 == 0x136032ed) * nonce.s6), ((V[7].s7 == 0x136032ed) * nonce.s7)); + output[0 + (upsample(result.s0, result.s1) > 0)] = upsample(result.s0, result.s1); + output[2 + (upsample(result.s2, result.s3) > 1)] = upsample(result.s2, result.s3); + output[4 + (upsample(result.s4, result.s5) > 0)] = upsample(result.s4, result.s5); + output[6 + (upsample(result.s6, result.s7) > 1)] = upsample(result.s6, result.s7); +#elif defined VECTORS4 + u result = (u)(((V[7].x == 0x136032ed) * nonce.x), ((V[7].y == 0x136032ed) * nonce.y), ((V[7].z == 0x136032ed) * nonce.z), ((V[7].w == 0x136032ed) * nonce.w)); + output[0 + (upsample(result.x, result.y) > 0)] = upsample(result.x, result.y); + output[2 + (upsample(result.z, result.w) > 1)] = upsample(result.z, result.w); +#elif defined VECTORS2 + u result = (u)(((V[7].x == 0x136032ed) * nonce.x), ((V[7].y == 0x136032ed) * nonce.y)); + output[upsample(result.x, result.y) > 0] = upsample(result.x, result.y); +#else + u result = (V[7] == 0x136032ed) * nonce; + output[result != 0] = result; +#endif +} From a6c6866a0dd29de911f6ffa1010fdcbb08a3c184 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Wed, 8 Feb 2012 13:45:56 +1100 Subject: [PATCH 2/8] Add basic build ability with diakgcn and put all kernel names in configure.ac to avoid changing them in mutliple places. --- Makefile.am | 4 ++-- configure.ac | 5 +++++ device-gpu.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- miner.h | 1 + ocl.c | 16 +++++++++++----- 5 files changed, 64 insertions(+), 10 deletions(-) diff --git a/Makefile.am b/Makefile.am index 9c8c2944..347b4af4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,7 +17,7 @@ INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) bin_PROGRAMS = cgminer -bin_SCRIPTS = phatk120203.cl poclbm120203.cl +bin_SCRIPTS = *.cl cgminer_LDFLAGS = $(PTHREAD_FLAGS) cgminer_LDADD = $(DLOPEN_FLAGS) @LIBCURL_LIBS@ @JANSSON_LIBS@ @PTHREAD_LIBS@ \ @@ -44,7 +44,7 @@ cgminer_SOURCES += device-gpu.h device-gpu.c # the original GPU related sources, unchanged cgminer_SOURCES += ocl.c ocl.h findnonce.c findnonce.h cgminer_SOURCES += adl.c adl.h adl_functions.h -cgminer_SOURCES += phatk120203.cl poclbm120203.cl +cgminer_SOURCES += *.cl if HAS_CPUMINE # original CPU related sources, unchanged diff --git a/configure.ac b/configure.ac index 9dcd9b7a..b8a26b39 100644 --- a/configure.ac +++ b/configure.ac @@ -293,6 +293,11 @@ fi AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to cgminer install]) +AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120203"], [Filename for phatk kernel]) +AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120203"], [Filename for poclbm kernel]) +AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120208"], [Filename for diakgcn kernel]) + + AC_SUBST(OPENCL_LIBS) AC_SUBST(OPENCL_FLAGS) AC_SUBST(JANSSON_LIBS) diff --git a/device-gpu.c b/device-gpu.c index e05f5546..d40db48c 100644 --- a/device-gpu.c +++ b/device-gpu.c @@ -754,6 +754,43 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk) return status; } +static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk) +{ + cl_kernel *kernel = &clState->kernel; + cl_int status = 0; + int num = 0; + + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce); + + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e); + status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2); + + status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer), + (void *)&clState->outputBuffer); + + return status; +} + static void set_threads_hashes(unsigned int vectors, unsigned int *threads, unsigned int *hashes, size_t *globalThreads, unsigned int minthreads, int intensity) @@ -905,9 +942,11 @@ static void opencl_detect() return; if (opt_kernel) { - if (strcmp(opt_kernel, "poclbm") && strcmp(opt_kernel, "phatk")) - quit(1, "Invalid kernel name specified - must be poclbm or phatk"); - if (!strcmp(opt_kernel, "poclbm")) + if (strcmp(opt_kernel, "poclbm") && strcmp(opt_kernel, "phatk") && strcmp(opt_kernel, "diakgcn")) + quit(1, "Invalid kernel name specified - must be poclbm, phatk or diakgcn"); + if (!strcmp(opt_kernel, "diakgcn")) + chosen_kernel = KL_DIAKGCN; + else if (!strcmp(opt_kernel, "poclbm")) chosen_kernel = KL_POCLBM; else chosen_kernel = KL_PHATK; @@ -1042,6 +1081,9 @@ static bool opencl_thread_init(struct thr_info *thr) default: thrdata->queue_kernel_parameters = &queue_phatk_kernel; break; + case KL_DIAKGCN: + thrdata->queue_kernel_parameters = &queue_diakgcn_kernel; + break; } thrdata->res = calloc(BUFFERSIZE, 1); diff --git a/miner.h b/miner.h index 44e6a537..adedaaac 100644 --- a/miner.h +++ b/miner.h @@ -675,6 +675,7 @@ enum cl_kernel { KL_NONE, KL_POCLBM, KL_PHATK, + KL_DIAKGCN, }; extern void get_datestamp(char *, struct timeval *); diff --git a/ocl.c b/ocl.c index 76ceb605..53115576 100644 --- a/ocl.c +++ b/ocl.c @@ -348,21 +348,27 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) char filename[16]; if (chosen_kernel == KL_NONE) { - if (!clState->hasBitAlign || strstr(name, "Tahiti")) + if (strstr(name, "Tahiti")) + chosen_kernel = KL_DIAKGCN; + else if (!clState->hasBitAlign) chosen_kernel = KL_POCLBM; else chosen_kernel = KL_PHATK; } switch (chosen_kernel) { + case KL_DIAKGCN: + strcpy(filename, DIAKGCN_KERNNAME".cl"); + strcpy(binaryfilename, DIAKGCN_KERNNAME); + break; case KL_POCLBM: - strcpy(filename, "poclbm120203.cl"); - strcpy(binaryfilename, "poclbm120203"); + strcpy(filename, POCLBM_KERNNAME".cl"); + strcpy(binaryfilename, POCLBM_KERNNAME); break; case KL_NONE: /* Shouldn't happen */ case KL_PHATK: - strcpy(filename, "phatk120203.cl"); - strcpy(binaryfilename, "phatk120203"); + strcpy(filename, PHATK_KERNNAME".cl"); + strcpy(binaryfilename, PHATK_KERNNAME); break; } From 53d33c753fc50d32fe03e7e1776d74444763943d Mon Sep 17 00:00:00 2001 From: ckolivas Date: Wed, 8 Feb 2012 15:40:10 +1100 Subject: [PATCH 3/8] First working port of the diakgcn kernel. --- device-gpu.c | 175 +++++++++++++++++++++++++---------------------- diakgcn120208.cl | 78 +++++++++++---------- findnonce.c | 10 ++- miner.h | 4 ++ 4 files changed, 151 insertions(+), 116 deletions(-) diff --git a/device-gpu.c b/device-gpu.c index d40db48c..e39bff4b 100644 --- a/device-gpu.c +++ b/device-gpu.c @@ -673,39 +673,42 @@ void manage_gpu(void) #ifdef HAVE_OPENCL static _clState *clStates[MAX_GPUDEVICES]; +#define CL_SET_BLKARG(blkvar) status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->blkvar) +#define CL_SET_ARG(var) status |= clSetKernelArg(*kernel, num++, sizeof(var), (void *)&var) +#define CL_SET_VARG(args, var) status |= clSetKernelArg(*kernel, num++, args * sizeof(uint), (void *)var) + static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk) { cl_kernel *kernel = &clState->kernel; cl_int status = 0; int num = 0; - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce); - - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2); - - status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer), - (void *)&clState->outputBuffer); + CL_SET_BLKARG(ctx_a); + CL_SET_BLKARG(ctx_b); + CL_SET_BLKARG(ctx_c); + CL_SET_BLKARG(ctx_d); + CL_SET_BLKARG(ctx_e); + CL_SET_BLKARG(ctx_f); + CL_SET_BLKARG(ctx_g); + CL_SET_BLKARG(ctx_h); + CL_SET_BLKARG(cty_b); + CL_SET_BLKARG(cty_c); + CL_SET_BLKARG(cty_d); + CL_SET_BLKARG(cty_f); + CL_SET_BLKARG(cty_g); + CL_SET_BLKARG(cty_h); + CL_SET_BLKARG(nonce); + + CL_SET_BLKARG(fW0); + CL_SET_BLKARG(fW1); + CL_SET_BLKARG(fW2); + CL_SET_BLKARG(fW3); + CL_SET_BLKARG(fW15); + CL_SET_BLKARG(fW01r); + CL_SET_BLKARG(fcty_e); + CL_SET_BLKARG(fcty_e2); + + CL_SET_ARG(clState->outputBuffer); return status; } @@ -718,75 +721,87 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk) int i, num = 0; uint *nonces; - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h); - - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h); + CL_SET_BLKARG(ctx_a); + CL_SET_BLKARG(ctx_b); + CL_SET_BLKARG(ctx_c); + CL_SET_BLKARG(ctx_d); + CL_SET_BLKARG(ctx_e); + CL_SET_BLKARG(ctx_f); + CL_SET_BLKARG(ctx_g); + CL_SET_BLKARG(ctx_h); + + CL_SET_BLKARG(cty_b); + CL_SET_BLKARG(cty_c); + CL_SET_BLKARG(cty_d); + CL_SET_BLKARG(cty_f); + CL_SET_BLKARG(cty_g); + CL_SET_BLKARG(cty_h); nonces = alloca(sizeof(uint) * vwidth); for (i = 0; i < vwidth; i++) nonces[i] = blk->nonce + i; status |= clSetKernelArg(*kernel, num++, vwidth * sizeof(uint), (void *)nonces); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W16); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W17); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreVal4_2); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreVal0); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW18); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW19); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW31); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW32); + CL_SET_BLKARG(W16); + CL_SET_BLKARG(W17); + CL_SET_BLKARG(PreVal4_2); + CL_SET_BLKARG(PreVal0); + CL_SET_BLKARG(PreW18); + CL_SET_BLKARG(PreW19); + CL_SET_BLKARG(PreW31); + CL_SET_BLKARG(PreW32); - status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer), - (void *)&clState->outputBuffer); + CL_SET_ARG(clState->outputBuffer); return status; } static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk) { + cl_uint vwidth = clState->preferred_vwidth; cl_kernel *kernel = &clState->kernel; cl_int status = 0; - int num = 0; + int i, num = 0; + uint *nonces; - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce); - - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2); - - status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer), - (void *)&clState->outputBuffer); + nonces = alloca(sizeof(uint) * vwidth); + for (i = 0; i < vwidth; i++) + nonces[i] = blk->nonce + i; + CL_SET_VARG(vwidth, nonces); + + CL_SET_BLKARG(PreVal4); + CL_SET_BLKARG(cty_h); + CL_SET_BLKARG(cty_d); + CL_SET_BLKARG(PreVal0); + CL_SET_BLKARG(cty_b); + CL_SET_BLKARG(cty_c); + CL_SET_BLKARG(cty_f); + CL_SET_BLKARG(cty_g); + CL_SET_BLKARG(C1addK5); + CL_SET_BLKARG(B1addK6); + CL_SET_BLKARG(PreVal0addK7); + CL_SET_BLKARG(W16addK16); + CL_SET_BLKARG(W17addK17); + CL_SET_BLKARG(PreW18); + CL_SET_BLKARG(PreW19); + CL_SET_BLKARG(W16); + CL_SET_BLKARG(W17); + CL_SET_BLKARG(PreW31); + CL_SET_BLKARG(PreW32); + + CL_SET_BLKARG(ctx_a); + CL_SET_BLKARG(ctx_b); + CL_SET_BLKARG(ctx_c); + CL_SET_BLKARG(ctx_d); + CL_SET_BLKARG(ctx_e); + CL_SET_BLKARG(ctx_f); + CL_SET_BLKARG(ctx_g); + CL_SET_BLKARG(ctx_h); + + CL_SET_BLKARG(A0); + CL_SET_BLKARG(B0); + + CL_SET_ARG(clState->outputBuffer); return status; } diff --git a/diakgcn120208.cl b/diakgcn120208.cl index f8b263fb..84f02e11 100644 --- a/diakgcn120208.cl +++ b/diakgcn120208.cl @@ -3,9 +3,7 @@ // Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3. // The kernel was rewritten by me (Diapolo) and is still public-domain! -#ifdef VECTORS8 - typedef uint8 u; -#elif defined VECTORS4 +#if defined VECTORS4 typedef uint4 u; #elif defined VECTORS2 typedef uint2 u; @@ -31,9 +29,7 @@ #ifdef GOFFSET typedef uint uu; #else - #ifdef VECTORS8 - typedef uint8 uu; - #elif defined VECTORS4 + #if defined VECTORS4 typedef uint4 uu; #elif defined VECTORS2 typedef uint2 uu; @@ -67,29 +63,23 @@ __kernel u W[17]; u V[8]; -#ifdef VECTORS8 +#if defined VECTORS4 #ifdef GOFFSET - u nonce = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7); + u nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3); #else - u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + ((uint)get_local_id(0) * 8U) + base; - #endif -#elif defined VECTORS4 - #ifdef GOFFSET - u nonce = ((uint)get_global_id(0) << 2) + (u)(0, 1, 2, 3); - #else - u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + ((uint)get_local_id(0) * 4U) + base; + u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); #endif #elif defined VECTORS2 #ifdef GOFFSET - u nonce = ((uint)get_global_id(0) << 1) + (u)(0, 1); + u nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1); #else - u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) * 2U) + ((uint)get_local_id(0) * 2U) + base; + u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); #endif #else #ifdef GOFFSET - u nonce = (uint)get_global_id(0); + u nonce = base + get_global_id(0); #else - u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + (uint)get_local_id(0) + base; + u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); #endif #endif @@ -589,22 +579,40 @@ __kernel V[7] += V[3] + W[10] + ch(124) + rot26(V[0]); -#ifdef VECTORS8 - u result = (u)(((V[7].s0 == 0x136032ed) * nonce.s0), ((V[7].s1 == 0x136032ed) * nonce.s1), ((V[7].s2 == 0x136032ed) * nonce.s2), ((V[7].s3 == 0x136032ed) * nonce.s3), - ((V[7].s4 == 0x136032ed) * nonce.s4), ((V[7].s5 == 0x136032ed) * nonce.s5), ((V[7].s6 == 0x136032ed) * nonce.s6), ((V[7].s7 == 0x136032ed) * nonce.s7)); - output[0 + (upsample(result.s0, result.s1) > 0)] = upsample(result.s0, result.s1); - output[2 + (upsample(result.s2, result.s3) > 1)] = upsample(result.s2, result.s3); - output[4 + (upsample(result.s4, result.s5) > 0)] = upsample(result.s4, result.s5); - output[6 + (upsample(result.s6, result.s7) > 1)] = upsample(result.s6, result.s7); -#elif defined VECTORS4 - u result = (u)(((V[7].x == 0x136032ed) * nonce.x), ((V[7].y == 0x136032ed) * nonce.y), ((V[7].z == 0x136032ed) * nonce.z), ((V[7].w == 0x136032ed) * nonce.w)); - output[0 + (upsample(result.x, result.y) > 0)] = upsample(result.x, result.y); - output[2 + (upsample(result.z, result.w) > 1)] = upsample(result.z, result.w); -#elif defined VECTORS2 - u result = (u)(((V[7].x == 0x136032ed) * nonce.x), ((V[7].y == 0x136032ed) * nonce.y)); - output[upsample(result.x, result.y) > 0] = upsample(result.x, result.y); + +#define FOUND (0x80) +#define NFLAG (0x7F) + +#ifdef VECTORS4 + V[7] ^= 0x136032ed; + + bool result = V[7].x & V[7].y & V[7].z & V[7].w; + + if (!result) { + if (!V[7].x) + output[FOUND] = output[NFLAG & W[3].x] = W[3].x; + if (!V[7].y) + output[FOUND] = output[NFLAG & W[3].y] = W[3].y; + if (!V[7].z) + output[FOUND] = output[NFLAG & W[3].z] = W[3].z; + if (!V[7].w) + output[FOUND] = output[NFLAG & W[3].w] = W[3].w; + } #else - u result = (V[7] == 0x136032ed) * nonce; - output[result != 0] = result; + #ifdef VECTORS2 + V[7] ^= 0x136032ed; + + bool result = V[7].x & V[7].y; + + if (!result) { + if (!V[7].x) + output[FOUND] = output[NFLAG & W[3].x] = W[3].x; + if (!V[7].y) + output[FOUND] = output[NFLAG & W[3].y] = W[3].y; + } + #else + if (V[7] == 0x136032ed) + output[FOUND] = output[NFLAG & W[3]] = W[3]; + #endif #endif } diff --git a/findnonce.c b/findnonce.c index 35fd14e0..da9c4ecb 100644 --- a/findnonce.c +++ b/findnonce.c @@ -115,7 +115,15 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) { blk->PreVal4addT1 = blk->PreVal4 + blk->T1; - blk->T1substate0 = state[0] - blk->T1; + blk->T1substate0 = blk->ctx_a - blk->T1; + + blk->B1addK6 = blk->cty_b + 0x923f82a4; + blk->PreVal0addK7 = blk->PreVal0 + 0xab1c5ed5; + blk->W16addK16 = blk->W16 + 0xe49b69c1; + blk->W17addK17 = blk->W17 + 0xefbe4786; + + blk->A0 = blk->ctx_a + 0x98c7e2a2; + blk->B0 = blk->ctx_a + 0xfc08884d; } #define P(t) (W[(t)&0xF] = W[(t-16)&0xF] + (rotate(W[(t-15)&0xF], 25) ^ rotate(W[(t-15)&0xF], 14) ^ (W[(t-15)&0xF] >> 3)) + W[(t-7)&0xF] + (rotate(W[(t-2)&0xF], 15) ^ rotate(W[(t-2)&0xF], 13) ^ (W[(t-2)&0xF] >> 10))) diff --git a/miner.h b/miner.h index adedaaac..a13113c2 100644 --- a/miner.h +++ b/miner.h @@ -610,6 +610,10 @@ typedef struct { cl_uint PreW19; cl_uint PreW31; cl_uint PreW32; + + /* For diakgcn */ + cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17; + cl_uint A0, B0; } dev_blk_ctx; #else typedef struct { From 5eab0e76aae20e5e9fafacc084ea801c1e6c42b0 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Wed, 8 Feb 2012 16:02:05 +1100 Subject: [PATCH 4/8] Use correct variable on output in diakgcn kernel. --- diakgcn120208.cl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/diakgcn120208.cl b/diakgcn120208.cl index 84f02e11..f128ab7e 100644 --- a/diakgcn120208.cl +++ b/diakgcn120208.cl @@ -590,13 +590,13 @@ __kernel if (!result) { if (!V[7].x) - output[FOUND] = output[NFLAG & W[3].x] = W[3].x; + output[FOUND] = output[NFLAG & nonce.x] = nonce.x; if (!V[7].y) - output[FOUND] = output[NFLAG & W[3].y] = W[3].y; + output[FOUND] = output[NFLAG & nonce.y] = nonce.y; if (!V[7].z) - output[FOUND] = output[NFLAG & W[3].z] = W[3].z; + output[FOUND] = output[NFLAG & nonce.z] = nonce.z; if (!V[7].w) - output[FOUND] = output[NFLAG & W[3].w] = W[3].w; + output[FOUND] = output[NFLAG & nonce.w] = nonce.w; } #else #ifdef VECTORS2 @@ -606,13 +606,13 @@ __kernel if (!result) { if (!V[7].x) - output[FOUND] = output[NFLAG & W[3].x] = W[3].x; + output[FOUND] = output[NFLAG & nonce.x] = nonce.x; if (!V[7].y) - output[FOUND] = output[NFLAG & W[3].y] = W[3].y; + output[FOUND] = output[NFLAG & nonce.y] = nonce.y; } #else if (V[7] == 0x136032ed) - output[FOUND] = output[NFLAG & W[3]] = W[3]; + output[FOUND] = output[NFLAG & nonce] = nonce; #endif #endif } From 7a069b93af94eb4b969ef167aa356ec45c3b15a3 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Wed, 8 Feb 2012 16:24:46 +1100 Subject: [PATCH 5/8] Use K array explicitly to make it clear what is being added. --- findnonce.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/findnonce.c b/findnonce.c index da9c4ecb..299452ff 100644 --- a/findnonce.c +++ b/findnonce.c @@ -67,7 +67,7 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) { blk->cty_b = B; blk->cty_c = C; - blk->C1addK5 = C + 0x59f111f1; + blk->C1addK5 = C + SHA256_K[5]; blk->cty_d = D; @@ -117,10 +117,10 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) { blk->PreVal4addT1 = blk->PreVal4 + blk->T1; blk->T1substate0 = blk->ctx_a - blk->T1; - blk->B1addK6 = blk->cty_b + 0x923f82a4; - blk->PreVal0addK7 = blk->PreVal0 + 0xab1c5ed5; - blk->W16addK16 = blk->W16 + 0xe49b69c1; - blk->W17addK17 = blk->W17 + 0xefbe4786; + blk->B1addK6 = blk->cty_b + SHA256_K[6]; + blk->PreVal0addK7 = blk->PreVal0 + SHA256_K[7]; + blk->W16addK16 = blk->W16 + SHA256_K[16]; + blk->W17addK17 = blk->W17 + SHA256_K[17]; blk->A0 = blk->ctx_a + 0x98c7e2a2; blk->B0 = blk->ctx_a + 0xfc08884d; From c5142b7f124f0e05cdb576f19bad341fed4e1e8a Mon Sep 17 00:00:00 2001 From: ckolivas Date: Wed, 8 Feb 2012 16:45:37 +1100 Subject: [PATCH 6/8] Convert to Unix EOL. --- diakgcn120208.cl | 1236 +++++++++++++++++++++++----------------------- 1 file changed, 618 insertions(+), 618 deletions(-) diff --git a/diakgcn120208.cl b/diakgcn120208.cl index f128ab7e..b5d31969 100644 --- a/diakgcn120208.cl +++ b/diakgcn120208.cl @@ -1,618 +1,618 @@ -// DiaKGCN 04-02-2012 - OpenCL kernel by Diapolo -// -// Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3. -// The kernel was rewritten by me (Diapolo) and is still public-domain! - -#if defined VECTORS4 - typedef uint4 u; -#elif defined VECTORS2 - typedef uint2 u; -#else - typedef uint u; -#endif - -#ifdef BFI_INT - #pragma OPENCL EXTENSION cl_amd_media_ops : enable - #define Ch(x, y, z) amd_bytealign(x, y, z) - #define Ma(x, y, z) amd_bytealign(z ^ x, y, x) -#else - #define Ch(x, y, z) bitselect(z, y, x) - #if defined(VECTORS2) || defined(VECTORS4) || defined(VECTORS8) - // GCN - VEC2 or VEC4 - #define Ma(z, x, y) bitselect(z, y, z ^ x) - #else - // GCN - no VEC - #define Ma(z, x, y) Ch(z ^ x, y, x) - #endif -#endif - -#ifdef GOFFSET - typedef uint uu; -#else - #if defined VECTORS4 - typedef uint4 uu; - #elif defined VECTORS2 - typedef uint2 uu; - #else - typedef uint uu; - #endif -#endif - -#define ch(n) Ch(V[(4 + 128 - n) % 8], V[(5 + 128 - n) % 8], V[(6 + 128 - n) % 8]) -#define ma(n) Ma(V[(1 + 128 - n) % 8], V[(2 + 128 - n) % 8], V[(0 + 128 - n) % 8]) - -#define rot15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U)) -#define rot25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U)) -#define rot26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U)) -#define rot30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U)) - -__kernel - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) - void search( const uu base, const uint PreVal4, - const uint H1, const uint D1, const uint PreVal0, const uint B1, const uint C1, - const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7, - const uint W16addK16, const uint W17addK17, - const uint PreW18, const uint PreW19, - const uint W16, const uint W17, - const uint PreW31, const uint PreW32, - const uint state0, const uint state1, const uint state2, const uint state3, - const uint state4, const uint state5, const uint state6, const uint state7, - const uint state0A, const uint state0B, - __global ulong * output) -{ - u W[17]; - u V[8]; - -#if defined VECTORS4 - #ifdef GOFFSET - u nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3); - #else - u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); - #endif -#elif defined VECTORS2 - #ifdef GOFFSET - u nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1); - #else - u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); - #endif -#else - #ifdef GOFFSET - u nonce = base + get_global_id(0); - #else - u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); - #endif -#endif - - V[4] = PreVal4 + nonce; - - V[7] = H1 + (V[3] = D1 + Ch((PreVal0 + nonce), B1, C1) + rot26(PreVal0 + nonce)); - V[3] += rot30(V[4]) + Ma(F1, G1, V[4]); - - V[6] = G1 + (V[2] = C1addK5 + Ch(V[7], (PreVal0 + nonce), B1) + rot26(V[7])); - V[2] += rot30(V[3]) + Ma(V[4], F1, V[3]); - - V[5] = F1 + (V[1] = B1addK6 + Ch(V[6], V[7], (PreVal0 + nonce)) + rot26(V[6])); - V[1] += rot30(V[2]) + Ma(V[3], V[4], V[2]); - - V[4] += nonce + PreVal0addK7 + Ch(V[5], V[6], V[7]) + rot26(V[5]); - V[0] = nonce + PreVal0addK7 + Ch(V[5], V[6], V[7]) + rot26(V[5]) +rot30(V[1]) + Ma(V[2], V[3], V[1]); - - V[3] += 0xd807aa98 + V[7] + Ch(V[4], V[5], V[6]) + rot26(V[4]); - V[7] = 0xd807aa98 + V[7] + Ch(V[4], V[5], V[6]) + rot26(V[4]) + rot30(V[0]) + Ma(V[1], V[2], V[0]); - - V[2] += 0x12835b01 + V[6] + Ch(V[3], V[4], V[5]) + rot26(V[3]); - V[6] = 0x12835b01 + V[6] + Ch(V[3], V[4], V[5]) + rot26(V[3]) + rot30(V[7]) + Ma(V[0], V[1], V[7]); - - V[1] += 0x243185be + V[5] + Ch(V[2], V[3], V[4]) + rot26(V[2]); - V[5] = 0x243185be + V[5] + Ch(V[2], V[3], V[4]) + rot26(V[2]) + rot30(V[6]) + Ma(V[7], V[0], V[6]); - - V[0] += 0x550c7dc3 + V[4] + Ch(V[1], V[2], V[3]) + rot26(V[1]); - V[4] = 0x550c7dc3 + V[4] + Ch(V[1], V[2], V[3]) + rot26(V[1]) + rot30(V[5]) + Ma(V[6], V[7], V[5]); - -//--------------- ch() + ma() replaced above --------------- - - V[7] += 0x72be5d74 + V[3] + ch(12) + rot26(V[0]); - V[3] = 0x72be5d74 + V[3] + ch(12) + rot26(V[0]) + rot30(V[4]) + ma(12); - - V[6] += 0x80deb1fe + V[2] + ch(13) + rot26(V[7]); - V[2] = 0x80deb1fe + V[2] + ch(13) + rot26(V[7]) + rot30(V[3]) + ma(13); - - V[5] += 0x9bdc06a7 + V[1] + ch(14) + rot26(V[6]); - V[1] = 0x9bdc06a7 + V[1] + ch(14) + rot26(V[6]) + rot30(V[2]) + ma(14); - - V[4] += 0xc19bf3f4 + V[0] + ch(15) + rot26(V[5]); - V[0] = 0xc19bf3f4 + V[0] + ch(15) + rot26(V[5]) + rot30(V[1]) + ma(15); - - V[3] += W16addK16 + V[7] + ch(16) + rot26(V[4]); - V[7] = W16addK16 + V[7] + ch(16) + rot26(V[4]) + rot30(V[0]) + ma(16); - - V[2] += W17addK17 + V[6] + ch(17) + rot26(V[3]); - V[6] = W17addK17 + V[6] + ch(17) + rot26(V[3]) + rot30(V[7]) + ma(17); - -//---------------------------------------------------------------------------------- - -#ifdef VECTORS8 - W[0] = PreW18 + (u)(rot25(nonce.s0), rot25(nonce.s0) ^ 0x2004000, rot25(nonce.s0) ^ 0x4008000, rot25(nonce.s0) ^ 0x600C000, - rot25(nonce.s0) ^ 0x8010000, rot25(nonce.s0) ^ 0xa014000, rot25(nonce.s0) ^ 0xc018000, rot25(nonce.s0) ^ 0xe01c000); -#elif defined VECTORS4 - W[0] = PreW18 + (u)(rot25(nonce.x), rot25(nonce.x) ^ 0x2004000, rot25(nonce.x) ^ 0x4008000, rot25(nonce.x) ^ 0x600C000); -#elif defined VECTORS2 - W[0] = PreW18 + (u)(rot25(nonce.x), rot25(nonce.x) ^ 0x2004000); -#else - W[0] = PreW18 + rot25(nonce); -#endif - W[1] = PreW19 + nonce; - W[2] = 0x80000000 + rot15(W[0]); - W[3] = rot15(W[1]); - W[4] = 0x00000280 + rot15(W[2]); - W[5] = W16 + rot15(W[3]); - W[6] = W17 + rot15(W[4]); - W[7] = W[0] + rot15(W[5]); - W[8] = W[1] + rot15(W[6]); - W[9] = W[2] + rot15(W[7]); - W[10] = W[3] + rot15(W[8]); - W[11] = W[4] + rot15(W[9]); - W[12] = 0x00a00055 + W[5] + rot15(W[10]); - W[13] = PreW31 + W[6] + rot15(W[11]); - W[14] = PreW32 + W[7] + rot15(W[12]); - W[15] = W17 + W[8] + rot15(W[13]) + rot25(W[0]); - W[16] = W[0] + W[9] + rot15(W[14]) + rot25(W[1]); - - V[1] += 0x0fc19dc6 + V[5] + W[0] + ch(18) + rot26(V[2]); - V[5] = 0x0fc19dc6 + V[5] + W[0] + ch(18) + rot26(V[2]) + rot30(V[6]) + ma(18); - - V[0] += 0x240ca1cc + V[4] + W[1] + ch(19) + rot26(V[1]); - V[4] = 0x240ca1cc + V[4] + W[1] + ch(19) + rot26(V[1]) + rot30(V[5]) + ma(19); - - V[7] += 0x2de92c6f + V[3] + W[2] + ch(20) + rot26(V[0]); - V[3] = 0x2de92c6f + V[3] + W[2] + ch(20) + rot26(V[0]) + rot30(V[4]) + ma(20); - - V[6] += 0x4a7484aa + V[2] + W[3] + ch(21) + rot26(V[7]); - V[2] = 0x4a7484aa + V[2] + W[3] + ch(21) + rot26(V[7]) + rot30(V[3]) + ma(21); - - V[5] += 0x5cb0a9dc + V[1] + W[4] + ch(22) + rot26(V[6]); - V[1] = 0x5cb0a9dc + V[1] + W[4] + ch(22) + rot26(V[6]) + rot30(V[2]) + ma(22); - - V[4] += 0x76f988da + V[0] + W[5] + ch(23) + rot26(V[5]); - V[0] = 0x76f988da + V[0] + W[5] + ch(23) + rot26(V[5]) + rot30(V[1]) + ma(23); - - V[3] += 0x983e5152 + V[7] + W[6] + ch(24) + rot26(V[4]); - V[7] = 0x983e5152 + V[7] + W[6] + ch(24) + rot26(V[4]) + rot30(V[0]) + ma(24); - - V[2] += 0xa831c66d + V[6] + W[7] + ch(25) + rot26(V[3]); - V[6] = 0xa831c66d + V[6] + W[7] + ch(25) + rot26(V[3]) + rot30(V[7]) + ma(25); - - V[1] += 0xb00327c8 + V[5] + W[8] + ch(26) + rot26(V[2]); - V[5] = 0xb00327c8 + V[5] + W[8] + ch(26) + rot26(V[2]) + rot30(V[6]) + ma(26); - - V[0] += 0xbf597fc7 + V[4] + W[9] + ch(27) + rot26(V[1]); - V[4] = 0xbf597fc7 + V[4] + W[9] + ch(27) + rot26(V[1]) + rot30(V[5]) + ma(27); - - V[7] += 0xc6e00bf3 + V[3] + W[10] + ch(28) + rot26(V[0]); - V[3] = 0xc6e00bf3 + V[3] + W[10] + ch(28) + rot26(V[0]) + rot30(V[4]) + ma(28); - - V[6] += 0xd5a79147 + V[2] + W[11] + ch(29) + rot26(V[7]); - V[2] = 0xd5a79147 + V[2] + W[11] + ch(29) + rot26(V[7]) + rot30(V[3]) + ma(29); - - V[5] += 0x06ca6351 + V[1] + W[12] + ch(30) + rot26(V[6]); - V[1] = 0x06ca6351 + V[1] + W[12] + ch(30) + rot26(V[6]) + rot30(V[2]) + ma(30); - - V[4] += 0x14292967 + V[0] + W[13] + ch(31) + rot26(V[5]); - V[0] = 0x14292967 + V[0] + W[13] + ch(31) + rot26(V[5]) + rot30(V[1]) + ma(31); - - V[3] += 0x27b70a85 + V[7] + W[14] + ch(32) + rot26(V[4]); - V[7] = 0x27b70a85 + V[7] + W[14] + ch(32) + rot26(V[4]) + rot30(V[0]) + ma(32); - - V[2] += 0x2e1b2138 + V[6] + W[15] + ch(33) + rot26(V[3]); - V[6] = 0x2e1b2138 + V[6] + W[15] + ch(33) + rot26(V[3]) + rot30(V[7]) + ma(33); - - V[1] += 0x4d2c6dfc + V[5] + W[16] + ch(34) + rot26(V[2]); - V[5] = 0x4d2c6dfc + V[5] + W[16] + ch(34) + rot26(V[2]) + rot30(V[6]) + ma(34); - -//---------------------------------------------------------------------------------- - - W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); - W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); - W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); - W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); - W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); - W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); - W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); - W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); - W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); - W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); - W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); - W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); - W[12] = W[13] + W[5] + rot15(W[10]) + rot25(W[14]); - W[13] = W[14] + W[6] + rot15(W[11]) + rot25(W[15]); - W[14] = W[15] + W[7] + rot15(W[12]) + rot25(W[16]); - W[15] = W[16] + W[8] + rot15(W[13]) + rot25( W[0]); - W[16] = W[0] + W[9] + rot15(W[14]) + rot25( W[1]); - - V[0] += 0x53380d13 + V[4] + W[0] + ch(35) + rot26(V[1]); - V[4] = 0x53380d13 + V[4] + W[0] + ch(35) + rot26(V[1]) + rot30(V[5]) + ma(35); - - V[7] += 0x650a7354 + V[3] + W[1] + ch(36) + rot26(V[0]); - V[3] = 0x650a7354 + V[3] + W[1] + ch(36) + rot26(V[0]) + rot30(V[4]) + ma(36); - - V[6] += 0x766a0abb + V[2] + W[2] + ch(37) + rot26(V[7]); - V[2] = 0x766a0abb + V[2] + W[2] + ch(37) + rot26(V[7]) + rot30(V[3]) + ma(37); - - V[5] += 0x81c2c92e + V[1] + W[3] + ch(38) + rot26(V[6]); - V[1] = 0x81c2c92e + V[1] + W[3] + ch(38) + rot26(V[6]) + rot30(V[2]) + ma(38); - - V[4] += 0x92722c85 + V[0] + W[4] + ch(39) + rot26(V[5]); - V[0] = 0x92722c85 + V[0] + W[4] + ch(39) + rot26(V[5]) + rot30(V[1]) + ma(39); - - V[3] += 0xa2bfe8a1 + V[7] + W[5] + ch(40) + rot26(V[4]); - V[7] = 0xa2bfe8a1 + V[7] + W[5] + ch(40) + rot26(V[4]) + rot30(V[0]) + ma(40); - - V[2] += 0xa81a664b + V[6] + W[6] + ch(41) + rot26(V[3]); - V[6] = 0xa81a664b + V[6] + W[6] + ch(41) + rot26(V[3]) + rot30(V[7]) + ma(41); - - V[1] += 0xc24b8b70 + V[5] + W[7] + ch(42) + rot26(V[2]); - V[5] = 0xc24b8b70 + V[5] + W[7] + ch(42) + rot26(V[2]) + rot30(V[6]) + ma(42); - - V[0] += 0xc76c51a3 + V[4] + W[8] + ch(43) + rot26(V[1]); - V[4] = 0xc76c51a3 + V[4] + W[8] + ch(43) + rot26(V[1]) + rot30(V[5]) + ma(43); - - V[7] += 0xd192e819 + V[3] + W[9] + ch(44) + rot26(V[0]); - V[3] = 0xd192e819 + V[3] + W[9] + ch(44) + rot26(V[0]) + rot30(V[4]) + ma(44); - - V[6] += 0xd6990624 + V[2] + W[10] + ch(45) + rot26(V[7]); - V[2] = 0xd6990624 + V[2] + W[10] + ch(45) + rot26(V[7]) + rot30(V[3]) + ma(45); - - V[5] += 0xf40e3585 + V[1] + W[11] + ch(46) + rot26(V[6]); - V[1] = 0xf40e3585 + V[1] + W[11] + ch(46) + rot26(V[6]) + rot30(V[2]) + ma(46); - - V[4] += 0x106aa070 + V[0] + W[12] + ch(47) + rot26(V[5]); - V[0] = 0x106aa070 + V[0] + W[12] + ch(47) + rot26(V[5]) + rot30(V[1]) + ma(47); - - V[3] += 0x19a4c116 + V[7] + W[13] + ch(48) + rot26(V[4]); - V[7] = 0x19a4c116 + V[7] + W[13] + ch(48) + rot26(V[4]) + rot30(V[0]) + ma(48); - - V[2] += 0x1e376c08 + V[6] + W[14] + ch(49) + rot26(V[3]); - V[6] = 0x1e376c08 + V[6] + W[14] + ch(49) + rot26(V[3]) + rot30(V[7]) + ma(49); - - V[1] += 0x2748774c + V[5] + W[15] + ch(50) + rot26(V[2]); - V[5] = 0x2748774c + V[5] + W[15] + ch(50) + rot26(V[2]) + rot30(V[6]) + ma(50); - - V[0] += 0x34b0bcb5 + V[4] + W[16] + ch(51) + rot26(V[1]); - V[4] = 0x34b0bcb5 + V[4] + W[16] + ch(51) + rot26(V[1]) + rot30(V[5]) + ma(51); - -//---------------------------------------------------------------------------------- - - W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); - W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); - W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); - W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); - W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); - W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); - W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); - W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); - W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); - W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); - W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); - W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); - - V[7] += 0x391c0cb3 + V[3] + W[0] + ch(52) + rot26(V[0]); - V[3] = 0x391c0cb3 + V[3] + W[0] + ch(52) + rot26(V[0]) + rot30(V[4]) + ma(52); - - V[6] += 0x4ed8aa4a + V[2] + W[1] + ch(53) + rot26(V[7]); - V[2] = 0x4ed8aa4a + V[2] + W[1] + ch(53) + rot26(V[7]) + rot30(V[3]) + ma(53); - - V[5] += 0x5b9cca4f + V[1] + W[2] + ch(54) + rot26(V[6]); - V[1] = 0x5b9cca4f + V[1] + W[2] + ch(54) + rot26(V[6]) + rot30(V[2]) + ma(54); - - V[4] += 0x682e6ff3 + V[0] + W[3] + ch(55) + rot26(V[5]); - V[0] = 0x682e6ff3 + V[0] + W[3] + ch(55) + rot26(V[5]) + rot30(V[1]) + ma(55); - - V[3] += 0x748f82ee + V[7] + W[4] + ch(56) + rot26(V[4]); - V[7] = 0x748f82ee + V[7] + W[4] + ch(56) + rot26(V[4]) + rot30(V[0]) + ma(56); - - V[2] += 0x78a5636f + V[6] + W[5] + ch(57) + rot26(V[3]); - V[6] = 0x78a5636f + V[6] + W[5] + ch(57) + rot26(V[3]) + rot30(V[7]) + ma(57); - - V[1] += 0x84c87814 + V[5] + W[6] + ch(58) + rot26(V[2]); - V[5] = 0x84c87814 + V[5] + W[6] + ch(58) + rot26(V[2]) + rot30(V[6]) + ma(58); - - V[0] += 0x8cc70208 + V[4] + W[7] + ch(59) + rot26(V[1]); - V[4] = 0x8cc70208 + V[4] + W[7] + ch(59) + rot26(V[1]) + rot30(V[5]) + ma(59); - - V[7] += 0x90befffa + V[3] + W[8] + ch(60) + rot26(V[0]); - V[3] = 0x90befffa + V[3] + W[8] + ch(60) + rot26(V[0]) + rot30(V[4]) + ma(60); - - V[6] += 0xa4506ceb + V[2] + W[9] + ch(61) + rot26(V[7]); - V[2] = 0xa4506ceb + V[2] + W[9] + ch(61) + rot26(V[7]) + rot30(V[3]) + ma(61); - - V[5] += 0xbef9a3f7 + V[1] + W[10] + ch(62) + rot26(V[6]); - V[1] = 0xbef9a3f7 + V[1] + W[10] + ch(62) + rot26(V[6]) + rot30(V[2]) + ma(62); - - V[4] += 0xc67178f2 + V[0] + W[11] + ch(63) + rot26(V[5]); - V[0] = 0xc67178f2 + V[0] + W[11] + ch(63) + rot26(V[5]) + rot30(V[1]) + ma(63); - -//---------------------------------------------------------------------------------- - - W[0] = state0 + V[0]; - W[1] = state1 + V[1]; - W[2] = state2 + V[2]; - W[3] = state3 + V[3]; - W[4] = state4 + V[4]; - W[5] = state5 + V[5]; - W[6] = state6 + V[6]; - W[7] = state7 + V[7]; - - // 0x98c7e2a2 + W[0] - u state0AaddV0 = state0A + V[0]; - // 0xfc08884d + W[0] - u state0BaddV0 = state0B + V[0]; - - V[2] = 0x3c6ef372 + (V[6] = 0x90bb1e3c + W[1] + Ch(state0AaddV0, 0x510e527fU, 0x9b05688cU) + rot26(state0AaddV0)); - V[6] += rot30(state0BaddV0) + Ma(0x6a09e667U, 0xbb67ae85U, state0BaddV0); - - V[1] = 0xbb67ae85 + (V[5] = 0x50c6645b + W[2] + Ch(V[2], state0AaddV0, 0x510e527fU) + rot26(V[2])); - V[5] += rot30(V[6]) + Ma(state0BaddV0, 0x6a09e667U, V[6]); - - V[0] = 0x6a09e667 + (V[4] = 0x3ac42e24 + W[3] + Ch(V[1], V[2], state0AaddV0) + rot26(V[1])); - V[4] += rot30(V[5]) + Ma(V[6], state0BaddV0, V[5]); - - V[7] = (state0BaddV0) + (V[3] = 0x3956c25b + state0AaddV0 + W[4] + Ch(V[0], V[1], V[2]) + rot26(V[0])); - V[3] += rot30(V[4]) + Ma(V[5], V[6], V[4]); - -//--------------- ch() + ma() replaced above --------------- - - V[6] += 0x59f111f1 + V[2] + W[5] + ch(69) + rot26(V[7]); - V[2] = 0x59f111f1 + V[2] + W[5] + ch(69) + rot26(V[7]) + rot30(V[3]) + ma(69); - - V[5] += 0x923f82a4 + V[1] + W[6] + ch(70) + rot26(V[6]); - V[1] = 0x923f82a4 + V[1] + W[6] + ch(70) + rot26(V[6]) + rot30(V[2]) + ma(70); - - V[4] += 0xab1c5ed5 + V[0] + W[7] + ch(71) + rot26(V[5]); - V[0] = 0xab1c5ed5 + V[0] + W[7] + ch(71) + rot26(V[5]) + rot30(V[1]) + ma(71); - - V[3] += 0x5807aa98 + V[7] + ch(72) + rot26(V[4]); - V[7] = 0x5807aa98 + V[7] + ch(72) + rot26(V[4]) + rot30(V[0]) + ma(72); - - V[2] += 0x12835b01 + V[6] + ch(73) + rot26(V[3]); - V[6] = 0x12835b01 + V[6] + ch(73) + rot26(V[3]) + rot30(V[7]) + ma(73); - - V[1] += 0x243185be + V[5] + ch(74) + rot26(V[2]); - V[5] = 0x243185be + V[5] + ch(74) + rot26(V[2]) + rot30(V[6]) + ma(74); - - V[0] += 0x550c7dc3 + V[4] + ch(75) + rot26(V[1]); - V[4] = 0x550c7dc3 + V[4] + ch(75) + rot26(V[1]) + rot30(V[5]) + ma(75); - - V[7] += 0x72be5d74 + V[3] + ch(76) + rot26(V[0]); - V[3] = 0x72be5d74 + V[3] + ch(76) + rot26(V[0]) + rot30(V[4]) + ma(76); - - V[6] += 0x80deb1fe + V[2] + ch(77) + rot26(V[7]); - V[2] = 0x80deb1fe + V[2] + ch(77) + rot26(V[7]) + rot30(V[3]) + ma(77); - - V[5] += 0x9bdc06a7 + V[1] + ch(78) + rot26(V[6]); - V[1] = 0x9bdc06a7 + V[1] + ch(78) + rot26(V[6]) + rot30(V[2]) + ma(78); - - V[4] += 0xc19bf274 + V[0] + ch(79) + rot26(V[5]); - V[0] = 0xc19bf274 + V[0] + ch(79) + rot26(V[5]) + rot30(V[1]) + ma(79); - -//---------------------------------------------------------------------------------- - - W[0] = W[0] + rot25(W[1]); - W[1] = 0x00a00000 + W[1] + rot25(W[2]); - W[2] = W[2] + rot15(W[0]) + rot25(W[3]); - W[3] = W[3] + rot15(W[1]) + rot25(W[4]); - W[4] = W[4] + rot15(W[2]) + rot25(W[5]); - W[5] = W[5] + rot15(W[3]) + rot25(W[6]); - W[6] = 0x00000100 + W[6] + rot15(W[4]) + rot25(W[7]); - W[7] = 0x11002000 + W[7] + W[0] + rot15(W[5]); - W[8] = 0x80000000 + W[1] + rot15(W[6]); - W[9] = W[2] + rot15(W[7]); - W[10] = W[3] + rot15(W[8]); - W[11] = W[4] + rot15(W[9]); - W[12] = W[5] + rot15(W[10]); - W[13] = W[6] + rot15(W[11]); - W[14] = 0x00400022 + W[7] + rot15( W[12]); - W[15] = 0x00000100 + W[8] + rot15( W[13]) + rot25(W[0]); - W[16] = W[0] + W[9] + rot15( W[14]) + rot25(W[1]); - - V[3] += 0xe49b69c1 + V[7] + W[0] + ch(80) + rot26(V[4]); - V[7] = 0xe49b69c1 + V[7] + W[0] + ch(80) + rot26(V[4]) + rot30(V[0]) + ma(80); - - V[2] += 0xefbe4786 + V[6] + W[1] + ch(81) + rot26(V[3]); - V[6] = 0xefbe4786 + V[6] + W[1] + ch(81) + rot26(V[3]) + rot30(V[7]) + ma(81); - - V[1] += 0x0fc19dc6 + V[5] + W[2] + ch(82) + rot26(V[2]); - V[5] = 0x0fc19dc6 + V[5] + W[2] + ch(82) + rot26(V[2]) + rot30(V[6]) + ma(82); - - V[0] += 0x240ca1cc + V[4] + W[3] + ch(83) + rot26(V[1]); - V[4] = 0x240ca1cc + V[4] + W[3] + ch(83) + rot26(V[1]) + rot30(V[5]) + ma(83); - - V[7] += 0x2de92c6f + V[3] + W[4] + ch(84) + rot26(V[0]); - V[3] = 0x2de92c6f + V[3] + W[4] + ch(84) + rot26(V[0]) + rot30(V[4]) + ma(84); - - V[6] += 0x4a7484aa + V[2] + W[5] + ch(85) + rot26(V[7]); - V[2] = 0x4a7484aa + V[2] + W[5] + ch(85) + rot26(V[7]) + rot30(V[3]) + ma(85); - - V[5] += 0x5cb0a9dc + V[1] + W[6] + ch(86) + rot26(V[6]); - V[1] = 0x5cb0a9dc + V[1] + W[6] + ch(86) + rot26(V[6]) + rot30(V[2]) + ma(86); - - V[4] += 0x76f988da + V[0] + W[7] + ch(87) + rot26(V[5]); - V[0] = 0x76f988da + V[0] + W[7] + ch(87) + rot26(V[5]) + rot30(V[1]) + ma(87); - - V[3] += 0x983e5152 + V[7] + W[8] + ch(88) + rot26(V[4]); - V[7] = 0x983e5152 + V[7] + W[8] + ch(88) + rot26(V[4]) + rot30(V[0]) + ma(88); - - V[2] += 0xa831c66d + V[6] + W[9] + ch(89) + rot26(V[3]); - V[6] = 0xa831c66d + V[6] + W[9] + ch(89) + rot26(V[3]) + rot30(V[7]) + ma(89); - - V[1] += 0xb00327c8 + V[5] + W[10] + ch(90) + rot26(V[2]); - V[5] = 0xb00327c8 + V[5] + W[10] + ch(90) + rot26(V[2]) + rot30(V[6]) + ma(90); - - V[0] += 0xbf597fc7 + V[4] + W[11] + ch(91) + rot26(V[1]); - V[4] = 0xbf597fc7 + V[4] + W[11] + ch(91) + rot26(V[1]) + rot30(V[5]) + ma(91); - - V[7] += 0xc6e00bf3 + V[3] + W[12] + ch(92) + rot26(V[0]); - V[3] = 0xc6e00bf3 + V[3] + W[12] + ch(92) + rot26(V[0]) + rot30(V[4]) + ma(92); - - V[6] += 0xd5a79147 + V[2] + W[13] + ch(93) + rot26(V[7]); - V[2] = 0xd5a79147 + V[2] + W[13] + ch(93) + rot26(V[7]) + rot30(V[3]) + ma(93); - - V[5] += 0x06ca6351 + V[1] + W[14] + ch(94) + rot26(V[6]); - V[1] = 0x06ca6351 + V[1] + W[14] + ch(94) + rot26(V[6]) + rot30(V[2]) + ma(94); - - V[4] += 0x14292967 + V[0] + W[15] + ch(95) + rot26(V[5]); - V[0] = 0x14292967 + V[0] + W[15] + ch(95) + rot26(V[5]) + rot30(V[1]) + ma(95); - - V[3] += 0x27b70a85 + V[7] + W[16] + ch(96) + rot26(V[4]); - V[7] = 0x27b70a85 + V[7] + W[16] + ch(96) + rot26(V[4]) + rot30(V[0]) + ma(96); - -//---------------------------------------------------------------------------------- - - W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); - W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); - W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); - W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); - W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); - W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); - W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); - W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); - W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); - W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); - W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); - W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); - W[12] = W[13] + W[5] + rot15(W[10]) + rot25(W[14]); - W[13] = W[14] + W[6] + rot15(W[11]) + rot25(W[15]); - W[14] = W[15] + W[7] + rot15(W[12]) + rot25(W[16]); - W[15] = W[16] + W[8] + rot15(W[13]) + rot25( W[0]); - W[16] = W[0] + W[9] + rot15(W[14]) + rot25( W[1]); - - V[2] += 0x2e1b2138 + V[6] + W[0] + ch(97) + rot26(V[3]); - V[6] = 0x2e1b2138 + V[6] + W[0] + ch(97) + rot26(V[3]) + rot30(V[7]) + ma(97); - - V[1] += 0x4d2c6dfc + V[5] + W[1] + ch(98) + rot26(V[2]); - V[5] = 0x4d2c6dfc + V[5] + W[1] + ch(98) + rot26(V[2]) + rot30(V[6]) + ma(98); - - V[0] += 0x53380d13 + V[4] + W[2] + ch(99) + rot26(V[1]); - V[4] = 0x53380d13 + V[4] + W[2] + ch(99) + rot26(V[1]) + rot30(V[5]) + ma(99); - - V[7] += 0x650a7354 + V[3] + W[3] + ch(100) + rot26(V[0]); - V[3] = 0x650a7354 + V[3] + W[3] + ch(100) + rot26(V[0]) + rot30(V[4]) + ma(100); - - V[6] += 0x766a0abb + V[2] + W[4] + ch(101) + rot26(V[7]); - V[2] = 0x766a0abb + V[2] + W[4] + ch(101) + rot26(V[7]) + rot30(V[3]) + ma(101); - - V[5] += 0x81c2c92e + V[1] + W[5] + ch(102) + rot26(V[6]); - V[1] = 0x81c2c92e + V[1] + W[5] + ch(102) + rot26(V[6]) + rot30(V[2]) + ma(102); - - V[4] += 0x92722c85 + V[0] + W[6] + ch(103) + rot26(V[5]); - V[0] = 0x92722c85 + V[0] + W[6] + ch(103) + rot26(V[5]) + rot30(V[1]) + ma(103); - - V[3] += 0xa2bfe8a1 + V[7] + W[7] + ch(104) + rot26(V[4]); - V[7] = 0xa2bfe8a1 + V[7] + W[7] + ch(104) + rot26(V[4]) + rot30(V[0]) + ma(104); - - V[2] += 0xa81a664b + V[6] + W[8] + ch(105) + rot26(V[3]); - V[6] = 0xa81a664b + V[6] + W[8] + ch(105) + rot26(V[3]) + rot30(V[7]) + ma(105); - - V[1] += 0xc24b8b70 + V[5] + W[9] + ch(106) + rot26(V[2]); - V[5] = 0xc24b8b70 + V[5] + W[9] + ch(106) + rot26(V[2]) + rot30(V[6]) + ma(106); - - V[0] += 0xc76c51a3 + V[4] + W[10] + ch(107) + rot26(V[1]); - V[4] = 0xc76c51a3 + V[4] + W[10] + ch(107) + rot26(V[1]) + rot30(V[5]) + ma(107); - - V[7] += 0xd192e819 + V[3] + W[11] + ch(108) + rot26(V[0]); - V[3] = 0xd192e819 + V[3] + W[11] + ch(108) + rot26(V[0]) + rot30(V[4]) + ma(108); - - V[6] += 0xd6990624 + V[2] + W[12] + ch(109) + rot26(V[7]); - V[2] = 0xd6990624 + V[2] + W[12] + ch(109) + rot26(V[7]) + rot30(V[3]) + ma(109); - - V[5] += 0xf40e3585 + V[1] + W[13] + ch(110) + rot26(V[6]); - V[1] = 0xf40e3585 + V[1] + W[13] + ch(110) + rot26(V[6]) + rot30(V[2]) + ma(110); - - V[4] += 0x106aa070 + V[0] + W[14] + ch(111) + rot26(V[5]); - V[0] = 0x106aa070 + V[0] + W[14] + ch(111) + rot26(V[5]) + rot30(V[1]) + ma(111); - - V[3] += 0x19a4c116 + V[7] + W[15] + ch(112) + rot26(V[4]); - V[7] = 0x19a4c116 + V[7] + W[15] + ch(112) + rot26(V[4]) + rot30(V[0]) + ma(112); - - V[2] += 0x1e376c08 + V[6] + W[16] + ch(113) + rot26(V[3]); - V[6] = 0x1e376c08 + V[6] + W[16] + ch(113) + rot26(V[3]) + rot30(V[7]) + ma(113); - -//---------------------------------------------------------------------------------- - - W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); - W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); - W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); - W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); - W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); - W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); - W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); - W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); - W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); - W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); - W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); - - V[1] += 0x2748774c + V[5] + W[0] + ch(114) + rot26(V[2]); - V[5] = 0x2748774c + V[5] + W[0] + ch(114) + rot26(V[2]) + rot30(V[6]) + ma(114); - - V[0] += 0x34b0bcb5 + V[4] + W[1] + ch(115) + rot26(V[1]); - V[4] = 0x34b0bcb5 + V[4] + W[1] + ch(115) + rot26(V[1]) + rot30(V[5]) + ma(115); - - V[7] += 0x391c0cb3 + V[3] + W[2] + ch(116) + rot26(V[0]); - V[3] = 0x391c0cb3 + V[3] + W[2] + ch(116) + rot26(V[0]) + rot30(V[4]) + ma(116); - - V[6] += 0x4ed8aa4a + V[2] + W[3] + ch(117) + rot26(V[7]); - V[2] = 0x4ed8aa4a + V[2] + W[3] + ch(117) + rot26(V[7]) + rot30(V[3]) + ma(117); - - V[5] += 0x5b9cca4f + V[1] + W[4] + ch(118) + rot26(V[6]); - V[1] = 0x5b9cca4f + V[1] + W[4] + ch(118) + rot26(V[6]) + rot30(V[2]) + ma(118); - - V[4] += 0x682e6ff3 + V[0] + W[5] + ch(119) + rot26(V[5]); - V[0] = 0x682e6ff3 + V[0] + W[5] + ch(119) + rot26(V[5]) + rot30(V[1]) + ma(119); - - V[3] += 0x748f82ee + V[7] + W[6] + ch(120) + rot26(V[4]); - V[7] = 0x748f82ee + V[7] + W[6] + ch(120) + rot26(V[4]) + rot30(V[0]) + ma(120); - - V[2] += 0x78a5636f + V[6] + W[7] + ch(121) + rot26(V[3]); - - V[1] += 0x84c87814 + V[5] + W[8] + ch(122) + rot26(V[2]); - - V[0] += 0x8cc70208 + V[4] + W[9] + ch(123) + rot26(V[1]); - - V[7] += V[3] + W[10] + ch(124) + rot26(V[0]); - - -#define FOUND (0x80) -#define NFLAG (0x7F) - -#ifdef VECTORS4 - V[7] ^= 0x136032ed; - - bool result = V[7].x & V[7].y & V[7].z & V[7].w; - - if (!result) { - if (!V[7].x) - output[FOUND] = output[NFLAG & nonce.x] = nonce.x; - if (!V[7].y) - output[FOUND] = output[NFLAG & nonce.y] = nonce.y; - if (!V[7].z) - output[FOUND] = output[NFLAG & nonce.z] = nonce.z; - if (!V[7].w) - output[FOUND] = output[NFLAG & nonce.w] = nonce.w; - } -#else - #ifdef VECTORS2 - V[7] ^= 0x136032ed; - - bool result = V[7].x & V[7].y; - - if (!result) { - if (!V[7].x) - output[FOUND] = output[NFLAG & nonce.x] = nonce.x; - if (!V[7].y) - output[FOUND] = output[NFLAG & nonce.y] = nonce.y; - } - #else - if (V[7] == 0x136032ed) - output[FOUND] = output[NFLAG & nonce] = nonce; - #endif -#endif -} +// DiaKGCN 04-02-2012 - OpenCL kernel by Diapolo +// +// Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3. +// The kernel was rewritten by me (Diapolo) and is still public-domain! + +#if defined VECTORS4 + typedef uint4 u; +#elif defined VECTORS2 + typedef uint2 u; +#else + typedef uint u; +#endif + +#ifdef BFI_INT + #pragma OPENCL EXTENSION cl_amd_media_ops : enable + #define Ch(x, y, z) amd_bytealign(x, y, z) + #define Ma(x, y, z) amd_bytealign(z ^ x, y, x) +#else + #define Ch(x, y, z) bitselect(z, y, x) + #if defined(VECTORS2) || defined(VECTORS4) || defined(VECTORS8) + // GCN - VEC2 or VEC4 + #define Ma(z, x, y) bitselect(z, y, z ^ x) + #else + // GCN - no VEC + #define Ma(z, x, y) Ch(z ^ x, y, x) + #endif +#endif + +#ifdef GOFFSET + typedef uint uu; +#else + #if defined VECTORS4 + typedef uint4 uu; + #elif defined VECTORS2 + typedef uint2 uu; + #else + typedef uint uu; + #endif +#endif + +#define ch(n) Ch(V[(4 + 128 - n) % 8], V[(5 + 128 - n) % 8], V[(6 + 128 - n) % 8]) +#define ma(n) Ma(V[(1 + 128 - n) % 8], V[(2 + 128 - n) % 8], V[(0 + 128 - n) % 8]) + +#define rot15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U)) +#define rot25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U)) +#define rot26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U)) +#define rot30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U)) + +__kernel + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) + void search( const uu base, const uint PreVal4, + const uint H1, const uint D1, const uint PreVal0, const uint B1, const uint C1, + const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7, + const uint W16addK16, const uint W17addK17, + const uint PreW18, const uint PreW19, + const uint W16, const uint W17, + const uint PreW31, const uint PreW32, + const uint state0, const uint state1, const uint state2, const uint state3, + const uint state4, const uint state5, const uint state6, const uint state7, + const uint state0A, const uint state0B, + __global ulong * output) +{ + u W[17]; + u V[8]; + +#if defined VECTORS4 + #ifdef GOFFSET + u nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3); + #else + u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); + #endif +#elif defined VECTORS2 + #ifdef GOFFSET + u nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1); + #else + u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); + #endif +#else + #ifdef GOFFSET + u nonce = base + get_global_id(0); + #else + u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); + #endif +#endif + + V[4] = PreVal4 + nonce; + + V[7] = H1 + (V[3] = D1 + Ch((PreVal0 + nonce), B1, C1) + rot26(PreVal0 + nonce)); + V[3] += rot30(V[4]) + Ma(F1, G1, V[4]); + + V[6] = G1 + (V[2] = C1addK5 + Ch(V[7], (PreVal0 + nonce), B1) + rot26(V[7])); + V[2] += rot30(V[3]) + Ma(V[4], F1, V[3]); + + V[5] = F1 + (V[1] = B1addK6 + Ch(V[6], V[7], (PreVal0 + nonce)) + rot26(V[6])); + V[1] += rot30(V[2]) + Ma(V[3], V[4], V[2]); + + V[4] += nonce + PreVal0addK7 + Ch(V[5], V[6], V[7]) + rot26(V[5]); + V[0] = nonce + PreVal0addK7 + Ch(V[5], V[6], V[7]) + rot26(V[5]) +rot30(V[1]) + Ma(V[2], V[3], V[1]); + + V[3] += 0xd807aa98 + V[7] + Ch(V[4], V[5], V[6]) + rot26(V[4]); + V[7] = 0xd807aa98 + V[7] + Ch(V[4], V[5], V[6]) + rot26(V[4]) + rot30(V[0]) + Ma(V[1], V[2], V[0]); + + V[2] += 0x12835b01 + V[6] + Ch(V[3], V[4], V[5]) + rot26(V[3]); + V[6] = 0x12835b01 + V[6] + Ch(V[3], V[4], V[5]) + rot26(V[3]) + rot30(V[7]) + Ma(V[0], V[1], V[7]); + + V[1] += 0x243185be + V[5] + Ch(V[2], V[3], V[4]) + rot26(V[2]); + V[5] = 0x243185be + V[5] + Ch(V[2], V[3], V[4]) + rot26(V[2]) + rot30(V[6]) + Ma(V[7], V[0], V[6]); + + V[0] += 0x550c7dc3 + V[4] + Ch(V[1], V[2], V[3]) + rot26(V[1]); + V[4] = 0x550c7dc3 + V[4] + Ch(V[1], V[2], V[3]) + rot26(V[1]) + rot30(V[5]) + Ma(V[6], V[7], V[5]); + +//--------------- ch() + ma() replaced above --------------- + + V[7] += 0x72be5d74 + V[3] + ch(12) + rot26(V[0]); + V[3] = 0x72be5d74 + V[3] + ch(12) + rot26(V[0]) + rot30(V[4]) + ma(12); + + V[6] += 0x80deb1fe + V[2] + ch(13) + rot26(V[7]); + V[2] = 0x80deb1fe + V[2] + ch(13) + rot26(V[7]) + rot30(V[3]) + ma(13); + + V[5] += 0x9bdc06a7 + V[1] + ch(14) + rot26(V[6]); + V[1] = 0x9bdc06a7 + V[1] + ch(14) + rot26(V[6]) + rot30(V[2]) + ma(14); + + V[4] += 0xc19bf3f4 + V[0] + ch(15) + rot26(V[5]); + V[0] = 0xc19bf3f4 + V[0] + ch(15) + rot26(V[5]) + rot30(V[1]) + ma(15); + + V[3] += W16addK16 + V[7] + ch(16) + rot26(V[4]); + V[7] = W16addK16 + V[7] + ch(16) + rot26(V[4]) + rot30(V[0]) + ma(16); + + V[2] += W17addK17 + V[6] + ch(17) + rot26(V[3]); + V[6] = W17addK17 + V[6] + ch(17) + rot26(V[3]) + rot30(V[7]) + ma(17); + +//---------------------------------------------------------------------------------- + +#ifdef VECTORS8 + W[0] = PreW18 + (u)(rot25(nonce.s0), rot25(nonce.s0) ^ 0x2004000, rot25(nonce.s0) ^ 0x4008000, rot25(nonce.s0) ^ 0x600C000, + rot25(nonce.s0) ^ 0x8010000, rot25(nonce.s0) ^ 0xa014000, rot25(nonce.s0) ^ 0xc018000, rot25(nonce.s0) ^ 0xe01c000); +#elif defined VECTORS4 + W[0] = PreW18 + (u)(rot25(nonce.x), rot25(nonce.x) ^ 0x2004000, rot25(nonce.x) ^ 0x4008000, rot25(nonce.x) ^ 0x600C000); +#elif defined VECTORS2 + W[0] = PreW18 + (u)(rot25(nonce.x), rot25(nonce.x) ^ 0x2004000); +#else + W[0] = PreW18 + rot25(nonce); +#endif + W[1] = PreW19 + nonce; + W[2] = 0x80000000 + rot15(W[0]); + W[3] = rot15(W[1]); + W[4] = 0x00000280 + rot15(W[2]); + W[5] = W16 + rot15(W[3]); + W[6] = W17 + rot15(W[4]); + W[7] = W[0] + rot15(W[5]); + W[8] = W[1] + rot15(W[6]); + W[9] = W[2] + rot15(W[7]); + W[10] = W[3] + rot15(W[8]); + W[11] = W[4] + rot15(W[9]); + W[12] = 0x00a00055 + W[5] + rot15(W[10]); + W[13] = PreW31 + W[6] + rot15(W[11]); + W[14] = PreW32 + W[7] + rot15(W[12]); + W[15] = W17 + W[8] + rot15(W[13]) + rot25(W[0]); + W[16] = W[0] + W[9] + rot15(W[14]) + rot25(W[1]); + + V[1] += 0x0fc19dc6 + V[5] + W[0] + ch(18) + rot26(V[2]); + V[5] = 0x0fc19dc6 + V[5] + W[0] + ch(18) + rot26(V[2]) + rot30(V[6]) + ma(18); + + V[0] += 0x240ca1cc + V[4] + W[1] + ch(19) + rot26(V[1]); + V[4] = 0x240ca1cc + V[4] + W[1] + ch(19) + rot26(V[1]) + rot30(V[5]) + ma(19); + + V[7] += 0x2de92c6f + V[3] + W[2] + ch(20) + rot26(V[0]); + V[3] = 0x2de92c6f + V[3] + W[2] + ch(20) + rot26(V[0]) + rot30(V[4]) + ma(20); + + V[6] += 0x4a7484aa + V[2] + W[3] + ch(21) + rot26(V[7]); + V[2] = 0x4a7484aa + V[2] + W[3] + ch(21) + rot26(V[7]) + rot30(V[3]) + ma(21); + + V[5] += 0x5cb0a9dc + V[1] + W[4] + ch(22) + rot26(V[6]); + V[1] = 0x5cb0a9dc + V[1] + W[4] + ch(22) + rot26(V[6]) + rot30(V[2]) + ma(22); + + V[4] += 0x76f988da + V[0] + W[5] + ch(23) + rot26(V[5]); + V[0] = 0x76f988da + V[0] + W[5] + ch(23) + rot26(V[5]) + rot30(V[1]) + ma(23); + + V[3] += 0x983e5152 + V[7] + W[6] + ch(24) + rot26(V[4]); + V[7] = 0x983e5152 + V[7] + W[6] + ch(24) + rot26(V[4]) + rot30(V[0]) + ma(24); + + V[2] += 0xa831c66d + V[6] + W[7] + ch(25) + rot26(V[3]); + V[6] = 0xa831c66d + V[6] + W[7] + ch(25) + rot26(V[3]) + rot30(V[7]) + ma(25); + + V[1] += 0xb00327c8 + V[5] + W[8] + ch(26) + rot26(V[2]); + V[5] = 0xb00327c8 + V[5] + W[8] + ch(26) + rot26(V[2]) + rot30(V[6]) + ma(26); + + V[0] += 0xbf597fc7 + V[4] + W[9] + ch(27) + rot26(V[1]); + V[4] = 0xbf597fc7 + V[4] + W[9] + ch(27) + rot26(V[1]) + rot30(V[5]) + ma(27); + + V[7] += 0xc6e00bf3 + V[3] + W[10] + ch(28) + rot26(V[0]); + V[3] = 0xc6e00bf3 + V[3] + W[10] + ch(28) + rot26(V[0]) + rot30(V[4]) + ma(28); + + V[6] += 0xd5a79147 + V[2] + W[11] + ch(29) + rot26(V[7]); + V[2] = 0xd5a79147 + V[2] + W[11] + ch(29) + rot26(V[7]) + rot30(V[3]) + ma(29); + + V[5] += 0x06ca6351 + V[1] + W[12] + ch(30) + rot26(V[6]); + V[1] = 0x06ca6351 + V[1] + W[12] + ch(30) + rot26(V[6]) + rot30(V[2]) + ma(30); + + V[4] += 0x14292967 + V[0] + W[13] + ch(31) + rot26(V[5]); + V[0] = 0x14292967 + V[0] + W[13] + ch(31) + rot26(V[5]) + rot30(V[1]) + ma(31); + + V[3] += 0x27b70a85 + V[7] + W[14] + ch(32) + rot26(V[4]); + V[7] = 0x27b70a85 + V[7] + W[14] + ch(32) + rot26(V[4]) + rot30(V[0]) + ma(32); + + V[2] += 0x2e1b2138 + V[6] + W[15] + ch(33) + rot26(V[3]); + V[6] = 0x2e1b2138 + V[6] + W[15] + ch(33) + rot26(V[3]) + rot30(V[7]) + ma(33); + + V[1] += 0x4d2c6dfc + V[5] + W[16] + ch(34) + rot26(V[2]); + V[5] = 0x4d2c6dfc + V[5] + W[16] + ch(34) + rot26(V[2]) + rot30(V[6]) + ma(34); + +//---------------------------------------------------------------------------------- + + W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); + W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); + W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); + W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); + W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); + W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); + W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); + W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); + W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); + W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); + W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); + W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); + W[12] = W[13] + W[5] + rot15(W[10]) + rot25(W[14]); + W[13] = W[14] + W[6] + rot15(W[11]) + rot25(W[15]); + W[14] = W[15] + W[7] + rot15(W[12]) + rot25(W[16]); + W[15] = W[16] + W[8] + rot15(W[13]) + rot25( W[0]); + W[16] = W[0] + W[9] + rot15(W[14]) + rot25( W[1]); + + V[0] += 0x53380d13 + V[4] + W[0] + ch(35) + rot26(V[1]); + V[4] = 0x53380d13 + V[4] + W[0] + ch(35) + rot26(V[1]) + rot30(V[5]) + ma(35); + + V[7] += 0x650a7354 + V[3] + W[1] + ch(36) + rot26(V[0]); + V[3] = 0x650a7354 + V[3] + W[1] + ch(36) + rot26(V[0]) + rot30(V[4]) + ma(36); + + V[6] += 0x766a0abb + V[2] + W[2] + ch(37) + rot26(V[7]); + V[2] = 0x766a0abb + V[2] + W[2] + ch(37) + rot26(V[7]) + rot30(V[3]) + ma(37); + + V[5] += 0x81c2c92e + V[1] + W[3] + ch(38) + rot26(V[6]); + V[1] = 0x81c2c92e + V[1] + W[3] + ch(38) + rot26(V[6]) + rot30(V[2]) + ma(38); + + V[4] += 0x92722c85 + V[0] + W[4] + ch(39) + rot26(V[5]); + V[0] = 0x92722c85 + V[0] + W[4] + ch(39) + rot26(V[5]) + rot30(V[1]) + ma(39); + + V[3] += 0xa2bfe8a1 + V[7] + W[5] + ch(40) + rot26(V[4]); + V[7] = 0xa2bfe8a1 + V[7] + W[5] + ch(40) + rot26(V[4]) + rot30(V[0]) + ma(40); + + V[2] += 0xa81a664b + V[6] + W[6] + ch(41) + rot26(V[3]); + V[6] = 0xa81a664b + V[6] + W[6] + ch(41) + rot26(V[3]) + rot30(V[7]) + ma(41); + + V[1] += 0xc24b8b70 + V[5] + W[7] + ch(42) + rot26(V[2]); + V[5] = 0xc24b8b70 + V[5] + W[7] + ch(42) + rot26(V[2]) + rot30(V[6]) + ma(42); + + V[0] += 0xc76c51a3 + V[4] + W[8] + ch(43) + rot26(V[1]); + V[4] = 0xc76c51a3 + V[4] + W[8] + ch(43) + rot26(V[1]) + rot30(V[5]) + ma(43); + + V[7] += 0xd192e819 + V[3] + W[9] + ch(44) + rot26(V[0]); + V[3] = 0xd192e819 + V[3] + W[9] + ch(44) + rot26(V[0]) + rot30(V[4]) + ma(44); + + V[6] += 0xd6990624 + V[2] + W[10] + ch(45) + rot26(V[7]); + V[2] = 0xd6990624 + V[2] + W[10] + ch(45) + rot26(V[7]) + rot30(V[3]) + ma(45); + + V[5] += 0xf40e3585 + V[1] + W[11] + ch(46) + rot26(V[6]); + V[1] = 0xf40e3585 + V[1] + W[11] + ch(46) + rot26(V[6]) + rot30(V[2]) + ma(46); + + V[4] += 0x106aa070 + V[0] + W[12] + ch(47) + rot26(V[5]); + V[0] = 0x106aa070 + V[0] + W[12] + ch(47) + rot26(V[5]) + rot30(V[1]) + ma(47); + + V[3] += 0x19a4c116 + V[7] + W[13] + ch(48) + rot26(V[4]); + V[7] = 0x19a4c116 + V[7] + W[13] + ch(48) + rot26(V[4]) + rot30(V[0]) + ma(48); + + V[2] += 0x1e376c08 + V[6] + W[14] + ch(49) + rot26(V[3]); + V[6] = 0x1e376c08 + V[6] + W[14] + ch(49) + rot26(V[3]) + rot30(V[7]) + ma(49); + + V[1] += 0x2748774c + V[5] + W[15] + ch(50) + rot26(V[2]); + V[5] = 0x2748774c + V[5] + W[15] + ch(50) + rot26(V[2]) + rot30(V[6]) + ma(50); + + V[0] += 0x34b0bcb5 + V[4] + W[16] + ch(51) + rot26(V[1]); + V[4] = 0x34b0bcb5 + V[4] + W[16] + ch(51) + rot26(V[1]) + rot30(V[5]) + ma(51); + +//---------------------------------------------------------------------------------- + + W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); + W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); + W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); + W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); + W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); + W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); + W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); + W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); + W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); + W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); + W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); + W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); + + V[7] += 0x391c0cb3 + V[3] + W[0] + ch(52) + rot26(V[0]); + V[3] = 0x391c0cb3 + V[3] + W[0] + ch(52) + rot26(V[0]) + rot30(V[4]) + ma(52); + + V[6] += 0x4ed8aa4a + V[2] + W[1] + ch(53) + rot26(V[7]); + V[2] = 0x4ed8aa4a + V[2] + W[1] + ch(53) + rot26(V[7]) + rot30(V[3]) + ma(53); + + V[5] += 0x5b9cca4f + V[1] + W[2] + ch(54) + rot26(V[6]); + V[1] = 0x5b9cca4f + V[1] + W[2] + ch(54) + rot26(V[6]) + rot30(V[2]) + ma(54); + + V[4] += 0x682e6ff3 + V[0] + W[3] + ch(55) + rot26(V[5]); + V[0] = 0x682e6ff3 + V[0] + W[3] + ch(55) + rot26(V[5]) + rot30(V[1]) + ma(55); + + V[3] += 0x748f82ee + V[7] + W[4] + ch(56) + rot26(V[4]); + V[7] = 0x748f82ee + V[7] + W[4] + ch(56) + rot26(V[4]) + rot30(V[0]) + ma(56); + + V[2] += 0x78a5636f + V[6] + W[5] + ch(57) + rot26(V[3]); + V[6] = 0x78a5636f + V[6] + W[5] + ch(57) + rot26(V[3]) + rot30(V[7]) + ma(57); + + V[1] += 0x84c87814 + V[5] + W[6] + ch(58) + rot26(V[2]); + V[5] = 0x84c87814 + V[5] + W[6] + ch(58) + rot26(V[2]) + rot30(V[6]) + ma(58); + + V[0] += 0x8cc70208 + V[4] + W[7] + ch(59) + rot26(V[1]); + V[4] = 0x8cc70208 + V[4] + W[7] + ch(59) + rot26(V[1]) + rot30(V[5]) + ma(59); + + V[7] += 0x90befffa + V[3] + W[8] + ch(60) + rot26(V[0]); + V[3] = 0x90befffa + V[3] + W[8] + ch(60) + rot26(V[0]) + rot30(V[4]) + ma(60); + + V[6] += 0xa4506ceb + V[2] + W[9] + ch(61) + rot26(V[7]); + V[2] = 0xa4506ceb + V[2] + W[9] + ch(61) + rot26(V[7]) + rot30(V[3]) + ma(61); + + V[5] += 0xbef9a3f7 + V[1] + W[10] + ch(62) + rot26(V[6]); + V[1] = 0xbef9a3f7 + V[1] + W[10] + ch(62) + rot26(V[6]) + rot30(V[2]) + ma(62); + + V[4] += 0xc67178f2 + V[0] + W[11] + ch(63) + rot26(V[5]); + V[0] = 0xc67178f2 + V[0] + W[11] + ch(63) + rot26(V[5]) + rot30(V[1]) + ma(63); + +//---------------------------------------------------------------------------------- + + W[0] = state0 + V[0]; + W[1] = state1 + V[1]; + W[2] = state2 + V[2]; + W[3] = state3 + V[3]; + W[4] = state4 + V[4]; + W[5] = state5 + V[5]; + W[6] = state6 + V[6]; + W[7] = state7 + V[7]; + + // 0x98c7e2a2 + W[0] + u state0AaddV0 = state0A + V[0]; + // 0xfc08884d + W[0] + u state0BaddV0 = state0B + V[0]; + + V[2] = 0x3c6ef372 + (V[6] = 0x90bb1e3c + W[1] + Ch(state0AaddV0, 0x510e527fU, 0x9b05688cU) + rot26(state0AaddV0)); + V[6] += rot30(state0BaddV0) + Ma(0x6a09e667U, 0xbb67ae85U, state0BaddV0); + + V[1] = 0xbb67ae85 + (V[5] = 0x50c6645b + W[2] + Ch(V[2], state0AaddV0, 0x510e527fU) + rot26(V[2])); + V[5] += rot30(V[6]) + Ma(state0BaddV0, 0x6a09e667U, V[6]); + + V[0] = 0x6a09e667 + (V[4] = 0x3ac42e24 + W[3] + Ch(V[1], V[2], state0AaddV0) + rot26(V[1])); + V[4] += rot30(V[5]) + Ma(V[6], state0BaddV0, V[5]); + + V[7] = (state0BaddV0) + (V[3] = 0x3956c25b + state0AaddV0 + W[4] + Ch(V[0], V[1], V[2]) + rot26(V[0])); + V[3] += rot30(V[4]) + Ma(V[5], V[6], V[4]); + +//--------------- ch() + ma() replaced above --------------- + + V[6] += 0x59f111f1 + V[2] + W[5] + ch(69) + rot26(V[7]); + V[2] = 0x59f111f1 + V[2] + W[5] + ch(69) + rot26(V[7]) + rot30(V[3]) + ma(69); + + V[5] += 0x923f82a4 + V[1] + W[6] + ch(70) + rot26(V[6]); + V[1] = 0x923f82a4 + V[1] + W[6] + ch(70) + rot26(V[6]) + rot30(V[2]) + ma(70); + + V[4] += 0xab1c5ed5 + V[0] + W[7] + ch(71) + rot26(V[5]); + V[0] = 0xab1c5ed5 + V[0] + W[7] + ch(71) + rot26(V[5]) + rot30(V[1]) + ma(71); + + V[3] += 0x5807aa98 + V[7] + ch(72) + rot26(V[4]); + V[7] = 0x5807aa98 + V[7] + ch(72) + rot26(V[4]) + rot30(V[0]) + ma(72); + + V[2] += 0x12835b01 + V[6] + ch(73) + rot26(V[3]); + V[6] = 0x12835b01 + V[6] + ch(73) + rot26(V[3]) + rot30(V[7]) + ma(73); + + V[1] += 0x243185be + V[5] + ch(74) + rot26(V[2]); + V[5] = 0x243185be + V[5] + ch(74) + rot26(V[2]) + rot30(V[6]) + ma(74); + + V[0] += 0x550c7dc3 + V[4] + ch(75) + rot26(V[1]); + V[4] = 0x550c7dc3 + V[4] + ch(75) + rot26(V[1]) + rot30(V[5]) + ma(75); + + V[7] += 0x72be5d74 + V[3] + ch(76) + rot26(V[0]); + V[3] = 0x72be5d74 + V[3] + ch(76) + rot26(V[0]) + rot30(V[4]) + ma(76); + + V[6] += 0x80deb1fe + V[2] + ch(77) + rot26(V[7]); + V[2] = 0x80deb1fe + V[2] + ch(77) + rot26(V[7]) + rot30(V[3]) + ma(77); + + V[5] += 0x9bdc06a7 + V[1] + ch(78) + rot26(V[6]); + V[1] = 0x9bdc06a7 + V[1] + ch(78) + rot26(V[6]) + rot30(V[2]) + ma(78); + + V[4] += 0xc19bf274 + V[0] + ch(79) + rot26(V[5]); + V[0] = 0xc19bf274 + V[0] + ch(79) + rot26(V[5]) + rot30(V[1]) + ma(79); + +//---------------------------------------------------------------------------------- + + W[0] = W[0] + rot25(W[1]); + W[1] = 0x00a00000 + W[1] + rot25(W[2]); + W[2] = W[2] + rot15(W[0]) + rot25(W[3]); + W[3] = W[3] + rot15(W[1]) + rot25(W[4]); + W[4] = W[4] + rot15(W[2]) + rot25(W[5]); + W[5] = W[5] + rot15(W[3]) + rot25(W[6]); + W[6] = 0x00000100 + W[6] + rot15(W[4]) + rot25(W[7]); + W[7] = 0x11002000 + W[7] + W[0] + rot15(W[5]); + W[8] = 0x80000000 + W[1] + rot15(W[6]); + W[9] = W[2] + rot15(W[7]); + W[10] = W[3] + rot15(W[8]); + W[11] = W[4] + rot15(W[9]); + W[12] = W[5] + rot15(W[10]); + W[13] = W[6] + rot15(W[11]); + W[14] = 0x00400022 + W[7] + rot15( W[12]); + W[15] = 0x00000100 + W[8] + rot15( W[13]) + rot25(W[0]); + W[16] = W[0] + W[9] + rot15( W[14]) + rot25(W[1]); + + V[3] += 0xe49b69c1 + V[7] + W[0] + ch(80) + rot26(V[4]); + V[7] = 0xe49b69c1 + V[7] + W[0] + ch(80) + rot26(V[4]) + rot30(V[0]) + ma(80); + + V[2] += 0xefbe4786 + V[6] + W[1] + ch(81) + rot26(V[3]); + V[6] = 0xefbe4786 + V[6] + W[1] + ch(81) + rot26(V[3]) + rot30(V[7]) + ma(81); + + V[1] += 0x0fc19dc6 + V[5] + W[2] + ch(82) + rot26(V[2]); + V[5] = 0x0fc19dc6 + V[5] + W[2] + ch(82) + rot26(V[2]) + rot30(V[6]) + ma(82); + + V[0] += 0x240ca1cc + V[4] + W[3] + ch(83) + rot26(V[1]); + V[4] = 0x240ca1cc + V[4] + W[3] + ch(83) + rot26(V[1]) + rot30(V[5]) + ma(83); + + V[7] += 0x2de92c6f + V[3] + W[4] + ch(84) + rot26(V[0]); + V[3] = 0x2de92c6f + V[3] + W[4] + ch(84) + rot26(V[0]) + rot30(V[4]) + ma(84); + + V[6] += 0x4a7484aa + V[2] + W[5] + ch(85) + rot26(V[7]); + V[2] = 0x4a7484aa + V[2] + W[5] + ch(85) + rot26(V[7]) + rot30(V[3]) + ma(85); + + V[5] += 0x5cb0a9dc + V[1] + W[6] + ch(86) + rot26(V[6]); + V[1] = 0x5cb0a9dc + V[1] + W[6] + ch(86) + rot26(V[6]) + rot30(V[2]) + ma(86); + + V[4] += 0x76f988da + V[0] + W[7] + ch(87) + rot26(V[5]); + V[0] = 0x76f988da + V[0] + W[7] + ch(87) + rot26(V[5]) + rot30(V[1]) + ma(87); + + V[3] += 0x983e5152 + V[7] + W[8] + ch(88) + rot26(V[4]); + V[7] = 0x983e5152 + V[7] + W[8] + ch(88) + rot26(V[4]) + rot30(V[0]) + ma(88); + + V[2] += 0xa831c66d + V[6] + W[9] + ch(89) + rot26(V[3]); + V[6] = 0xa831c66d + V[6] + W[9] + ch(89) + rot26(V[3]) + rot30(V[7]) + ma(89); + + V[1] += 0xb00327c8 + V[5] + W[10] + ch(90) + rot26(V[2]); + V[5] = 0xb00327c8 + V[5] + W[10] + ch(90) + rot26(V[2]) + rot30(V[6]) + ma(90); + + V[0] += 0xbf597fc7 + V[4] + W[11] + ch(91) + rot26(V[1]); + V[4] = 0xbf597fc7 + V[4] + W[11] + ch(91) + rot26(V[1]) + rot30(V[5]) + ma(91); + + V[7] += 0xc6e00bf3 + V[3] + W[12] + ch(92) + rot26(V[0]); + V[3] = 0xc6e00bf3 + V[3] + W[12] + ch(92) + rot26(V[0]) + rot30(V[4]) + ma(92); + + V[6] += 0xd5a79147 + V[2] + W[13] + ch(93) + rot26(V[7]); + V[2] = 0xd5a79147 + V[2] + W[13] + ch(93) + rot26(V[7]) + rot30(V[3]) + ma(93); + + V[5] += 0x06ca6351 + V[1] + W[14] + ch(94) + rot26(V[6]); + V[1] = 0x06ca6351 + V[1] + W[14] + ch(94) + rot26(V[6]) + rot30(V[2]) + ma(94); + + V[4] += 0x14292967 + V[0] + W[15] + ch(95) + rot26(V[5]); + V[0] = 0x14292967 + V[0] + W[15] + ch(95) + rot26(V[5]) + rot30(V[1]) + ma(95); + + V[3] += 0x27b70a85 + V[7] + W[16] + ch(96) + rot26(V[4]); + V[7] = 0x27b70a85 + V[7] + W[16] + ch(96) + rot26(V[4]) + rot30(V[0]) + ma(96); + +//---------------------------------------------------------------------------------- + + W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); + W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); + W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); + W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); + W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); + W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); + W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); + W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); + W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); + W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); + W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); + W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]); + W[12] = W[13] + W[5] + rot15(W[10]) + rot25(W[14]); + W[13] = W[14] + W[6] + rot15(W[11]) + rot25(W[15]); + W[14] = W[15] + W[7] + rot15(W[12]) + rot25(W[16]); + W[15] = W[16] + W[8] + rot15(W[13]) + rot25( W[0]); + W[16] = W[0] + W[9] + rot15(W[14]) + rot25( W[1]); + + V[2] += 0x2e1b2138 + V[6] + W[0] + ch(97) + rot26(V[3]); + V[6] = 0x2e1b2138 + V[6] + W[0] + ch(97) + rot26(V[3]) + rot30(V[7]) + ma(97); + + V[1] += 0x4d2c6dfc + V[5] + W[1] + ch(98) + rot26(V[2]); + V[5] = 0x4d2c6dfc + V[5] + W[1] + ch(98) + rot26(V[2]) + rot30(V[6]) + ma(98); + + V[0] += 0x53380d13 + V[4] + W[2] + ch(99) + rot26(V[1]); + V[4] = 0x53380d13 + V[4] + W[2] + ch(99) + rot26(V[1]) + rot30(V[5]) + ma(99); + + V[7] += 0x650a7354 + V[3] + W[3] + ch(100) + rot26(V[0]); + V[3] = 0x650a7354 + V[3] + W[3] + ch(100) + rot26(V[0]) + rot30(V[4]) + ma(100); + + V[6] += 0x766a0abb + V[2] + W[4] + ch(101) + rot26(V[7]); + V[2] = 0x766a0abb + V[2] + W[4] + ch(101) + rot26(V[7]) + rot30(V[3]) + ma(101); + + V[5] += 0x81c2c92e + V[1] + W[5] + ch(102) + rot26(V[6]); + V[1] = 0x81c2c92e + V[1] + W[5] + ch(102) + rot26(V[6]) + rot30(V[2]) + ma(102); + + V[4] += 0x92722c85 + V[0] + W[6] + ch(103) + rot26(V[5]); + V[0] = 0x92722c85 + V[0] + W[6] + ch(103) + rot26(V[5]) + rot30(V[1]) + ma(103); + + V[3] += 0xa2bfe8a1 + V[7] + W[7] + ch(104) + rot26(V[4]); + V[7] = 0xa2bfe8a1 + V[7] + W[7] + ch(104) + rot26(V[4]) + rot30(V[0]) + ma(104); + + V[2] += 0xa81a664b + V[6] + W[8] + ch(105) + rot26(V[3]); + V[6] = 0xa81a664b + V[6] + W[8] + ch(105) + rot26(V[3]) + rot30(V[7]) + ma(105); + + V[1] += 0xc24b8b70 + V[5] + W[9] + ch(106) + rot26(V[2]); + V[5] = 0xc24b8b70 + V[5] + W[9] + ch(106) + rot26(V[2]) + rot30(V[6]) + ma(106); + + V[0] += 0xc76c51a3 + V[4] + W[10] + ch(107) + rot26(V[1]); + V[4] = 0xc76c51a3 + V[4] + W[10] + ch(107) + rot26(V[1]) + rot30(V[5]) + ma(107); + + V[7] += 0xd192e819 + V[3] + W[11] + ch(108) + rot26(V[0]); + V[3] = 0xd192e819 + V[3] + W[11] + ch(108) + rot26(V[0]) + rot30(V[4]) + ma(108); + + V[6] += 0xd6990624 + V[2] + W[12] + ch(109) + rot26(V[7]); + V[2] = 0xd6990624 + V[2] + W[12] + ch(109) + rot26(V[7]) + rot30(V[3]) + ma(109); + + V[5] += 0xf40e3585 + V[1] + W[13] + ch(110) + rot26(V[6]); + V[1] = 0xf40e3585 + V[1] + W[13] + ch(110) + rot26(V[6]) + rot30(V[2]) + ma(110); + + V[4] += 0x106aa070 + V[0] + W[14] + ch(111) + rot26(V[5]); + V[0] = 0x106aa070 + V[0] + W[14] + ch(111) + rot26(V[5]) + rot30(V[1]) + ma(111); + + V[3] += 0x19a4c116 + V[7] + W[15] + ch(112) + rot26(V[4]); + V[7] = 0x19a4c116 + V[7] + W[15] + ch(112) + rot26(V[4]) + rot30(V[0]) + ma(112); + + V[2] += 0x1e376c08 + V[6] + W[16] + ch(113) + rot26(V[3]); + V[6] = 0x1e376c08 + V[6] + W[16] + ch(113) + rot26(V[3]) + rot30(V[7]) + ma(113); + +//---------------------------------------------------------------------------------- + + W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]); + W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]); + W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]); + W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]); + W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]); + W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]); + W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]); + W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]); + W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]); + W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]); + W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]); + + V[1] += 0x2748774c + V[5] + W[0] + ch(114) + rot26(V[2]); + V[5] = 0x2748774c + V[5] + W[0] + ch(114) + rot26(V[2]) + rot30(V[6]) + ma(114); + + V[0] += 0x34b0bcb5 + V[4] + W[1] + ch(115) + rot26(V[1]); + V[4] = 0x34b0bcb5 + V[4] + W[1] + ch(115) + rot26(V[1]) + rot30(V[5]) + ma(115); + + V[7] += 0x391c0cb3 + V[3] + W[2] + ch(116) + rot26(V[0]); + V[3] = 0x391c0cb3 + V[3] + W[2] + ch(116) + rot26(V[0]) + rot30(V[4]) + ma(116); + + V[6] += 0x4ed8aa4a + V[2] + W[3] + ch(117) + rot26(V[7]); + V[2] = 0x4ed8aa4a + V[2] + W[3] + ch(117) + rot26(V[7]) + rot30(V[3]) + ma(117); + + V[5] += 0x5b9cca4f + V[1] + W[4] + ch(118) + rot26(V[6]); + V[1] = 0x5b9cca4f + V[1] + W[4] + ch(118) + rot26(V[6]) + rot30(V[2]) + ma(118); + + V[4] += 0x682e6ff3 + V[0] + W[5] + ch(119) + rot26(V[5]); + V[0] = 0x682e6ff3 + V[0] + W[5] + ch(119) + rot26(V[5]) + rot30(V[1]) + ma(119); + + V[3] += 0x748f82ee + V[7] + W[6] + ch(120) + rot26(V[4]); + V[7] = 0x748f82ee + V[7] + W[6] + ch(120) + rot26(V[4]) + rot30(V[0]) + ma(120); + + V[2] += 0x78a5636f + V[6] + W[7] + ch(121) + rot26(V[3]); + + V[1] += 0x84c87814 + V[5] + W[8] + ch(122) + rot26(V[2]); + + V[0] += 0x8cc70208 + V[4] + W[9] + ch(123) + rot26(V[1]); + + V[7] += V[3] + W[10] + ch(124) + rot26(V[0]); + + +#define FOUND (0x80) +#define NFLAG (0x7F) + +#ifdef VECTORS4 + V[7] ^= 0x136032ed; + + bool result = V[7].x & V[7].y & V[7].z & V[7].w; + + if (!result) { + if (!V[7].x) + output[FOUND] = output[NFLAG & nonce.x] = nonce.x; + if (!V[7].y) + output[FOUND] = output[NFLAG & nonce.y] = nonce.y; + if (!V[7].z) + output[FOUND] = output[NFLAG & nonce.z] = nonce.z; + if (!V[7].w) + output[FOUND] = output[NFLAG & nonce.w] = nonce.w; + } +#else + #ifdef VECTORS2 + V[7] ^= 0x136032ed; + + bool result = V[7].x & V[7].y; + + if (!result) { + if (!V[7].x) + output[FOUND] = output[NFLAG & nonce.x] = nonce.x; + if (!V[7].y) + output[FOUND] = output[NFLAG & nonce.y] = nonce.y; + } + #else + if (V[7] == 0x136032ed) + output[FOUND] = output[NFLAG & nonce] = nonce; + #endif +#endif +} From 590497658053941b756c7648a3cd97b0e0e98a56 Mon Sep 17 00:00:00 2001 From: ckolivas Date: Wed, 8 Feb 2012 18:13:32 +1100 Subject: [PATCH 7/8] Reinstate original code as much as possible for comparison. --- diakgcn120208.cl | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/diakgcn120208.cl b/diakgcn120208.cl index b5d31969..7a87207a 100644 --- a/diakgcn120208.cl +++ b/diakgcn120208.cl @@ -3,7 +3,9 @@ // Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3. // The kernel was rewritten by me (Diapolo) and is still public-domain! -#if defined VECTORS4 +#ifdef VECTORS8 + typedef uint8 u; +#elif defined VECTORS4 typedef uint4 u; #elif defined VECTORS2 typedef uint2 u; @@ -29,7 +31,9 @@ #ifdef GOFFSET typedef uint uu; #else - #if defined VECTORS4 + #ifdef VECTORS8 + typedef uint8 uu; + #elif defined VECTORS4 typedef uint4 uu; #elif defined VECTORS2 typedef uint2 uu; @@ -63,23 +67,29 @@ __kernel u W[17]; u V[8]; -#if defined VECTORS4 +#ifdef VECTORS8 + #ifdef GOFFSET + u nonce = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7); + #else + u nonce = ((uint)get_group_id(0) * (uint)WORKSIZE * 8U) + ((uint)get_local_id(0) * 8U) + base; + #endif +#elif defined VECTORS4 #ifdef GOFFSET - u nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3); + u nonce = ((uint)get_global_id(0) << 2) + (u)(0, 1, 2, 3); #else - u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); + u nonce = ((uint)get_group_id(0) * (uint)WORKSIZE * 4U) + ((uint)get_local_id(0) * 4U) + base; #endif #elif defined VECTORS2 #ifdef GOFFSET - u nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1); + u nonce = ((uint)get_global_id(0) << 1) + (u)(0, 1); #else - u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); + u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) * 2U) + ((uint)get_local_id(0) * 2U) + base; #endif #else #ifdef GOFFSET - u nonce = base + get_global_id(0); + u nonce = (uint)get_global_id(0); #else - u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); + u nonce = ((uint)get_group_id(0) * (uint)WORKSIZE) + (uint)get_local_id(0) + base; #endif #endif From 041d6689bc0406818a5295c6fde54527d444524a Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Wed, 8 Feb 2012 21:03:20 +1100 Subject: [PATCH 8/8] Diakgcn should be using the modified PreVal4 which includes T1. --- device-gpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/device-gpu.c b/device-gpu.c index e39bff4b..f7cbcc24 100644 --- a/device-gpu.c +++ b/device-gpu.c @@ -769,7 +769,7 @@ static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk) nonces[i] = blk->nonce + i; CL_SET_VARG(vwidth, nonces); - CL_SET_BLKARG(PreVal4); + CL_SET_BLKARG(PreVal4_2); CL_SET_BLKARG(cty_h); CL_SET_BLKARG(cty_d); CL_SET_BLKARG(PreVal0);