diff --git a/kernel/credits.cl b/kernel/credits.cl new file mode 100644 index 00000000..4e32e7bd --- /dev/null +++ b/kernel/credits.cl @@ -0,0 +1,232 @@ +/* +* "yescrypt" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif + + +#define ROL32(x, n) rotate(x, (uint) n) +#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) +#define SWAP64(x) as_ulong(as_uchar8(x).s32107654) /// hmm... + +#define SHR(x, n) ((x) >> n) + +#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3)) +#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10)) + +#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10)) +#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7)) + +#define P(a,b,c,d,e,f,g,h,x,K) \ +{ \ + temp1 = h + S3(e) + F1(e,f,g) + (K + x); \ + d += temp1; h = temp1 + S2(a) + F0(a,b,c); \ +} + +#define F0(y, x, z) bitselect(z, y, z ^ x) +#define F1(x, y, z) bitselect(z, y, x) + +#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0) +#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1) +#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2) +#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3) +#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4) +#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5) +#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6) +#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7) +#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8) +#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9) +#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10) +#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11) +#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12) +#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13) +#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14) +#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15) + +#define RD14 (S1(W12) + W7 + S0(W15) + W14) +#define RD15 (S1(W13) + W8 + S0(W0) + W15) + +/// generic sha transform +inline uint8 sha256_Transform(uint16 data, uint8 state) +{ + uint temp1; + uint8 res = state; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + +#define v0 res.s0 +#define v1 res.s1 +#define v2 res.s2 +#define v3 res.s3 +#define v4 res.s4 +#define v5 res.s5 +#define v6 res.s6 +#define v7 res.s7 + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef v4 +#undef v5 +#undef v6 +#undef v7 + return (res + state); +} + + + +static __constant uint8 H256 = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, + 0xA54FF53A, 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + + +static __constant uint8 pad_data = +{ + 0x00000000, 0x00000000, 0x80000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000540 +}; + +static __constant uint8 pad_state = +{ + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100 +}; + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output,const ulong target, uint8 midstate ) +{ + + + uint nonce = get_global_id(0); + uint16 in; + uint8 state1; + + in.lo = ((__global const uint8 *)input)[4]; + in.hi = pad_data; + in.hi.s0 = ((__global const uint *)input)[40]; + in.hi.s1 = ((__global const uint *)input)[41]; + in.s3 = nonce; + state1 = sha256_Transform(in, midstate); + in.lo = state1; + in.hi = pad_state; + state1 = sha256_Transform(in,H256); + +if (SWAP64(state1.s67) <= target) + output[atomic_inc(output + 0xFF)] = nonce; + +} +