diff --git a/Makefile.am b/Makefile.am index ac2ecdad..c36ea1db 100644 --- a/Makefile.am +++ b/Makefile.am @@ -73,10 +73,12 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h sgminer_SOURCES += algorithm/whirlpoolx.c algorithm/whirlpoolx.h sgminer_SOURCES += algorithm/lyra2re.c algorithm/lyra2re.h algorithm/lyra2.c algorithm/lyra2.h algorithm/sponge.c algorithm/sponge.h -sgminer_SOURCES += algorithm/lyra2re_old.c algorithm/lyra2re_old.h +sgminer_SOURCES += algorithm/lyra2rev2.c algorithm/lyra2rev2.h sgminer_SOURCES += algorithm/pluck.c algorithm/pluck.h sgminer_SOURCES += algorithm/credits.c algorithm/credits.h sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h +sgminer_SOURCES += algorithm/blake256.c algorithm/blake256.h +sgminer_SOURCES += algorithm/blakecoin.c algorithm/blakecoin.h bin_SCRIPTS = $(top_srcdir)/kernel/*.cl diff --git a/algorithm.c b/algorithm.c index 6acab924..8d436e17 100644 --- a/algorithm.c +++ b/algorithm.c @@ -33,10 +33,12 @@ #include "algorithm/neoscrypt.h" #include "algorithm/whirlpoolx.h" #include "algorithm/lyra2re.h" -#include "algorithm/lyra2re_old.h" +#include "algorithm/lyra2rev2.h" #include "algorithm/pluck.h" #include "algorithm/yescrypt.h" #include "algorithm/credits.h" +#include "algorithm/blake256.h" +#include "algorithm/blakecoin.h" #include "compat.h" @@ -62,10 +64,13 @@ const char *algorithm_type_str[] = { "Neoscrypt", "WhirlpoolX", "Lyra2RE", - "Lyra2REv2" + "Lyra2REV2" "Pluck" "Yescrypt", - "Yescrypt-multi" + "Yescrypt-multi", + "Blakecoin", + "Blake", + "Vanilla" }; void sha256(const unsigned char *message, unsigned int len, unsigned char *digest) @@ -798,7 +803,7 @@ static cl_int queue_whirlpoolx_kernel(struct __clState *clState, struct _dev_blk return status; } -static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +static cl_int queue_lyra2re_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) { cl_kernel *kernel; unsigned int num; @@ -842,7 +847,7 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct return status; } -static cl_int queue_lyra2REv2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +static cl_int queue_lyra2rev2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) { cl_kernel *kernel; unsigned int num; @@ -915,10 +920,38 @@ static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_un return status; } +static cl_int queue_blake_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; + cl_int status = 0; + cl_ulong le_target; + + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(blk->work->blk.ctx_a); + CL_SET_ARG(blk->work->blk.ctx_b); + CL_SET_ARG(blk->work->blk.ctx_c); + CL_SET_ARG(blk->work->blk.ctx_d); + CL_SET_ARG(blk->work->blk.ctx_e); + CL_SET_ARG(blk->work->blk.ctx_f); + CL_SET_ARG(blk->work->blk.ctx_g); + CL_SET_ARG(blk->work->blk.ctx_h); + + CL_SET_ARG(blk->work->blk.cty_a); + CL_SET_ARG(blk->work->blk.cty_b); + CL_SET_ARG(blk->work->blk.cty_c); + + return status; +} + static algorithm_settings_t algos[] = { // kernels starting from this will have difficulty calculated by using litecoin algorithm #define A_SCRYPT(a) \ - { a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options } + { a, ALGO_SCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFFFFFFULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, scrypt_regenhash, NULL, queue_scrypt_kernel, gen_hash, append_scrypt_compiler_options } A_SCRYPT("ckolivas"), A_SCRYPT("alexkarnew"), A_SCRYPT("alexkarnold"), @@ -929,36 +962,33 @@ static algorithm_settings_t algos[] = { #undef A_SCRYPT #define A_NEOSCRYPT(a) \ - { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options } + { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, NULL, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options } A_NEOSCRYPT("neoscrypt"), #undef A_NEOSCRYPT #define A_PLUCK(a) \ - { a, ALGO_PLUCK, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, pluck_regenhash, queue_pluck_kernel, gen_hash, append_neoscrypt_compiler_options } + { a, ALGO_PLUCK, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, pluck_regenhash, NULL, queue_pluck_kernel, gen_hash, append_neoscrypt_compiler_options } A_PLUCK("pluck"), #undef A_PLUCK #define A_CREDITS(a) \ - { a, ALGO_CRE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, credits_regenhash, queue_credits_kernel, gen_hash, NULL} + { a, ALGO_CRE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, credits_regenhash, NULL, queue_credits_kernel, gen_hash, NULL} A_CREDITS("credits"), #undef A_CREDITS - - #define A_YESCRYPT(a) \ - { a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options} + { a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, NULL, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options} A_YESCRYPT("yescrypt"), #undef A_YESCRYPT #define A_YESCRYPT_MULTI(a) \ - { a, ALGO_YESCRYPT_MULTI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 6,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, queue_yescrypt_multikernel, gen_hash, append_neoscrypt_compiler_options} + { a, ALGO_YESCRYPT_MULTI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 6,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, NULL, queue_yescrypt_multikernel, gen_hash, append_neoscrypt_compiler_options} A_YESCRYPT_MULTI("yescrypt-multi"), #undef A_YESCRYPT_MULTI - // kernels starting from this will have difficulty calculated by using quarkcoin algorithm #define A_QUARK(a, b) \ - { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options } + { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options } A_QUARK("quarkcoin", quarkcoin_regenhash), A_QUARK("qubitcoin", qubitcoin_regenhash), A_QUARK("animecoin", animecoin_regenhash), @@ -967,46 +997,48 @@ static algorithm_settings_t algos[] = { // kernels starting from this will have difficulty calculated by using bitcoin algorithm #define A_DARK(a, b) \ - { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options } + { a, ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options } A_DARK("darkcoin", darkcoin_regenhash), A_DARK("inkcoin", inkcoin_regenhash), A_DARK("myriadcoin-groestl", myriadcoin_groestl_regenhash), #undef A_DARK - { "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, queue_sph_kernel, sha256, NULL }, - { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, queue_maxcoin_kernel, sha256, NULL }, - - { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options }, + { "twecoin", ALGO_TWE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, twecoin_regenhash, NULL, queue_sph_kernel, sha256, NULL }, + { "maxcoin", ALGO_KECCAK, "", 1, 256, 1, 4, 15, 0x0F, 0xFFFFULL, 0x000000ffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, maxcoin_regenhash, NULL, queue_maxcoin_kernel, sha256, NULL }, - { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, queue_sph_kernel, gen_hash, append_x13_compiler_options }, - { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options }, - { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options }, + { "darkcoin-mod", ALGO_X11, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, darkcoin_regenhash, NULL, queue_darkcoin_mod_kernel, gen_hash, append_x11_compiler_options }, - { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_kernel, gen_hash, append_x13_compiler_options }, - { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, queue_x14_old_kernel, gen_hash, append_x13_compiler_options }, + { "marucoin", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, marucoin_regenhash, NULL, queue_sph_kernel, gen_hash, append_x13_compiler_options }, + { "marucoin-mod", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 12, 8 * 16 * 4194304, 0, marucoin_regenhash, NULL, queue_marucoin_mod_kernel, gen_hash, append_x13_compiler_options }, + { "marucoin-modold", ALGO_X13, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, marucoin_regenhash, NULL, queue_marucoin_mod_old_kernel, gen_hash, append_x13_compiler_options }, - { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblock_kernel, gen_hash, append_x13_compiler_options }, - { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options }, + { "x14", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 13, 8 * 16 * 4194304, 0, x14_regenhash, NULL, queue_x14_kernel, gen_hash, append_x13_compiler_options }, + { "x14old", ALGO_X14, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 8 * 16 * 4194304, 0, x14_regenhash, NULL, queue_x14_old_kernel, gen_hash, append_x13_compiler_options }, - { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options }, + { "bitblock", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 14, 4 * 16 * 4194304, 0, bitblock_regenhash, NULL, queue_bitblock_kernel, gen_hash, append_x13_compiler_options }, + { "bitblockold", ALGO_X15, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 10, 4 * 16 * 4194304, 0, bitblock_regenhash, NULL, queue_bitblockold_kernel, gen_hash, append_x13_compiler_options }, - { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL }, + { "talkcoin-mod", ALGO_NIST, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 8 * 16 * 4194304, 0, talkcoin_regenhash, NULL, queue_talkcoin_mod_kernel, gen_hash, append_x11_compiler_options }, - { "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2reold_regenhash, queue_lyra2RE_kernel, gen_hash, NULL }, - - { "lyra2rev2", ALGO_LYRA2REv2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2re_regenhash, queue_lyra2REv2_kernel, gen_hash, append_neoscrypt_compiler_options }, + { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, NULL, queue_fresh_kernel, gen_hash, NULL }, + { "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2re_regenhash, precalc_hash_blake256, queue_lyra2re_kernel, gen_hash, NULL }, + { "lyra2rev2", ALGO_LYRA2REV2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2rev2_regenhash, precalc_hash_blake256, queue_lyra2rev2_kernel, gen_hash, append_neoscrypt_compiler_options }, // kernels starting from this will have difficulty calculated by using fuguecoin algorithm #define A_FUGUE(a, b, c) \ - { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, c, NULL } + { a, ALGO_FUGUE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, queue_sph_kernel, c, NULL } A_FUGUE("fuguecoin", fuguecoin_regenhash, sha256), A_FUGUE("groestlcoin", groestlcoin_regenhash, sha256), A_FUGUE("diamond", groestlcoin_regenhash, gen_hash), #undef A_FUGUE - { "whirlcoin", ALGO_WHIRL, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 3, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, whirlcoin_regenhash, queue_whirlcoin_kernel, sha256, NULL }, - { "whirlpoolx", ALGO_WHIRLPOOLX, "", 1, 1, 1, 0, 0, 0xFFU, 0xFFFFULL, 0x0000FFFFUL, 0, 0, 0, whirlpoolx_regenhash, queue_whirlpoolx_kernel, gen_hash, NULL }, + { "whirlcoin", ALGO_WHIRL, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 3, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, whirlcoin_regenhash, NULL, queue_whirlcoin_kernel, sha256, NULL }, + { "whirlpoolx", ALGO_WHIRLPOOLX, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000FFFFUL, 0, 0, 0, whirlpoolx_regenhash, NULL, queue_whirlpoolx_kernel, gen_hash, NULL }, + + { "blake256r8", ALGO_BLAKECOIN, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x000000ffUL, 0, 128, 0, blakecoin_regenhash, precalc_hash_blakecoin, queue_blake_kernel, sha256, NULL }, + { "blake256r14", ALGO_BLAKE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x00000000UL, 0, 128, 0, blake256_regenhash, precalc_hash_blake256, queue_blake_kernel, gen_hash, NULL }, + { "vanilla", ALGO_VANILLA, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x000000ffUL, 0, 128, 0, blakecoin_regenhash, precalc_hash_blakecoin, queue_blake_kernel, gen_hash, NULL }, // Terminator (do not remove) { NULL, ALGO_UNK, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NULL, NULL, NULL, NULL } @@ -1019,7 +1051,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo) // Find algorithm settings and copy for (src = algos; src->name; src++) { - if (strcmp(src->name, algo) == 0) + if (strcasecmp(src->name, algo) == 0) { strcpy(dest->name, src->name); dest->kernelfile = src->kernelfile; @@ -1037,6 +1069,7 @@ void copy_algorithm_settings(algorithm_t* dest, const char* algo) dest->rw_buffer_size = src->rw_buffer_size; dest->cq_properties = src->cq_properties; dest->regenhash = src->regenhash; + dest->precalc_hash = src->precalc_hash; dest->queue_kernel = src->queue_kernel; dest->gen_hash = src->gen_hash; dest->set_compile_options = src->set_compile_options; @@ -1081,6 +1114,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa ALGO_ALIAS("whirlpool", "whirlcoin"); ALGO_ALIAS("lyra2", "lyra2re"); ALGO_ALIAS("lyra2v2", "lyra2rev2"); + ALGO_ALIAS("blakecoin", "blake256r8"); + ALGO_ALIAS("blake", "blake256r14"); #undef ALGO_ALIAS #undef ALGO_ALIAS_NF diff --git a/algorithm.h b/algorithm.h index 8b7185a4..afbd2f7a 100644 --- a/algorithm.h +++ b/algorithm.h @@ -30,10 +30,13 @@ typedef enum { ALGO_NEOSCRYPT, ALGO_WHIRLPOOLX, ALGO_LYRA2RE, - ALGO_LYRA2REv2, + ALGO_LYRA2REV2, ALGO_PLUCK, ALGO_YESCRYPT, ALGO_YESCRYPT_MULTI, + ALGO_BLAKECOIN, + ALGO_BLAKE, + ALGO_VANILLA } algorithm_type_t; extern const char *algorithm_type_str[]; @@ -67,6 +70,7 @@ typedef struct _algorithm_t { long rw_buffer_size; cl_command_queue_properties cq_properties; void(*regenhash)(struct work *); + void(*precalc_hash)(struct _dev_blk_ctx *, uint32_t *, uint32_t *); cl_int(*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint); void(*gen_hash)(const unsigned char *, unsigned int, unsigned char *); void(*set_compile_options)(struct _build_kernel_data *, struct cgpu_info *, struct _algorithm_t *); @@ -89,6 +93,7 @@ typedef struct _algorithm_settings_t long rw_buffer_size; cl_command_queue_properties cq_properties; void (*regenhash)(struct work *); + void (*precalc_hash)(struct _dev_blk_ctx *, uint32_t *, uint32_t *); cl_int (*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint); void (*gen_hash)(const unsigned char *, unsigned int, unsigned char *); void (*set_compile_options)(build_kernel_data *, struct cgpu_info *, algorithm_t *); diff --git a/algorithm/blake256.c b/algorithm/blake256.c new file mode 100644 index 00000000..ddd86122 --- /dev/null +++ b/algorithm/blake256.c @@ -0,0 +1,163 @@ +/* + * BLAKE implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + * + * Modified for more speed by BlueDragon747 for the Blakecoin project + */ + +#include +#include +#include +#include + +#include "sph/sph_blake.h" +#include "algorithm/blake256.h" + +/* +* Encode a length len/4 vector of (uint32_t) into a length len vector of +* (unsigned char) in big-endian form. Assumes len is a multiple of 4. +*/ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + +static const uint32_t diff1targ_blake256 = 0x000000ff; + +inline void blake256hash(void *state, const void *input) +{ + sph_blake256_context ctx_blake; + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, input, 80); + sph_blake256_close(&ctx_blake, state); +} + +void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *pdata) +{ + sph_blake256_context ctx_blake; + uint32_t data[16]; + + be32enc_vect(data, (const uint32_t *)pdata, 16); + + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, data, 64); + + blk->ctx_a = ctx_blake.H[0]; + blk->ctx_b = ctx_blake.H[1]; + blk->ctx_c = ctx_blake.H[2]; + blk->ctx_d = ctx_blake.H[3]; + blk->ctx_e = ctx_blake.H[4]; + blk->ctx_f = ctx_blake.H[5]; + blk->ctx_g = ctx_blake.H[6]; + blk->ctx_h = ctx_blake.H[7]; + + blk->cty_a = pdata[16]; + blk->cty_b = pdata[17]; + blk->cty_c = pdata[18]; +} + + +static const uint32_t diff1targ = 0x0000ffff; + +/* Used externally as confirmation of correct OCL code */ +int blake256_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + blake256hash(ohash, data); + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + if (tmp_hash7 > diff1targ) + return -1; + if (tmp_hash7 > Htarg) + return 0; + return 1; +} + +void blake256_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + blake256hash(ohash, data); +} + +bool scanhash_blake256(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while(1) { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + blake256hash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", + (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) { + *last_nonce = n; + break; + } + } + + return ret; +} diff --git a/algorithm/blake256.h b/algorithm/blake256.h new file mode 100644 index 00000000..f021585d --- /dev/null +++ b/algorithm/blake256.h @@ -0,0 +1,10 @@ +#ifndef BLAKE256_H +#define BLAKE256_H + +#include "miner.h" + +extern int blake256_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); +extern void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *pdata); +extern void blake256_regenhash(struct work *work); + +#endif /* BLAKE256_H */ \ No newline at end of file diff --git a/algorithm/blakecoin.c b/algorithm/blakecoin.c new file mode 100644 index 00000000..6a118cef --- /dev/null +++ b/algorithm/blakecoin.c @@ -0,0 +1,163 @@ +/* + * BLAKE implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + * + * Modified for more speed by BlueDragon747 for the Blakecoin project + */ + +#include +#include +#include +#include + +#include "sph/sph_blake.h" +#include "algorithm/blakecoin.h" + +/* +* Encode a length len/4 vector of (uint32_t) into a length len vector of +* (unsigned char) in big-endian form. Assumes len is a multiple of 4. +*/ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + +static const uint32_t diff1targ_blake256 = 0x000000ff; + +inline void blakecoinhash(void *state, const void *input) +{ + sph_blake256_context ctx_blake; + sph_blake256_init(&ctx_blake); + sph_blake256r8(&ctx_blake, input, 80); + sph_blake256r8_close(&ctx_blake, state); +} + +void precalc_hash_blakecoin(dev_blk_ctx *blk, uint32_t *state, uint32_t *pdata) +{ + sph_blake256_context ctx_blake; + uint32_t data[16]; + + be32enc_vect(data, (const uint32_t *)pdata, 16); + + sph_blake256_init(&ctx_blake); + sph_blake256r8(&ctx_blake, data, 64); + + blk->ctx_a = ctx_blake.H[0]; + blk->ctx_b = ctx_blake.H[1]; + blk->ctx_c = ctx_blake.H[2]; + blk->ctx_d = ctx_blake.H[3]; + blk->ctx_e = ctx_blake.H[4]; + blk->ctx_f = ctx_blake.H[5]; + blk->ctx_g = ctx_blake.H[6]; + blk->ctx_h = ctx_blake.H[7]; + + blk->cty_a = pdata[16]; + blk->cty_b = pdata[17]; + blk->cty_c = pdata[18]; +} + + +static const uint32_t diff1targ = 0x0000ffff; + +/* Used externally as confirmation of correct OCL code */ +int blakecoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + blakecoinhash(ohash, data); + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + if (tmp_hash7 > diff1targ) + return -1; + if (tmp_hash7 > Htarg) + return 0; + return 1; +} + +void blakecoin_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + blakecoinhash(ohash, data); +} + +bool scanhash_blakecoin(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while(1) { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + blakecoinhash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", + (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) { + *last_nonce = n; + break; + } + } + + return ret; +} diff --git a/algorithm/blakecoin.h b/algorithm/blakecoin.h new file mode 100644 index 00000000..dcaeff71 --- /dev/null +++ b/algorithm/blakecoin.h @@ -0,0 +1,10 @@ +#ifndef BLAKECOIN_H +#define BLAKECOIN_H + +#include "miner.h" + +extern int blakecoin_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); +extern void precalc_hash_blakecoin(dev_blk_ctx *blk, uint32_t *state, uint32_t *data); +extern void blakecoin_regenhash(struct work *work); + +#endif /* BLAKECOIN_H */ \ No newline at end of file diff --git a/algorithm/lyra2.c b/algorithm/lyra2.c index 42640e76..865d8e17 100644 --- a/algorithm/lyra2.c +++ b/algorithm/lyra2.c @@ -61,16 +61,18 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); - uint64_t *wholeMatrix = malloc(i); + uint64_t *wholeMatrix = (uint64_t*)malloc(i); if (wholeMatrix == NULL) { return -1; } memset(wholeMatrix, 0, i); //Allocates pointers to each row of the matrix - uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*)); + uint64_t **memMatrix = (uint64_t**)malloc(nRows * sizeof (uint64_t*)); if (memMatrix == NULL) { return -1; } @@ -122,7 +124,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //======================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - uint64_t *state = malloc(16 * sizeof (uint64_t)); + uint64_t *state = (uint64_t*)malloc(16 * sizeof (uint64_t)); if (state == NULL) { return -1; } @@ -134,7 +136,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * ptrWord = wholeMatrix; for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) } //Initializes M[0] and M[1] @@ -196,7 +198,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * absorbBlock(state, memMatrix[rowa]); //Squeezes the key - squeeze(state, K, kLen); + squeeze(state, (unsigned char*)K, kLen); //==========================================================================/ //========================= Freeing the memory =============================// diff --git a/algorithm/lyra2re.c b/algorithm/lyra2re.c index cfc5adbb..61f2b34f 100644 --- a/algorithm/lyra2re.c +++ b/algorithm/lyra2re.c @@ -36,8 +36,6 @@ #include "sph/sph_groestl.h" #include "sph/sph_skein.h" #include "sph/sph_keccak.h" -#include "sph/sph_bmw.h" -#include "sph/sph_cubehash.h" #include "lyra2.h" /* @@ -57,10 +55,9 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) inline void lyra2rehash(void *state, const void *input) { sph_blake256_context ctx_blake; - sph_bmw256_context ctx_bmw; + sph_groestl256_context ctx_groestl; sph_keccak256_context ctx_keccak; sph_skein256_context ctx_skein; - sph_cubehash256_context ctx_cube; uint32_t hashA[8], hashB[8]; @@ -72,23 +69,17 @@ inline void lyra2rehash(void *state, const void *input) sph_keccak256 (&ctx_keccak,hashA, 32); sph_keccak256_close(&ctx_keccak, hashB); - sph_cubehash256_init(&ctx_cube); - sph_cubehash256(&ctx_cube, hashB, 32); - sph_cubehash256_close(&ctx_cube, hashA); + LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); - LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); sph_skein256_init(&ctx_skein); - sph_skein256 (&ctx_skein, hashB, 32); - sph_skein256_close(&ctx_skein, hashA); + sph_skein256 (&ctx_skein, hashA, 32); + sph_skein256_close(&ctx_skein, hashB); - sph_cubehash256_init(&ctx_cube); - sph_cubehash256(&ctx_cube, hashA, 32); - sph_cubehash256_close(&ctx_cube, hashB); - sph_bmw256_init(&ctx_bmw); - sph_bmw256 (&ctx_bmw, hashB, 32); - sph_bmw256_close(&ctx_bmw, hashA); + sph_groestl256_init(&ctx_groestl); + sph_groestl256 (&ctx_groestl, hashB, 32); + sph_groestl256_close(&ctx_groestl, hashA); memcpy(state, hashA, 32); } diff --git a/algorithm/lyra2re.h b/algorithm/lyra2re.h index 8bc52ac4..8a58e747 100644 --- a/algorithm/lyra2re.h +++ b/algorithm/lyra2re.h @@ -2,8 +2,6 @@ #define LYRA2RE_H #include "miner.h" -#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent -#define LYRA_SECBUF_SIZE (4) // (not used) extern int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); diff --git a/algorithm/lyra2re_old.h b/algorithm/lyra2re_old.h deleted file mode 100644 index 0788dfb3..00000000 --- a/algorithm/lyra2re_old.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef LYRA2REOLD_H -#define LYRA2REOLD_H - -#include "miner.h" - -extern int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, - uint32_t nonce); -extern void lyra2reold_regenhash(struct work *work); - -#endif /* LYRA2RE_H */ diff --git a/algorithm/lyra2re_old.c b/algorithm/lyra2rev2.c similarity index 77% rename from algorithm/lyra2re_old.c rename to algorithm/lyra2rev2.c index 31a0a1ab..aea0082a 100644 --- a/algorithm/lyra2re_old.c +++ b/algorithm/lyra2rev2.c @@ -36,6 +36,8 @@ #include "sph/sph_groestl.h" #include "sph/sph_skein.h" #include "sph/sph_keccak.h" +#include "sph/sph_bmw.h" +#include "sph/sph_cubehash.h" #include "lyra2.h" /* @@ -52,13 +54,13 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) } -inline void lyra2rehash_old(void *state, const void *input) +inline void lyra2rev2hash(void *state, const void *input) { sph_blake256_context ctx_blake; - sph_groestl256_context ctx_groestl; + sph_bmw256_context ctx_bmw; sph_keccak256_context ctx_keccak; sph_skein256_context ctx_skein; - + sph_cubehash256_context ctx_cube; uint32_t hashA[8], hashB[8]; sph_blake256_init(&ctx_blake); @@ -69,32 +71,41 @@ inline void lyra2rehash_old(void *state, const void *input) sph_keccak256 (&ctx_keccak,hashA, 32); sph_keccak256_close(&ctx_keccak, hashB); - LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashB, 32); + sph_cubehash256_close(&ctx_cube, hashA); + + LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); + + sph_skein256_init(&ctx_skein); + sph_skein256 (&ctx_skein, hashB, 32); + sph_skein256_close(&ctx_skein, hashA); - sph_skein256_init(&ctx_skein); - sph_skein256 (&ctx_skein, hashA, 32); - sph_skein256_close(&ctx_skein, hashB); + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashA, 32); + sph_cubehash256_close(&ctx_cube, hashB); + sph_bmw256_init(&ctx_bmw); + sph_bmw256 (&ctx_bmw, hashB, 32); + sph_bmw256_close(&ctx_bmw, hashA); - sph_groestl256_init(&ctx_groestl); - sph_groestl256 (&ctx_groestl, hashB, 32); - sph_groestl256_close(&ctx_groestl, hashA); +//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]); - memcpy(state, hashA, 32); + memcpy(state, hashA, 32); } static const uint32_t diff1targ = 0x0000ffff; /* Used externally as confirmation of correct OCL code */ -int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +int lyra2rev2_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) { uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); uint32_t data[20], ohash[8]; be32enc_vect(data, (const uint32_t *)pdata, 19); data[19] = htobe32(nonce); - lyra2rehash_old(ohash, data); + lyra2rev2hash(ohash, data); tmp_hash7 = be32toh(ohash[7]); applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", @@ -108,7 +119,7 @@ int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t return 1; } -void lyra2reold_regenhash(struct work *work) +void lyra2rev2_regenhash(struct work *work) { uint32_t data[20]; uint32_t *nonce = (uint32_t *)(work->data + 76); @@ -116,10 +127,10 @@ void lyra2reold_regenhash(struct work *work) be32enc_vect(data, (const uint32_t *)work->data, 19); data[19] = htobe32(*nonce); - lyra2rehash_old(ohash, data); + lyra2rev2hash(ohash, data); } -bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, +bool scanhash_lyra2rev2(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, unsigned char *pdata, unsigned char __maybe_unused *phash1, unsigned char __maybe_unused *phash, const unsigned char *ptarget, uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) @@ -137,7 +148,7 @@ bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unuse *nonce = ++n; data[19] = (n); - lyra2rehash_old(ostate, data); + lyra2rev2hash(ostate, data); tmp_hash7 = (ostate[7]); applog(LOG_INFO, "data7 %08lx", diff --git a/algorithm/lyra2rev2.h b/algorithm/lyra2rev2.h new file mode 100644 index 00000000..1a31f76f --- /dev/null +++ b/algorithm/lyra2rev2.h @@ -0,0 +1,11 @@ +#ifndef LYRA2REV2_H +#define LYRA2REV2_H + +#include "miner.h" +#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent +#define LYRA_SECBUF_SIZE (4) // (not used) +extern int lyra2rev2_test(unsigned char *pdata, const unsigned char *ptarget, + uint32_t nonce); +extern void lyra2rev2_regenhash(struct work *work); + +#endif /* LYRA2REV2_H */ diff --git a/algorithm/yescrypt-opt.c b/algorithm/yescrypt-opt.c index b54be469..6adef7e7 100644 --- a/algorithm/yescrypt-opt.c +++ b/algorithm/yescrypt-opt.c @@ -99,7 +99,7 @@ alloc_region(yescrypt_region_t * region, size_t size) if (size + 63 < size) { errno = ENOMEM; } - else if ((base = malloc(size + 63)) != NULL) { + else if ((base = (uint8_t *)malloc(size + 63)) != NULL) { aligned = base + 63; aligned -= (uintptr_t)aligned & 63; } @@ -520,7 +520,7 @@ smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags, uint64_t * XY, uint64_t * S) { void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = (S ? blockmix_pwxform : blockmix_salsa8); - const uint64_t * VROM = shared->shared1.aligned; + const uint64_t * VROM = (uint64_t *)shared->shared1.aligned; uint32_t VROM_mask = shared->mask1; size_t s = 16 * r; uint64_t * X = V; @@ -671,7 +671,7 @@ smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop, void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = (S ? blockmix_pwxform : blockmix_salsa8); - const uint64_t * VROM = shared->shared1.aligned; + const uint64_t * VROM = (uint64_t *)shared->shared1.aligned; uint32_t VROM_mask = shared->mask1 | 1; size_t s = 16 * r; yescrypt_flags_t rw = flags & YESCRYPT_RW; @@ -835,7 +835,7 @@ smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; if (Sp) - smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL); + smix1(Bp, 1, S_SIZE_ALL / 16, (yescrypt_flags_t)flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL); diff --git a/api.c b/api.c index 08701b8f..1efa3004 100644 --- a/api.c +++ b/api.c @@ -1334,7 +1334,7 @@ static void apiversion(struct io_data *io_data, __maybe_unused SOCKETTYPE c, __m io_open = io_add(io_data, isjson ? COMSTR JSON_VERSION : _VERSION COMSTR); root = api_add_string(root, "Miner", PACKAGE " " VERSION, false); - root = api_add_string(root, "CGMiner", CGMINER_VERSION, false); + root = api_add_string(root, "SGMiner", CGMINER_VERSION, false); root = api_add_const(root, "API", APIVERSION, false); root = print_data(root, buf, isjson, false); diff --git a/configure.ac b/configure.ac index 681c2b9c..659ad843 100644 --- a/configure.ac +++ b/configure.ac @@ -1,7 +1,7 @@ ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## m4_define([v_maj], [5]) -m4_define([v_min], [2]) +m4_define([v_min], [3]) m4_define([v_mic], [0]) m4_define([v_rev], [nicehash]) ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--## diff --git a/driver-opencl.c b/driver-opencl.c index 0e45e555..0e368782 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1366,13 +1366,8 @@ static bool opencl_thread_init(struct thr_info *thr) static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work) { - if (work->pool->algorithm.type == ALGO_LYRA2RE || work->pool->algorithm.type == ALGO_LYRA2REv2) { - work->blk.work = work; - precalc_hash_blake256(&work->blk, 0, (uint32_t *)(work->data)); - } - else { - work->blk.work = work; - } + work->blk.work = work; + if (work->pool->algorithm.precalc_hash) work->pool->algorithm.precalc_hash(&work->blk, 0, (uint32_t *)(work->data)); thr->pool_no = work->pool->pool_no; return true; } diff --git a/findnonce.c b/findnonce.c index 8858cfa6..72dcaaef 100644 --- a/findnonce.c +++ b/findnonce.c @@ -234,136 +234,3 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res) free(pcd); } } - -// BLAKE 256 14 rounds (standard) - -typedef struct -{ - uint32_t h[8]; - uint32_t t; -} blake_state256; - -#define NB_ROUNDS32 14 - -const uint8_t blake_sigma[][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } -}; - -const uint32_t blake_u256[16] = -{ - 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, - 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, - 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c, - 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 -}; - -#define ROT32(x,n) (((x)<<(32-n))|( (x)>>(n))) -//#define ROT32(x,n) (rotate((uint)x, (uint)32-n)) -#define ADD32(x,y) ((uint32_t)((x) + (y))) -#define XOR32(x,y) ((uint32_t)((x) ^ (y))) - -#define G(a,b,c,d,i) \ -do { \ - v[a] += XOR32(m[blake_sigma[r][i]], blake_u256[blake_sigma[r][i + 1]]) + v[b]; \ - v[d] = ROT32(XOR32(v[d], v[a]), 16); \ - v[c] += v[d]; \ - v[b] = ROT32(XOR32(v[b], v[c]), 12); \ - v[a] += XOR32(m[blake_sigma[r][i + 1]], blake_u256[blake_sigma[r][i]]) + v[b]; \ - v[d] = ROT32(XOR32(v[d], v[a]), 8); \ - v[c] += v[d]; \ - v[b] = ROT32(XOR32(v[b], v[c]), 7); \ -} while (0) - -// compress a block -void blake256_compress_block(blake_state256 *S, uint32_t *m) -{ - uint32_t v[16]; - int i, r; - for (i = 0; i < 8; ++i) v[i] = S->h[i]; - - v[8] = blake_u256[0]; - v[9] = blake_u256[1]; - v[10] = blake_u256[2]; - v[11] = blake_u256[3]; - v[12] = blake_u256[4]; - v[13] = blake_u256[5]; - v[14] = blake_u256[6]; - v[15] = blake_u256[7]; - - v[12] ^= S->t; - v[13] ^= S->t; - - for (r = 0; r < NB_ROUNDS32; ++r) - { - /* column step */ - G(0, 4, 8, 12, 0); - G(1, 5, 9, 13, 2); - G(2, 6, 10, 14, 4); - G(3, 7, 11, 15, 6); - /* diagonal step */ - G(0, 5, 10, 15, 8); - G(1, 6, 11, 12, 10); - G(2, 7, 8, 13, 12); - G(3, 4, 9, 14, 14); - } - - for (i = 0; i < 16; ++i) S->h[i & 7] ^= v[i]; -} - -void blake256_init(blake_state256 *S) -{ - S->h[0] = 0x6a09e667; - S->h[1] = 0xbb67ae85; - S->h[2] = 0x3c6ef372; - S->h[3] = 0xa54ff53a; - S->h[4] = 0x510e527f; - S->h[5] = 0x9b05688c; - S->h[6] = 0x1f83d9ab; - S->h[7] = 0x5be0cd19; - S->t = 0; -} - -void blake256_update(blake_state256 *S, const uint32_t *in) -{ - uint32_t m[16]; - int i; - S->t = 512; - for (i = 0; i < 16; ++i) m[i] = in[i]; - blake256_compress_block(S, m); -} - -void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) -{ - blake_state256 S; - blake256_init(&S); - blake256_update(&S, data); - - blk->ctx_a = S.h[0]; - blk->ctx_b = S.h[1]; - blk->ctx_c = S.h[2]; - blk->ctx_d = S.h[3]; - blk->ctx_e = S.h[4]; - blk->ctx_f = S.h[5]; - blk->ctx_g = S.h[6]; - blk->ctx_h = S.h[7]; - - blk->cty_a = data[16]; - blk->cty_b = data[17]; - blk->cty_c = data[18]; -} diff --git a/findnonce.h b/findnonce.h index 49b1aa9a..9376a57b 100644 --- a/findnonce.h +++ b/findnonce.h @@ -10,6 +10,5 @@ extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data); extern void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res); -extern void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data); #endif /*FINDNONCE_H*/ diff --git a/kernel/blake256r14.cl b/kernel/blake256r14.cl new file mode 100644 index 00000000..e94b4d7c --- /dev/null +++ b/kernel/blake256r14.cl @@ -0,0 +1,157 @@ +// (c) 2013 originally written by smolen, modified by kr105 + +#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n))) + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search( + volatile __global uint * restrict output, + // precalc hash from fisrt part of message + const uint h0, + const uint h1, + const uint h2, + const uint h3, + const uint h4, + const uint h5, + const uint h6, + const uint h7, + // last 12 bytes of original message + const uint in16, + const uint in17, + const uint in18 +) +{ + uint M0, M1, M2, M3, M4, M5, M6, M7; + uint M8, M9, MA, MB, MC, MD, ME, MF; + uint V0, V1, V2, V3, V4, V5, V6, V7; + uint V8, V9, VA, VB, VC, VD, VE, VF; + uint pre7; + uint nonce = get_global_id(0); + + V0 = h0; + V1 = h1; + V2 = h2; + V3 = h3; + V4 = h4; + V5 = h5; + V6 = h6; + pre7 = V7 = h7; + M0 = in16; + M1 = in17; + M2 = in18; + M3 = nonce; + + V8 = 0x243F6A88UL; + V9 = 0x85A308D3UL; + VA = 0x13198A2EUL; + VB = 0x03707344UL; + VC = 640 ^ 0xA4093822UL; + VD = 640 ^ 0x299F31D0UL; + VE = 0x082EFA98UL; + VF = 0xEC4E6C89UL; + + M4 = 0x80000000; + M5 = 0; + M6 = 0; + M7 = 0; + M8 = 0; + M9 = 0; + MA = 0; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 640; + + V0 = (V0 + V4 + (M0 ^ 0x85A308D3UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M1 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M2 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M3 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M4 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M5 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M6 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M7 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M8 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M9 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MA ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MB ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MC ^ 0xC97C50DDUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MD ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (ME ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MF ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (ME ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MA ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M9 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MF ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MD ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M6 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M1 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MC ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M0 ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M2 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MB ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M7 ^ 0x34E90C6CUL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M5 ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M3 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MB ^ 0x452821E6UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M8 ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (MC ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M0 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M5 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M2 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MF ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MD ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (MA ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (ME ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M7 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M1 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M9 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M4 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M7 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M9 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M3 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M1 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MD ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MC ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MB ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (ME ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M2 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M6 ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M5 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MA ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M4 ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M0 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (MF ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M8 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M9 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M0 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M5 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M7 ^ 0x299F31D0UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M2 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M4 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MA ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MF ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (ME ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M1 ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MB ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MC ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M3 ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MD ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M2 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MC ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M6 ^ 0xBE5466CFUL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MA ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M0 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MB ^ 0x243F6A88UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M8 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M3 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M4 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MD ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M7 ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M5 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MF ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (ME ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M1 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M9 ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MC ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M5 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M1 ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MF ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (ME ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MD ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M4 ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MA ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M0 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M7 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M9 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M2 ^ 0x38D01377UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M8 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MB ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MD ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MB ^ 0xC97C50DDUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M7 ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (ME ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MC ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M1 ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M3 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M9 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M5 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M0 ^ 0x299F31D0UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MF ^ 0xA4093822UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M4 ^ 0xB5470917UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M2 ^ 0xBE5466CFUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MA ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + + // Constants + // 00 = 0x243F6A88UL + // 01 = 0x85A308D3UL + // 02 = 0x13198A2EUL + // 03 = 0x03707344UL + // 04 = 0xA4093822UL + // 05 = 0x299F31D0UL + // 06 = 0x082EFA98UL + // 07 = 0xEC4E6C89UL + // 08 = 0x452821E6UL + // 09 = 0x38D01377UL + // 10 = 0xBE5466CFUL + // 11 = 0x34E90C6CUL + // 12 = 0xC0AC29B7UL + // 13 = 0xC97C50DDUL + // 14 = 0x3F84D5B5UL + // 15 = 0xB5470917UL + // A=10,B=11,C=12,D=13,E=14,F=15 + + // Round 9: + // 6^15 + V0 = (V0 + V4 + (M6 ^ 0xB5470917UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MF ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; + // 14^9 + V1 = (V1 + V5 + (ME ^ 0x38D01377UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M9 ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; + // 11^3 + V2 = (V2 + V6 + (MB ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M3 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; + // 0^8 + V3 = (V3 + V7 + (M0 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M8 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; + // 12^2 + V0 = (V0 + V5 + (MC ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M2 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; + // 13^7 + V1 = (V1 + V6 + (MD ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M7 ^ 0xC97C50DDUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; + // 1^4 + V2 = (V2 + V7 + (M1 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M4 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; + // 10^5 + V3 = (V3 + V4 + (MA ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M5 ^ 0xBE5466CFUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + + // Constants + // 00 = 0x243F6A88UL + // 01 = 0x85A308D3UL + // 02 = 0x13198A2EUL + // 03 = 0x03707344UL + // 04 = 0xA4093822UL + // 05 = 0x299F31D0UL + // 06 = 0x082EFA98UL + // 07 = 0xEC4E6C89UL + // 08 = 0x452821E6UL + // 09 = 0x38D01377UL + // 10 = 0xBE5466CFUL + // 11 = 0x34E90C6CUL + // 12 = 0xC0AC29B7UL + // 13 = 0xC97C50DDUL + // 14 = 0x3F84D5B5UL + // 15 = 0xB5470917UL + // A=10,B=11,C=12,D=13,E=14,F=15 + + // Round 10 + // 10^2 + V0 = (V0 + V4 + (MA ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M2 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; + // 8^4 + V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; + // 7^6 + V2 = (V2 + V6 + (M7 ^ 0x082EFA98UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M6 ^ 0xEC4E6C89UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; + // 1^5 + V3 = (V3 + V7 + (M1 ^ 0x299F31D0UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M5 ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; + // 15^11 + V0 = (V0 + V5 + (MF ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MB ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; + // 9^14 + V1 = (V1 + V6 + (M9 ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (ME ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; + // 3^12 + V2 = (V2 + V7 + (M3 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MC ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; + // 13^0 + V3 = (V3 + V4 + (MD ^ 0x243F6A88UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M0 ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + + // Round 11,12,13,14 repeated from beginning again + V0 = (V0 + V4 + (M0 ^ 0x85A308D3UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M1 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M2 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M3 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M4 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M5 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M6 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M7 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M8 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M9 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MA ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MB ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MC ^ 0xC97C50DDUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MD ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (ME ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MF ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (ME ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MA ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M9 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MF ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MD ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M6 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M1 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MC ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M0 ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M2 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MB ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M7 ^ 0x34E90C6CUL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M5 ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M3 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MB ^ 0x452821E6UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M8 ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (MC ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M0 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M5 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M2 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MF ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MD ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (MA ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (ME ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M7 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M1 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M9 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M4 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M7 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M9 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M3 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M1 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MD ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MC ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MB ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (ME ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M2 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M6 ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M5 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MA ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M4 ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M0 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (MF ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M8 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + + if(pre7 ^ V7 ^ VF) + return; + output[output[0xFF]++] = nonce; +} \ No newline at end of file diff --git a/kernel/blake256r8.cl b/kernel/blake256r8.cl new file mode 100644 index 00000000..fca62fe7 --- /dev/null +++ b/kernel/blake256r8.cl @@ -0,0 +1,77 @@ +// (c) 2013 originally written by smolen, modified by kr105 + +#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n))) + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search( + volatile __global uint * restrict output, + // precalc hash from fisrt part of message + const uint h0, + const uint h1, + const uint h2, + const uint h3, + const uint h4, + const uint h5, + const uint h6, + const uint h7, + // last 12 bytes of original message + const uint in16, + const uint in17, + const uint in18 +) +{ + uint M0, M1, M2, M3, M4, M5, M6, M7; + uint M8, M9, MA, MB, MC, MD, ME, MF; + uint V0, V1, V2, V3, V4, V5, V6, V7; + uint V8, V9, VA, VB, VC, VD, VE, VF; + uint pre7; + uint nonce = get_global_id(0); + + V0 = h0; + V1 = h1; + V2 = h2; + V3 = h3; + V4 = h4; + V5 = h5; + V6 = h6; + pre7 = V7 = h7; + M0 = in16; + M1 = in17; + M2 = in18; + M3 = nonce; + + V8 = 0x243F6A88UL; + V9 = 0x85A308D3UL; + VA = 0x13198A2EUL; + VB = 0x03707344UL; + VC = 640 ^ 0xA4093822UL; + VD = 640 ^ 0x299F31D0UL; + VE = 0x082EFA98UL; + VF = 0xEC4E6C89UL; + + M4 = 0x80000000; + M5 = 0; + M6 = 0; + M7 = 0; + M8 = 0; + M9 = 0; + MA = 0; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 640; + + V0 = (V0 + V4 + (M0 ^ 0x85A308D3UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M1 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M2 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M3 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M4 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M5 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M6 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M7 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M8 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M9 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MA ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MB ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MC ^ 0xC97C50DDUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MD ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (ME ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MF ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (ME ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MA ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M9 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MF ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MD ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M6 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M1 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MC ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M0 ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M2 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MB ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M7 ^ 0x34E90C6CUL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M5 ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M3 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MB ^ 0x452821E6UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M8 ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (MC ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M0 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M5 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M2 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MF ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MD ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (MA ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (ME ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M7 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M1 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M9 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M4 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M7 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M9 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M3 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M1 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MD ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MC ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MB ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (ME ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M2 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M6 ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M5 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MA ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M4 ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M0 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (MF ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M8 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M9 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M0 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M5 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M7 ^ 0x299F31D0UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M2 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M4 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MA ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MF ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (ME ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M1 ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MB ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MC ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M3 ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MD ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M2 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MC ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M6 ^ 0xBE5466CFUL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MA ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M0 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MB ^ 0x243F6A88UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M8 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M3 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M4 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MD ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M7 ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M5 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MF ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (ME ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M1 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M9 ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MC ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M5 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M1 ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MF ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (ME ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MD ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M4 ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MA ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M0 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M7 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M9 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M2 ^ 0x38D01377UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M8 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MB ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MD ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MB ^ 0xC97C50DDUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M7 ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (ME ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MC ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M1 ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M3 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M9 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M5 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M0 ^ 0x299F31D0UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MF ^ 0xA4093822UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M4 ^ 0xB5470917UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M2 ^ 0xBE5466CFUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MA ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + + if(pre7 ^ V7 ^ VF) + return; + output[output[0xFF]++] = nonce; +} \ No newline at end of file diff --git a/kernel/lyra2rev2.cl b/kernel/lyra2rev2.cl index 0fe0440d..a165a751 100644 --- a/kernel/lyra2rev2.cl +++ b/kernel/lyra2rev2.cl @@ -31,8 +31,8 @@ // typedef unsigned int uint; #pragma OPENCL EXTENSION cl_amd_printf : enable -#ifndef LYRA2RE_CL -#define LYRA2RE_CL +#ifndef LYRA2REV2_CL +#define LYRA2REV2_CL #if __ENDIAN_LITTLE__ #define SPH_LITTLE_ENDIAN 1 @@ -522,4 +522,4 @@ __kernel void search6(__global uchar* hashes, __global uint* output, const ulong } -#endif // LYRA2RE_CL \ No newline at end of file +#endif // LYRA2REV2_CL \ No newline at end of file diff --git a/kernel/neoscrypt.cl b/kernel/neoscrypt.cl index 7939d7ed..9ffcad4b 100644 --- a/kernel/neoscrypt.cl +++ b/kernel/neoscrypt.cl @@ -1,9 +1,32 @@ -/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */ -/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */ +// NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 +// By Wolf (Wolf0 aka Wolf9466) // Stupid AMD compiler ignores the unroll pragma in these two + +// Tahiti 3/2, +// Hawaii 4/4 + notneededswap +// Pitcairn 3/4 + notneededswap +#if defined(__Tahiti__) +#define SALSA_SMALL_UNROLL 4 +#define CHACHA_SMALL_UNROLL 2 +//#define SWAP 1 +//#define SHITMAIN 1 +//#define WIDE_STRIPE 1 +#elif defined(__Pitcairn__) + #define SALSA_SMALL_UNROLL 3 -#define CHACHA_SMALL_UNROLL 3 +#define CHACHA_SMALL_UNROLL 2 +//#define SWAP 1 +//#define SHITMAIN 1 +//#define WIDE_STRIPE 1 + +#else +#define SALSA_SMALL_UNROLL 4 +#define CHACHA_SMALL_UNROLL 4 +//#define SWAP 1 +//#define SHITMAIN 1 +//#define WIDE_STRIPE 1 +#endif // If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted // as the unroll factor; must divide cleanly into ten. @@ -96,6 +119,28 @@ static const __constant uchar BLAKE2S_SIGMA[10][16] = b = rotate(b ^ c, 25U); \ } while(0) +#define BLAKE_PARALLEL_G1(idx0, a, b, c, d, key) do { \ + a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][0]], key[BLAKE2S_SIGMA[idx0][2]], key[BLAKE2S_SIGMA[idx0][4]], key[BLAKE2S_SIGMA[idx0][6]]); \ + d = rotate(d ^ a, 16U); \ + c += d; \ + b = rotate(b ^ c, 20U); \ + a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][1]], key[BLAKE2S_SIGMA[idx0][3]], key[BLAKE2S_SIGMA[idx0][5]], key[BLAKE2S_SIGMA[idx0][7]]); \ + d = rotate(d ^ a, 24U); \ + c += d; \ + b = rotate(b ^ c, 25U); \ +} while(0) + +#define BLAKE_PARALLEL_G2(idx0, a, b, c, d, key) do { \ + a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][8]], key[BLAKE2S_SIGMA[idx0][10]], key[BLAKE2S_SIGMA[idx0][12]], key[BLAKE2S_SIGMA[idx0][14]]); \ + d = rotate(d ^ a, 16U); \ + c += d; \ + b = rotate(b ^ c, 20U); \ + a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][9]], key[BLAKE2S_SIGMA[idx0][11]], key[BLAKE2S_SIGMA[idx0][13]], key[BLAKE2S_SIGMA[idx0][15]]); \ + d = rotate(d ^ a, 24U); \ + c += d; \ + b = rotate(b ^ c, 25U); \ +} while(0) + void Blake2S(uint *restrict inout, const uint *restrict inkey) { uint16 V; @@ -122,14 +167,17 @@ void Blake2S(uint *restrict inout, const uint *restrict inkey) #endif for(int x = 0; x < 10; ++x) { - BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey); + /*BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey); BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey); BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey); BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey); BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey); BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey); BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey); - BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey); + BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);*/ + + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey); } // XOR low part of state with the high part, @@ -156,14 +204,17 @@ void Blake2S(uint *restrict inout, const uint *restrict inkey) #endif for(int x = 0; x < 10; ++x) { - BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout); + /*BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout); BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout); BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout); BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout); BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout); BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout); BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout); - BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout); + BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);*/ + + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout); } // XOR low part of state with high part, then with input block @@ -227,15 +278,73 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u { // Make the key buffer twice the size of the key so it fits a Blake2S block // This way, we don't need a temp buffer in the Blake2S function. - uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 }; + uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 }; // Copy input and key to their buffers CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); // PRF - Blake2S((uint *)input, (uint *)key); + //Blake2S((uint *)input, (uint *)key); + + uint *inkey = (uint *)key, *inout = (uint *)input; + + // PRF + uint16 V; + uint8 tmpblock; + + // Load first block (IV into V.lo) and constants (IV into V.hi) + V.lo = V.hi = vload8(0U, BLAKE2S_IV); + + // XOR with initial constant + V.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message so far (including this block) + // There are two uints for this field, but high uint is zero + V.sc ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey); + } + + // XOR low part of state with the high part, + // then with the original input block. + V.lo ^= V.hi ^ tmpblock; + // Load constants (IV into V.hi) + V.hi = vload8(0U, BLAKE2S_IV); + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message into block again + V.sc ^= BLAKE2S_BLOCK_SIZE << 1; + + // Last block compression - XOR final constant into state + V.se ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout); + } + + // XOR low part of state with high part, then with input block + V.lo ^= V.hi ^ tmpblock; + + // Store result in input/output buffer + vstore8(V.lo, 0, inout); + + // Calculate the next buffer pointer bufidx = 0; @@ -284,7 +393,475 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u } } -#define SALSA_CORE(state) do { \ +/* FastKDF, a fast buffered key derivation function: + * FASTKDF_BUFFER_SIZE must be a power of 2; + * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; + * prf_output_size must be <= prf_key_size; */ +void fastkdf1(const uchar password[80], uchar output[256]) +{ + + /* WARNING! + * This algorithm uses byte-wise addressing for memory blocks. + * Or in other words, trying to copy an unaligned memory region + * will significantly slow down the algorithm, when copying uses + * words or bigger entities. It even may corrupt the data, when + * the device does not support it properly. + * Therefore use byte copying, which will not the fastest but at + * least get reliable results. */ + + // BLOCK_SIZE 64U + // FASTKDF_BUFFER_SIZE 256U + // BLAKE2S_BLOCK_SIZE 64U + // BLAKE2S_KEY_SIZE 32U + // BLAKE2S_OUT_SIZE 32U + uchar bufidx = 0; + uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) }; + uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer; + + // Initialize the password buffer + #pragma unroll 1 + for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)A)[i] = ((ulong *)password)[i % 10]; + + ((uint16 *)(B + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0]; + + // The primary iteration + #pragma unroll 1 + for(int i = 0; i < 32; ++i) + { + // Make the key buffer twice the size of the key so it fits a Blake2S block + // This way, we don't need a temp buffer in the Blake2S function. + uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 }; + + // Copy input and key to their buffers + CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); + CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); + + uint *inkey = (uint *)key, *inout = (uint *)input; + + #ifndef __Hawaii__ + + // PRF + uint4 V[4]; + uint8 tmpblock; + + tmpblock = vload8(0U, BLAKE2S_IV); + + V[0] = V[2] = tmpblock.lo; + V[1] = V[3] = tmpblock.hi; + + V[0].s0 ^= 0x01012020U; + tmpblock.lo = V[0]; + + V[3].s0 ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inkey); + BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inkey); + } + + V[0] ^= V[2] ^ tmpblock.lo; + V[1] ^= V[3] ^ tmpblock.hi; + + V[2] = vload4(0U, BLAKE2S_IV); + V[3] = vload4(1U, BLAKE2S_IV); + + tmpblock.lo = V[0]; + tmpblock.hi = V[1]; + + V[3].s0 ^= BLAKE2S_BLOCK_SIZE << 1; + V[3].s2 ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inout); + BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inout); + } + + V[0] ^= V[2] ^ tmpblock.lo; + V[1] ^= V[3] ^ tmpblock.hi; + + vstore4(V[0], 0, inout); + vstore4(V[1], 1, inout); + + #else + + // PRF + uint16 V; + uint8 tmpblock; + + // Load first block (IV into V.lo) and constants (IV into V.hi) + V.lo = V.hi = vload8(0U, BLAKE2S_IV); + + // XOR with initial constant + V.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message so far (including this block) + // There are two uints for this field, but high uint is zero + V.sc ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey); + } + + // XOR low part of state with the high part, + // then with the original input block. + V.lo ^= V.hi ^ tmpblock; + + // Load constants (IV into V.hi) + V.hi = vload8(0U, BLAKE2S_IV); + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message into block again + V.sc ^= BLAKE2S_BLOCK_SIZE << 1; + + // Last block compression - XOR final constant into state + V.se ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout); + } + + // XOR low part of state with high part, then with input block + V.lo ^= V.hi ^ tmpblock; + + // Store result in input/output buffer + vstore8(V.lo, 0, inout); + + #endif + + // Calculate the next buffer pointer + bufidx = 0; + + for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x) + bufidx += input[x]; + + // bufidx a uchar now - always mod 255 + //bufidx &= (FASTKDF_BUFFER_SIZE - 1); + + // Modify the salt buffer + XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE); + + if(bufidx < BLAKE2S_KEY_SIZE) + { + // Head modified, tail updated + // this was made off the original code... wtf + //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); + CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx); + } + else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) + { + // Tail modified, head updated + CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); + } + } + + // Modify and copy into the output buffer + + // Damned compiler crashes + // Fuck you, AMD + + //for(uint i = 0; i < output_len; ++i, ++bufidx) + // output[i] = B[bufidx] ^ A[i]; + + uint left = FASTKDF_BUFFER_SIZE - bufidx; + //uint left = (~bufidx) + 1 + + if(left < 256) + { + XORBytes(output, B + bufidx, A, left); + XORBytes(output + left, B, A + left, 256 - left); + } + else + { + XORBytes(output, B + bufidx, A, 256); + } +} + +/* FastKDF, a fast buffered key derivation function: + * FASTKDF_BUFFER_SIZE must be a power of 2; + * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; + * prf_output_size must be <= prf_key_size; */ +void fastkdf2(const uchar password[80], const uchar salt[256], __global uint* restrict output, const uint target) +{ + + /* WARNING! + * This algorithm uses byte-wise addressing for memory blocks. + * Or in other words, trying to copy an unaligned memory region + * will significantly slow down the algorithm, when copying uses + * words or bigger entities. It even may corrupt the data, when + * the device does not support it properly. + * Therefore use byte copying, which will not the fastest but at + * least get reliable results. */ + + // BLOCK_SIZE 64U + // FASTKDF_BUFFER_SIZE 256U + // BLAKE2S_BLOCK_SIZE 64U + // BLAKE2S_KEY_SIZE 32U + // BLAKE2S_OUT_SIZE 32U + // salt_len == 256, output_len == 32 + uchar bufidx = 0; + uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) }; + uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer; + //uchar A[256], B[256]; + + // Initialize the password buffer + #pragma unroll 1 + for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10]; + + ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0]; + + // Initialize the salt buffer + ((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0]; + ((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1]; + + // The primary iteration + #pragma unroll 1 + for(int i = 0; i < 32; ++i) + { + // Make the key buffer twice the size of the key so it fits a Blake2S block + // This way, we don't need a temp buffer in the Blake2S function. + uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 }; + + // Copy input and key to their buffers + CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); + CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); + + uint *inkey = (uint *)key, *inout = (uint *)input; + + #ifndef __Hawaii__ + + // PRF + uint4 V[4]; + uint8 tmpblock; + + tmpblock = vload8(0U, BLAKE2S_IV); + + V[0] = V[2] = tmpblock.lo; + V[1] = V[3] = tmpblock.hi; + + V[0].s0 ^= 0x01012020U; + tmpblock.lo = V[0]; + + V[3].s0 ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inkey); + BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inkey); + } + + V[0] ^= V[2] ^ tmpblock.lo; + V[1] ^= V[3] ^ tmpblock.hi; + + V[2] = vload4(0U, BLAKE2S_IV); + V[3] = vload4(1U, BLAKE2S_IV); + + tmpblock.lo = V[0]; + tmpblock.hi = V[1]; + + V[3].s0 ^= BLAKE2S_BLOCK_SIZE << 1; + V[3].s2 ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inout); + BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inout); + } + + V[0] ^= V[2] ^ tmpblock.lo; + V[1] ^= V[3] ^ tmpblock.hi; + + vstore4(V[0], 0, inout); + vstore4(V[1], 1, inout); + + #else + + // PRF + uint16 V; + uint8 tmpblock; + + // Load first block (IV into V.lo) and constants (IV into V.hi) + V.lo = V.hi = vload8(0U, BLAKE2S_IV); + + // XOR with initial constant + V.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message so far (including this block) + // There are two uints for this field, but high uint is zero + V.sc ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey); + } + + // XOR low part of state with the high part, + // then with the original input block. + V.lo ^= V.hi ^ tmpblock; + + // Load constants (IV into V.hi) + V.hi = vload8(0U, BLAKE2S_IV); + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message into block again + V.sc ^= BLAKE2S_BLOCK_SIZE << 1; + + // Last block compression - XOR final constant into state + V.se ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout); + } + + // XOR low part of state with high part, then with input block + V.lo ^= V.hi ^ tmpblock; + + // Store result in input/output buffer + vstore8(V.lo, 0, inout); + #endif + + // Calculate the next buffer pointer + bufidx = 0; + + for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x) + bufidx += input[x]; + + // bufidx a uchar now - always mod 255 + //bufidx &= (FASTKDF_BUFFER_SIZE - 1); + + // Modify the salt buffer + XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE); + + if(bufidx < BLAKE2S_KEY_SIZE) + { + // Head modified, tail updated + // this was made off the original code... wtf + //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); + CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx); + } + else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) + { + // Tail modified, head updated + CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); + } + } + + // Modify and copy into the output buffer + + // Damned compiler crashes + // Fuck you, AMD + + uchar outbuf[32]; + + for(uint i = 0; i < 32; ++i, ++bufidx) + outbuf[i] = B[bufidx] ^ A[i]; + + /*uint left = FASTKDF_BUFFER_SIZE - bufidx; + //uint left = (~bufidx) + 1 + uchar outbuf[32]; + + if(left < 32) + { + XORBytes(outbuf, B + bufidx, A, left); + XORBytes(outbuf + left, B, A + left, 32 - left); + } + else + { + XORBytes(outbuf, B + bufidx, A, 32); + }*/ + + if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0); + +} + +/* + s0 s1 s2 s3 + s4 s5 s6 s7 + s8 s9 sa sb + sc sd se sf +shittify: +s0=s4 +s1=s9 +s2=se +s3=s3 +s4=s8 +s5=sd +s6=s2 +s7=s7 +s8=sc +s9=s1 +sa=s6 +sb=sb +sc=s0 +sd=s5 +se=sa +sf=sf +unshittify: +s0=sc +s1=s9 +s2=s6 +s3=s3 +s4=s0 +s5=sd +s6=sa +s7=s7 +s8=s4 +s9=s1 +sa=se +sb=sb +sc=s8 +sd=s5 +se=s2 +sf=sf + +*/ + +#define SALSA_CORE(state) do { \ + state[0] ^= rotate(state[3] + state[2], 7U); \ + state[1] ^= rotate(state[0] + state[3], 9U); \ + state[2] ^= rotate(state[1] + state[0], 13U); \ + state[3] ^= rotate(state[2] + state[1], 18U); \ + state[2] ^= rotate(state[3].wxyz + state[0].zwxy, 7U); \ + state[1] ^= rotate(state[2].wxyz + state[3].zwxy, 9U); \ + state[0] ^= rotate(state[1].wxyz + state[2].zwxy, 13U); \ + state[3] ^= rotate(state[0].wxyz + state[1].zwxy, 18U); \ +} while(0) + +#define SALSA_CORE_SCALAR(state) do { \ state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \ state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \ state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \ @@ -295,10 +872,18 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \ } while(0) -uint16 salsa_small_scalar_rnd(uint16 X) +uint16 salsa_small_parallel_rnd(uint16 X) { - uint16 st = X; - +#ifndef SHITMAIN + uint4 st[4] = { (uint4)(X.s4, X.s9, X.se, X.s3), + (uint4)(X.s8, X.sd, X.s2, X.s7), + (uint4)(X.sc, X.s1, X.s6, X.sb), + (uint4)(X.s0, X.s5, X.sa, X.sf) }; +#else + uint4 st[4]; + ((uint16 *)st)[0] = X; +#endif + #if SALSA_SMALL_UNROLL == 1 for(int i = 0; i < 10; ++i) @@ -335,7 +920,7 @@ uint16 salsa_small_scalar_rnd(uint16 X) SALSA_CORE(st); } - #else + #elif SALSA_SMALL_UNROLL == 5 for(int i = 0; i < 2; ++i) { @@ -346,26 +931,114 @@ uint16 salsa_small_scalar_rnd(uint16 X) SALSA_CORE(st); } + #else + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + #endif +#ifndef SHITMAIN + return(X + (uint16)( + st[3].x, st[2].y, st[1].z, st[0].w, + st[0].x, st[3].y, st[2].z, st[1].w, + st[1].x, st[0].y, st[3].z, st[2].w, + st[2].x, st[1].y, st[0].z, st[3].w)); +#else + return(X + ((uint16 *)st)[0]); +#endif +} + +uint16 salsa_small_scalar_rnd(uint16 X) +{ + uint16 st = X; + + #if SALSA_SMALL_UNROLL == 1 + + for(int i = 0; i < 10; ++i) + { + SALSA_CORE_SCALAR(st); + } + + #elif SALSA_SMALL_UNROLL == 2 + + for(int i = 0; i < 5; ++i) + { + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + } + + #elif SALSA_SMALL_UNROLL == 3 + + for(int i = 0; i < 4; ++i) + { + SALSA_CORE_SCALAR(st); + if(i == 3) break; + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + } + + #elif SALSA_SMALL_UNROLL == 4 + + for(int i = 0; i < 3; ++i) + { + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + if(i == 2) break; + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + } + + #else + + for(int i = 0; i < 2; ++i) + { + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + } + + #endif + return(X + st); } + #define CHACHA_CORE_PARALLEL(state) do { \ - state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \ - state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \ - state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \ - state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \ + state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], 16U); \ + state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], 12U); \ + state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], 8U); \ + state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], 7U); \ \ - state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \ - state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \ - state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \ - state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \ + state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], 16); \ + state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, 12U); \ + state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], 8U); \ + state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, 7U); \ +} while(0) + +#define CHACHA_CORE(state) do { \ + state.s0 += state.s4; state.sc = as_uint(as_ushort2(state.sc ^ state.s0).s10); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 12U); state.s0 += state.s4; state.sc = rotate(state.sc ^ state.s0, 8U); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 7U); \ + state.s1 += state.s5; state.sd = as_uint(as_ushort2(state.sd ^ state.s1).s10); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 12U); state.s1 += state.s5; state.sd = rotate(state.sd ^ state.s1, 8U); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 7U); \ + state.s2 += state.s6; state.se = as_uint(as_ushort2(state.se ^ state.s2).s10); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 12U); state.s2 += state.s6; state.se = rotate(state.se ^ state.s2, 8U); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 7U); \ + state.s3 += state.s7; state.sf = as_uint(as_ushort2(state.sf ^ state.s3).s10); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 12U); state.s3 += state.s7; state.sf = rotate(state.sf ^ state.s3, 8U); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 7U); \ + state.s0 += state.s5; state.sf = as_uint(as_ushort2(state.sf ^ state.s0).s10); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 12U); state.s0 += state.s5; state.sf = rotate(state.sf ^ state.s0, 8U); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 7U); \ + state.s1 += state.s6; state.sc = as_uint(as_ushort2(state.sc ^ state.s1).s10); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 12U); state.s1 += state.s6; state.sc = rotate(state.sc ^ state.s1, 8U); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 7U); \ + state.s2 += state.s7; state.sd = as_uint(as_ushort2(state.sd ^ state.s2).s10); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 12U); state.s2 += state.s7; state.sd = rotate(state.sd ^ state.s2, 8U); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 7U); \ + state.s3 += state.s4; state.se = as_uint(as_ushort2(state.se ^ state.s3).s10); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 12U); state.s3 += state.s4; state.se = rotate(state.se ^ state.s3, 8U); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 7U); \ } while(0) uint16 chacha_small_parallel_rnd(uint16 X) { - uint4 t, st[4]; + uint4 st[4]; ((uint16 *)st)[0] = X; @@ -405,7 +1078,7 @@ uint16 chacha_small_parallel_rnd(uint16 X) CHACHA_CORE_PARALLEL(st); } - #else + #elif CHACHA_SMALL_UNROLL == 5 for(int i = 0; i < 2; ++i) { @@ -415,15 +1088,95 @@ uint16 chacha_small_parallel_rnd(uint16 X) CHACHA_CORE_PARALLEL(st); CHACHA_CORE_PARALLEL(st); } + #else + + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); #endif return(X + ((uint16 *)st)[0]); } -void neoscrypt_blkmix(uint16 *XV, bool alg) -{ +uint16 chacha_small_scalar_rnd(uint16 X) +{ + uint16 st = X; + + #if CHACHA_SMALL_UNROLL == 1 + + for(int i = 0; i < 10; ++i) + { + CHACHA_CORE(st); + } + + #elif CHACHA_SMALL_UNROLL == 2 + + for(int i = 0; i < 5; ++i) + { + CHACHA_CORE(st); + CHACHA_CORE(st); + } + + #elif CHACHA_SMALL_UNROLL == 3 + + for(int i = 0; i < 4; ++i) + { + CHACHA_CORE(st); + if(i == 3) break; + CHACHA_CORE(st); + CHACHA_CORE(st); + } + + #elif CHACHA_SMALL_UNROLL == 4 + + for(int i = 0; i < 3; ++i) + { + CHACHA_CORE(st); + CHACHA_CORE(st); + if(i == 2) break; + CHACHA_CORE(st); + CHACHA_CORE(st); + } + + #elif CHACHA_SMALL_UNROLL == 5 + + for(int i = 0; i < 2; ++i) + { + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + } + + #else + + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + + #endif + + return(X + st); +} +void neoscrypt_blkmix_salsa(uint16 XV[4]) +{ /* NeoScrypt flow: Scrypt flow: Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; @@ -431,48 +1184,135 @@ void neoscrypt_blkmix(uint16 *XV, bool alg) Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; Xa" = Ya; Xb" = Yc; Xc" = Yb; Xd" = Yd; */ - +#if 0 + for(int i = 0; i < 4; ++i) XV[i] = (uint16)( + XV[i].s4, XV[i].s9, XV[i].se, XV[i].s3, XV[i].s8, XV[i].sd, XV[i].s2, XV[i].s7, + XV[i].sc, XV[i].s1, XV[i].s6, XV[i].sb, XV[i].s0, XV[i].s5, XV[i].sa, XV[i].sf); +#endif XV[0] ^= XV[3]; - if(!alg) - { - XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; - XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; - XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; - XV[3] = salsa_small_scalar_rnd(XV[3]); - } - else - { - XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; - XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1]; - XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2]; - XV[3] = chacha_small_parallel_rnd(XV[3]); - } + XV[0] = salsa_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; + XV[1] = salsa_small_parallel_rnd(XV[1]); XV[2] ^= XV[1]; + XV[2] = salsa_small_parallel_rnd(XV[2]); XV[3] ^= XV[2]; + XV[3] = salsa_small_parallel_rnd(XV[3]); + + //XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; + //XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; + //XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; + //XV[3] = salsa_small_scalar_rnd(XV[3]); + + XV[1] ^= XV[2]; + XV[2] ^= XV[1]; + XV[1] ^= XV[2]; +#if 0 + XV[0] = (uint16)(XV[0].sc, XV[0].s9, XV[0].s6, XV[0].s3, XV[0].s0, XV[0].sd, XV[0].sa, XV[0].s7, XV[0].s4, XV[0].s1, XV[0].se, XV[0].sb, XV[0].s8, XV[0].s5, XV[0].s2, XV[0].sf); + XV[1] = (uint16)(XV[1].sc, XV[1].s9, XV[1].s6, XV[1].s3, XV[1].s0, XV[1].sd, XV[1].sa, XV[1].s7, XV[1].s4, XV[1].s1, XV[1].se, XV[1].sb, XV[1].s8, XV[1].s5, XV[1].s2, XV[1].sf); + XV[2] = (uint16)(XV[2].sc, XV[2].s9, XV[2].s6, XV[2].s3, XV[2].s0, XV[2].sd, XV[2].sa, XV[2].s7, XV[2].s4, XV[2].s1, XV[2].se, XV[2].sb, XV[2].s8, XV[2].s5, XV[2].s2, XV[2].sf); + XV[3] = (uint16)(XV[3].sc, XV[3].s9, XV[3].s6, XV[3].s3, XV[3].s0, XV[3].sd, XV[3].sa, XV[3].s7, XV[3].s4, XV[3].s1, XV[3].se, XV[3].sb, XV[3].s8, XV[3].s5, XV[3].s2, XV[3].sf); +#endif +} + +void neoscrypt_blkmix_chacha(uint16 XV[4]) +{ + + /* NeoScrypt flow: Scrypt flow: + Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; + Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; + Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; + Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; + Xa" = Ya; Xb" = Yc; + Xc" = Yb; Xd" = Yd; */ + XV[0] ^= XV[3]; + + #if 1 + + XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; + XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1]; + XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2]; + XV[3] = chacha_small_parallel_rnd(XV[3]); + + #else + + XV[0] = chacha_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; + XV[1] = chacha_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; + XV[2] = chacha_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; + XV[3] = chacha_small_scalar_rnd(XV[3]); + + #endif + XV[1] ^= XV[2]; XV[2] ^= XV[1]; XV[1] ^= XV[2]; } +#ifdef WIDE_STRIPE + +void ScratchpadStore(__global void *V, void *X, uchar idx) +{ + ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))] = ((ulong16 *)X)[0]; + ((__global ulong16 *)V)[mul24((idx << 1), (int)get_global_size(0)) + 1] = ((ulong16 *)X)[1]; + //const uint idx2 = mul24(idx << 2, (int)get_global_size(0)); + //#pragma unroll + //for(int i = 0; i < 4; ++i) ((__global uint16 *)V)[idx2 + i] = ((uint16 *)X)[i]; +} + +void ScratchpadMix(void *X, const __global void *V, uchar idx) +{ + ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))]; + ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[mul24((idx << 1), (int)get_global_size(0)) + 1]; +} + +#else + void ScratchpadStore(__global void *V, void *X, uchar idx) { - ((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0]; - ((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1]; + ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))] = ((ulong16 *)X)[0]; + ((__global ulong16 *)V)[mul24((idx << 1) + 1, (int)get_global_size(0))] = ((ulong16 *)X)[1]; } void ScratchpadMix(void *X, const __global void *V, uchar idx) { - ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1]; - ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1]; + ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))]; + ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[mul24((idx << 1) + 1, (int)get_global_size(0))]; +} + +#endif + + + +#define SALSA_PERM (uint16)(4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, 0, 5, 10, 15) +#define SALSA_INV_PERM (uint16)(12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15) + +void SMix_Salsa(uint16 X[4], __global uint16 *V) +{ + #pragma unroll 1 + for(int i = 0; i < 128; ++i) + { + ScratchpadStore(V, X, i); + neoscrypt_blkmix_salsa(X); + } + + #pragma unroll 1 + for(int i = 0; i < 128; ++i) + { + #ifdef SHITMAIN + const uint idx = convert_uchar(((uint *)X)[60] & 0x7F); + #else + const uint idx = convert_uchar(((uint *)X)[48] & 0x7F); + #endif + ScratchpadMix(X, V, idx); + neoscrypt_blkmix_salsa(X); + } } -void SMix(uint16 *X, __global uint16 *V, bool flag) +void SMix_Chacha(uint16 X[4], __global uint16 *V) { #pragma unroll 1 for(int i = 0; i < 128; ++i) { ScratchpadStore(V, X, i); - neoscrypt_blkmix(X, flag); + neoscrypt_blkmix_chacha(X); } #pragma unroll 1 @@ -480,10 +1320,13 @@ void SMix(uint16 *X, __global uint16 *V, bool flag) { const uint idx = convert_uchar(((uint *)X)[48] & 0x7F); ScratchpadMix(X, V, idx); - neoscrypt_blkmix(X, flag); + neoscrypt_blkmix_chacha(X); } } +#define SALSA_PERM (uint16)(4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, 0, 5, 10, 15) +#define SALSA_INV_PERM (uint16)(12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15) + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target) { @@ -491,9 +1334,12 @@ __kernel void search(__global const uchar* restrict input, __global uint* restri #define CONSTANT_r 2 // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha uint16 X[4], Z[4]; - /* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */ - __global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS))); - uchar outbuf[32]; + #ifdef WIDE_STRIPE + __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1); + #else + __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0))); + #endif + //uchar outbuf[32]; uchar data[PASSWORD_LEN]; ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; @@ -502,24 +1348,149 @@ __kernel void search(__global const uchar* restrict input, __global uint* restri ((uint *)data)[19] = get_global_id(0); // X = KDF(password, salt) - fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); - + //fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); + fastkdf1(data, (uchar *)X); + + #ifndef SHITMAIN // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2 CopyBytes128(Z, X, 2); - + #else + + #pragma unroll + for(int i = 0; i < 4; ++i) ((uint16 *)Z)[i] = shuffle(((uint16 *)X)[i], SALSA_PERM); + + #endif + // X = SMix(X); X & Z are swapped, repeat. - for(bool flag = false;; ++flag) + for(int i = 0;; ++i) { - SMix(X, V, flag); - if(flag) break; + #ifdef SWAP + if (i) SMix_Salsa(X,V); else SMix_Chacha(X,V); + if(i) break; SwapBytes128(X, Z, 256); + #else + if (i) SMix_Chacha(X,V); else SMix_Salsa(Z,V); + if(i) break; + #endif } + + #if defined(SWAP) && defined(SHITMAIN) + #pragma unroll + for(int i = 0; i < 4; ++i) ((uint16 *)Z)[i] ^= shuffle(((uint16 *)X)[i], SALSA_INV_PERM); + fastkdf2(data, (uchar *)Z, output, target); + #elif defined(SHITMAIN) + #pragma unroll + for(int i = 0; i < 4; ++i) ((uint16 *)X)[i] ^= shuffle(((uint16 *)Z)[i], SALSA_INV_PERM); + fastkdf2(data, (uchar *)X, output, target); + #else + // blkxor(X, Z) + ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0]; + ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1]; + + // output = KDF(password, X) + //fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); + fastkdf2(data, (uchar *)X, output, target); + #endif +} + + +/* +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint16 *XZOutput) +{ +#define CONSTANT_N 128 +#define CONSTANT_r 2 + // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha + uint16 X[4]; + XZOutput += (4 * 2 * get_global_id(0)); + + //uchar outbuf[32]; + uchar data[PASSWORD_LEN]; + + ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; + ((ulong *)data)[8] = ((__global const ulong *)input)[8]; + ((uint *)data)[18] = ((__global const uint *)input)[18]; + ((uint *)data)[19] = get_global_id(0); + // X = KDF(password, salt) + //fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); + fastkdf1(data, (uchar *)X); + + for(int i = 0; i < 4; ++i) XZOutput[i] = X[i]; + for(int i = 0; i < 4; ++i) XZOutput[i + 4] = X[i]; + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global uint16 *XZOutput, __global uchar *padcache) +{ +#define CONSTANT_N 128 +#define CONSTANT_r 2 + // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha + uint16 X[4], Z[4]; + #ifdef WIDE_STRIPE + __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1); + #else + __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0))); + #endif + //uchar outbuf[32]; + + XZOutput += (4 * 2 * get_global_id(0)); + + for(int i = 0; i < 4; ++i) X[i] = XZOutput[i]; + + SMix_Salsa(X,V); + + for(int i = 0; i < 4; ++i) XZOutput[i] = X[i]; + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global uint16 *XZOutput, __global uchar *padcache) +{ +#define CONSTANT_N 128 +#define CONSTANT_r 2 + // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha + uint16 X[4], Z[4]; + #ifdef WIDE_STRIPE + __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1); + #else + __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0))); + #endif + //uchar outbuf[32]; + + XZOutput += (4 * 2 * get_global_id(0)); + + for(int i = 0; i < 4; ++i) X[i] = XZOutput[i + 4]; + + SMix_Chacha(X,V); + + for(int i = 0; i < 4; ++i) XZOutput[i + 4] = X[i]; + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global const uchar* restrict input, __global uint16 *XZOutput, __global uint* restrict output, const uint target) +{ + uint16 X[4], Z[4]; + uchar data[PASSWORD_LEN]; + + ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; + ((ulong *)data)[8] = ((__global const ulong *)input)[8]; + ((uint *)data)[18] = ((__global const uint *)input)[18]; + ((uint *)data)[19] = get_global_id(0); + + XZOutput += (4 * 2 * get_global_id(0)); + + for(int i = 0; i < 4; ++i) X[i] = XZOutput[i]; + for(int i = 0; i < 4; ++i) Z[i] = XZOutput[i + 4]; + // blkxor(X, Z) ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0]; ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1]; // output = KDF(password, X) - fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); - if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0); -} \ No newline at end of file + //fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); + fastkdf2(data, (uchar *)X, output, target); +} +*/ \ No newline at end of file diff --git a/kernel/vanilla.cl b/kernel/vanilla.cl new file mode 100644 index 00000000..fca62fe7 --- /dev/null +++ b/kernel/vanilla.cl @@ -0,0 +1,77 @@ +// (c) 2013 originally written by smolen, modified by kr105 + +#define SPH_ROTR32(v,n) rotate((uint)(v),(uint)(32-(n))) + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search( + volatile __global uint * restrict output, + // precalc hash from fisrt part of message + const uint h0, + const uint h1, + const uint h2, + const uint h3, + const uint h4, + const uint h5, + const uint h6, + const uint h7, + // last 12 bytes of original message + const uint in16, + const uint in17, + const uint in18 +) +{ + uint M0, M1, M2, M3, M4, M5, M6, M7; + uint M8, M9, MA, MB, MC, MD, ME, MF; + uint V0, V1, V2, V3, V4, V5, V6, V7; + uint V8, V9, VA, VB, VC, VD, VE, VF; + uint pre7; + uint nonce = get_global_id(0); + + V0 = h0; + V1 = h1; + V2 = h2; + V3 = h3; + V4 = h4; + V5 = h5; + V6 = h6; + pre7 = V7 = h7; + M0 = in16; + M1 = in17; + M2 = in18; + M3 = nonce; + + V8 = 0x243F6A88UL; + V9 = 0x85A308D3UL; + VA = 0x13198A2EUL; + VB = 0x03707344UL; + VC = 640 ^ 0xA4093822UL; + VD = 640 ^ 0x299F31D0UL; + VE = 0x082EFA98UL; + VF = 0xEC4E6C89UL; + + M4 = 0x80000000; + M5 = 0; + M6 = 0; + M7 = 0; + M8 = 0; + M9 = 0; + MA = 0; + MB = 0; + MC = 0; + MD = 1; + ME = 0; + MF = 640; + + V0 = (V0 + V4 + (M0 ^ 0x85A308D3UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M1 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M2 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M3 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M4 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M5 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M6 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M7 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M8 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M9 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MA ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MB ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MC ^ 0xC97C50DDUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (MD ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (ME ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MF ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (ME ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MA ^ 0x3F84D5B5UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M4 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M8 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M9 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MF ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MD ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M6 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M1 ^ 0xC0AC29B7UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MC ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M0 ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M2 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MB ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M7 ^ 0x34E90C6CUL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M5 ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M3 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MB ^ 0x452821E6UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M8 ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (MC ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M0 ^ 0xC0AC29B7UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M5 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M2 ^ 0x299F31D0UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MF ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MD ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (MA ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (ME ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M7 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M1 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M9 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M4 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M7 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M9 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M3 ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M1 ^ 0x03707344UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MD ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MC ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MB ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (ME ^ 0x34E90C6CUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M2 ^ 0x082EFA98UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M6 ^ 0x13198A2EUL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M5 ^ 0xBE5466CFUL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MA ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M4 ^ 0x243F6A88UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M0 ^ 0xA4093822UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (MF ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M8 ^ 0xB5470917UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M9 ^ 0x243F6A88UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M0 ^ 0x38D01377UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M5 ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (M7 ^ 0x299F31D0UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M2 ^ 0xA4093822UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M4 ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (MA ^ 0xB5470917UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MF ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (ME ^ 0x85A308D3UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M1 ^ 0x3F84D5B5UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MB ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (MC ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M3 ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MD ^ 0x03707344UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (M2 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MC ^ 0x13198A2EUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M6 ^ 0xBE5466CFUL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MA ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (M0 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MB ^ 0x243F6A88UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M8 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M3 ^ 0x452821E6UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M4 ^ 0xC97C50DDUL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (MD ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M7 ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M5 ^ 0xEC4E6C89UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (MF ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (ME ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M1 ^ 0x38D01377UL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (M9 ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MC ^ 0x299F31D0UL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (M5 ^ 0xC0AC29B7UL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M1 ^ 0xB5470917UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (MF ^ 0x85A308D3UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (ME ^ 0xC97C50DDUL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (MD ^ 0x3F84D5B5UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M4 ^ 0xBE5466CFUL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (MA ^ 0xA4093822UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M0 ^ 0xEC4E6C89UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M7 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (M6 ^ 0x03707344UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M3 ^ 0x082EFA98UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M9 ^ 0x13198A2EUL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M2 ^ 0x38D01377UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M8 ^ 0x34E90C6CUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MB ^ 0x452821E6UL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + V0 = (V0 + V4 + (MD ^ 0x34E90C6CUL)); VC = SPH_ROTR32(VC ^ V0, 16); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 12); V0 = (V0 + V4 + (MB ^ 0xC97C50DDUL)); VC = SPH_ROTR32(VC ^ V0, 8); V8 = (V8 + VC); V4 = SPH_ROTR32(V4 ^ V8, 7);; V1 = (V1 + V5 + (M7 ^ 0x3F84D5B5UL)); VD = SPH_ROTR32(VD ^ V1, 16); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 12); V1 = (V1 + V5 + (ME ^ 0xEC4E6C89UL)); VD = SPH_ROTR32(VD ^ V1, 8); V9 = (V9 + VD); V5 = SPH_ROTR32(V5 ^ V9, 7);; V2 = (V2 + V6 + (MC ^ 0x85A308D3UL)); VE = SPH_ROTR32(VE ^ V2, 16); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 12); V2 = (V2 + V6 + (M1 ^ 0xC0AC29B7UL)); VE = SPH_ROTR32(VE ^ V2, 8); VA = (VA + VE); V6 = SPH_ROTR32(V6 ^ VA, 7);; V3 = (V3 + V7 + (M3 ^ 0x38D01377UL)); VF = SPH_ROTR32(VF ^ V3, 16); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 12); V3 = (V3 + V7 + (M9 ^ 0x03707344UL)); VF = SPH_ROTR32(VF ^ V3, 8); VB = (VB + VF); V7 = SPH_ROTR32(V7 ^ VB, 7);; V0 = (V0 + V5 + (M5 ^ 0x243F6A88UL)); VF = SPH_ROTR32(VF ^ V0, 16); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 12); V0 = (V0 + V5 + (M0 ^ 0x299F31D0UL)); VF = SPH_ROTR32(VF ^ V0, 8); VA = (VA + VF); V5 = SPH_ROTR32(V5 ^ VA, 7);; V1 = (V1 + V6 + (MF ^ 0xA4093822UL)); VC = SPH_ROTR32(VC ^ V1, 16); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 12); V1 = (V1 + V6 + (M4 ^ 0xB5470917UL)); VC = SPH_ROTR32(VC ^ V1, 8); VB = (VB + VC); V6 = SPH_ROTR32(V6 ^ VB, 7);; V2 = (V2 + V7 + (M8 ^ 0x082EFA98UL)); VD = SPH_ROTR32(VD ^ V2, 16); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 12); V2 = (V2 + V7 + (M6 ^ 0x452821E6UL)); VD = SPH_ROTR32(VD ^ V2, 8); V8 = (V8 + VD); V7 = SPH_ROTR32(V7 ^ V8, 7);; V3 = (V3 + V4 + (M2 ^ 0xBE5466CFUL)); VE = SPH_ROTR32(VE ^ V3, 16); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 12); V3 = (V3 + V4 + (MA ^ 0x13198A2EUL)); VE = SPH_ROTR32(VE ^ V3, 8); V9 = (V9 + VE); V4 = SPH_ROTR32(V4 ^ V9, 7); + + if(pre7 ^ V7 ^ VF) + return; + output[output[0xFF]++] = nonce; +} \ No newline at end of file diff --git a/ocl.c b/ocl.c index cb00790f..ecc383c8 100644 --- a/ocl.c +++ b/ocl.c @@ -37,7 +37,7 @@ #include "algorithm/neoscrypt.h" #include "algorithm/pluck.h" #include "algorithm/yescrypt.h" -#include "algorithm/lyra2re.h" +#include "algorithm/lyra2rev2.h" /* FIXME: only here for global config vars, replace with configuration.h * or similar as soon as config is in a struct instead of littered all @@ -187,7 +187,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg cl_platform_id platform = NULL; struct cgpu_info *cgpu = &gpus[gpu]; _clState *clState = (_clState *)calloc(1, sizeof(_clState)); - cl_uint preferred_vwidth, slot = 0, cpnd = 0, numDevices = clDevicesNum(); + cl_uint preferred_vwidth, numDevices = clDevicesNum(); cl_device_id *devices = (cl_device_id *)alloca(numDevices * sizeof(cl_device_id)); build_kernel_data *build_data = (build_kernel_data *)alloca(sizeof(struct _build_kernel_data)); char **pbuff = (char **)alloca(sizeof(char *) * numDevices), filename[256]; @@ -586,7 +586,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } // Lyra2re v2 TC - else if (cgpu->algorithm.type == ALGO_LYRA2REv2 && !cgpu->opt_tc) { + else if (cgpu->algorithm.type == ALGO_LYRA2REV2 && !cgpu->opt_tc) { size_t glob_thread_count; long max_int; unsigned char type = 0; @@ -797,7 +797,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_DEBUG, "yescrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); // scrypt/n-scrypt } - else if (algorithm->type == ALGO_LYRA2REv2) { + else if (algorithm->type == ALGO_LYRA2REV2) { /* The scratch/pad-buffer needs 32kBytes memory per thread. */ bufsize = LYRA_SCRATCHBUF_SIZE * cgpu->thread_concurrency; buf1size = 4* 8 * cgpu->thread_concurrency; //matrix @@ -855,7 +855,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg return NULL; } } - else if (algorithm->type == ALGO_LYRA2REv2) { + else if (algorithm->type == ALGO_LYRA2REV2) { // need additionnal buffers clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); if (status != CL_SUCCESS && !clState->buffer1) { diff --git a/sgminer.c b/sgminer.c index a418e1d8..ca58f701 100644 --- a/sgminer.c +++ b/sgminer.c @@ -2625,7 +2625,7 @@ static void curses_print_devstatus(struct cgpu_info *cgpu, int count) if (devcursor + count > LINES - 2) return; - if (count >= most_devices) + if (count >= (opt_removedisabled ? most_devices : total_devices)) return; if (cgpu->dev_start_tv.tv_sec == 0) @@ -2745,7 +2745,7 @@ static void switch_logsize(bool __maybe_unused newdevs) if (opt_compact) { logstart = devcursor + 1; } else { - logstart = devcursor + most_devices + 1; + logstart = devcursor + (opt_removedisabled ? most_devices : total_devices) + 1; } logcursor = logstart + 1; #ifdef WIN32 @@ -8873,7 +8873,7 @@ int main(int argc, char *argv[]) rd_unlock(&devices_lock); if (!opt_compact) { - logstart += most_devices; + logstart += (opt_removedisabled ? most_devices : total_devices); logcursor = logstart + 1; #ifdef HAVE_CURSES check_winsizes(); diff --git a/sph/blake.c b/sph/blake.c index 1c4a479b..c7b7b14a 100644 --- a/sph/blake.c +++ b/sph/blake.c @@ -507,6 +507,55 @@ static const sph_u64 CB[16] = { #if SPH_COMPACT_BLAKE_32 +#define COMPRESS32r8 do { \ + sph_u32 M[16]; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + unsigned r; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M[0x0] = sph_dec32be_aligned(buf + 0); \ + M[0x1] = sph_dec32be_aligned(buf + 4); \ + M[0x2] = sph_dec32be_aligned(buf + 8); \ + M[0x3] = sph_dec32be_aligned(buf + 12); \ + M[0x4] = sph_dec32be_aligned(buf + 16); \ + M[0x5] = sph_dec32be_aligned(buf + 20); \ + M[0x6] = sph_dec32be_aligned(buf + 24); \ + M[0x7] = sph_dec32be_aligned(buf + 28); \ + M[0x8] = sph_dec32be_aligned(buf + 32); \ + M[0x9] = sph_dec32be_aligned(buf + 36); \ + M[0xA] = sph_dec32be_aligned(buf + 40); \ + M[0xB] = sph_dec32be_aligned(buf + 44); \ + M[0xC] = sph_dec32be_aligned(buf + 48); \ + M[0xD] = sph_dec32be_aligned(buf + 52); \ + M[0xE] = sph_dec32be_aligned(buf + 56); \ + M[0xF] = sph_dec32be_aligned(buf + 60); \ + for (r = 0; r < 8; r ++) \ + ROUND_S(r); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + #define COMPRESS32 do { \ sph_u32 M[16]; \ sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ @@ -558,6 +607,61 @@ static const sph_u64 CB[16] = { #else +#define COMPRESS32r8 do { \ + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + M8 = sph_dec32be_aligned(buf + 32); \ + M9 = sph_dec32be_aligned(buf + 36); \ + MA = sph_dec32be_aligned(buf + 40); \ + MB = sph_dec32be_aligned(buf + 44); \ + MC = sph_dec32be_aligned(buf + 48); \ + MD = sph_dec32be_aligned(buf + 52); \ + ME = sph_dec32be_aligned(buf + 56); \ + MF = sph_dec32be_aligned(buf + 60); \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + ROUND_S(4); \ + ROUND_S(5); \ + ROUND_S(6); \ + ROUND_S(7); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + #define COMPRESS32 do { \ sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ @@ -831,6 +935,44 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len) sc->ptr = ptr; } +static void +blake32r8(sph_blake_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE32 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE32(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((T0 = SPH_T32(T0 + 512)) < 512) + T1 = SPH_T32(T1 + 1); + COMPRESS32r8; + ptr = 0; + } + } + WRITE_STATE32(sc); + sc->ptr = ptr; +} + static void blake32_close(sph_blake_small_context *sc, unsigned ub, unsigned n, void *dst, size_t out_size_w32) @@ -884,6 +1026,59 @@ blake32_close(sph_blake_small_context *sc, sph_enc32be(out + (k << 2), sc->H[k]); } +static void +blake32r8_close(sph_blake_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + union { + unsigned char buf[64]; + sph_u32 dummy; + } u; + size_t ptr, k; + unsigned bit_len; + unsigned z; + sph_u32 th, tl; + unsigned char *out; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3) + n; + z = 0x80 >> n; + u.buf[ptr] = ((ub & -z) | z) & 0xFF; + tl = sc->T0 + bit_len; + th = sc->T1; + if (ptr == 0 && n == 0) { + sc->T0 = SPH_C32(0xFFFFFE00); + sc->T1 = SPH_C32(0xFFFFFFFF); + } else if (sc->T0 == 0) { + sc->T0 = SPH_C32(0xFFFFFE00) + bit_len; + sc->T1 = SPH_T32(sc->T1 - 1); + } else { + sc->T0 -= 512 - bit_len; + } + if (bit_len <= 446) { + memset(u.buf + ptr + 1, 0, 55 - ptr); + if (out_size_w32 == 8) + u.buf[55] |= 1; + sph_enc32be_aligned(u.buf + 56, th); + sph_enc32be_aligned(u.buf + 60, tl); + blake32r8(sc, u.buf + ptr, 64 - ptr); + } else { + memset(u.buf + ptr + 1, 0, 63 - ptr); + blake32r8(sc, u.buf + ptr, 64 - ptr); + sc->T0 = SPH_C32(0xFFFFFE00); + sc->T1 = SPH_C32(0xFFFFFFFF); + memset(u.buf, 0, 56); + if (out_size_w32 == 8) + u.buf[55] = 1; + sph_enc32be_aligned(u.buf + 56, th); + sph_enc32be_aligned(u.buf + 60, tl); + blake32r8(sc, u.buf, 64); + } + out = (unsigned char *)dst; + for (k = 0; k < out_size_w32; k ++) + sph_enc32be(out + (k << 2), sc->H[k]); +} + #if SPH_64 static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 }; @@ -1034,6 +1229,13 @@ sph_blake256(void *cc, const void *data, size_t len) blake32((sph_blake_small_context *)cc, data, len); } +/* see sph_blake.h */ +void +sph_blake256r8(void *cc, const void *data, size_t len) +{ + blake32r8((sph_blake_small_context *)cc, data, len); +} + /* see sph_blake.h */ void sph_blake256_close(void *cc, void *dst) @@ -1041,6 +1243,13 @@ sph_blake256_close(void *cc, void *dst) sph_blake256_addbits_and_close(cc, 0, 0, dst); } +/* see sph_blake.h */ +void +sph_blake256r8_close(void *cc, void *dst) +{ + sph_blake256r8_addbits_and_close(cc, 0, 0, dst); +} + /* see sph_blake.h */ void sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) @@ -1049,6 +1258,14 @@ sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) sph_blake256_init(cc); } +/* see sph_blake.h */ +void +sph_blake256r8_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake32r8_close((sph_blake_small_context *)cc, ub, n, dst, 8); + sph_blake256_init(cc); +} + #if SPH_64 /* see sph_blake.h */ diff --git a/sph/sph_blake.h b/sph/sph_blake.h index c3829cad..8dea65ca 100644 --- a/sph/sph_blake.h +++ b/sph/sph_blake.h @@ -194,6 +194,7 @@ void sph_blake256_init(void *cc); * @param len the input data length (in bytes) */ void sph_blake256(void *cc, const void *data, size_t len); +void sph_blake256r8(void *cc, const void *data, size_t len); /** * Terminate the current BLAKE-256 computation and output the result into @@ -205,6 +206,7 @@ void sph_blake256(void *cc, const void *data, size_t len); * @param dst the destination buffer */ void sph_blake256_close(void *cc, void *dst); +void sph_blake256r8_close(void *cc, void *dst); /** * Add a few additional bits (0 to 7) to the current computation, then @@ -221,6 +223,8 @@ void sph_blake256_close(void *cc, void *dst); */ void sph_blake256_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); +void sph_blake256r8_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); #if SPH_64 diff --git a/winbuild/sgminer.vcxproj b/winbuild/sgminer.vcxproj index 6ec8a4cd..67679bb1 100644 --- a/winbuild/sgminer.vcxproj +++ b/winbuild/sgminer.vcxproj @@ -263,8 +263,10 @@ + + @@ -328,11 +330,14 @@ + + + @@ -365,6 +370,7 @@ + diff --git a/winbuild/sgminer.vcxproj.filters b/winbuild/sgminer.vcxproj.filters index 02c26210..7866e29d 100644 --- a/winbuild/sgminer.vcxproj.filters +++ b/winbuild/sgminer.vcxproj.filters @@ -218,6 +218,12 @@ Source Files\algorithm + + Source Files\algorithm + + + Source Files\algorithm + @@ -415,6 +421,18 @@ Header Files\algorithm + + Header Files\algorithm + + + Header Files\algorithm + + + Header Files\sph + + + Header Files\algorithm +