From 01f3183c3196281ab35b0ae4fd6589c54b633319 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 22 Aug 2015 15:01:51 +0200 Subject: [PATCH] bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot --- Algo256/bmw.cu | 101 +++++++++++ Algo256/cuda_bmw.cu | 374 ++++++++++++++++++++++++++++++++++++++++ Makefile.am | 4 + README.txt | 5 +- ccminer.cpp | 33 ++-- ccminer.vcxproj | 4 + ccminer.vcxproj.filters | 8 +- cuda_checkhash.cu | 37 ++++ miner.h | 5 + util.cpp | 3 + 10 files changed, 562 insertions(+), 12 deletions(-) create mode 100644 Algo256/bmw.cu create mode 100644 Algo256/cuda_bmw.cu diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu new file mode 100644 index 0000000..1b0d556 --- /dev/null +++ b/Algo256/bmw.cu @@ -0,0 +1,101 @@ +/** + * bmw-256 MDT + * tpruvot - 2015 + */ +extern "C" { +#include "sph/sph_bmw.h" +} + +#include "miner.h" + +#include "cuda_helper.h" + +static uint32_t *d_hash[MAX_GPUS]; + +extern void bmw256_midstate_init(int thr_id, uint32_t threads); +extern void bmw256_setBlock_80(int thr_id, void *pdata); +extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap); + +extern uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash); + +// CPU Hash +extern "C" void bmw_hash(void *state, const void *input) +{ + uint32_t _ALIGN(64) hash[16]; + sph_bmw256_context ctx; + + sph_bmw256_init(&ctx); + sph_bmw256(&ctx, input, 80); + sph_bmw256_close(&ctx, (void*) hash); + + memcpy(state, hash, 32); +} + +static bool init[MAX_GPUS] = { 0 }; + +static __inline uint32_t swab32_if(uint32_t val, bool iftrue) { + return iftrue ? swab32(val) : val; +} + +extern "C" int scanhash_bmw(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + bool swapnonce = true; + uint32_t throughput = device_intensity(thr_id, __func__, 1U << 21); + throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0005; + + if (!init[thr_id]) { + cudaSetDevice(device_map[thr_id]); + + cuda_check_cpu_init(thr_id, throughput); + bmw256_midstate_init(thr_id, throughput); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) { + be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + } + + bmw256_setBlock_80(thr_id, (void*)endiandata); + + cuda_check_cpu_setTarget(ptarget); + + do { + bmw256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], (int) swapnonce); + uint32_t foundNonce = cuda_check_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]); + if (foundNonce != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash64[8]; + endiandata[19] = swab32_if(foundNonce, swapnonce); + bmw_hash(vhash64, endiandata); + + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { + *hashes_done = foundNonce - first_nonce + 1; + pdata[19] = swab32_if(foundNonce,!swapnonce); + return 1; + } + else { + applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); + } + } + + if ((uint64_t) throughput + pdata[19] > max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} diff --git a/Algo256/cuda_bmw.cu b/Algo256/cuda_bmw.cu new file mode 100644 index 0000000..b7a4f99 --- /dev/null +++ b/Algo256/cuda_bmw.cu @@ -0,0 +1,374 @@ +/** + * BMW-256 CUDA Implementation - tpruvot 2015 + * + * Not optimal but close to the sph version and easier to adapt. + */ + +#include +#include + +#define SPH_64 1 +#define USE_MIDSTATE + +extern "C" { +#include "sph/sph_bmw.h" +} + +#include "cuda_helper.h" + +__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) + +#ifndef USE_MIDSTATE +__constant__ static sph_u32 IV256[16] = { + 0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F, + 0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F, + 0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F, + 0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F +}; +#endif + +__constant__ static sph_u32 final_s[16] = { + 0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3, + 0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7, + 0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab, + 0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf +}; + +static sph_bmw_small_context* d_midstate[MAX_GPUS]; + +#define I16_16 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +#define I16_17 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +#define I16_18 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +#define I16_19 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 +#define I16_20 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 +#define I16_21 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 +#define I16_22 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +#define I16_23 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 +#define I16_24 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 +#define I16_25 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 +#define I16_26 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 +#define I16_27 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +#define I16_28 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 +#define I16_29 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 +#define I16_30 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 +#define I16_31 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + +//#define M16_16 0, 1, 3, 4, 7, 10, 11 +//#define M16_17 1, 2, 4, 5, 8, 11, 12 +#define M16_18 2, 3, 5, 6, 9, 12, 13 +#define M16_19 3, 4, 6, 7, 10, 13, 14 +#define M16_20 4, 5, 7, 8, 11, 14, 15 +#define M16_21 5, 6, 8, 9, 12, 15, 16 +#define M16_22 6, 7, 9, 10, 13, 0, 1 +#define M16_23 7, 8, 10, 11, 14, 1, 2 +#define M16_24 8, 9, 11, 12, 15, 2, 3 +#define M16_25 9, 10, 12, 13, 0, 3, 4 +#define M16_26 10, 11, 13, 14, 1, 4, 5 +#define M16_27 11, 12, 14, 15, 2, 5, 6 +#define M16_28 12, 13, 15, 16, 3, 6, 7 +#define M16_29 13, 14, 0, 1, 4, 7, 8 +#define M16_30 14, 15, 1, 2, 5, 8, 9 +#define M16_31 15, 16, 2, 3, 6, 9, 10 + +#define ss0(x) (((x) >> 1) ^ ((x) << 3) ^ ROTL32(x, 4) ^ ROTL32(x, 19)) +#define ss1(x) (((x) >> 1) ^ ((x) << 2) ^ ROTL32(x, 8) ^ ROTL32(x, 23)) +#define ss2(x) (((x) >> 2) ^ ((x) << 1) ^ ROTL32(x, 12) ^ ROTL32(x, 25)) +#define ss3(x) (((x) >> 2) ^ ((x) << 2) ^ ROTL32(x, 15) ^ ROTL32(x, 29)) +#define ss4(x) (((x) >> 1) ^ (x)) +#define ss5(x) (((x) >> 2) ^ (x)) + +#define rs1(x) ROTL32(x, 3) +#define rs2(x) ROTL32(x, 7) +#define rs3(x) ROTL32(x, 13) +#define rs4(x) ROTL32(x, 16) +#define rs5(x) ROTL32(x, 19) +#define rs6(x) ROTL32(x, 23) +#define rs7(x) ROTL32(x, 27) + +#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \ + tt((data[i0] ^ h[i0]) op01 (data[i1] ^ h[i1]) op12 (data[i2] ^ h[i2]) op23 (data[i3] ^ h[i3]) op34 (data[i4] ^ h[i4])) +//#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) +//#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) +//#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) +//#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13) +//#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14) +//#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15) +//#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13) +//#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14) +//#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15) +//#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14) +//#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15) +//#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9) +//#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10) +//#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11) +//#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12) +//#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13) + +__device__ +static void gpu_compress_small(const sph_u32 *data, const sph_u32 h[16], sph_u32 dh[16]) +{ + // FOLD MAKE_Qas; + + sph_u32 dx[16]; + for (int i=0; i<16; i++) + dx[i] = data[i] ^ h[i]; + + sph_u32 qt[32]; + qt[ 0] = dx[ 5] - dx[7] + dx[10] + dx[13] + dx[14]; // Ws0 + qt[ 1] = dx[ 6] - dx[8] + dx[11] + dx[14] - dx[15]; // Ws1 + qt[ 2] = dx[ 0] + dx[7] + dx[ 9] - dx[12] + dx[15]; // Ws2 + qt[ 3] = dx[ 0] - dx[1] + dx[ 8] - dx[10] + dx[13]; // Ws3 + qt[ 4] = dx[ 1] + dx[2] + dx[ 9] - dx[11] - dx[14]; // Ws4; + qt[ 5] = dx[ 3] - dx[2] + dx[10] - dx[12] + dx[15]; // Ws5; + qt[ 6] = dx[ 4] - dx[0] - dx[ 3] - dx[11] + dx[13]; // Ws6; + qt[ 7] = dx[ 1] - dx[4] - dx[ 5] - dx[12] - dx[14]; // Ws7; + qt[ 8] = dx[ 2] - dx[5] - dx[ 6] + dx[13] - dx[15]; // Ws8; + qt[ 9] = dx[ 0] - dx[3] + dx[ 6] - dx[ 7] + dx[14]; // Ws9; + qt[10] = dx[ 8] - dx[1] - dx[ 4] - dx[ 7] + dx[15]; // Ws10; + qt[11] = dx[ 8] - dx[0] - dx[ 2] - dx[ 5] + dx[ 9]; // Ws11; + qt[12] = dx[ 1] + dx[3] - dx[ 6] - dx[ 9] + dx[10]; // Ws12; + qt[13] = dx[ 2] + dx[4] + dx[ 7] + dx[10] + dx[11]; // Ws13; + qt[14] = dx[ 3] - dx[5] + dx[ 8] - dx[11] - dx[12]; // Ws14; + qt[15] = dx[12] - dx[4] - dx[ 6] - dx[ 9] + dx[13]; // Ws15; + + qt[ 0] = ss0(qt[ 0]) + h[ 1]; + qt[ 1] = ss1(qt[ 1]) + h[ 2]; + qt[ 2] = ss2(qt[ 2]) + h[ 3]; + qt[ 3] = ss3(qt[ 3]) + h[ 4]; + qt[ 4] = ss4(qt[ 4]) + h[ 5]; + + qt[ 5] = ss0(qt[ 5]) + h[ 6]; + qt[ 6] = ss1(qt[ 6]) + h[ 7]; + qt[ 7] = ss2(qt[ 7]) + h[ 8]; + qt[ 8] = ss3(qt[ 8]) + h[ 9]; + qt[ 9] = ss4(qt[ 9]) + h[10]; + + qt[10] = ss0(qt[10]) + h[11]; + qt[11] = ss1(qt[11]) + h[12]; + qt[12] = ss2(qt[12]) + h[13]; + qt[13] = ss3(qt[13]) + h[14]; + qt[14] = ss4(qt[14]) + h[15]; + + qt[15] = ss0(qt[15]) + h[ 0]; + + //MAKE_Qbs; + #define Ks(j) ((sph_u32)(0x05555555UL * j)) + #define Qs(j) (qt[j]) + + #define expand1s_in(i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + (ss1(qt[i0]) + ss2(qt[i1]) + ss3(qt[i2]) + ss0(qt[i3]) + ss1(qt[i4]) + ss2(qt[i5]) + ss3(qt[i6]) + ss0(qt[i7]) \ + + ss1(qt[i8]) + ss2(qt[i9]) + ss3(qt[i10]) + ss0(qt[i11]) + ss1(qt[i12]) + ss2(qt[i13]) + ss3(qt[i14]) + ss0(qt[i15]) \ + + ((ROTL32(data[i0m], i1m) + ROTL32(data[i3m], i4m) - ROTL32(data[i10m], i11m) + Ks(i16)) ^ h[i7m])) + + qt[16] = expand1s_in(16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 3, 4, 7, 10, 11); + qt[17] = expand1s_in(17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 4, 5, 8, 11, 12); + + #define expand2s_inner(qf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + (qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \ + + qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \ + + qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \ + + ((ROTL32(data[i0m], i1m) + ROTL32(data[i3m], i4m) - ROTL32(data[i10m], i11m) + Ks(i16)) ^ h[i7m])) + +#ifdef _MSC_VER + #define LPAR ( + #define expand2s(i16) \ + expand2s_(Qs, i16, I16_ ## i16, M16_ ## i16) + #define expand2s_(qf, i16, ix, iy) \ + expand2s_inner LPAR qf, i16, ix, iy) +#else + #define expand2s_(i16, ix, iy) \ + expand2s_inner(Qs, i16, ix, iy) + #define expand2s(i16) \ + expand2s_(i16, I16_ ## i16, M16_ ## i16) +#endif + + qt[18] = expand2s(18); + qt[19] = expand2s(19); + qt[20] = expand2s(20); + qt[21] = expand2s(21); + qt[22] = expand2s(22); + qt[23] = expand2s(23); + qt[24] = expand2s(24); + qt[25] = expand2s(25); + qt[26] = expand2s(26); + qt[27] = expand2s(27); + qt[28] = expand2s(28); + qt[29] = expand2s(29); + qt[30] = expand2s(30); + qt[31] = expand2s(31); + + sph_u32 xl, xh; + xl = Qs(16) ^ Qs(17) ^ Qs(18) ^ Qs(19) ^ Qs(20) ^ Qs(21) ^ Qs(22) ^ Qs(23); + + xh = xl ^ Qs(24) ^ Qs(25) ^ Qs(26) ^ Qs(27) ^ Qs(28) ^ Qs(29) ^ Qs(30) ^ Qs(31); + + dh[ 0] = ((xh << 5) ^ (Qs(16) >> 5) ^ data[ 0]) + (xl ^ Qs(24) ^ Qs(0)); + dh[ 1] = ((xh >> 7) ^ (Qs(17) << 8) ^ data[ 1]) + (xl ^ Qs(25) ^ Qs(1)); + dh[ 2] = ((xh >> 5) ^ (Qs(18) << 5) ^ data[ 2]) + (xl ^ Qs(26) ^ Qs(2)); + dh[ 3] = ((xh >> 1) ^ (Qs(19) << 5) ^ data[ 3]) + (xl ^ Qs(27) ^ Qs(3)); + dh[ 4] = ((xh >> 3) ^ (Qs(20) << 0) ^ data[ 4]) + (xl ^ Qs(28) ^ Qs(4)); + dh[ 5] = ((xh << 6) ^ (Qs(21) >> 6) ^ data[ 5]) + (xl ^ Qs(29) ^ Qs(5)); + dh[ 6] = ((xh >> 4) ^ (Qs(22) << 6) ^ data[ 6]) + (xl ^ Qs(30) ^ Qs(6)); + dh[ 7] = ((xh >> 11) ^ (Qs(23) << 2) ^ data[ 7]) + (xl ^ Qs(31) ^ Qs(7)); + + dh[ 8] = ROTL32(dh[4], 9) + (xh ^ Qs(24) ^ data[ 8]) + ((xl << 8) ^ Qs(23) ^ Qs( 8)); + dh[ 9] = ROTL32(dh[5], 10) + (xh ^ Qs(25) ^ data[ 9]) + ((xl >> 6) ^ Qs(16) ^ Qs( 9)); + dh[10] = ROTL32(dh[6], 11) + (xh ^ Qs(26) ^ data[10]) + ((xl << 6) ^ Qs(17) ^ Qs(10)); + dh[11] = ROTL32(dh[7], 12) + (xh ^ Qs(27) ^ data[11]) + ((xl << 4) ^ Qs(18) ^ Qs(11)); + dh[12] = ROTL32(dh[0], 13) + (xh ^ Qs(28) ^ data[12]) + ((xl >> 3) ^ Qs(19) ^ Qs(12)); + dh[13] = ROTL32(dh[1], 14) + (xh ^ Qs(29) ^ data[13]) + ((xl >> 4) ^ Qs(20) ^ Qs(13)); + dh[14] = ROTL32(dh[2], 15) + (xh ^ Qs(30) ^ data[14]) + ((xl >> 7) ^ Qs(21) ^ Qs(14)); + dh[15] = ROTL32(dh[3], 16) + (xh ^ Qs(31) ^ data[15]) + ((xl >> 2) ^ Qs(22) ^ Qs(15)); +} + +#ifndef USE_MIDSTATE + +__device__ +static void gpu_bmw256_init(sph_bmw_small_context *sc) +{ + memcpy(sc->H, IV256, sizeof sc->H); + sc->ptr = 0; + sc->bit_count = 0; +} + +__device__ +static void gpu_bmw256(sph_bmw_small_context *sc, const void *data, size_t len) +{ + sph_u32 htmp[16]; + sph_u32 *h1, *h2; + unsigned char *buf = sc->buf; + size_t ptr = sc->ptr; + + sc->bit_count += (sph_u64)len << 3; + + h1 = sc->H; + h2 = htmp; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + ptr += clen; + if (ptr == sizeof sc->buf) { + sph_u32 *ht; + + gpu_compress_small((sph_u32 *) buf, h1, h2); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + sc->ptr = ptr; + if (h1 != sc->H) + memcpy(sc->H, h1, sizeof sc->H); +} + +#endif + +#define sph_enc64le(ptr, x) \ + *((uint64_t*)(ptr)) = x +#define sph_enc64le_aligned sph_enc64le + +__device__ +static void gpu_bmw256_close(sph_bmw_small_context *sc, uint2 *out) +{ + unsigned char *buf = sc->buf; + size_t ptr = sc->ptr; + + buf[ptr ++] = 0x80; + sph_u32 *h = sc->H; + + sph_u32 h1[16]; + if (ptr > (sizeof sc->buf) - 8) { + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + gpu_compress_small((sph_u32 *) buf, h, h1); + ptr = 0; + h = h1; + } + memset(buf + ptr, 0, sizeof(sc->buf) - 8 - ptr); + + sph_enc64le_aligned(buf + sizeof(sc->buf) - 8, SPH_T64(sc->bit_count)); + + sph_u32 h2[16]; + gpu_compress_small((sph_u32 *) buf, h, h2); + gpu_compress_small(h2, final_s, h1); + + uint64_t* h64 = (uint64_t*) (&h1[8]); + #pragma unroll + for (int i = 0; i < 4; i++) { + out[i] = vectorize(h64[i]); + } +} + +__global__ /* __launch_bounds__(256, 3) */ +void bmw256_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash, sph_bmw256_context *d_midstate, int swap) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nonce = startNonce + thread; + nonce = swap ? cuda_swab32(nonce): nonce; + +#ifndef USE_MIDSTATE + uint2 hash[10]; + #pragma unroll + for(int i=0;i<9;i++) + hash[i] = vectorize(c_PaddedMessage80[i]); + hash[9] = make_uint2(c_PaddedMessage80[9], nonce); + + sph_bmw256_context ctx; + gpu_bmw256_init(&ctx); + gpu_bmw256(&ctx, (void*) hash, 80); +#else + sph_bmw256_context ctx; + ctx.ptr = 16; ctx.bit_count = 640; + uint2 *buf = (uint2 *) ctx.buf; + buf[0] = vectorize(c_PaddedMessage80[8]); + buf[1] = make_uint2(c_PaddedMessage80[9], nonce); + #pragma unroll + for(int i=0;i<16;i++) + ctx.H[i] = d_midstate->H[i]; +#endif + gpu_bmw256_close(&ctx, (uint2*) &g_hash[thread << 2]); + } +} + +__host__ +void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, int swap) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + bmw256_gpu_hash_80<<>>(threads, startNonce, (uint64_t*)d_outputHash, d_midstate[thr_id], swap); +} + +__host__ +void bmw256_setBlock_80(int thr_id, void *pdata) +{ + uint64_t PaddedMessage[16]; + memcpy(PaddedMessage, pdata, 80); + memset(&PaddedMessage[10], 0, 48); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)); + + sph_bmw256_context ctx; + sph_bmw256_init(&ctx); + sph_bmw256(&ctx, (void*) PaddedMessage, 80); + CUDA_SAFE_CALL(cudaMemcpy(d_midstate[thr_id], &ctx, sizeof(sph_bmw256_context), cudaMemcpyHostToDevice)); +} + +__host__ +void bmw256_midstate_init(int thr_id, uint32_t threads) +{ + cudaMalloc(&d_midstate[thr_id], sizeof(sph_bmw256_context)); +} diff --git a/Makefile.am b/Makefile.am index 7b343c1..d086f91 100644 --- a/Makefile.am +++ b/Makefile.am @@ -37,6 +37,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ Algo256/blake256.cu Algo256/keccak256.cu \ + Algo256/bmw.cu Algo256/cuda_bmw.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ @@ -95,6 +96,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v" Algo256/blake256.o: Algo256/blake256.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< +Algo256/cuda_bmw.o: Algo256/cuda_bmw.cu + $(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $< + heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< diff --git a/README.txt b/README.txt index 27fffea..bc9fed1 100644 --- a/README.txt +++ b/README.txt @@ -34,6 +34,7 @@ DarkCoin and other X11 coins Chaincoin and Flaxscript (C11) Saffroncoin blake (256 14-rounds) BlakeCoin (256 8-rounds) +Midnight (BMW 256) Qubit (Digibyte, ...) Luffa (Joincoin) Keccak (Maxcoin) @@ -66,6 +67,7 @@ its command line interface and options. -a, --algo=ALGO specify the algorithm to use blake use to mine Saffroncoin (Blake 256) blakecoin use to mine Old Blake 256 + bmw use to mine Midnight c11/flax use to mine Chaincoin and Flax deep use to mine Deepcoin dmd-gr use to mine Diamond-Groestl @@ -225,7 +227,8 @@ features. August 2015... Add Lyra2REv2 algo (Vertcoin/Zoom) Restore WhirlpoolX algo (VNL) - Drop animecoin support + Drop Animecoin support + Add bmw (Midnight) algo July 06th 2015 v1.6.5-C11 Nvml api power limits diff --git a/ccminer.cpp b/ccminer.cpp index 429257c..12f704f 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -86,6 +86,7 @@ struct workio_cmd { enum sha_algos { ALGO_BLAKE, ALGO_BLAKECOIN, + ALGO_BMW, ALGO_C11, ALGO_DEEP, ALGO_DMD_GR, @@ -123,6 +124,7 @@ enum sha_algos { static const char *algo_names[] = { "blake", "blakecoin", + "bmw", "c11", "deep", "dmd-gr", @@ -280,6 +282,7 @@ Options:\n\ -a, --algo=ALGO specify the hash algorithm to use\n\ blake Blake 256 (SFR)\n\ blakecoin Fast Blake 256 (8 rounds)\n\ + bmw BMW 256\n\ c11/flax X11 variant\n\ deep Deepcoin\n\ dmd-gr Diamond-Groestl\n\ @@ -848,6 +851,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work) be32enc(&ntime, work->data[17]); be32enc(&nonce, work->data[19]); break; + case ALGO_BLAKE: + case ALGO_BLAKECOIN: + case ALGO_BMW: + // fast algos require that... + check_dups = true; default: le32enc(&ntime, work->data[17]); le32enc(&nonce, work->data[19]); @@ -1758,6 +1766,7 @@ static void *miner_thread(void *userdata) case ALGO_WHIRLPOOLX: minmax = 0x80000000U; break; + case ALGO_BMW: case ALGO_KECCAK: minmax = 0x40000000U; break; @@ -1844,6 +1853,20 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_BLAKECOIN: + rc = scanhash_blake256(thr_id, work.data, work.target, + max_nonce, &hashes_done, 8); + break; + + case ALGO_BLAKE: + rc = scanhash_blake256(thr_id, work.data, work.target, + max_nonce, &hashes_done, 14); + break; + + case ALGO_BMW: + rc = scanhash_bmw(thr_id, work.data, work.target, max_nonce, &hashes_done); + break; + case ALGO_C11: rc = scanhash_c11(thr_id, work.data, work.target, max_nonce, &hashes_done); @@ -1880,16 +1903,6 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; - case ALGO_BLAKECOIN: - rc = scanhash_blake256(thr_id, work.data, work.target, - max_nonce, &hashes_done, 8); - break; - - case ALGO_BLAKE: - rc = scanhash_blake256(thr_id, work.data, work.target, - max_nonce, &hashes_done, 14); - break; - case ALGO_FRESH: rc = scanhash_fresh(thr_id, work.data, work.target, max_nonce, &hashes_done); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 936472b..a163741 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -247,6 +247,10 @@ + + + 76 + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index bb4573d..d4b6802 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -652,6 +652,12 @@ Source Files\neoscrypt + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + @@ -668,4 +674,4 @@ Ressources - + diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index c0a3c16..52ba432 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -89,6 +89,21 @@ void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, u } } +__global__ __launch_bounds__(512, 4) +void cuda_checkhash_32(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t *inpHash = &hash[thread << 3]; + + if (resNonces[0] == UINT32_MAX) { + if (hashbelowtarget(inpHash, pTarget)) + resNonces[0] = (startNounce + thread); + } + } +} + __host__ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash) { @@ -111,6 +126,28 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin return h_resNonces[thr_id][0]; } +__host__ +uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash) +{ + cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); + + const uint32_t threadsperblock = 512; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + if (!init_done) { + applog(LOG_ERR, "missing call to cuda_check_cpu_init"); + return UINT32_MAX; + } + + cuda_checkhash_32 <<>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]); + cudaThreadSynchronize(); + + cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + return h_resNonces[thr_id][0]; +} + /* --------------------------------------------------------------------------------------------- */ __global__ __launch_bounds__(512, 4) diff --git a/miner.h b/miner.h index 4c5822c..7f1ea7b 100644 --- a/miner.h +++ b/miner.h @@ -303,6 +303,10 @@ extern int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds); +extern int scanhash_bmw(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + extern int scanhash_c11(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -769,6 +773,7 @@ void applog_compare_hash(unsigned char *hash, unsigned char *hash2); void print_hash_tests(void); void blake256hash(void *output, const void *input, int8_t rounds); +void bmw_hash(void *state, const void *input); void c11hash(void *output, const void *input); void deephash(void *state, const void *input); void luffa_hash(void *state, const void *input); diff --git a/util.cpp b/util.cpp index a4ddbf7..27e667f 100644 --- a/util.cpp +++ b/util.cpp @@ -1812,6 +1812,9 @@ void print_hash_tests(void) blake256hash(&hash[0], &buf[0], 14); printpfx("blake", hash); + bmw_hash(&hash[0], &buf[0]); + printpfx("bmw", hash); + c11hash(&hash[0], &buf[0]); printpfx("c11", hash);