From 01f3183c3196281ab35b0ae4fd6589c54b633319 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 22 Aug 2015 15:01:51 +0200
Subject: [PATCH] bmw algo for MDT, with midstate

which could be extracted from json too

replace a satcoin by another one ;)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
---
 Algo256/bmw.cu          | 101 +++++++++++
 Algo256/cuda_bmw.cu     | 374 ++++++++++++++++++++++++++++++++++++++++
 Makefile.am             |   4 +
 README.txt              |   5 +-
 ccminer.cpp             |  33 ++--
 ccminer.vcxproj         |   4 +
 ccminer.vcxproj.filters |   8 +-
 cuda_checkhash.cu       |  37 ++++
 miner.h                 |   5 +
 util.cpp                |   3 +
 10 files changed, 562 insertions(+), 12 deletions(-)
 create mode 100644 Algo256/bmw.cu
 create mode 100644 Algo256/cuda_bmw.cu

diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu
new file mode 100644
index 0000000..1b0d556
--- /dev/null
+++ b/Algo256/bmw.cu
@@ -0,0 +1,101 @@
+/**
+ * bmw-256 MDT
+ * tpruvot - 2015
+ */
+extern "C" {
+#include "sph/sph_bmw.h"
+}
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void bmw256_midstate_init(int thr_id, uint32_t threads);
+extern void bmw256_setBlock_80(int thr_id, void *pdata);
+extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap);
+
+extern uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
+
+// CPU Hash
+extern "C" void bmw_hash(void *state, const void *input)
+{
+	uint32_t _ALIGN(64) hash[16];
+	sph_bmw256_context ctx;
+
+	sph_bmw256_init(&ctx);
+	sph_bmw256(&ctx, input, 80);
+	sph_bmw256_close(&ctx, (void*) hash);
+
+	memcpy(state, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
+	return iftrue ? swab32(val) : val;
+}
+
+extern "C" int scanhash_bmw(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+	bool swapnonce = true;
+	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 21);
+	throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0005;
+
+	if (!init[thr_id]) {
+		cudaSetDevice(device_map[thr_id]);
+
+		cuda_check_cpu_init(thr_id, throughput);
+		bmw256_midstate_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++) {
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+	}
+
+	bmw256_setBlock_80(thr_id, (void*)endiandata);
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		bmw256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], (int) swapnonce);
+		uint32_t foundNonce = cuda_check_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (foundNonce != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash64[8];
+			endiandata[19] = swab32_if(foundNonce, swapnonce);
+			bmw_hash(vhash64, endiandata);
+
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
+				*hashes_done = foundNonce - first_nonce + 1;
+				pdata[19] = swab32_if(foundNonce,!swapnonce);
+				return 1;
+			}
+			else {
+				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] > max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
diff --git a/Algo256/cuda_bmw.cu b/Algo256/cuda_bmw.cu
new file mode 100644
index 0000000..b7a4f99
--- /dev/null
+++ b/Algo256/cuda_bmw.cu
@@ -0,0 +1,374 @@
+/**
+ * BMW-256 CUDA Implementation - tpruvot 2015
+ *
+ * Not optimal but close to the sph version and easier to adapt.
+ */
+
+#include <stdio.h>
+#include <memory.h>
+
+#define SPH_64 1
+#define USE_MIDSTATE
+
+extern "C" {
+#include "sph/sph_bmw.h"
+}
+
+#include "cuda_helper.h"
+
+__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+
+#ifndef USE_MIDSTATE
+__constant__ static sph_u32 IV256[16] = {
+	0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F,
+	0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F,
+	0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F,
+	0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F
+};
+#endif
+
+__constant__ static sph_u32 final_s[16] = {
+	0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3,
+	0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7,
+	0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
+	0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf
+};
+
+static sph_bmw_small_context* d_midstate[MAX_GPUS];
+
+#define I16_16    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+#define I16_17    1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
+#define I16_18    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17
+#define I16_19    3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18
+#define I16_20    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
+#define I16_21    5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
+#define I16_22    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+#define I16_23    7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
+#define I16_24    8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+#define I16_25    9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+#define I16_26   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25
+#define I16_27   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26
+#define I16_28   12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+#define I16_29   13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28
+#define I16_30   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+#define I16_31   15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+
+//#define M16_16    0,  1,  3,  4,  7, 10, 11
+//#define M16_17    1,  2,  4,  5,  8, 11, 12
+#define M16_18    2,  3,  5,  6,  9, 12, 13
+#define M16_19    3,  4,  6,  7, 10, 13, 14
+#define M16_20    4,  5,  7,  8, 11, 14, 15
+#define M16_21    5,  6,  8,  9, 12, 15, 16
+#define M16_22    6,  7,  9, 10, 13,  0,  1
+#define M16_23    7,  8, 10, 11, 14,  1,  2
+#define M16_24    8,  9, 11, 12, 15,  2,  3
+#define M16_25    9, 10, 12, 13,  0,  3,  4
+#define M16_26   10, 11, 13, 14,  1,  4,  5
+#define M16_27   11, 12, 14, 15,  2,  5,  6
+#define M16_28   12, 13, 15, 16,  3,  6,  7
+#define M16_29   13, 14,  0,  1,  4,  7,  8
+#define M16_30   14, 15,  1,  2,  5,  8,  9
+#define M16_31   15, 16,  2,  3,  6,  9, 10
+
+#define ss0(x)    (((x) >> 1) ^ ((x) << 3) ^ ROTL32(x,  4) ^ ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ ((x) << 2) ^ ROTL32(x,  8) ^ ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ ((x) << 1) ^ ROTL32(x, 12) ^ ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ ((x) << 2) ^ ROTL32(x, 15) ^ ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+
+#define rs1(x)    ROTL32(x,  3)
+#define rs2(x)    ROTL32(x,  7)
+#define rs3(x)    ROTL32(x, 13)
+#define rs4(x)    ROTL32(x, 16)
+#define rs5(x)    ROTL32(x, 19)
+#define rs6(x)    ROTL32(x, 23)
+#define rs7(x)    ROTL32(x, 27)
+
+#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+	tt((data[i0] ^ h[i0]) op01 (data[i1] ^ h[i1]) op12 (data[i2] ^ h[i2]) op23 (data[i3] ^ h[i3]) op34 (data[i4] ^ h[i4]))
+//#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+//#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+//#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+//#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+//#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+//#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+//#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+//#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+//#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+//#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+//#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+//#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+//#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+//#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+//#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+//#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+__device__
+static void gpu_compress_small(const sph_u32 *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+		// FOLD MAKE_Qas;
+
+		sph_u32 dx[16];
+		for (int i=0; i<16; i++)
+			dx[i] = data[i] ^ h[i];
+
+		sph_u32 qt[32];
+		qt[ 0] = dx[ 5] - dx[7] + dx[10] + dx[13] + dx[14]; // Ws0
+		qt[ 1] = dx[ 6] - dx[8] + dx[11] + dx[14] - dx[15]; // Ws1
+		qt[ 2] = dx[ 0] + dx[7] + dx[ 9] - dx[12] + dx[15]; // Ws2
+		qt[ 3] = dx[ 0] - dx[1] + dx[ 8] - dx[10] + dx[13]; // Ws3
+		qt[ 4] = dx[ 1] + dx[2] + dx[ 9] - dx[11] - dx[14]; // Ws4;
+		qt[ 5] = dx[ 3] - dx[2] + dx[10] - dx[12] + dx[15]; // Ws5;
+		qt[ 6] = dx[ 4] - dx[0] - dx[ 3] - dx[11] + dx[13]; // Ws6;
+		qt[ 7] = dx[ 1] - dx[4] - dx[ 5] - dx[12] - dx[14]; // Ws7;
+		qt[ 8] = dx[ 2] - dx[5] - dx[ 6] + dx[13] - dx[15]; // Ws8;
+		qt[ 9] = dx[ 0] - dx[3] + dx[ 6] - dx[ 7] + dx[14]; // Ws9;
+		qt[10] = dx[ 8] - dx[1] - dx[ 4] - dx[ 7] + dx[15]; // Ws10;
+		qt[11] = dx[ 8] - dx[0] - dx[ 2] - dx[ 5] + dx[ 9]; // Ws11;
+		qt[12] = dx[ 1] + dx[3] - dx[ 6] - dx[ 9] + dx[10]; // Ws12;
+		qt[13] = dx[ 2] + dx[4] + dx[ 7] + dx[10] + dx[11]; // Ws13;
+		qt[14] = dx[ 3] - dx[5] + dx[ 8] - dx[11] - dx[12]; // Ws14;
+		qt[15] = dx[12] - dx[4] - dx[ 6] - dx[ 9] + dx[13]; // Ws15;
+
+		qt[ 0] = ss0(qt[ 0]) + h[ 1];
+		qt[ 1] = ss1(qt[ 1]) + h[ 2];
+		qt[ 2] = ss2(qt[ 2]) + h[ 3];
+		qt[ 3] = ss3(qt[ 3]) + h[ 4];
+		qt[ 4] = ss4(qt[ 4]) + h[ 5];
+
+		qt[ 5] = ss0(qt[ 5]) + h[ 6];
+		qt[ 6] = ss1(qt[ 6]) + h[ 7];
+		qt[ 7] = ss2(qt[ 7]) + h[ 8];
+		qt[ 8] = ss3(qt[ 8]) + h[ 9];
+		qt[ 9] = ss4(qt[ 9]) + h[10];
+
+		qt[10] = ss0(qt[10]) + h[11];
+		qt[11] = ss1(qt[11]) + h[12];
+		qt[12] = ss2(qt[12]) + h[13];
+		qt[13] = ss3(qt[13]) + h[14];
+		qt[14] = ss4(qt[14]) + h[15];
+
+		qt[15] = ss0(qt[15]) + h[ 0];
+
+		//MAKE_Qbs;
+		#define Ks(j)   ((sph_u32)(0x05555555UL * j))
+		#define Qs(j)   (qt[j])
+
+		#define expand1s_in(i16, \
+				i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, \
+				i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+			(ss1(qt[i0]) + ss2(qt[i1]) + ss3(qt[i2]) + ss0(qt[i3]) + ss1(qt[i4]) + ss2(qt[i5]) + ss3(qt[i6]) + ss0(qt[i7]) \
+				+ ss1(qt[i8]) + ss2(qt[i9]) + ss3(qt[i10]) + ss0(qt[i11]) + ss1(qt[i12]) + ss2(qt[i13]) + ss3(qt[i14]) + ss0(qt[i15]) \
+				+ ((ROTL32(data[i0m], i1m) + ROTL32(data[i3m], i4m)  - ROTL32(data[i10m], i11m) + Ks(i16)) ^ h[i7m]))
+
+		qt[16] = expand1s_in(16,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  0, 1, 3, 4, 7, 10, 11);
+		qt[17] = expand1s_in(17,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,  1, 2, 4, 5, 8, 11, 12);
+
+		#define expand2s_inner(qf, i16, \
+				i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, \
+				i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+			(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+				+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+				+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+				+ ((ROTL32(data[i0m], i1m) + ROTL32(data[i3m], i4m) - ROTL32(data[i10m], i11m) + Ks(i16)) ^ h[i7m]))
+
+#ifdef _MSC_VER
+		#define LPAR   (
+		#define expand2s(i16) \
+			expand2s_(Qs, i16, I16_ ## i16, M16_ ## i16)
+		#define expand2s_(qf, i16, ix, iy) \
+			expand2s_inner LPAR qf, i16, ix, iy)
+#else
+		#define expand2s_(i16, ix, iy) \
+			expand2s_inner(Qs, i16, ix, iy)
+		#define expand2s(i16) \
+			expand2s_(i16, I16_ ## i16, M16_ ## i16)
+#endif
+
+		qt[18] = expand2s(18);
+		qt[19] = expand2s(19);
+		qt[20] = expand2s(20);
+		qt[21] = expand2s(21);
+		qt[22] = expand2s(22);
+		qt[23] = expand2s(23);
+		qt[24] = expand2s(24);
+		qt[25] = expand2s(25);
+		qt[26] = expand2s(26);
+		qt[27] = expand2s(27);
+		qt[28] = expand2s(28);
+		qt[29] = expand2s(29);
+		qt[30] = expand2s(30);
+		qt[31] = expand2s(31);
+
+		sph_u32 xl, xh;
+		xl = Qs(16) ^ Qs(17) ^ Qs(18) ^ Qs(19) ^ Qs(20) ^ Qs(21) ^ Qs(22) ^ Qs(23);
+
+		xh = xl ^ Qs(24) ^ Qs(25) ^ Qs(26) ^ Qs(27)	^ Qs(28) ^ Qs(29) ^ Qs(30) ^ Qs(31);
+
+		dh[ 0] = ((xh <<  5) ^ (Qs(16) >>  5) ^ data[ 0]) + (xl ^ Qs(24) ^ Qs(0));
+		dh[ 1] = ((xh >>  7) ^ (Qs(17) <<  8) ^ data[ 1]) + (xl ^ Qs(25) ^ Qs(1));
+		dh[ 2] = ((xh >>  5) ^ (Qs(18) <<  5) ^ data[ 2]) + (xl ^ Qs(26) ^ Qs(2));
+		dh[ 3] = ((xh >>  1) ^ (Qs(19) <<  5) ^ data[ 3]) + (xl ^ Qs(27) ^ Qs(3));
+		dh[ 4] = ((xh >>  3) ^ (Qs(20) <<  0) ^ data[ 4]) + (xl ^ Qs(28) ^ Qs(4));
+		dh[ 5] = ((xh <<  6) ^ (Qs(21) >>  6) ^ data[ 5]) + (xl ^ Qs(29) ^ Qs(5));
+		dh[ 6] = ((xh >>  4) ^ (Qs(22) <<  6) ^ data[ 6]) + (xl ^ Qs(30) ^ Qs(6));
+		dh[ 7] = ((xh >> 11) ^ (Qs(23) <<  2) ^ data[ 7]) + (xl ^ Qs(31) ^ Qs(7));
+
+		dh[ 8] = ROTL32(dh[4],  9) + (xh ^ Qs(24) ^ data[ 8]) + ((xl << 8) ^ Qs(23) ^ Qs( 8));
+		dh[ 9] = ROTL32(dh[5], 10) + (xh ^ Qs(25) ^ data[ 9]) + ((xl >> 6) ^ Qs(16) ^ Qs( 9));
+		dh[10] = ROTL32(dh[6], 11) + (xh ^ Qs(26) ^ data[10]) + ((xl << 6) ^ Qs(17) ^ Qs(10));
+		dh[11] = ROTL32(dh[7], 12) + (xh ^ Qs(27) ^ data[11]) + ((xl << 4) ^ Qs(18) ^ Qs(11));
+		dh[12] = ROTL32(dh[0], 13) + (xh ^ Qs(28) ^ data[12]) + ((xl >> 3) ^ Qs(19) ^ Qs(12));
+		dh[13] = ROTL32(dh[1], 14) + (xh ^ Qs(29) ^ data[13]) + ((xl >> 4) ^ Qs(20) ^ Qs(13));
+		dh[14] = ROTL32(dh[2], 15) + (xh ^ Qs(30) ^ data[14]) + ((xl >> 7) ^ Qs(21) ^ Qs(14));
+		dh[15] = ROTL32(dh[3], 16) + (xh ^ Qs(31) ^ data[15]) + ((xl >> 2) ^ Qs(22) ^ Qs(15));
+}
+
+#ifndef USE_MIDSTATE
+
+__device__
+static void gpu_bmw256_init(sph_bmw_small_context *sc)
+{
+	memcpy(sc->H, IV256, sizeof sc->H);
+	sc->ptr = 0;
+	sc->bit_count = 0;
+}
+
+__device__
+static void gpu_bmw256(sph_bmw_small_context *sc, const void *data, size_t len)
+{
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+	unsigned char *buf = sc->buf;
+	size_t ptr = sc->ptr;
+
+	sc->bit_count += (sph_u64)len << 3;
+
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			gpu_compress_small((sph_u32 *) buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+#endif
+
+#define sph_enc64le(ptr, x) \
+	*((uint64_t*)(ptr)) = x
+#define sph_enc64le_aligned sph_enc64le
+
+__device__
+static void gpu_bmw256_close(sph_bmw_small_context *sc, uint2 *out)
+{
+	unsigned char *buf = sc->buf;
+	size_t ptr = sc->ptr;
+
+	buf[ptr ++] = 0x80;
+	sph_u32 *h = sc->H;
+
+	sph_u32 h1[16];
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		gpu_compress_small((sph_u32 *) buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, sizeof(sc->buf) - 8 - ptr);
+
+	sph_enc64le_aligned(buf + sizeof(sc->buf) - 8, SPH_T64(sc->bit_count));
+
+	sph_u32 h2[16];
+	gpu_compress_small((sph_u32 *) buf, h, h2);
+	gpu_compress_small(h2, final_s, h1);
+
+	uint64_t* h64 = (uint64_t*) (&h1[8]);
+	#pragma unroll
+	for (int i = 0; i < 4; i++) {
+		out[i] = vectorize(h64[i]);
+	}
+}
+
+__global__ /* __launch_bounds__(256, 3) */
+void bmw256_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash, sph_bmw256_context *d_midstate, int swap)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nonce = startNonce + thread;
+		nonce = swap ? cuda_swab32(nonce): nonce;
+
+#ifndef USE_MIDSTATE
+		uint2 hash[10];
+		#pragma unroll
+		for(int i=0;i<9;i++)
+			hash[i] = vectorize(c_PaddedMessage80[i]);
+		hash[9] = make_uint2(c_PaddedMessage80[9], nonce);
+
+		sph_bmw256_context ctx;
+		gpu_bmw256_init(&ctx);
+		gpu_bmw256(&ctx, (void*) hash, 80);
+#else
+		sph_bmw256_context ctx;
+		ctx.ptr = 16; ctx.bit_count = 640;
+		uint2 *buf = (uint2 *) ctx.buf;
+		buf[0] = vectorize(c_PaddedMessage80[8]);
+		buf[1] = make_uint2(c_PaddedMessage80[9], nonce);
+		#pragma unroll
+		for(int i=0;i<16;i++)
+			ctx.H[i] = d_midstate->H[i];
+#endif
+		gpu_bmw256_close(&ctx, (uint2*) &g_hash[thread << 2]);
+	}
+}
+
+__host__
+void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, int swap)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	bmw256_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_outputHash, d_midstate[thr_id], swap);
+}
+
+__host__
+void bmw256_setBlock_80(int thr_id, void *pdata)
+{
+	uint64_t PaddedMessage[16];
+	memcpy(PaddedMessage, pdata, 80);
+	memset(&PaddedMessage[10], 0, 48);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
+
+	sph_bmw256_context ctx;
+	sph_bmw256_init(&ctx);
+	sph_bmw256(&ctx, (void*) PaddedMessage, 80);
+	CUDA_SAFE_CALL(cudaMemcpy(d_midstate[thr_id], &ctx, sizeof(sph_bmw256_context), cudaMemcpyHostToDevice));
+}
+
+__host__
+void bmw256_midstate_init(int thr_id, uint32_t threads)
+{
+	cudaMalloc(&d_midstate[thr_id], sizeof(sph_bmw256_context));
+}
diff --git a/Makefile.am b/Makefile.am
index 7b343c1..d086f91 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -37,6 +37,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
 			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
 			  Algo256/blake256.cu Algo256/keccak256.cu \
+			  Algo256/bmw.cu Algo256/cuda_bmw.cu \
 			  JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
 			  JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
 			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
@@ -95,6 +96,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
 Algo256/blake256.o: Algo256/blake256.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 
+Algo256/cuda_bmw.o: Algo256/cuda_bmw.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
+
 heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
 
diff --git a/README.txt b/README.txt
index 27fffea..bc9fed1 100644
--- a/README.txt
+++ b/README.txt
@@ -34,6 +34,7 @@ DarkCoin and other X11 coins
 Chaincoin and Flaxscript (C11)
 Saffroncoin blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
+Midnight (BMW 256)
 Qubit (Digibyte, ...)
 Luffa (Joincoin)
 Keccak (Maxcoin)
@@ -66,6 +67,7 @@ its command line interface and options.
   -a, --algo=ALGO       specify the algorithm to use
                           blake       use to mine Saffroncoin (Blake 256)
                           blakecoin   use to mine Old Blake 256
+                          bmw         use to mine Midnight
                           c11/flax    use to mine Chaincoin and Flax
                           deep        use to mine Deepcoin
                           dmd-gr      use to mine Diamond-Groestl
@@ -225,7 +227,8 @@ features.
   August 2015...
                   Add Lyra2REv2 algo (Vertcoin/Zoom)
                   Restore WhirlpoolX algo (VNL)
-                  Drop animecoin support
+                  Drop Animecoin support
+                  Add bmw (Midnight) algo
 
   July 06th 2015  v1.6.5-C11
                   Nvml api power limits
diff --git a/ccminer.cpp b/ccminer.cpp
index 429257c..12f704f 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -86,6 +86,7 @@ struct workio_cmd {
 enum sha_algos {
 	ALGO_BLAKE,
 	ALGO_BLAKECOIN,
+	ALGO_BMW,
 	ALGO_C11,
 	ALGO_DEEP,
 	ALGO_DMD_GR,
@@ -123,6 +124,7 @@ enum sha_algos {
 static const char *algo_names[] = {
 	"blake",
 	"blakecoin",
+	"bmw",
 	"c11",
 	"deep",
 	"dmd-gr",
@@ -280,6 +282,7 @@ Options:\n\
   -a, --algo=ALGO       specify the hash algorithm to use\n\
 			blake       Blake 256 (SFR)\n\
 			blakecoin   Fast Blake 256 (8 rounds)\n\
+			bmw         BMW 256\n\
 			c11/flax    X11 variant\n\
 			deep        Deepcoin\n\
 			dmd-gr      Diamond-Groestl\n\
@@ -848,6 +851,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			be32enc(&ntime, work->data[17]);
 			be32enc(&nonce, work->data[19]);
 			break;
+		case ALGO_BLAKE:
+		case ALGO_BLAKECOIN:
+		case ALGO_BMW:
+			// fast algos require that...
+			check_dups = true;
 		default:
 			le32enc(&ntime, work->data[17]);
 			le32enc(&nonce, work->data[19]);
@@ -1758,6 +1766,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_WHIRLPOOLX:
 				minmax = 0x80000000U;
 				break;
+			case ALGO_BMW:
 			case ALGO_KECCAK:
 				minmax = 0x40000000U;
 				break;
@@ -1844,6 +1853,20 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 
+		case ALGO_BLAKECOIN:
+			rc = scanhash_blake256(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done, 8);
+			break;
+
+		case ALGO_BLAKE:
+			rc = scanhash_blake256(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done, 14);
+			break;
+
+		case ALGO_BMW:
+			rc = scanhash_bmw(thr_id, work.data, work.target, max_nonce, &hashes_done);
+			break;
+
 		case ALGO_C11:
 			rc = scanhash_c11(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
@@ -1880,16 +1903,6 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 
-		case ALGO_BLAKECOIN:
-			rc = scanhash_blake256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 8);
-			break;
-
-		case ALGO_BLAKE:
-			rc = scanhash_blake256(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, 14);
-			break;
-
 		case ALGO_FRESH:
 			rc = scanhash_fresh(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 936472b..a163741 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -247,6 +247,10 @@
     <ClCompile Include="neoscrypt\neoscrypt.cpp" />
     <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
     <ClInclude Include="neoscrypt\cuda_vectors.h" />
+    <CudaCompile Include="Algo256\bmw.cu" />
+    <CudaCompile Include="Algo256\cuda_bmw.cu">
+      <MaxRegCount>76</MaxRegCount>
+    </CudaCompile>
     <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu" />
     <ClCompile Include="scrypt-jane.cpp" />
     <ClCompile Include="scrypt.cpp" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index bb4573d..d4b6802 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -652,6 +652,12 @@
     <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
       <Filter>Source Files\neoscrypt</Filter>
     </CudaCompile>
+    <CudaCompile Include="Algo256\bmw.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_bmw.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico">
@@ -668,4 +674,4 @@
       <Filter>Ressources</Filter>
     </Text>
   </ItemGroup>
-</Project>
+</Project>
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index c0a3c16..52ba432 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -89,6 +89,21 @@ void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, u
 	}
 }
 
+__global__ __launch_bounds__(512, 4)
+void cuda_checkhash_32(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *inpHash = &hash[thread << 3];
+
+		if (resNonces[0] == UINT32_MAX) {
+			if (hashbelowtarget(inpHash, pTarget))
+				resNonces[0] = (startNounce + thread);
+		}
+	}
+}
+
 __host__
 uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
 {
@@ -111,6 +126,28 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin
 	return h_resNonces[thr_id][0];
 }
 
+__host__
+uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
+{
+	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
+
+	const uint32_t threadsperblock = 512;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	if (!init_done) {
+		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
+		return UINT32_MAX;
+	}
+
+	cuda_checkhash_32 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	return h_resNonces[thr_id][0];
+}
+
 /* --------------------------------------------------------------------------------------------- */
 
 __global__ __launch_bounds__(512, 4)
diff --git a/miner.h b/miner.h
index 4c5822c..7f1ea7b 100644
--- a/miner.h
+++ b/miner.h
@@ -303,6 +303,10 @@ extern int scanhash_blake256(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done, int8_t blakerounds);
 
+extern int scanhash_bmw(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern int scanhash_c11(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@@ -769,6 +773,7 @@ void applog_compare_hash(unsigned char *hash, unsigned char *hash2);
 
 void print_hash_tests(void);
 void blake256hash(void *output, const void *input, int8_t rounds);
+void bmw_hash(void *state, const void *input);
 void c11hash(void *output, const void *input);
 void deephash(void *state, const void *input);
 void luffa_hash(void *state, const void *input);
diff --git a/util.cpp b/util.cpp
index a4ddbf7..27e667f 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1812,6 +1812,9 @@ void print_hash_tests(void)
 	blake256hash(&hash[0], &buf[0], 14);
 	printpfx("blake", hash);
 
+	bmw_hash(&hash[0], &buf[0]);
+	printpfx("bmw", hash);
+
 	c11hash(&hash[0], &buf[0]);
 	printpfx("c11", hash);