From 933a020205229f5a144e8cc785c7ae43b1a16d71 Mon Sep 17 00:00:00 2001
From: samr7 <samr7@cs.washington.edu>
Date: Sun, 7 Aug 2011 16:13:23 -0700
Subject: [PATCH] Add explicit preprocessor unrolling. Limit use of #pragma
 unroll to NVIDIA platforms.

---
 calc_addrs.cl  | 1127 +++++++++++++++++++++---------------------------
 oclvanitygen.c |   44 +-
 2 files changed, 513 insertions(+), 658 deletions(-)

diff --git a/calc_addrs.cl b/calc_addrs.cl
index 187a6c4..76ab97b 100644
--- a/calc_addrs.cl
+++ b/calc_addrs.cl
@@ -20,26 +20,64 @@
  * This file contains an OpenCL kernel for performing certain parts of
  * the bitcoin address calculation process.
  *
- * Kernel: calc_addrs
+ * Kernel: ec_add_grid
  *
  * Inputs:
- * - Row of (sequential) EC points
- * - Array of column increment EC points (= rowsize * Pgenerator)
+ * - Row: Array of (sequential) EC points
+ * - Column: Array of column increment EC points (= rowsize * Pgenerator)
  *
  * Steps:
- * - For each row increment value C:
- *   - For each row point P:
- *      - Compute P + C
- *      - Normalize and hash with SHA256 and RIPEMD160
- *      - Store hash value in output array
+ * - Compute P = Row[x] + Column[y]
+ *   P is computed as numerator/denominator components Pxj, Pyj, Pz
+ *   Final values are: Px = Pxj / (Pz^2), Py = Pyj / (Pz^3)
+ *
+ *   The modular inverse of Pz is required to compute Px and Py, and
+ *   can be computed more efficiently in large batches.  This is done in
+ *   the next kernel heap_invert.
+ *
+ * - Store Pxj, Pyj to intermediate point buffer
+ * - Store Pz to z_heap
+ *
+ * Outputs:
+ * - Intermediate point buffer
+ * - Denominator buffer (z_heap)
+ *
+ * -------------------------------
+ * Kernel: heap_invert
+ *
+ * Inputs:
+ * - Denominator buffer (z_heap)
+ * - N = Batch size (power of 2)
+ *
+ * Steps:
+ * - Compute the product tree for N values in the denominator buffer
+ * - Compute the modular inverse of the root of the product tree
+ * - Multiply down the tree to compute the modular inverse of each leaf
+ *
+ * Outputs:
+ * - Modular inverse denominator buffer (z_heap)
+ *
+ * -------------------------------
+ * Kernel: hash_ec_point_get
+ *
+ * Inputs:
+ * - Intermediate point buffer
+ * - Modular inverse denominator buffer (z_heap)
+ *
+ * Steps:
+ * - Compute Px = Pxj * (1/Pz)^2
+ * - Compute Py = Pyj * (1/Pz)^3
+ * - Compute H = RIPEMD160(SHA256(0x04 | Px | Py))
  *
  * Output:
  * - Array of 20-byte address hash values
  *
- * Each instance of the kernel computes one full row.  With a typical
- * row size of 256 points, this makes each kernel instance very heavy.
- * This tradeoff is chosen in favor of batched modular inversion, which
- * substantially reduces the cost of performing modular inversion.
+ * -------------------------------
+ * Kernel: hash_ec_point_search_prefix
+ *
+ * Like hash_ec_point_get, but instead of storing the complete hash
+ * value to an output buffer, it searches a sorted list of ranges,
+ * and if a match is found, writes a flag to an output buffer.
  */
 
 
@@ -56,6 +94,41 @@
 #define load_be32(v) bswap32(v)
 #endif
 
+/* Explicit unrolling */
+#define unroll_5(a) do { a(0) a(1) a(2) a(3) a(4) } while (0)
+#define unroll_8(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
+#define unroll_8_sf(a) do { a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
+#define unroll_8_sl(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) } while (0)
+#define unroll_8_reverse(a) \
+	do { a(7) a(6) a(5) a(4) a(3) a(2) a(1) a(0) } while (0)
+#define unroll_8_reverse_sl(a) \
+	do { a(7) a(6) a(5) a(4) a(3) a(2) a(1) } while (0)
+#define unroll_16(a) do {				\
+	a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7)		\
+	a(8) a(9) a(10) a(11) a(12) a(13) a(14) a(15)	\
+	} while (0)
+#define unroll_64(a) do {				\
+	a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7)		\
+	a(8) a(9) a(10) a(11) a(12) a(13) a(14) a(15)	\
+	a(16) a(17) a(18) a(19) a(20) a(21) a(22) a(23) \
+	a(24) a(25) a(26) a(27) a(28) a(29) a(30) a(31)	\
+	a(32) a(33) a(34) a(35) a(36) a(37) a(38) a(39) \
+	a(40) a(41) a(42) a(43) a(44) a(45) a(46) a(47) \
+	a(48) a(49) a(50) a(51) a(52) a(53) a(54) a(55) \
+	a(56) a(57) a(58) a(59) a(60) a(61) a(62) a(63) \
+	} while (0)
+
+#if defined(DEEP_PREPROC_UNROLL)
+#define iter_5(a) unroll_5(a)
+#define iter_8(a) unroll_8(a)
+#define iter_16(a) unroll_16(a)
+#define iter_64(a) unroll_64(a)
+#else
+#define iter_5(a) do {int _i; for (_i = 0; _i < 5; _i++) { a(_i) }} while (0)
+#define iter_8(a) do {int _i; for (_i = 0; _i < 8; _i++) { a(_i) }} while (0)
+#define iter_16(a) do {int _i; for (_i = 0; _i < 16; _i++) { a(_i) }} while (0)
+#define iter_64(a) do {int _i; for (_i = 0; _i < 64; _i++) { a(_i) }} while (0)
+#endif
 
 /*
  * BIGNUM mini-library
@@ -105,6 +178,21 @@ __constant bn_word mont_n0[2] = { 0xd2253531, 0xd838091d };
 #define bn_is_bit_set(bn, n) \
 	((((bn_word*)&bn)[n >> BN_WSHIFT]) & (1 << (n & (BN_WBITS-1))))
 
+#define bn_unroll(e) unroll_8(e)
+#define bn_unroll_sf(e)	unroll_8_sf(e)
+#define bn_unroll_sl(e)	unroll_8_sl(e)
+#define bn_unroll_reverse(e) unroll_8_reverse(e)
+#define bn_unroll_reverse_sl(e) unroll_8_reverse_sl(e)
+
+#define bn_unroll_arg(e, arg)				\
+	e(arg, 0) e(arg, 1) e(arg, 2) e(arg, 3)	\
+	e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)
+#define bn_unroll_arg_sf(e, arg)			\
+	e(arg, 1) e(arg, 2) e(arg, 3)		\
+	e(arg, 4) e(arg, 5) e(arg, 6) e(arg, 7)
+
+#define bn_iter(e) iter_8(e)
+
 
 /*
  * Bitwise shift
@@ -113,60 +201,53 @@ __constant bn_word mont_n0[2] = { 0xd2253531, 0xd838091d };
 void
 bn_lshift1(bignum *bn)
 {
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = (BN_NWORDS - 1); i > 0; i--)
-		bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31);
-	bn->d[i] <<= 1;
+#define bn_lshift1_inner1(i) \
+	bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31);
+
+	bn_unroll_reverse_sl(bn_lshift1_inner1);
+	bn->d[0] <<= 1;
 }
 
 void
 bn_rshift(bignum *bn, int shift)
 {
-	int i, wd, iws, iwr;
+	int wd, iws, iwr;
 	bn_word ihw, ilw;
 	iws = (shift & (BN_WBITS-1));
 	iwr = BN_WBITS - iws;
 	wd = (shift >> BN_WSHIFT);
 	ihw = (wd < BN_WBITS) ? bn->d[wd] : 0;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0, wd++; i < (BN_NWORDS-1); i++, wd++) {
-		ilw = ihw;
-		ihw = (wd < BN_WBITS) ? bn->d[wd] : 0;
-		bn->d[i] = (ilw >> iws) | (ihw << iwr);
-	}
-	bn->d[i] = (ihw >> iws);
+
+#define bn_rshift_inner1(i) \
+	wd++; \
+	ilw = ihw; \
+	ihw = (wd < BN_WBITS) ? bn->d[wd] : 0; \
+	bn->d[i] = (ilw >> iws) | (ihw << iwr);
+
+	bn_unroll_sl(bn_rshift_inner1);
+	bn->d[BN_NWORDS-1] = (ihw >> iws);
 }
 
 void
 bn_rshift1(bignum *bn)
 {
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < (BN_NWORDS - 1); i++)
-		bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1);
-	bn->d[i] >>= 1;
+#define bn_rshift1_inner1(i) \
+	bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1);
+
+	bn_unroll_sl(bn_rshift1_inner1);
+	bn->d[BN_NWORDS-1] >>= 1;
 }
 
 void
 bn_rshift1_2(bignum *bna, bignum *bnb)
 {
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < (BN_NWORDS - 1); i++) {
-		bna->d[i] = (bna->d[i+1] << 31) | (bna->d[i] >> 1);
+#define bn_rshift1_2_inner1(i) \
+		bna->d[i] = (bna->d[i+1] << 31) | (bna->d[i] >> 1); \
 		bnb->d[i] = (bnb->d[i+1] << 31) | (bnb->d[i] >> 1);
-	}
-	bna->d[i] >>= 1;
-	bnb->d[i] >>= 1;
+
+	bn_unroll_sl(bn_rshift1_2_inner1);
+	bna->d[BN_NWORDS-1] >>= 1;
+	bnb->d[BN_NWORDS-1] >>= 1;
 }
 
 
@@ -177,28 +258,26 @@ bn_rshift1_2(bignum *bna, bignum *bnb)
 int
 bn_ucmp_ge(bignum *a, bignum *b)
 {
-	int i, l = 0, g = 0;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = (BN_NWORDS - 1); i >= 0; i--) {
-		if (a->d[i] < b->d[i]) l |= (1 << i);
-		if (a->d[i] > b->d[i]) g |= (1 << i);
-	}
+	int l = 0, g = 0;
+
+#define bn_ucmp_ge_inner1(i) \
+	if (a->d[i] < b->d[i]) l |= (1 << i); \
+	if (a->d[i] > b->d[i]) g |= (1 << i);
+
+	bn_unroll_reverse(bn_ucmp_ge_inner1);
 	return (l > g) ? 0 : 1;
 }
 
 int
 bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
 {
-	int i, l = 0, g = 0;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = (BN_NWORDS - 1); i >= 0; i--) {
-		if (a->d[i] < b[i]) l |= (1 << i);
-		if (a->d[i] > b[i]) g |= (1 << i);
-	}
+	int l = 0, g = 0;
+
+#define bn_ucmp_ge_c_inner1(i) \
+	if (a->d[i] < b[i]) l |= (1 << i); \
+	if (a->d[i] > b[i]) g |= (1 << i);
+
+	bn_unroll_reverse(bn_ucmp_ge_c_inner1);
 	return (l > g) ? 0 : 1;
 }
 
@@ -209,12 +288,12 @@ bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
 void
 bn_neg(bignum *n)
 {
-	int i, c;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0, c = 1; i < BN_NWORDS; i++)
-		c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
+	int c = 1;
+
+#define bn_neg_inner1(i) \
+	c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
+
+	bn_unroll(bn_neg_inner1);
 }
 
 /*
@@ -234,30 +313,25 @@ bn_neg(bignum *n)
 	} while (0)
 
 bn_word
-bn_uadd_words_seq(bn_word *r, bn_word *a, bn_word *b, int count)
+bn_uadd_words_seq(bn_word *r, bn_word *a, bn_word *b)
 {
 	bn_word t, c = 0;
-	int i;
+
+#define bn_uadd_words_seq_inner1(i) \
+	bn_addc_word(r[i], a[i], b[i], t, c);
+
 	bn_add_word(r[0], a[0], b[0], t, c);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < count; i++)
-		bn_addc_word(r[i], a[i], b[i], t, c);
+	bn_unroll_sf(bn_uadd_words_seq_inner1);
 	return c;
 }
 
 bn_word
-bn_uadd_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b, int count)
+bn_uadd_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
 {
 	bn_word t, c = 0;
-	int i;
+
 	bn_add_word(r[0], a[0], b[0], t, c);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < count; i++)
-		bn_addc_word(r[i], a[i], b[i], t, c);
+	bn_unroll_sf(bn_uadd_words_seq_inner1);
 	return c;
 }
 
@@ -275,30 +349,25 @@ bn_uadd_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b, int count)
 	} while (0)
 
 bn_word
-bn_usub_words_seq(bn_word *r, bn_word *a, bn_word *b, int count)
+bn_usub_words_seq(bn_word *r, bn_word *a, bn_word *b)
 {
 	bn_word t, c = 0;
-	int i;
+
+#define bn_usub_words_seq_inner1(i) \
+	bn_subb_word(r[i], a[i], b[i], t, c);
+
 	bn_sub_word(r[0], a[0], b[0], t, c);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < count; i++)
-		bn_subb_word(r[i], a[i], b[i], t, c);
+	bn_unroll_sf(bn_usub_words_seq_inner1);
 	return c;
 }
 
 bn_word
-bn_usub_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b, int count)
+bn_usub_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b)
 {
 	bn_word t, c = 0;
-	int i;
+
 	bn_sub_word(r[0], a[0], b[0], t, c);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < count; i++)
-		bn_subb_word(r[i], a[i], b[i], t, c);
+	bn_unroll_sf(bn_usub_words_seq_inner1);
 	return c;
 }
 
@@ -306,115 +375,79 @@ bn_usub_words_c_seq(bn_word *r, bn_word *a, __constant bn_word *b, int count)
  * Add/subtract better suited for AMD's VLIW architecture
  */
 bn_word
-bn_uadd_words_vliw(bn_word *r, bn_word *a, bn_word *b, int count)
+bn_uadd_words_vliw(bn_word *r, bn_word *a, bn_word *b)
 {
 	bignum x;
 	bn_word c = 0, cp = 0;
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < count; i++)
-		x.d[i] = a[i] + b[i];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < count; i++) {
-		c |= (a[i] > x.d[i]) ? (1 << i) : 0;
-		cp |= (!~x.d[i]) ? (1 << i) : 0;
-	}
+
+#define bn_uadd_words_vliw_inner1(i) \
+	x.d[i] = a[i] + b[i];
+
+#define bn_uadd_words_vliw_inner2(i) \
+	c |= (a[i] > x.d[i]) ? (1 << i) : 0; \
+	cp |= (!~x.d[i]) ? (1 << i) : 0;
+
+#define bn_uadd_words_vliw_inner3(i) \
+	r[i] = x.d[i] + ((c >> i) & 1);
+
+	bn_unroll(bn_uadd_words_vliw_inner1);
+	bn_unroll(bn_uadd_words_vliw_inner2);
 	c = ((cp + (c << 1)) ^ cp);
 	r[0] = x.d[0];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < count; i++)
-		r[i] = x.d[i] + ((c >> i) & 1);
-	return c >> count;
+	bn_unroll_sf(bn_uadd_words_vliw_inner3);
+	return c >> BN_NWORDS;
 }
 
 bn_word
-bn_uadd_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b, int count)
+bn_uadd_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
 {
 	bignum x;
 	bn_word c = 0, cp = 0;
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < count; i++)
-		x.d[i] = a[i] + b[i];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < count; i++) {
-		c |= (b[i] > x.d[i]) ? (1 << i) : 0;
-		cp |= (!~x.d[i]) ? (1 << i) : 0;
-	}
+
+	bn_unroll(bn_uadd_words_vliw_inner1);
+	bn_unroll(bn_uadd_words_vliw_inner2);
 	c = ((cp + (c << 1)) ^ cp);
 	r[0] = x.d[0];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < count; i++)
-		r[i] = x.d[i] + ((c >> i) & 1);
-	return c >> count;
+	bn_unroll_sf(bn_uadd_words_vliw_inner3);
+	return c >> BN_NWORDS;
 }
 
 bn_word
-bn_usub_words_vliw(bn_word *r, bn_word *a, bn_word *b, int count)
+bn_usub_words_vliw(bn_word *r, bn_word *a, bn_word *b)
 {
 	bignum x;
 	bn_word c = 0, cp = 0;
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < count; i++)
-		x.d[i] = a[i] - b[i];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < count; i++) {
-		c |= (a[i] < b[i]) ? (1 << i) : 0;
-		cp |= (!x.d[i]) ? (1 << i) : 0;
-	}
+
+#define bn_usub_words_vliw_inner1(i) \
+	x.d[i] = a[i] - b[i];
+
+#define bn_usub_words_vliw_inner2(i) \
+	c |= (a[i] < b[i]) ? (1 << i) : 0; \
+	cp |= (!x.d[i]) ? (1 << i) : 0;
+
+#define bn_usub_words_vliw_inner3(i) \
+	r[i] = x.d[i] - ((c >> i) & 1);
+
+	bn_unroll(bn_usub_words_vliw_inner1);
+	bn_unroll(bn_usub_words_vliw_inner2);
 	c = ((cp + (c << 1)) ^ cp);
 	r[0] = x.d[0];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < count; i++)
-		r[i] = x.d[i] - ((c >> i) & 1);
-	return c >> count;
+	bn_unroll_sf(bn_usub_words_vliw_inner3);
+	return c >> BN_NWORDS;
 }
 
 bn_word
-bn_usub_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b, int count)
+bn_usub_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b)
 {
 	bignum x;
 	bn_word c = 0, cp = 0;
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < count; i++)
-		x.d[i] = a[i] - b[i];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < count; i++) {
-		c |= (a[i] < b[i]) ? (1 << i) : 0;
-		cp |= (!x.d[i]) ? (1 << i) : 0;
-	}
+
+	bn_unroll(bn_usub_words_vliw_inner1);
+	bn_unroll(bn_usub_words_vliw_inner2);
 	c = ((cp + (c << 1)) ^ cp);
 	r[0] = x.d[0];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < count; i++)
-		r[i] = x.d[i] - ((c >> i) & 1);
-	return c >> count;
+	bn_unroll_sf(bn_usub_words_vliw_inner3);
+	return c >> BN_NWORDS;
 }
 
 
@@ -430,10 +463,10 @@ bn_usub_words_c_vliw(bn_word *r, bn_word *a, __constant bn_word *b, int count)
 #define bn_usub_words_c bn_usub_words_c_seq
 #endif
 
-#define bn_uadd(r, a, b) bn_uadd_words((r)->d, (a)->d, (b)->d, BN_NWORDS)
-#define bn_uadd_c(r, a, b) bn_uadd_words_c((r)->d, (a)->d, b, BN_NWORDS)
-#define bn_usub(r, a, b) bn_usub_words((r)->d, (a)->d, (b)->d, BN_NWORDS)
-#define bn_usub_c(r, a, b) bn_usub_words_c((r)->d, (a)->d, b, BN_NWORDS)
+#define bn_uadd(r, a, b) bn_uadd_words((r)->d, (a)->d, (b)->d)
+#define bn_uadd_c(r, a, b) bn_uadd_words_c((r)->d, (a)->d, b)
+#define bn_usub(r, a, b) bn_usub_words((r)->d, (a)->d, (b)->d)
+#define bn_usub_c(r, a, b) bn_usub_words_c((r)->d, (a)->d, b)
 
 /*
  * Modular add/sub
@@ -483,68 +516,60 @@ bn_mod_lshift1(bignum *bn)
 		c = (s < c) ? p + 1 : p;	\
 		if (r < s) c++;			\
 	} while (0)
-
 void
 bn_mul_mont(bignum *r, bignum *a, bignum *b)
 {
 	bignum t;
 	bn_word tea, teb, c, p, s, m;
-	int i, j;
 
-	c = 0;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
+#if !defined(VERY_EXPENSIVE_BRANCHES)
+	int q;
 #endif
-	for (j = 0; j < BN_NWORDS; j++)
+
+	c = 0;
+#define bn_mul_mont_inner1(j)					\
 		bn_mul_word(t.d[j], a->d[j], b->d[0], c, p, s);
+	bn_unroll(bn_mul_mont_inner1);
 	tea = c;
 	teb = 0;
 
 	c = 0;
 	m = t.d[0] * mont_n0[0];
 	bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (j = 1; j < BN_NWORDS; j++) {
-		bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);
+#define bn_mul_mont_inner2(j)						\
+		bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);	\
 		t.d[j-1] = t.d[j];
-	}
+	bn_unroll_sf(bn_mul_mont_inner2);
 	t.d[BN_NWORDS-1] = tea + c;
 	tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
 
-#if defined(UNROLL_MAX) && defined(VERY_EXPENSIVE_BRANCHES)
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 1; i < BN_NWORDS; i++) {
-		c = 0;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s);
-		tea += c;
-		teb = ((tea < c) ? 1 : 0);
-
-		c = 0;
-		m = t.d[0] * mont_n0[0];
-		bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 1; j < BN_NWORDS; j++) {
-			bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);
-			t.d[j-1] = t.d[j];
-		}
-		t.d[BN_NWORDS-1] = tea + c;
-		tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
-	}
+#define bn_mul_mont_inner3_1(i, j)					\
+		bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s);
+#define bn_mul_mont_inner3_2(i, j)					\
+		bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);	\
+		t.d[j-1] = t.d[j];
+#define bn_mul_mont_inner3(i)				 \
+	c = 0;						 \
+	bn_unroll_arg(bn_mul_mont_inner3_1, i);		 \
+	tea += c;					 \
+	teb = ((tea < c) ? 1 : 0);			 \
+	c = 0;						 \
+	m = t.d[0] * mont_n0[0];			 \
+	bn_mul_add_word(t.d[0], modulus[0], m, c, p, s); \
+	bn_unroll_arg_sf(bn_mul_mont_inner3_2, i);	 \
+	t.d[BN_NWORDS-1] = tea + c;			 \
+	tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
 
 #if defined(VERY_EXPENSIVE_BRANCHES)
+	bn_unroll_sf(bn_mul_mont_inner3);
 	c = tea | !bn_usub_c(r, &t, modulus);
 	if (!c)
 		*r = t;
+
 #else
+	for (q = 1; q < BN_NWORDS; q++) {
+		bn_mul_mont_inner3(q);
+	}
 	c = tea || (t.d[BN_NWORDS-1] >= modulus[BN_NWORDS-1]);
 	if (c) {
 		c = tea | !bn_usub_c(r, &t, modulus);
@@ -561,46 +586,54 @@ bn_from_mont(bignum *rb, bignum *b)
 #define WORKSIZE ((2*BN_NWORDS) + 1)
 	bn_word r[WORKSIZE];
 	bn_word m, c, p, s;
-	int i, j;
-	/* Copy the input to the working area */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
+#if defined(PRAGMA_UNROLL)
+	int i;
 #endif
-	for (i = 0; i < BN_NWORDS; i++)
-		r[i] = b->d[i];
+
+	/* Copy the input to the working area */
 	/* Zero the upper words */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = BN_NWORDS; i < WORKSIZE; i++)
-		r[i] = 0;
+#define bn_from_mont_inner1(i)			\
+	r[i] = b->d[i];
+#define bn_from_mont_inner2(i)			\
+	r[BN_NWORDS+i] = 0;
+
+	bn_unroll(bn_from_mont_inner1);
+	bn_unroll(bn_from_mont_inner2);
+	r[WORKSIZE-1] = 0;
+
 	/* Multiply (long) by modulus */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++) {
-		m = r[i] * mont_n0[0];
-		c = 0;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			bn_mul_add_word(r[i+j], modulus[j], m, c, p, s);
-		r[BN_NWORDS + i] += c;
+#define bn_from_mont_inner3_1(i, j) \
+	bn_mul_add_word(r[i+j], modulus[j], m, c, p, s);
+
 #if !defined(VERY_EXPENSIVE_BRANCHES)
-		if (r[BN_NWORDS + i] < c)
-			r[BN_NWORDS + i + 1] += 1;
+#define bn_from_mont_inner3_2(i)		\
+	if (r[BN_NWORDS + i] < c)		\
+		r[BN_NWORDS + i + 1] += 1;
 #else
-		r[BN_NWORDS + i + 1] += (r[BN_NWORDS + i] < c) ? 1 : 0;
+#define bn_from_mont_inner3_2(i)				\
+	r[BN_NWORDS + i + 1] += (r[BN_NWORDS + i] < c) ? 1 : 0;
 #endif
-	}
-	c = bn_usub_words_c(rb->d, &r[BN_NWORDS], modulus, BN_NWORDS);
-	if (c) {
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
+
+#define bn_from_mont_inner3(i)			 \
+	m = r[i] * mont_n0[0];			 \
+	c = 0;					 \
+	bn_unroll_arg(bn_from_mont_inner3_1, i); \
+	r[BN_NWORDS + i] += c;			 \
+	bn_from_mont_inner3_2(i)
+
+#if !defined(PRAGMA_UNROLL)
+	bn_unroll(bn_from_mont_inner3);
+#else
+#pragma unroll 8
+	for (i = 0; i < BN_NWORDS; i++) { bn_from_mont_inner3(i) }
 #endif
-		for (j = 0; j < BN_NWORDS; j++)
-			rb->d[j] = r[BN_NWORDS + j];
+
+#define bn_from_mont_inner4(i)			\
+	rb->d[i] = r[BN_NWORDS + i];
+
+	c = bn_usub_words_c(rb->d, &r[BN_NWORDS], modulus);
+	if (c) {
+		bn_unroll(bn_from_mont_inner4);
 	}
 }
 
@@ -672,6 +705,12 @@ bn_mod_inverse(bignum *r, bignum *n)
  * NOTE #2: Endianness of the OpenCL device makes no difference here.
  */
 
+#define hash256_unroll(a) unroll_8(a)
+#define hash160_unroll(a) unroll_5(a)
+#define hash256_iter(a) iter_8(a)
+#define hash160_iter(a) iter_5(a)
+
+
 /*
  * SHA-2 256
  *
@@ -708,12 +747,10 @@ __constant uint sha2_k[64] = {
 void
 sha2_256_init(uint *out)
 {
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 8; i++)
-		out[i] = sha2_init[i];
+#define sha2_256_init_inner_1(i) \
+	out[i] = sha2_init[i];
+
+	hash256_unroll(sha2_256_init_inner_1);
 }
 
 /* The state variable remapping is really contorted */
@@ -732,46 +769,48 @@ sha2_256_init(uint *out)
 void
 sha2_256_block(uint *out, uint *in)
 {
-	int i;
 	uint state[8], t1, t2;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
+#if defined(PRAGMA_UNROLL)
+	int i;
 #endif
-	for (i = 0; i < 8; i++)
-		state[i] = out[i];
-#ifdef UNROLL_MAX
+
+#define sha2_256_block_inner_1(i) \
+	state[i] = out[i];
+	hash256_unroll(sha2_256_block_inner_1);
+
+#define sha2_256_block_inner_2(i) \
+	if (i >= 16) {							\
+		t1 = in[(i + 1) % 16];					\
+		t2 = in[(i + 14) % 16];					\
+		in[i % 16] += (in[(i + 9) % 16] +			\
+		       (rotate(t1, 25U) ^ rotate(t1, 14U) ^ (t1 >> 3)) + \
+		       (rotate(t2, 15U) ^ rotate(t2, 13U) ^ (t2 >> 10))); \
+	}								\
+	t1 = (sha2_stvar(state, i, 7) +					\
+	      sha2_s1(sha2_stvar(state, i, 4)) +			\
+	      sha2_ch(sha2_stvar(state, i, 4),				\
+		      sha2_stvar(state, i, 5),				\
+		      sha2_stvar(state, i, 6)) +			\
+	      sha2_k[i] +						\
+	      in[i % 16]);						\
+	t2 = (sha2_s0(sha2_stvar(state, i, 0)) +			\
+	      sha2_ma(sha2_stvar(state, i, 0),				\
+		      sha2_stvar(state, i, 1),				\
+		      sha2_stvar(state, i, 2)));			\
+	sha2_stvar(state, i, 3) += t1;					\
+	sha2_stvar(state, i, 7) = t1 + t2;				\
+
+#if !defined(PRAGMA_UNROLL)
+	unroll_64(sha2_256_block_inner_2);
+#else
 #pragma unroll 64
+	for (i = 0; i < 64; i++) { sha2_256_block_inner_2(i) }
 #endif
-	for (i = 0; i < 64; i++) {
-		if (i >= 16) {
-			/* Advance the input window */
-			t1 = in[(i + 1) % 16];
-			t2 = in[(i + 14) % 16];
-			in[i % 16] += (in[(i + 9) % 16] +
-			     (rotate(t1, 25U) ^ rotate(t1, 14U) ^ (t1 >> 3)) +
-			     (rotate(t2, 15U) ^ rotate(t2, 13U) ^ (t2 >> 10)));
-		}
 
-		/* Compute the t1, t2 augmentations */
-		t1 = (sha2_stvar(state, i, 7) +
-		      sha2_s1(sha2_stvar(state, i, 4)) +
-		      sha2_ch(sha2_stvar(state, i, 4),
-			      sha2_stvar(state, i, 5),
-			      sha2_stvar(state, i, 6)) +
-		      sha2_k[i] +
-		      in[i % 16]);
-		t2 = (sha2_s0(sha2_stvar(state, i, 0)) +
-		      sha2_ma(sha2_stvar(state, i, 0),
-			      sha2_stvar(state, i, 1),
-			      sha2_stvar(state, i, 2)));
-		sha2_stvar(state, i, 3) += t1;
-		sha2_stvar(state, i, 7) = t1 + t2;
-	}
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 8; i++)
-		out[i] += state[i];
+#define sha2_256_block_inner_3(i) \
+	out[i] += state[i];
+
+	hash256_unroll(sha2_256_block_inner_3);
 }
 
 
@@ -859,54 +898,60 @@ __constant uchar ripemd160_rlp[] = {
 void
 ripemd160_init(uint *out)
 {
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for(i = 0; i < 5; i++)
-		out[i] = ripemd160_iv[i];
+#define ripemd160_init_inner_1(i) \
+	out[i] = ripemd160_iv[i];
+
+	hash160_unroll(ripemd160_init_inner_1);
 }
 
 void
 ripemd160_block(uint *out, uint *in)
 {
 	uint vals[10], t;
+#if defined(PRAGMA_UNROLL)
 	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 5; i++)
-		vals[i] = vals[i + 5] = out[i];
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 16; i++)
-		ripemd160_round(i, in, vals,
-				ripemd160_f0, ripemd160_f4, t);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
 #endif
-	for (i = 16; i < 32; i++)
-		ripemd160_round(i, in, vals,
-				ripemd160_f1, ripemd160_f3, t);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 32; i < 48; i++)
-		ripemd160_round(i, in, vals,
-				ripemd160_f2, ripemd160_f2, t);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 48; i < 64; i++)
-		ripemd160_round(i, in, vals,
-				ripemd160_f3, ripemd160_f1, t);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
+
+#define ripemd160_block_inner_1(i) \
+	vals[i] = vals[i + 5] = out[i];
+
+	hash160_unroll(ripemd160_block_inner_1);
+
+#define ripemd160_block_inner_p0(i)		\
+	ripemd160_round(i, in, vals, \
+			ripemd160_f0, ripemd160_f4, t);
+#define ripemd160_block_inner_p1(i)		\
+	ripemd160_round((16 + i), in, vals,		\
+			ripemd160_f1, ripemd160_f3, t);
+#define ripemd160_block_inner_p2(i)		\
+	ripemd160_round((32 + i), in, vals,		\
+			ripemd160_f2, ripemd160_f2, t);
+#define ripemd160_block_inner_p3(i)		\
+	ripemd160_round((48 + i), in, vals,		\
+			ripemd160_f3, ripemd160_f1, t);
+#define ripemd160_block_inner_p4(i)		\
+	ripemd160_round((64 + i), in, vals,		\
+			ripemd160_f4, ripemd160_f0, t);
+
+#if !defined(PRAGMA_UNROLL)
+	unroll_16(ripemd160_block_inner_p0);
+	unroll_16(ripemd160_block_inner_p1);
+	unroll_16(ripemd160_block_inner_p2);
+	unroll_16(ripemd160_block_inner_p3);
+	unroll_16(ripemd160_block_inner_p4);
+#else
+#pragma unroll 16
+	for (i = 0; i < 16; i++) { ripemd160_block_inner_p0(i); }
+#pragma unroll 16
+	for (i = 0; i < 16; i++) { ripemd160_block_inner_p1(i); }
+#pragma unroll 16
+	for (i = 0; i < 16; i++) { ripemd160_block_inner_p2(i); }
+#pragma unroll 16
+	for (i = 0; i < 16; i++) { ripemd160_block_inner_p3(i); }
+#pragma unroll 16
+	for (i = 0; i < 16; i++) { ripemd160_block_inner_p4(i); }
 #endif
-	for (i = 64; i < 80; i++)
-		ripemd160_round(i, in, vals,
-				ripemd160_f4, ripemd160_f0, t);
+
 	t = out[1] + vals[2] + vals[8];
 	out[1] = out[2] + vals[3] + vals[9];
 	out[2] = out[3] + vals[4] + vals[5];
@@ -953,176 +998,6 @@ test_mod_inverse(__global bignum *inv_out, __global bignum *nums_in,
 #endif  /* TEST_KERNELS */
 
 
-#if 0
-__kernel void
-calc_addrs(__global uint *hashes_out,
-	   __global bignum *z_heap, __global bignum *point_tmp,
-	   __global bignum *row_in, __global bignum *col_in, int ncols)
-{
-	uint hash1[16];
-	uint hash2[16];
-	uint wl, wh;
-	bignum rx, ry;
-	bignum x1, y1, a, b, c, d, e, z;
-	bn_word cy;
-	int i, o;
-
-	/* Load the row increment point */
-	o = get_global_id(0);
-	rx = col_in[2*o];
-	ry = col_in[(2*o) + 1];
-	hashes_out += (o * 5 * ncols);
-	z_heap += (o * 2 * ncols);
-	point_tmp += (o * 2 * ncols);
-
-	/*
-	 * Perform the EC point add.
-	 * Add the row increment to all row points.
-	 * Save the X,Y in the point temporary space.
-	 * Save the Z in the z_heap for modular inversion.
-	 */
-	for (i = 0; i < ncols; i++) {
-		x1 = row_in[(2*i)];
-		y1 = row_in[(2*i) + 1];
-
-		bn_mod_sub(&z, &x1, &rx);
-		z_heap[(ncols - 1) + i] = z;
-
-		bn_mod_sub(&b, &y1, &ry);
-		bn_mod_add(&c, &x1, &rx);
-		bn_mod_add(&d, &y1, &ry);
-		bn_mul_mont(&y1, &b, &b);
-		bn_mul_mont(&x1, &z, &z);
-		bn_mul_mont(&e, &c, &x1);
-		bn_mod_sub(&y1, &y1, &e);
-		point_tmp[2*i] = y1;
-		bn_mod_lshift1(&y1);
-		bn_mod_sub(&y1, &e, &y1);
-		bn_mul_mont(&y1, &y1, &b);
-		bn_mul_mont(&a, &x1, &z);
-		bn_mul_mont(&c, &d, &a);
-		bn_mod_sub(&y1, &y1, &c);
-		cy = 0;
-		if (bn_is_odd(y1))
-			cy = bn_uadd_c(&y1, &y1, modulus);
-		bn_rshift1(&y1);
-		y1.d[BN_NWORDS-1] |= (cy ? 0x80000000 : 0);
-		point_tmp[(2*i)+1] = y1;
-	}
-
-	/* Compute the product hierarchy in z_heap */
-	for (i = ncols - 1; i > 0; i--) {
-		a = z_heap[(i*2) - 1];
-		b = z_heap[(i*2)];
-		bn_mul_mont(&z, &a, &b);
-		z_heap[i-1] = z;
-	}
-
-	/* Invert the root, fix up 1/ZR -> R/Z */
-	z = z_heap[0];
-	bn_mod_inverse(&z, &z);
-
-	for (i = 0; i < BN_NWORDS; i++)
-		a.d[i] = mont_rr[i];
-	bn_mul_mont(&z, &z, &a);
-	bn_mul_mont(&z, &z, &a);
-	z_heap[0] = z;
-
-	for (i = 1; i < ncols; i++) {
-		a = z_heap[i - 1];
-		b = z_heap[(i*2) - 1];
-		c = z_heap[i*2];
-		bn_mul_mont(&z, &a, &c);
-		z_heap[(i*2) - 1] = z;
-		bn_mul_mont(&z, &a, &b);
-		z_heap[i*2] = z;
-	}
-
-	for (i = 0; i < ncols; i++) {
-		/*
-		 * Multiply the coordinates by the inverted Z values.
-		 * Stash the coordinates in the hash buffer.
-		 * SHA-2 requires big endian, and our intended hash input
-		 * is big-endian, so swapping is unnecessary, but
-		 * inserting the format byte in front causes a headache.
-		 */
-		a = z_heap[(ncols - 1) + i];
-		bn_mul_mont(&b, &a, &a);  /* Z^2 */
-		x1 = point_tmp[2*i];
-		bn_mul_mont(&x1, &x1, &b);  /* X / Z^2 */
-		bn_from_mont(&x1, &x1);
-
-		wh = 0x00000004;  /* POINT_CONVERSION_UNCOMPRESSED */
-		for (o = 0; o < BN_NWORDS; o++) {
-			wl = wh;
-			wh = x1.d[(BN_NWORDS - 1) - o];
-			hash1[o] = (wl << 24) | (wh >> 8);
-		}
-
-		bn_mul_mont(&a, &a, &b);  /* Z^3 */
-		y1 = point_tmp[(2*i)+1];
-		bn_mul_mont(&y1, &y1, &a);  /* Y / Z^3 */
-		bn_from_mont(&y1, &y1);
-
-		for (o = 0; o < BN_NWORDS; o++) {
-			wl = wh;
-			wh = y1.d[(BN_NWORDS - 1) - o];
-			hash1[BN_NWORDS + o] = (wl << 24) | (wh >> 8);
-		}
-
-		/*
-		 * Hash the first 64 bytes of the buffer
-		 */
-		sha2_256_init(hash2);
-		sha2_256_block(hash2, hash1);
-
-		/*
-		 * Hash the last byte of the buffer + SHA-2 padding
-		 */
-		hash1[0] = wh << 24 | 0x800000;
-		hash1[1] = 0;
-		hash1[2] = 0;
-		hash1[3] = 0;
-		hash1[4] = 0;
-		hash1[5] = 0;
-		hash1[6] = 0;
-		hash1[7] = 0;
-		hash1[8] = 0;
-		hash1[9] = 0;
-		hash1[10] = 0;
-		hash1[11] = 0;
-		hash1[12] = 0;
-		hash1[13] = 0;
-		hash1[14] = 0;
-		hash1[15] = 65 * 8;
-		sha2_256_block(hash2, hash1);
-
-		/*
-		 * Hash the SHA-2 result with RIPEMD160
-		 * Unfortunately, SHA-2 outputs big-endian, but
-		 * RIPEMD160 expects little-endian.  Need to swap!
-		 */
-		for (o = 0; o < 8; o++)
-			hash2[o] = bswap32(hash2[o]);
-		hash2[8] = bswap32(0x80000000);
-		hash2[9] = 0;
-		hash2[10] = 0;
-		hash2[11] = 0;
-		hash2[12] = 0;
-		hash2[13] = 0;
-		hash2[14] = 32 * 8;
-		hash2[15] = 0;
-		ripemd160_init(hash1);
-		ripemd160_block(hash1, hash2);
-
-		/* Copy the hash to the output buffer */
-		for (o = 0; o < 5; o++)
-			*(hashes_out++) = hash1[o];
-	}
-
-}
-#endif
-
 #define ACCESS_BUNDLE 1024
 #define ACCESS_STRIDE (ACCESS_BUNDLE/BN_NWORDS)
 
@@ -1144,17 +1019,16 @@ ec_add_grid(__global bn_word *points_out, __global bn_word *z_heap,
 	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 		 (cell % (ACCESS_STRIDE/2)));
 
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		x1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
+#define ec_add_grid_inner_1(i) \
+	x1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
+
+	bn_unroll(ec_add_grid_inner_1);
 	start += (ACCESS_STRIDE/2);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		y1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
+
+#define ec_add_grid_inner_2(i) \
+	y1.d[i] = row_in[start + (i*ACCESS_STRIDE)];
+
+	bn_unroll(ec_add_grid_inner_2);
 
 	bn_mod_sub(&z, &x1, &rx);
 
@@ -1162,11 +1036,10 @@ ec_add_grid(__global bn_word *points_out, __global bn_word *z_heap,
 	start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 		 (cell % ACCESS_STRIDE));
 
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		z_heap[start + (i*ACCESS_STRIDE)] = z.d[i];
+#define ec_add_grid_inner_3(i) \
+	z_heap[start + (i*ACCESS_STRIDE)] = z.d[i];
+
+	bn_unroll(ec_add_grid_inner_3);
 
 	bn_mod_sub(&b, &y1, &ry);
 	bn_mod_add(&c, &x1, &rx);
@@ -1184,11 +1057,10 @@ ec_add_grid(__global bn_word *points_out, __global bn_word *z_heap,
 	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 		 (cell % (ACCESS_STRIDE/2)));
 
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		points_out[start + (i*ACCESS_STRIDE)] = y1.d[i];
+#define ec_add_grid_inner_4(i) \
+	points_out[start + (i*ACCESS_STRIDE)] = y1.d[i];
+
+	bn_unroll(ec_add_grid_inner_4);
 
 	bn_mod_lshift1(&y1);
 	bn_mod_sub(&y1, &e, &y1);
@@ -1203,49 +1075,49 @@ ec_add_grid(__global bn_word *points_out, __global bn_word *z_heap,
 	y1.d[BN_NWORDS-1] |= (cy ? 0x80000000 : 0);
 
 	start += (ACCESS_STRIDE/2);
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		points_out[start + (i*ACCESS_STRIDE)] = y1.d[i];
+
+	bn_unroll(ec_add_grid_inner_4);
 }
 
 __kernel void
 heap_invert(__global bn_word *z_heap, int batch)
 {
 	bignum a, b, c, z;
-	int i, j, off, lcell, hcell, start;
+	int i, off, lcell, hcell, start;
+
+#define heap_invert_inner_load_a(j)				\
+		a.d[j] = z_heap[start + j*ACCESS_STRIDE];
+#define heap_invert_inner_load_b(j)				\
+		b.d[j] = z_heap[start + j*ACCESS_STRIDE];
+#define heap_invert_inner_load_z(j)				\
+		z.d[j] = z_heap[start + j*ACCESS_STRIDE];
+#define heap_invert_inner_store_z(j)				\
+		z_heap[start + j*ACCESS_STRIDE] = z.d[j];
+#define heap_invert_inner_store_c(j)				\
+		z_heap[start + j*ACCESS_STRIDE] = c.d[j];
 
 	off = get_global_size(0);
 	lcell = get_global_id(0);
 	hcell = (off * batch) + lcell;
 	for (i = 0; i < (batch-1); i++) {
+
 		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 			 (lcell % ACCESS_STRIDE));
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			a.d[j] = z_heap[start + j*ACCESS_STRIDE];
+
+		bn_unroll(heap_invert_inner_load_a);
 
 		lcell += off;
 		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 			 (lcell % ACCESS_STRIDE));
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			b.d[j] = z_heap[start + j*ACCESS_STRIDE];
+
+		bn_unroll(heap_invert_inner_load_b);
 
 		bn_mul_mont(&z, &a, &b);
 
 		start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 			 (hcell % ACCESS_STRIDE));
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			z_heap[start + j*ACCESS_STRIDE] = z.d[j];
+
+		bn_unroll(heap_invert_inner_store_z);
 
 		lcell += off;
 		hcell += off;
@@ -1254,11 +1126,11 @@ heap_invert(__global bn_word *z_heap, int batch)
 	/* Invert the root, fix up 1/ZR -> R/Z */
 	bn_mod_inverse(&z, &z);
 
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		a.d[i] = mont_rr[i];
+#define heap_invert_inner_1(i)			\
+	a.d[i] = mont_rr[i];
+
+	bn_unroll(heap_invert_inner_1);
+
 	bn_mul_mont(&z, &z, &a);
 	bn_mul_mont(&z, &z, &a);
 
@@ -1266,57 +1138,33 @@ heap_invert(__global bn_word *z_heap, int batch)
 	hcell = lcell + (off << 1);
 	start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 		 (hcell % ACCESS_STRIDE));
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (j = 0; j < BN_NWORDS; j++)
-		z_heap[start + j*ACCESS_STRIDE] = z.d[j];
+
+	bn_unroll(heap_invert_inner_store_z);
 
 	for (i = 0; i < (batch-1); i++) {
 		start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 			 (hcell % ACCESS_STRIDE));
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			z.d[j] = z_heap[start + j*ACCESS_STRIDE];
-
+		bn_unroll(heap_invert_inner_load_z);
 
 		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 			 (lcell % ACCESS_STRIDE));
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			a.d[j] = z_heap[start + j*ACCESS_STRIDE];
+		bn_unroll(heap_invert_inner_load_a);
 
 		lcell += off;
 		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 			 (lcell % ACCESS_STRIDE));
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			b.d[j] = z_heap[start + j*ACCESS_STRIDE];
+		bn_unroll(heap_invert_inner_load_b);
 
 		bn_mul_mont(&c, &a, &z);
 
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			z_heap[start + j*ACCESS_STRIDE] = c.d[j];
+		bn_unroll(heap_invert_inner_store_c);
 
 		bn_mul_mont(&c, &b, &z);
 
 		lcell -= off;
 		start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 			 (lcell % ACCESS_STRIDE));
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-		for (j = 0; j < BN_NWORDS; j++)
-			z_heap[start + j*ACCESS_STRIDE] = c.d[j];
+		bn_unroll(heap_invert_inner_store_c);
 
 		lcell -= (off << 1);
 		hcell -= off;
@@ -1329,7 +1177,6 @@ hash_ec_point(uint *hash_out, __global bn_word *xy, __global bn_word *zip)
 	uint hash1[16], hash2[16];
 	bignum c, zi, zzi;
 	bn_word wh, wl;
-	int i;
 
 	/*
 	 * Multiply the coordinates by the inverted Z values.
@@ -1338,50 +1185,46 @@ hash_ec_point(uint *hash_out, __global bn_word *xy, __global bn_word *zip)
 	 * is big-endian, so swapping is unnecessary, but
 	 * inserting the format byte in front causes a headache.
 	 */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		zi.d[i] = zip[i*ACCESS_STRIDE];
+#define hash_ec_point_inner_1(i)		\
+	zi.d[i] = zip[i*ACCESS_STRIDE];
+
+	bn_unroll(hash_ec_point_inner_1);
 
 	bn_mul_mont(&zzi, &zi, &zi);  /* 1 / Z^2 */
 
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		c.d[i] = xy[i*ACCESS_STRIDE];
+#define hash_ec_point_inner_2(i)		\
+	c.d[i] = xy[i*ACCESS_STRIDE];
+
+	bn_unroll(hash_ec_point_inner_2);
 
 	bn_mul_mont(&c, &c, &zzi);  /* X / Z^2 */
 	bn_from_mont(&c, &c);
 
 	wh = 0x00000004;  /* POINT_CONVERSION_UNCOMPRESSED */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++) {
-		wl = wh;
-		wh = c.d[(BN_NWORDS - 1) - i];
-		hash1[i] = (wl << 24) | (wh >> 8);
-	}
+
+#define hash_ec_point_inner_3(i)		\
+	wl = wh;				\
+	wh = c.d[(BN_NWORDS - 1) - i];		\
+	hash1[i] = (wl << 24) | (wh >> 8);
+
+	bn_unroll(hash_ec_point_inner_3);
 
 	bn_mul_mont(&zzi, &zzi, &zi);  /* 1 / Z^3 */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++)
-		c.d[i] = xy[(ACCESS_STRIDE/2) + i*ACCESS_STRIDE];
+
+#define hash_ec_point_inner_4(i)				\
+	c.d[i] = xy[(ACCESS_STRIDE/2) + i*ACCESS_STRIDE];
+
+	bn_unroll(hash_ec_point_inner_4);
+
 	bn_mul_mont(&c, &c, &zzi);  /* Y / Z^3 */
 	bn_from_mont(&c, &c);
 
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < BN_NWORDS; i++) {
-		wl = wh;
-		wh = c.d[(BN_NWORDS - 1) - i];
-		hash1[BN_NWORDS + i] = (wl << 24) | (wh >> 8);
-	}
+#define hash_ec_point_inner_5(i)			\
+	wl = wh;					\
+	wh = c.d[(BN_NWORDS - 1) - i];			\
+	hash1[BN_NWORDS + i] = (wl << 24) | (wh >> 8);
+
+	bn_unroll(hash_ec_point_inner_5);
 
 	/*
 	 * Hash the first 64 bytes of the buffer
@@ -1415,11 +1258,12 @@ hash_ec_point(uint *hash_out, __global bn_word *xy, __global bn_word *zip)
 	 * Unfortunately, SHA-2 outputs big-endian, but
 	 * RIPEMD160 expects little-endian.  Need to swap!
 	 */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 8; i++)
-		hash2[i] = bswap32(hash2[i]);
+
+#define hash_ec_point_inner_6(i)		\
+	hash2[i] = bswap32(hash2[i]);
+
+	hash256_unroll(hash_ec_point_inner_6);
+
 	hash2[8] = bswap32(0x80000000);
 	hash2[9] = 0;
 	hash2[10] = 0;
@@ -1457,11 +1301,10 @@ hash_ec_point_get(__global uint *hashes_out,
 	hashes_out += 5 * (i + get_global_id(0));
 
 	/* Output the hash in proper byte-order */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 5; i++)
-		hashes_out[i] = load_le32(hash[i]);
+#define hash_ec_point_get_inner_1(i)		\
+	hashes_out[i] = load_le32(hash[i]);
+
+	hash160_unroll(hash_ec_point_get_inner_1);
 }
 
 /*
@@ -1473,23 +1316,20 @@ int
 hash160_ucmp_g(uint *a, __global uint *bound)
 {
 	uint gv;
-	int i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 5; i++) {
-		gv = load_be32(bound[i]);
-		if (a[i] < gv) return -1;
+
+#define hash160_ucmp_g_inner_1(i) 		\
+		gv = load_be32(bound[i]);	\
+		if (a[i] < gv) return -1;	\
 		if (a[i] > gv) break;
-	}
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 5; i++) {
-		gv = load_be32(bound[5+i]);
-		if (a[i] < gv) return 0;
+
+	hash160_iter(hash160_ucmp_g_inner_1);
+
+#define hash160_ucmp_g_inner_2(i)   		\
+		gv = load_be32(bound[5+i]);	\
+		if (a[i] < gv) return 0;	\
 		if (a[i] > gv) return 1;
-	}
+
+	hash160_iter(hash160_ucmp_g_inner_2);
 	return 0;
 }
 
@@ -1519,11 +1359,10 @@ hash_ec_point_search_prefix(__global uint *found,
 	 * - The byte-level convention of RIPEMD160 is little-endian
 	 * - We are comparing it in big-endian order
 	 */
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-	for (i = 0; i < 5; i++)
-		hash[i] = bswap32(hash[i]);
+#define hash_ec_point_search_prefix_inner_1(i)	\
+	hash[i] = bswap32(hash[i]);
+
+	hash160_unroll(hash_ec_point_search_prefix_inner_1);
 
 	/* Binary-search the target table for the hash we just computed */
 	for (high = ntargets - 1, low = 0, i = high >> 1;
@@ -1537,11 +1376,11 @@ hash_ec_point_search_prefix(__global uint *found,
 			found[0] = ((get_global_id(1) * get_global_size(0)) +
 				    get_global_id(0));
 			found[1] = i;
-#ifdef UNROLL_MAX
-#pragma unroll UNROLL_MAX
-#endif
-			for (p = 0; p < 5; p++)
-				found[p+2] = load_be32(hash[p]);
+
+#define hash_ec_point_search_prefix_inner_2(i)	\
+			found[i+2] = load_be32(hash[i]);
+
+			hash160_unroll(hash_ec_point_search_prefix_inner_2);
 			high = -1;
 		}
 	}
diff --git a/oclvanitygen.c b/oclvanitygen.c
index 1e42698..55a4455 100644
--- a/oclvanitygen.c
+++ b/oclvanitygen.c
@@ -384,15 +384,17 @@ vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog)
  */
 
 enum {
-	VG_OCL_UNROLL_LOOPS         = (1 << 0),
-	VG_OCL_EXPENSIVE_BRANCHES   = (1 << 1),
-	VG_OCL_DEEP_VLIW            = (1 << 2),
-	VG_OCL_AMD_BFI_INT          = (1 << 3),
-	VG_OCL_NV_VERBOSE           = (1 << 4),
-	VG_OCL_BROKEN               = (1 << 5),
-	VG_OCL_NO_BINARIES          = (1 << 6),
-
-	VG_OCL_OPTIMIZATIONS        = (VG_OCL_UNROLL_LOOPS |
+	VG_OCL_DEEP_PREPROC_UNROLL  = (1 << 0),
+	VG_OCL_PRAGMA_UNROLL        = (1 << 1),
+	VG_OCL_EXPENSIVE_BRANCHES   = (1 << 2),
+	VG_OCL_DEEP_VLIW            = (1 << 3),
+	VG_OCL_AMD_BFI_INT          = (1 << 4),
+	VG_OCL_NV_VERBOSE           = (1 << 5),
+	VG_OCL_BROKEN               = (1 << 6),
+	VG_OCL_NO_BINARIES          = (1 << 7),
+
+	VG_OCL_OPTIMIZATIONS        = (VG_OCL_DEEP_PREPROC_UNROLL |
+				       VG_OCL_PRAGMA_UNROLL |
 				       VG_OCL_EXPENSIVE_BRANCHES |
 				       VG_OCL_DEEP_VLIW |
 				       VG_OCL_AMD_BFI_INT),
@@ -406,13 +408,18 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
 	const char *dvn;
 	unsigned int quirks = 0;
 
-	/* Loop unrolling for devices other than CPUs */
-	if (!(vg_ocl_device_gettype(vocp->voc_ocldid) & CL_DEVICE_TYPE_CPU))
-		quirks |= VG_OCL_UNROLL_LOOPS;
+	quirks |= VG_OCL_DEEP_PREPROC_UNROLL;
 
 	vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID);
 	switch (vend) {
 	case 0x10de: /* NVIDIA */
+		/*
+		 * NVIDIA's compiler seems to take a really really long
+		 * time when using preprocessor unrolling, but works
+		 * well with pragma unroll.
+		 */
+		quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL;
+		quirks |= VG_OCL_PRAGMA_UNROLL;
 		quirks |= VG_OCL_NV_VERBOSE;
 #ifdef WIN32
 		if (strcmp(vg_ocl_device_getstr(vocp->voc_ocldid,
@@ -427,6 +434,12 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
 #endif
 		break;
 	case 0x1002: /* AMD/ATI */
+		/*
+		 * AMD's compiler works best with preprocesor unrolling.
+		 * Pragma unroll is unreliable with AMD's compiler and
+		 * seems to crash based on whether the gods were smiling
+		 * when Catalyst was last installed/upgraded.
+		 */
 		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
 		    CL_DEVICE_TYPE_GPU) {
 			quirks |= VG_OCL_EXPENSIVE_BRANCHES;
@@ -896,9 +909,12 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
 
 	end = 0;
 	optbuf[end] = '\0';
-	if (vocp->voc_quirks & VG_OCL_UNROLL_LOOPS)
+	if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL)
+		end += snprintf(optbuf + end, sizeof(optbuf) - end,
+				"-DDEEP_PREPROC_UNROLL ");
+	if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL)
 		end += snprintf(optbuf + end, sizeof(optbuf) - end,
-				"-DUNROLL_MAX=16 ");
+				"-DPRAGMA_UNROLL ");
 	if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES)
 		end += snprintf(optbuf + end, sizeof(optbuf) - end,
 				"-DVERY_EXPENSIVE_BRANCHES ");