Don't force unrolling of specific loops without DEEP_PREPROC_UNROLL.

Clean up formatting.
13 years ago · d481879a7e
1 changed files with 74 additions and 56 deletions
--- a/calc_addrs.cl
+++ b/calc_addrs.cl
@ -94,7 +94,18 @@
				@@ -94,7 +94,18 @@
 #define load_be32(v) bswap32(v)
 #endif

-/* Explicit unrolling */
+/*
+ * Loop unrolling macros
+ *
+ * In most cases, preprocessor unrolling works best.
+ * The exception is NVIDIA's compiler, which seems to take unreasonably
+ * long to compile a loop with a larger iteration count, or a loop with
+ * a body of >50 PTX instructions, with preprocessor unrolling.
+ * However, it does not seem to take as long with pragma unroll, and
+ * produces good output.
+ */
+
+/* Explicit loop unrolling */
 #define unroll_5(a) do { a(0) a(1) a(2) a(3) a(4) } while (0)
 #define unroll_8(a) do { a(0) a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
 #define unroll_8_sf(a) do { a(1) a(2) a(3) a(4) a(5) a(6) a(7) } while (0)
@ -118,6 +129,7 @@
				@@ -118,6 +129,7 @@
 	a(56) a(57) a(58) a(59) a(60) a(61) a(62) a(63) \
 	} while (0)

+/* Conditional loop unrolling */
 #if defined(DEEP_PREPROC_UNROLL)
 #define iter_5(a) unroll_5(a)
 #define iter_8(a) unroll_8(a)
@ -203,7 +215,6 @@ bn_lshift1(bignum *bn)
				@@ -203,7 +215,6 @@ bn_lshift1(bignum *bn)
 {
 #define bn_lshift1_inner1(i)						\
 		bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31);
-
 	bn_unroll_reverse_sl(bn_lshift1_inner1);
 	bn->d[0] <<= 1;
 }
@ -223,7 +234,6 @@ bn_rshift(bignum *bn, int shift)
				@@ -223,7 +234,6 @@ bn_rshift(bignum *bn, int shift)
 		ilw = ihw;				\
 		ihw = (wd < BN_WBITS) ? bn->d[wd] : 0;	\
 		bn->d[i] = (ilw >> iws) | (ihw << iwr);
-
 	bn_unroll_sl(bn_rshift_inner1);
 	bn->d[BN_NWORDS-1] = (ihw >> iws);
 }
@ -233,7 +243,6 @@ bn_rshift1(bignum *bn)
				@@ -233,7 +243,6 @@ bn_rshift1(bignum *bn)
 {
 #define bn_rshift1_inner1(i)						\
 		bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1);
-
 	bn_unroll_sl(bn_rshift1_inner1);
 	bn->d[BN_NWORDS-1] >>= 1;
 }
@ -244,7 +253,6 @@ bn_rshift1_2(bignum *bna, bignum *bnb)
				@@ -244,7 +253,6 @@ bn_rshift1_2(bignum *bna, bignum *bnb)
 #define bn_rshift1_2_inner1(i)						\
 		bna->d[i] = (bna->d[i+1] << 31) | (bna->d[i] >> 1);	\
 		bnb->d[i] = (bnb->d[i+1] << 31) | (bnb->d[i] >> 1);
-
 	bn_unroll_sl(bn_rshift1_2_inner1);
 	bna->d[BN_NWORDS-1] >>= 1;
 	bnb->d[BN_NWORDS-1] >>= 1;
@ -263,7 +271,6 @@ bn_ucmp_ge(bignum *a, bignum *b)
				@@ -263,7 +271,6 @@ bn_ucmp_ge(bignum *a, bignum *b)
 #define bn_ucmp_ge_inner1(i)				\
 		if (a->d[i] < b->d[i]) l |= (1 << i);	\
 		if (a->d[i] > b->d[i]) g |= (1 << i);
-
 	bn_unroll_reverse(bn_ucmp_ge_inner1);
 	return (l > g) ? 0 : 1;
 }
@ -276,7 +283,6 @@ bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
				@@ -276,7 +283,6 @@ bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
 #define bn_ucmp_ge_c_inner1(i)				\
 		if (a->d[i] < b[i]) l |= (1 << i);	\
 		if (a->d[i] > b[i]) g |= (1 << i);
-
 	bn_unroll_reverse(bn_ucmp_ge_c_inner1);
 	return (l > g) ? 0 : 1;
 }
@ -292,7 +298,6 @@ bn_neg(bignum *n)
				@@ -292,7 +298,6 @@ bn_neg(bignum *n)

 #define bn_neg_inner1(i)				\
 		c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
-
 	bn_unroll(bn_neg_inner1);
 }

@ -319,7 +324,6 @@ bn_uadd_words_seq(bn_word *r, bn_word *a, bn_word *b)
				@@ -319,7 +324,6 @@ bn_uadd_words_seq(bn_word *r, bn_word *a, bn_word *b)

 #define bn_uadd_words_seq_inner1(i)			\
 		bn_addc_word(r[i], a[i], b[i], t, c);
-
 	bn_add_word(r[0], a[0], b[0], t, c);
 	bn_unroll_sf(bn_uadd_words_seq_inner1);
 	return c;
@ -560,6 +564,10 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
				@@ -560,6 +564,10 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
 	t.d[BN_NWORDS-1] = tea + c;			 \
 	tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);

+	/*
+	 * The outer loop here is quite long, and we won't unroll it
+	 * unless VERY_EXPENSIVE_BRANCHES is set.
+	 */
 #if defined(VERY_EXPENSIVE_BRANCHES)
 	bn_unroll_sf(bn_mul_mont_inner3);
 	c = tea | !bn_usub_c(r, &t, modulus);
@ -621,18 +629,28 @@ bn_from_mont(bignum *rb, bignum *b)
				@@ -621,18 +629,28 @@ bn_from_mont(bignum *rb, bignum *b)
 	r[BN_NWORDS + i] += c;			 \
 	bn_from_mont_inner3_2(i)

+	/*
+	 * The outer loop here is not very long, so we will unroll
+	 * it by default.  However, it's just complicated enough to
+	 * cause NVIDIA's compiler to take unreasonably long to compile
+	 * it, unless we use pragma unroll.
+	 */
 #if !defined(PRAGMA_UNROLL)
-	bn_unroll(bn_from_mont_inner3);
+	bn_iter(bn_from_mont_inner3);
 #else
 #pragma unroll 8
 	for (i = 0; i < BN_NWORDS; i++) { bn_from_mont_inner3(i) }
 #endif

-#define bn_from_mont_inner4(i)			\
-	rb->d[i] = r[BN_NWORDS + i];
-
+	/*
+	 * Make sure the result is less than the modulus.
+	 * Subtracting is not much more expensive than compare, so
+	 * subtract always and assign based on the carry out value.
+	 */
 	c = bn_usub_words_c(rb->d, &r[BN_NWORDS], modulus);
 	if (c) {
+#define bn_from_mont_inner4(i)				\
+			rb->d[i] = r[BN_NWORDS + i];
 		bn_unroll(bn_from_mont_inner4);
 	}
 }
@ -801,7 +819,7 @@ sha2_256_block(uint *out, uint *in)
				@@ -801,7 +819,7 @@ sha2_256_block(uint *out, uint *in)
 	sha2_stvar(state, i, 7) = t1 + t2;				\

 #if !defined(PRAGMA_UNROLL)
-	unroll_64(sha2_256_block_inner_2);
+	iter_64(sha2_256_block_inner_2);
 #else
 #pragma unroll 64
 	for (i = 0; i < 64; i++) { sha2_256_block_inner_2(i) }
@ -934,11 +952,11 @@ ripemd160_block(uint *out, uint *in)
				@@ -934,11 +952,11 @@ ripemd160_block(uint *out, uint *in)
 			ripemd160_f4, ripemd160_f0, t);

 #if !defined(PRAGMA_UNROLL)
-	unroll_16(ripemd160_block_inner_p0);
-	unroll_16(ripemd160_block_inner_p1);
-	unroll_16(ripemd160_block_inner_p2);
-	unroll_16(ripemd160_block_inner_p3);
-	unroll_16(ripemd160_block_inner_p4);
+	iter_16(ripemd160_block_inner_p0);
+	iter_16(ripemd160_block_inner_p1);
+	iter_16(ripemd160_block_inner_p2);
+	iter_16(ripemd160_block_inner_p3);
+	iter_16(ripemd160_block_inner_p4);
 #else
 #pragma unroll 16
 	for (i = 0; i < 16; i++) { ripemd160_block_inner_p0(i); }