Apply some optimizations to the OpenCL kernel.

Add a new flag, VERY_EXPENSIVE_BRANCHES, for various Radeon devices.
14 years ago · 31ca88ab40
2 changed files with 67 additions and 35 deletions
--- a/calc_addrs.cl
+++ b/calc_addrs.cl
@ -154,37 +154,52 @@ bn_rshift1(bignum *bn)
				@@ -154,37 +154,52 @@ bn_rshift1(bignum *bn)
 	bn->d[i] >>= 1;
 }

+void
+bn_rshift1_2(bignum *bna, bignum *bnb)
+{
+	int i;
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < (BN_NWORDS - 1); i++) {
+		bna->d[i] = (bna->d[i+1] << 31) | (bna->d[i] >> 1);
+		bnb->d[i] = (bnb->d[i+1] << 31) | (bnb->d[i] >> 1);
+	}
+	bna->d[i] >>= 1;
+	bnb->d[i] >>= 1;
+}
+

 /*
 * Unsigned comparison
 */

 int
-bn_ucmp(bignum *a, bignum *b)
+bn_ucmp_ge(bignum *a, bignum *b)
 {
-	int i;
+	int i, l = 0, g = 0;
 #ifdef UNROLL_MAX
 #pragma unroll UNROLL_MAX
 #endif
 	for (i = (BN_NWORDS - 1); i >= 0; i--) {
-		if (a->d[i] < b->d[i]) return -1;
-		if (a->d[i] > b->d[i]) return 1;
+		if (a->d[i] < b->d[i]) l |= (1 << i);
+		if (a->d[i] > b->d[i]) g |= (1 << i);
 	}
-	return 0;
+	return (l > g) ? 0 : 1;
 }

 int
-bn_ucmp_c(bignum *a, __constant bn_word *b)
+bn_ucmp_ge_c(bignum *a, __constant bn_word *b)
 {
-	int i;
+	int i, l = 0, g = 0;
 #ifdef UNROLL_MAX
 #pragma unroll UNROLL_MAX
 #endif
 	for (i = (BN_NWORDS - 1); i >= 0; i--) {
-		if (a->d[i] < b[i]) return -1;
-		if (a->d[i] > b[i]) return 1;
+		if (a->d[i] < b[i]) l |= (1 << i);
+		if (a->d[i] > b[i]) g |= (1 << i);
 	}
-	return 0;
+	return (l > g) ? 0 : 1;
 }

 /*
@ -295,7 +310,7 @@ void
				@@ -295,7 +310,7 @@ void
 bn_mod_add(bignum *r, bignum *a, bignum *b)
 {
 	if (bn_uadd(r, a, b) ||
-	    (bn_ucmp_c(r, modulus) >= 0))
+	    (bn_ucmp_ge_c(r, modulus)))
 		bn_usub_c(r, r, modulus);
 }

@ -311,7 +326,7 @@ bn_mod_lshift1(bignum *bn)
				@@ -311,7 +326,7 @@ bn_mod_lshift1(bignum *bn)
 {
 	bn_word c = (bn->d[BN_NWORDS-1] & 0x80000000);
 	bn_lshift1(bn);
-	if (c || (bn_ucmp_c(bn, modulus) >= 0))
+	if (c || (bn_ucmp_ge_c(bn, modulus)))
 		bn_usub_c(bn, bn, modulus);
 }

@ -323,14 +338,14 @@ bn_mod_lshift1(bignum *bn)
				@@ -323,14 +338,14 @@ bn_mod_lshift1(bignum *bn)
 */

 #define bn_mul_word(r, a, w, c, p, s) do { \
-		p = mul_hi(a, w);	   \
 		r = (a * w) + c;	   \
+		p = mul_hi(a, w);	   \
 		c = (r < c) ? p + 1 : p;   \
 	} while (0)

 #define bn_mul_add_word(r, a, w, c, p, s) do {	\
-		p = mul_hi(a, w);		\
 		s = r + c;			\
+		p = mul_hi(a, w);		\
 		r = (a * w) + s;		\
 		c = (s < c) ? p + 1 : p;	\
 		if (r < s) c++;			\
@ -365,6 +380,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
				@@ -365,6 +380,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
 	t.d[BN_NWORDS-1] = tea + c;
 	tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);

+#if defined(UNROLL_MAX) && defined(VERY_EXPENSIVE_BRANCHES)
+#pragma unroll UNROLL_MAX
+#endif
 	for (i = 1; i < BN_NWORDS; i++) {
 		c = 0;
 #ifdef UNROLL_MAX
@ -389,12 +407,19 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
				@@ -389,12 +407,19 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
 		tea = teb + ((t.d[BN_NWORDS-1] < c) ? 1 : 0);
 	}

-	if (tea || (t.d[BN_NWORDS-1] >= modulus[7])) {
-		c = bn_usub_c(r, &t, modulus);
-		if (tea || !c)
+#if defined(VERY_EXPENSIVE_BRANCHES)
+	c = tea | !bn_usub_c(r, &t, modulus);
+	if (!c)
+		*r = t;
+#else
+	c = tea || (t.d[BN_NWORDS-1] >= modulus[BN_NWORDS-1]);
+	if (c) {
+		c = tea | !bn_usub_c(r, &t, modulus);
+		if (c)
 			return;
 	}
 	*r = t;
+#endif
 }

 void
@ -478,30 +503,23 @@ bn_mod_inverse(bignum *r, bignum *n)
				@@ -478,30 +503,23 @@ bn_mod_inverse(bignum *r, bignum *n)
 	yc = 0;
 	while (!bn_is_zero(b)) {
 		shift = 0;
-		while (!bn_is_bit_set(b, shift)) {
-			shift++;
+		while (!bn_is_odd(b)) {
 			if (bn_is_odd(x))
 				xc += bn_uadd_c(&x, &x, modulus);
-			bn_rshift1(&x);
+			bn_rshift1_2(&x, &b);
 			x.d[7] |= (xc << 31);
 			xc >>= 1;
 		}
-		if (shift)
-			bn_rshift(&b, shift);

-		shift = 0;
-		while (!bn_is_bit_set(a, shift)) {
-			shift++;
+		while (!bn_is_odd(a)) {
 			if (bn_is_odd(y))
 				yc += bn_uadd_c(&y, &y, modulus);
-			bn_rshift1(&y);
+			bn_rshift1_2(&y, &a);
 			y.d[7] |= (yc << 31);
 			yc >>= 1;
 		}
-		if (shift)
-			bn_rshift(&a, shift);

-		if (bn_ucmp(&b, &a) >= 0) {
+		if (bn_ucmp_ge(&b, &a)) {
 			xc += yc + bn_uadd(&x, &x, &y);
 			bn_usub(&b, &b, &a);
 		} else {
@ -1106,14 +1124,16 @@ heap_invert(__global bn_word *z_heap, int batch)
				@@ -1106,14 +1124,16 @@ heap_invert(__global bn_word *z_heap, int batch)
 	bn_mul_mont(&z, &z, &a);
 	bn_mul_mont(&z, &z, &a);

+	lcell = (off * 2 * (batch - 2)) + get_global_id(0);
+	hcell = lcell + (off << 1);
+	start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
+		 (hcell % ACCESS_STRIDE));
 #ifdef UNROLL_MAX
 #pragma unroll UNROLL_MAX
 #endif
 	for (j = 0; j < BN_NWORDS; j++)
 		z_heap[start + j*ACCESS_STRIDE] = z.d[j];

-	lcell = (off * 2 * (batch - 2)) + get_global_id(0);
-	hcell = lcell + (off << 1);
 	for (i = 0; i < (batch-1); i++) {
 		start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
 			 (hcell % ACCESS_STRIDE));
--- a/oclvanitygen.c
+++ b/oclvanitygen.c
@ -612,6 +612,7 @@ int
				@@ -612,6 +612,7 @@ int
 vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did)
 {
 	cl_int ret;
+	const char *vend, *options;

 	memset(vocp, 0, sizeof(*vocp));
 	vg_exec_context_init(vcp, &vocp->base);
@ -652,10 +653,21 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did)
				@@ -652,10 +653,21 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did)
 		return 0;
 	}

-	if (!vg_ocl_load_program(vcp, vocp,
-				 "calc_addrs.cl",
-				 //"-cl-nv-verbose "
-				 "-DUNROLL_MAX=16"))
+	options = "-DUNROLL_MAX=16";
+
+	vend = vg_ocl_device_getstr(did, CL_DEVICE_VENDOR);
+	if (!strcmp(vend, "Advanced Micro Devices, Inc.") ||
+	    !strcmp(vend, "AMD")) {
+		/* Radeons do better with less flow control */
+		options = "-DUNROLL_MAX=16 -DVERY_EXPENSIVE_BRANCHES";
+
+	} else if (!strcmp(vend, "NVIDIA Corporation")) {
+		/* NVIDIA has a handy verbose output option */
+		if (vcp->vc_verbose > 1)
+			options = "-DUNROLL_MAX=16 -cl-nv-verbose";
+	}
+
+	if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", options))
 		return 0;
 	return 1;
 }