Add implementation of bignum add/subtract better suited to AMD Radeon.

2025-03-13 05:41:35 +00:00 · 2011-08-03 22:52:13 -07:00 · 2011-08-03 22:52:13 -07:00 · 7ba90d3086
commit 7ba90d3086
parent 04dd595ce2
2 changed files with 147 additions and 12 deletions
--- a/calc_addrs.cl
+++ b/calc_addrs.cl
@ -234,7 +234,7 @@ bn_neg(bignum *n)
 	} while (0)

 bn_word
-bn_uadd(bignum *r, bignum *a, bignum *b)
+bn_uadd_seq(bignum *r, bignum *a, bignum *b)
 {
 	bn_word t, c = 0;
 	int i;
@ -248,7 +248,7 @@ bn_uadd(bignum *r, bignum *a, bignum *b)
 }

 bn_word
-bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b)
+bn_uadd_c_seq(bignum *r, bignum *a, __constant bn_word *b)
 {
 	bn_word t, c = 0;
 	int i;
@ -275,7 +275,7 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b)
 	} while (0)

 bn_word
-bn_usub(bignum *r, bignum *a, bignum *b)
+bn_usub_seq(bignum *r, bignum *a, bignum *b)
 {
 	bn_word t, c = 0;
 	int i;
@ -289,7 +289,7 @@ bn_usub(bignum *r, bignum *a, bignum *b)
 }

 bn_word
-bn_usub_c(bignum *r, bignum *a, __constant bn_word *b)
+bn_usub_c_seq(bignum *r, bignum *a, __constant bn_word *b)
 {
 	bn_word t, c = 0;
 	int i;
@ -302,6 +302,135 @@ bn_usub_c(bignum *r, bignum *a, __constant bn_word *b)
 	return c;
 }

+/*
+ * Add/subtract better suited for AMD's VLIW architecture
+ */
+bn_word
+bn_uadd_vliw(bignum *r, bignum *a, bignum *b)
+{
+	bignum x;
+	bn_word c = 0, cp = 0;
+	int i;
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < BN_NWORDS; i++)
+		x.d[i] = a->d[i] + b->d[i];
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < BN_NWORDS; i++) {
+		c |= (a->d[i] > x.d[i]) ? (1 << i) : 0;
+		cp |= (!~x.d[i]) ? (1 << i) : 0;
+	}
+	c = ((cp + (c << 1)) & ~cp);
+	r->d[0] = x.d[0];
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 1; i < BN_NWORDS; i++)
+		r->d[i] = x.d[i] + ((c >> i) & 1);
+	return c >> BN_NWORDS;
+}
+
+bn_word
+bn_uadd_c_vliw(bignum *r, bignum *a, __constant bn_word *b)
+{
+	bignum x;
+	bn_word c = 0, cp = 0;
+	int i;
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < BN_NWORDS; i++)
+		x.d[i] = a->d[i] + b[i];
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < BN_NWORDS; i++) {
+		c |= (b[i] > x.d[i]) ? (1 << i) : 0;
+		cp |= (!~x.d[i]) ? (1 << i) : 0;
+	}
+	c = ((cp + (c << 1)) & ~cp);
+	r->d[0] = x.d[0];
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 1; i < BN_NWORDS; i++)
+		r->d[i] = x.d[i] + ((c >> i) & 1);
+	return c >> BN_NWORDS;
+}
+
+bn_word
+bn_usub_vliw(bignum *r, bignum *a, bignum *b)
+{
+	bignum x;
+	bn_word c = 0, cp = 0;
+	int i;
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < BN_NWORDS; i++)
+		x.d[i] = a->d[i] - b->d[i];
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < BN_NWORDS; i++) {
+		c |= (a->d[i] < b->d[i]) ? (1 << i) : 0;
+		cp |= (!x.d[i]) ? (1 << i) : 0;
+	}
+	c = ((cp + (c << 1)) & ~cp);
+	r->d[0] = x.d[0];
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 1; i < BN_NWORDS; i++)
+		r->d[i] = x.d[i] - ((c >> i) & 1);
+	return c >> BN_NWORDS;
+}
+
+bn_word
+bn_usub_c_vliw(bignum *r, bignum *a, __constant bn_word *b)
+{
+	bignum x;
+	bn_word c = 0, cp = 0;
+	int i;
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < BN_NWORDS; i++)
+		x.d[i] = a->d[i] - b[i];
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 0; i < BN_NWORDS; i++) {
+		c |= (a->d[i] < b[i]) ? (1 << i) : 0;
+		cp |= (!x.d[i]) ? (1 << i) : 0;
+	}
+	c = ((cp + (c << 1)) & ~cp);
+	r->d[0] = x.d[0];
+#ifdef UNROLL_MAX
+#pragma unroll UNROLL_MAX
+#endif
+	for (i = 1; i < BN_NWORDS; i++)
+		r->d[i] = x.d[i] - ((c >> i) & 1);
+	return c >> BN_NWORDS;
+}
+
+
+#if defined(DEEP_VLIW)
+#define bn_uadd bn_uadd_vliw
+#define bn_uadd_c bn_uadd_c_vliw
+#define bn_usub bn_usub_vliw
+#define bn_usub_c bn_usub_c_vliw
+#else
+#define bn_uadd bn_uadd_seq
+#define bn_uadd_c bn_uadd_c_seq
+#define bn_usub bn_usub_seq
+#define bn_usub_c bn_usub_c_seq
+#endif
+
+
 /*
 * Modular add/sub
 */
--- a/oclvanitygen.c
+++ b/oclvanitygen.c
@ -384,12 +384,14 @@ vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog)
 enum {
 	VG_OCL_UNROLL_LOOPS         = (1 << 0),
 	VG_OCL_EXPENSIVE_BRANCHES   = (1 << 1),
-	VG_OCL_NV_VERBOSE           = (1 << 2),
-	VG_OCL_BROKEN               = (1 << 3),
-	VG_OCL_NO_BINARIES          = (1 << 4),
+	VG_OCL_DEEP_VLIW            = (1 << 2),
+	VG_OCL_NV_VERBOSE           = (1 << 3),
+	VG_OCL_BROKEN               = (1 << 4),
+	VG_OCL_NO_BINARIES          = (1 << 5),

 	VG_OCL_OPTIMIZATIONS        = (VG_OCL_UNROLL_LOOPS |
-				       VG_OCL_EXPENSIVE_BRANCHES),
+				       VG_OCL_EXPENSIVE_BRANCHES |
+				       VG_OCL_DEEP_VLIW),

 };

@ -404,7 +406,8 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
 		quirks |= VG_OCL_UNROLL_LOOPS;

 	vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_VENDOR);
-	if (!strcmp(vend, "NVIDIA Corporation")) {
+	if (!strcmp(vend, "NVIDIA Corporation") ||
+	    !strcmp(vend, "NVIDIA")) {
 		quirks |= VG_OCL_NV_VERBOSE;
 #ifdef WIN32
 		if (strcmp(vg_ocl_device_getstr(vocp->voc_ocldid,
@ -420,9 +423,9 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
 	} else if (!strcmp(vend, "Advanced Micro Devices, Inc.") ||
 		   !strcmp(vend, "AMD")) {
 		quirks |= VG_OCL_EXPENSIVE_BRANCHES;
-		if (!strcmp(vg_ocl_device_getstr(vocp->voc_ocldid,
-						 CL_DEVICE_NAME),
-			    "ATI RV710")) {
+		quirks |= VG_OCL_DEEP_VLIW;
+		vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
+		if (!strcmp(vend, "ATI RV710")) {
 			quirks &= ~VG_OCL_OPTIMIZATIONS;
 			quirks |= VG_OCL_NO_BINARIES;
 		}
@ -737,6 +740,9 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
 	if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES)
 		end += snprintf(optbuf + end, sizeof(optbuf) - end,
 				"-DVERY_EXPENSIVE_BRANCHES ");
+	if (vocp->voc_quirks & VG_OCL_DEEP_VLIW)
+		end += snprintf(optbuf + end, sizeof(optbuf) - end,
+				"-DDEEP_VLIW ");
 	if (vocp->voc_quirks & VG_OCL_NV_VERBOSE)
 		end += snprintf(optbuf + end, sizeof(optbuf) - end,
 				"-cl-nv-verbose ");