From 7ba90d3086588b7ba58c185977ce884d41f1beab Mon Sep 17 00:00:00 2001 From: samr7 Date: Wed, 3 Aug 2011 22:52:13 -0700 Subject: [PATCH] Add implementation of bignum add/subtract better suited to AMD Radeon. --- calc_addrs.cl | 137 +++++++++++++++++++++++++++++++++++++++++++++++-- oclvanitygen.c | 22 +++++--- 2 files changed, 147 insertions(+), 12 deletions(-) diff --git a/calc_addrs.cl b/calc_addrs.cl index db90325..78a87ba 100644 --- a/calc_addrs.cl +++ b/calc_addrs.cl @@ -234,7 +234,7 @@ bn_neg(bignum *n) } while (0) bn_word -bn_uadd(bignum *r, bignum *a, bignum *b) +bn_uadd_seq(bignum *r, bignum *a, bignum *b) { bn_word t, c = 0; int i; @@ -248,7 +248,7 @@ bn_uadd(bignum *r, bignum *a, bignum *b) } bn_word -bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b) +bn_uadd_c_seq(bignum *r, bignum *a, __constant bn_word *b) { bn_word t, c = 0; int i; @@ -275,7 +275,7 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b) } while (0) bn_word -bn_usub(bignum *r, bignum *a, bignum *b) +bn_usub_seq(bignum *r, bignum *a, bignum *b) { bn_word t, c = 0; int i; @@ -289,7 +289,7 @@ bn_usub(bignum *r, bignum *a, bignum *b) } bn_word -bn_usub_c(bignum *r, bignum *a, __constant bn_word *b) +bn_usub_c_seq(bignum *r, bignum *a, __constant bn_word *b) { bn_word t, c = 0; int i; @@ -302,6 +302,135 @@ bn_usub_c(bignum *r, bignum *a, __constant bn_word *b) return c; } +/* + * Add/subtract better suited for AMD's VLIW architecture + */ +bn_word +bn_uadd_vliw(bignum *r, bignum *a, bignum *b) +{ + bignum x; + bn_word c = 0, cp = 0; + int i; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) + x.d[i] = a->d[i] + b->d[i]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) { + c |= (a->d[i] > x.d[i]) ? (1 << i) : 0; + cp |= (!~x.d[i]) ? (1 << i) : 0; + } + c = ((cp + (c << 1)) & ~cp); + r->d[0] = x.d[0]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 1; i < BN_NWORDS; i++) + r->d[i] = x.d[i] + ((c >> i) & 1); + return c >> BN_NWORDS; +} + +bn_word +bn_uadd_c_vliw(bignum *r, bignum *a, __constant bn_word *b) +{ + bignum x; + bn_word c = 0, cp = 0; + int i; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) + x.d[i] = a->d[i] + b[i]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) { + c |= (b[i] > x.d[i]) ? (1 << i) : 0; + cp |= (!~x.d[i]) ? (1 << i) : 0; + } + c = ((cp + (c << 1)) & ~cp); + r->d[0] = x.d[0]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 1; i < BN_NWORDS; i++) + r->d[i] = x.d[i] + ((c >> i) & 1); + return c >> BN_NWORDS; +} + +bn_word +bn_usub_vliw(bignum *r, bignum *a, bignum *b) +{ + bignum x; + bn_word c = 0, cp = 0; + int i; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) + x.d[i] = a->d[i] - b->d[i]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) { + c |= (a->d[i] < b->d[i]) ? (1 << i) : 0; + cp |= (!x.d[i]) ? (1 << i) : 0; + } + c = ((cp + (c << 1)) & ~cp); + r->d[0] = x.d[0]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 1; i < BN_NWORDS; i++) + r->d[i] = x.d[i] - ((c >> i) & 1); + return c >> BN_NWORDS; +} + +bn_word +bn_usub_c_vliw(bignum *r, bignum *a, __constant bn_word *b) +{ + bignum x; + bn_word c = 0, cp = 0; + int i; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) + x.d[i] = a->d[i] - b[i]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) { + c |= (a->d[i] < b[i]) ? (1 << i) : 0; + cp |= (!x.d[i]) ? (1 << i) : 0; + } + c = ((cp + (c << 1)) & ~cp); + r->d[0] = x.d[0]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 1; i < BN_NWORDS; i++) + r->d[i] = x.d[i] - ((c >> i) & 1); + return c >> BN_NWORDS; +} + + +#if defined(DEEP_VLIW) +#define bn_uadd bn_uadd_vliw +#define bn_uadd_c bn_uadd_c_vliw +#define bn_usub bn_usub_vliw +#define bn_usub_c bn_usub_c_vliw +#else +#define bn_uadd bn_uadd_seq +#define bn_uadd_c bn_uadd_c_seq +#define bn_usub bn_usub_seq +#define bn_usub_c bn_usub_c_seq +#endif + + /* * Modular add/sub */ diff --git a/oclvanitygen.c b/oclvanitygen.c index 970e541..4b26c5f 100644 --- a/oclvanitygen.c +++ b/oclvanitygen.c @@ -384,12 +384,14 @@ vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog) enum { VG_OCL_UNROLL_LOOPS = (1 << 0), VG_OCL_EXPENSIVE_BRANCHES = (1 << 1), - VG_OCL_NV_VERBOSE = (1 << 2), - VG_OCL_BROKEN = (1 << 3), - VG_OCL_NO_BINARIES = (1 << 4), + VG_OCL_DEEP_VLIW = (1 << 2), + VG_OCL_NV_VERBOSE = (1 << 3), + VG_OCL_BROKEN = (1 << 4), + VG_OCL_NO_BINARIES = (1 << 5), VG_OCL_OPTIMIZATIONS = (VG_OCL_UNROLL_LOOPS | - VG_OCL_EXPENSIVE_BRANCHES), + VG_OCL_EXPENSIVE_BRANCHES | + VG_OCL_DEEP_VLIW), }; @@ -404,7 +406,8 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp) quirks |= VG_OCL_UNROLL_LOOPS; vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_VENDOR); - if (!strcmp(vend, "NVIDIA Corporation")) { + if (!strcmp(vend, "NVIDIA Corporation") || + !strcmp(vend, "NVIDIA")) { quirks |= VG_OCL_NV_VERBOSE; #ifdef WIN32 if (strcmp(vg_ocl_device_getstr(vocp->voc_ocldid, @@ -420,9 +423,9 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp) } else if (!strcmp(vend, "Advanced Micro Devices, Inc.") || !strcmp(vend, "AMD")) { quirks |= VG_OCL_EXPENSIVE_BRANCHES; - if (!strcmp(vg_ocl_device_getstr(vocp->voc_ocldid, - CL_DEVICE_NAME), - "ATI RV710")) { + quirks |= VG_OCL_DEEP_VLIW; + vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME); + if (!strcmp(vend, "ATI RV710")) { quirks &= ~VG_OCL_OPTIMIZATIONS; quirks |= VG_OCL_NO_BINARIES; } @@ -737,6 +740,9 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did, if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES) end += snprintf(optbuf + end, sizeof(optbuf) - end, "-DVERY_EXPENSIVE_BRANCHES "); + if (vocp->voc_quirks & VG_OCL_DEEP_VLIW) + end += snprintf(optbuf + end, sizeof(optbuf) - end, + "-DDEEP_VLIW "); if (vocp->voc_quirks & VG_OCL_NV_VERBOSE) end += snprintf(optbuf + end, sizeof(optbuf) - end, "-cl-nv-verbose ");