Browse Source

Add implementation of bignum add/subtract better suited to AMD Radeon.

master
samr7 13 years ago
parent
commit
7ba90d3086
  1. 137
      calc_addrs.cl
  2. 22
      oclvanitygen.c

137
calc_addrs.cl

@ -234,7 +234,7 @@ bn_neg(bignum *n) @@ -234,7 +234,7 @@ bn_neg(bignum *n)
} while (0)
bn_word
bn_uadd(bignum *r, bignum *a, bignum *b)
bn_uadd_seq(bignum *r, bignum *a, bignum *b)
{
bn_word t, c = 0;
int i;
@ -248,7 +248,7 @@ bn_uadd(bignum *r, bignum *a, bignum *b) @@ -248,7 +248,7 @@ bn_uadd(bignum *r, bignum *a, bignum *b)
}
bn_word
bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b)
bn_uadd_c_seq(bignum *r, bignum *a, __constant bn_word *b)
{
bn_word t, c = 0;
int i;
@ -275,7 +275,7 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b) @@ -275,7 +275,7 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b)
} while (0)
bn_word
bn_usub(bignum *r, bignum *a, bignum *b)
bn_usub_seq(bignum *r, bignum *a, bignum *b)
{
bn_word t, c = 0;
int i;
@ -289,7 +289,7 @@ bn_usub(bignum *r, bignum *a, bignum *b) @@ -289,7 +289,7 @@ bn_usub(bignum *r, bignum *a, bignum *b)
}
bn_word
bn_usub_c(bignum *r, bignum *a, __constant bn_word *b)
bn_usub_c_seq(bignum *r, bignum *a, __constant bn_word *b)
{
bn_word t, c = 0;
int i;
@ -302,6 +302,135 @@ bn_usub_c(bignum *r, bignum *a, __constant bn_word *b) @@ -302,6 +302,135 @@ bn_usub_c(bignum *r, bignum *a, __constant bn_word *b)
return c;
}
/*
* Add/subtract better suited for AMD's VLIW architecture
*/
bn_word
bn_uadd_vliw(bignum *r, bignum *a, bignum *b)
{
bignum x;
bn_word c = 0, cp = 0;
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++)
x.d[i] = a->d[i] + b->d[i];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++) {
c |= (a->d[i] > x.d[i]) ? (1 << i) : 0;
cp |= (!~x.d[i]) ? (1 << i) : 0;
}
c = ((cp + (c << 1)) & ~cp);
r->d[0] = x.d[0];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++)
r->d[i] = x.d[i] + ((c >> i) & 1);
return c >> BN_NWORDS;
}
bn_word
bn_uadd_c_vliw(bignum *r, bignum *a, __constant bn_word *b)
{
bignum x;
bn_word c = 0, cp = 0;
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++)
x.d[i] = a->d[i] + b[i];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++) {
c |= (b[i] > x.d[i]) ? (1 << i) : 0;
cp |= (!~x.d[i]) ? (1 << i) : 0;
}
c = ((cp + (c << 1)) & ~cp);
r->d[0] = x.d[0];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++)
r->d[i] = x.d[i] + ((c >> i) & 1);
return c >> BN_NWORDS;
}
bn_word
bn_usub_vliw(bignum *r, bignum *a, bignum *b)
{
bignum x;
bn_word c = 0, cp = 0;
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++)
x.d[i] = a->d[i] - b->d[i];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++) {
c |= (a->d[i] < b->d[i]) ? (1 << i) : 0;
cp |= (!x.d[i]) ? (1 << i) : 0;
}
c = ((cp + (c << 1)) & ~cp);
r->d[0] = x.d[0];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++)
r->d[i] = x.d[i] - ((c >> i) & 1);
return c >> BN_NWORDS;
}
bn_word
bn_usub_c_vliw(bignum *r, bignum *a, __constant bn_word *b)
{
bignum x;
bn_word c = 0, cp = 0;
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++)
x.d[i] = a->d[i] - b[i];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++) {
c |= (a->d[i] < b[i]) ? (1 << i) : 0;
cp |= (!x.d[i]) ? (1 << i) : 0;
}
c = ((cp + (c << 1)) & ~cp);
r->d[0] = x.d[0];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++)
r->d[i] = x.d[i] - ((c >> i) & 1);
return c >> BN_NWORDS;
}
#if defined(DEEP_VLIW)
#define bn_uadd bn_uadd_vliw
#define bn_uadd_c bn_uadd_c_vliw
#define bn_usub bn_usub_vliw
#define bn_usub_c bn_usub_c_vliw
#else
#define bn_uadd bn_uadd_seq
#define bn_uadd_c bn_uadd_c_seq
#define bn_usub bn_usub_seq
#define bn_usub_c bn_usub_c_seq
#endif
/*
* Modular add/sub
*/

22
oclvanitygen.c

@ -384,12 +384,14 @@ vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog) @@ -384,12 +384,14 @@ vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog)
enum {
VG_OCL_UNROLL_LOOPS = (1 << 0),
VG_OCL_EXPENSIVE_BRANCHES = (1 << 1),
VG_OCL_NV_VERBOSE = (1 << 2),
VG_OCL_BROKEN = (1 << 3),
VG_OCL_NO_BINARIES = (1 << 4),
VG_OCL_DEEP_VLIW = (1 << 2),
VG_OCL_NV_VERBOSE = (1 << 3),
VG_OCL_BROKEN = (1 << 4),
VG_OCL_NO_BINARIES = (1 << 5),
VG_OCL_OPTIMIZATIONS = (VG_OCL_UNROLL_LOOPS |
VG_OCL_EXPENSIVE_BRANCHES),
VG_OCL_EXPENSIVE_BRANCHES |
VG_OCL_DEEP_VLIW),
};
@ -404,7 +406,8 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp) @@ -404,7 +406,8 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
quirks |= VG_OCL_UNROLL_LOOPS;
vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_VENDOR);
if (!strcmp(vend, "NVIDIA Corporation")) {
if (!strcmp(vend, "NVIDIA Corporation") ||
!strcmp(vend, "NVIDIA")) {
quirks |= VG_OCL_NV_VERBOSE;
#ifdef WIN32
if (strcmp(vg_ocl_device_getstr(vocp->voc_ocldid,
@ -420,9 +423,9 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp) @@ -420,9 +423,9 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
} else if (!strcmp(vend, "Advanced Micro Devices, Inc.") ||
!strcmp(vend, "AMD")) {
quirks |= VG_OCL_EXPENSIVE_BRANCHES;
if (!strcmp(vg_ocl_device_getstr(vocp->voc_ocldid,
CL_DEVICE_NAME),
"ATI RV710")) {
quirks |= VG_OCL_DEEP_VLIW;
vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
if (!strcmp(vend, "ATI RV710")) {
quirks &= ~VG_OCL_OPTIMIZATIONS;
quirks |= VG_OCL_NO_BINARIES;
}
@ -737,6 +740,9 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did, @@ -737,6 +740,9 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES)
end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-DVERY_EXPENSIVE_BRANCHES ");
if (vocp->voc_quirks & VG_OCL_DEEP_VLIW)
end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-DDEEP_VLIW ");
if (vocp->voc_quirks & VG_OCL_NV_VERBOSE)
end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-cl-nv-verbose ");

Loading…
Cancel
Save