Browse Source

Add explicit preprocessor unrolling.

Limit use of #pragma unroll to NVIDIA platforms.
master
samr7 13 years ago
parent
commit
933a020205
  1. 1127
      calc_addrs.cl
  2. 44
      oclvanitygen.c

1127
calc_addrs.cl

File diff suppressed because it is too large Load Diff

44
oclvanitygen.c

@ -384,15 +384,17 @@ vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog)
*/ */
enum { enum {
VG_OCL_UNROLL_LOOPS = (1 << 0), VG_OCL_DEEP_PREPROC_UNROLL = (1 << 0),
VG_OCL_EXPENSIVE_BRANCHES = (1 << 1), VG_OCL_PRAGMA_UNROLL = (1 << 1),
VG_OCL_DEEP_VLIW = (1 << 2), VG_OCL_EXPENSIVE_BRANCHES = (1 << 2),
VG_OCL_AMD_BFI_INT = (1 << 3), VG_OCL_DEEP_VLIW = (1 << 3),
VG_OCL_NV_VERBOSE = (1 << 4), VG_OCL_AMD_BFI_INT = (1 << 4),
VG_OCL_BROKEN = (1 << 5), VG_OCL_NV_VERBOSE = (1 << 5),
VG_OCL_NO_BINARIES = (1 << 6), VG_OCL_BROKEN = (1 << 6),
VG_OCL_NO_BINARIES = (1 << 7),
VG_OCL_OPTIMIZATIONS = (VG_OCL_UNROLL_LOOPS |
VG_OCL_OPTIMIZATIONS = (VG_OCL_DEEP_PREPROC_UNROLL |
VG_OCL_PRAGMA_UNROLL |
VG_OCL_EXPENSIVE_BRANCHES | VG_OCL_EXPENSIVE_BRANCHES |
VG_OCL_DEEP_VLIW | VG_OCL_DEEP_VLIW |
VG_OCL_AMD_BFI_INT), VG_OCL_AMD_BFI_INT),
@ -406,13 +408,18 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
const char *dvn; const char *dvn;
unsigned int quirks = 0; unsigned int quirks = 0;
/* Loop unrolling for devices other than CPUs */ quirks |= VG_OCL_DEEP_PREPROC_UNROLL;
if (!(vg_ocl_device_gettype(vocp->voc_ocldid) & CL_DEVICE_TYPE_CPU))
quirks |= VG_OCL_UNROLL_LOOPS;
vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID); vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID);
switch (vend) { switch (vend) {
case 0x10de: /* NVIDIA */ case 0x10de: /* NVIDIA */
/*
* NVIDIA's compiler seems to take a really really long
* time when using preprocessor unrolling, but works
* well with pragma unroll.
*/
quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL;
quirks |= VG_OCL_PRAGMA_UNROLL;
quirks |= VG_OCL_NV_VERBOSE; quirks |= VG_OCL_NV_VERBOSE;
#ifdef WIN32 #ifdef WIN32
if (strcmp(vg_ocl_device_getstr(vocp->voc_ocldid, if (strcmp(vg_ocl_device_getstr(vocp->voc_ocldid,
@ -427,6 +434,12 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
#endif #endif
break; break;
case 0x1002: /* AMD/ATI */ case 0x1002: /* AMD/ATI */
/*
* AMD's compiler works best with preprocesor unrolling.
* Pragma unroll is unreliable with AMD's compiler and
* seems to crash based on whether the gods were smiling
* when Catalyst was last installed/upgraded.
*/
if (vg_ocl_device_gettype(vocp->voc_ocldid) & if (vg_ocl_device_gettype(vocp->voc_ocldid) &
CL_DEVICE_TYPE_GPU) { CL_DEVICE_TYPE_GPU) {
quirks |= VG_OCL_EXPENSIVE_BRANCHES; quirks |= VG_OCL_EXPENSIVE_BRANCHES;
@ -896,9 +909,12 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
end = 0; end = 0;
optbuf[end] = '\0'; optbuf[end] = '\0';
if (vocp->voc_quirks & VG_OCL_UNROLL_LOOPS) if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL)
end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-DDEEP_PREPROC_UNROLL ");
if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL)
end += snprintf(optbuf + end, sizeof(optbuf) - end, end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-DUNROLL_MAX=16 "); "-DPRAGMA_UNROLL ");
if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES) if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES)
end += snprintf(optbuf + end, sizeof(optbuf) - end, end += snprintf(optbuf + end, sizeof(optbuf) - end,
"-DVERY_EXPENSIVE_BRANCHES "); "-DVERY_EXPENSIVE_BRANCHES ");

Loading…
Cancel
Save