Browse Source

Add some loop unrolling optimizations to the OpenCL kernel.

Add GPU idle time reporting in verbose mode.
master
samr7 14 years ago
parent
commit
e328e73d6a
  1. 110
      calc_addrs.cl
  2. 57
      oclvanitygen.c

110
calc_addrs.cl

@ -99,6 +99,9 @@ void
bn_lshift1(bignum *bn) bn_lshift1(bignum *bn)
{ {
int i; int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = (BN_NWORDS - 1); i > 0; i--) for (i = (BN_NWORDS - 1); i > 0; i--)
bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31); bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31);
bn->d[i] <<= 1; bn->d[i] <<= 1;
@ -129,6 +132,9 @@ void
bn_rshift1(bignum *bn) bn_rshift1(bignum *bn)
{ {
int i; int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < (BN_NWORDS - 1); i++) for (i = 0; i < (BN_NWORDS - 1); i++)
bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1); bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1);
bn->d[i] >>= 1; bn->d[i] >>= 1;
@ -143,6 +149,9 @@ int
bn_ucmp(bignum *a, bignum *b) bn_ucmp(bignum *a, bignum *b)
{ {
int i; int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = (BN_NWORDS - 1); i >= 0; i--) { for (i = (BN_NWORDS - 1); i >= 0; i--) {
if (a->d[i] < b->d[i]) return -1; if (a->d[i] < b->d[i]) return -1;
if (a->d[i] > b->d[i]) return 1; if (a->d[i] > b->d[i]) return 1;
@ -154,6 +163,9 @@ int
bn_ucmp_c(bignum *a, __constant bn_word *b) bn_ucmp_c(bignum *a, __constant bn_word *b)
{ {
int i; int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = (BN_NWORDS - 1); i >= 0; i--) { for (i = (BN_NWORDS - 1); i >= 0; i--) {
if (a->d[i] < b[i]) return -1; if (a->d[i] < b[i]) return -1;
if (a->d[i] > b[i]) return 1; if (a->d[i] > b[i]) return 1;
@ -169,6 +181,9 @@ void
bn_neg(bignum *n) bn_neg(bignum *n)
{ {
int i, c; int i, c;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0, c = 1; i < BN_NWORDS; i++) for (i = 0, c = 1; i < BN_NWORDS; i++)
c = (n->d[i] = (~n->d[i]) + c) ? 0 : c; c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
} }
@ -185,7 +200,7 @@ bn_neg(bignum *n)
#define bn_addc_word(r, a, b, t, c) do { \ #define bn_addc_word(r, a, b, t, c) do { \
t = a + b + c; \ t = a + b + c; \
c = (t < a) ? 1 : ((c && (t == a)) ? 1 : 0); \ c = (t < a) ? 1 : ((c & (t == a)) ? 1 : 0); \
r = t; \ r = t; \
} while (0) } while (0)
@ -195,6 +210,9 @@ bn_uadd(bignum *r, bignum *a, bignum *b)
bn_word t, c = 0; bn_word t, c = 0;
int i; int i;
bn_add_word(r->d[0], a->d[0], b->d[0], t, c); bn_add_word(r->d[0], a->d[0], b->d[0], t, c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++) for (i = 1; i < BN_NWORDS; i++)
bn_addc_word(r->d[i], a->d[i], b->d[i], t, c); bn_addc_word(r->d[i], a->d[i], b->d[i], t, c);
return c; return c;
@ -206,6 +224,9 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b)
bn_word t, c = 0; bn_word t, c = 0;
int i; int i;
bn_add_word(r->d[0], a->d[0], b[0], t, c); bn_add_word(r->d[0], a->d[0], b[0], t, c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++) for (i = 1; i < BN_NWORDS; i++)
bn_addc_word(r->d[i], a->d[i], b[i], t, c); bn_addc_word(r->d[i], a->d[i], b[i], t, c);
return c; return c;
@ -219,7 +240,7 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b)
#define bn_subb_word(r, a, b, t, c) do { \ #define bn_subb_word(r, a, b, t, c) do { \
t = a - (b + c); \ t = a - (b + c); \
c = ((a < b) || (!a && c)) ? 1 : 0; \ c = (a < b) ? 1 : (((!a) & c) ? 1 : 0); \
r = t; \ r = t; \
} while (0) } while (0)
@ -229,6 +250,9 @@ bn_usub(bignum *r, bignum *a, bignum *b)
bn_word t, c = 0; bn_word t, c = 0;
int i; int i;
bn_sub_word(r->d[0], a->d[0], b->d[0], t, c); bn_sub_word(r->d[0], a->d[0], b->d[0], t, c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++) for (i = 1; i < BN_NWORDS; i++)
bn_subb_word(r->d[i], a->d[i], b->d[i], t, c); bn_subb_word(r->d[i], a->d[i], b->d[i], t, c);
return c; return c;
@ -240,6 +264,9 @@ bn_usub_c(bignum *r, bignum *a, __constant bn_word *b)
bn_word t, c = 0; bn_word t, c = 0;
int i; int i;
bn_sub_word(r->d[0], a->d[0], b[0], t, c); bn_sub_word(r->d[0], a->d[0], b[0], t, c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++) for (i = 1; i < BN_NWORDS; i++)
bn_subb_word(r->d[i], a->d[i], b[i], t, c); bn_subb_word(r->d[i], a->d[i], b[i], t, c);
return c; return c;
@ -302,6 +329,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
int i, j; int i, j;
c = 0; c = 0;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 0; j < BN_NWORDS; j++) for (j = 0; j < BN_NWORDS; j++)
bn_mul_word(t.d[j], a->d[j], b->d[0], c, p, s); bn_mul_word(t.d[j], a->d[j], b->d[0], c, p, s);
tea = c; tea = c;
@ -310,6 +340,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
c = 0; c = 0;
m = t.d[0] * mont_n0[0]; m = t.d[0] * mont_n0[0];
bn_mul_add_word(t.d[0], modulus[0], m, c, p, s); bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 1; j < BN_NWORDS; j++) { for (j = 1; j < BN_NWORDS; j++) {
bn_mul_add_word(t.d[j], modulus[j], m, c, p, s); bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);
t.d[j-1] = t.d[j]; t.d[j-1] = t.d[j];
@ -319,6 +352,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
for (i = 1; i < BN_NWORDS; i++) { for (i = 1; i < BN_NWORDS; i++) {
c = 0; c = 0;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 0; j < BN_NWORDS; j++) for (j = 0; j < BN_NWORDS; j++)
bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s); bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s);
tea += c; tea += c;
@ -327,6 +363,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
c = 0; c = 0;
m = t.d[0] * mont_n0[0]; m = t.d[0] * mont_n0[0];
bn_mul_add_word(t.d[0], modulus[0], m, c, p, s); bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 1; j < BN_NWORDS; j++) { for (j = 1; j < BN_NWORDS; j++) {
bn_mul_add_word(t.d[j], modulus[j], m, c, p, s); bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);
t.d[j-1] = t.d[j]; t.d[j-1] = t.d[j];
@ -351,15 +390,27 @@ bn_from_mont(bignum *rb, bignum *b)
bn_word m, c, p, s; bn_word m, c, p, s;
int i, j, top; int i, j, top;
/* Copy the input to the working area */ /* Copy the input to the working area */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++) for (i = 0; i < BN_NWORDS; i++)
r[i] = b->d[i]; r[i] = b->d[i];
/* Zero the upper words */ /* Zero the upper words */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = BN_NWORDS; i < WORKSIZE; i++) for (i = BN_NWORDS; i < WORKSIZE; i++)
r[i] = 0; r[i] = 0;
/* Multiply (long) by modulus */ /* Multiply (long) by modulus */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++) { for (i = 0; i < BN_NWORDS; i++) {
m = r[i] * mont_n0[0]; m = r[i] * mont_n0[0];
c = 0; c = 0;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 0; j < BN_NWORDS; j++) for (j = 0; j < BN_NWORDS; j++)
bn_mul_add_word(r[i+j], modulus[j], m, c, p, s); bn_mul_add_word(r[i+j], modulus[j], m, c, p, s);
r[BN_NWORDS + i] += c; r[BN_NWORDS + i] += c;
@ -368,12 +419,18 @@ bn_from_mont(bignum *rb, bignum *b)
++r[BN_NWORDS + i + 2]; /* The end..? */ ++r[BN_NWORDS + i + 2]; /* The end..? */
} }
} }
for (top = WORKSIZE - 1; (top > BN_NWORDS) && (r[top] == 0); top--); #ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (top = WORKSIZE - 1; ((top > BN_NWORDS) & (r[top] == 0)); top--);
if (top <= BN_NWORDS) { if (top <= BN_NWORDS) {
*rb = bn_zero; *rb = bn_zero;
return; return;
} }
c = 0; c = 0;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 0; j < BN_NWORDS; j++) for (j = 0; j < BN_NWORDS; j++)
bn_subb_word(rb->d[j], r[BN_NWORDS + j], modulus[j], p, c); bn_subb_word(rb->d[j], r[BN_NWORDS + j], modulus[j], p, c);
if (c) { if (c) {
@ -532,6 +589,9 @@ void
sha2_256_init(uint *out) sha2_256_init(uint *out)
{ {
int i; int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
out[i] = sha2_init[i]; out[i] = sha2_init[i];
} }
@ -544,8 +604,14 @@ sha2_256_block(uint *out, uint *in)
{ {
int i; int i;
uint state[8], s0, s1, t1, t2; uint state[8], s0, s1, t1, t2;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
state[7-i] = out[i]; state[7-i] = out[i];
#ifdef UNROLL_MAX
#pragma unroll 64
#endif
for (i = 0; i < 64; i++) { for (i = 0; i < 64; i++) {
if (i >= 16) { if (i >= 16) {
/* Advance the input window */ /* Advance the input window */
@ -572,6 +638,9 @@ sha2_256_block(uint *out, uint *in)
sha2_stvar(state, i, 3) += t1; sha2_stvar(state, i, 3) += t1;
sha2_stvar(state, i, 7) = t1 + t2; sha2_stvar(state, i, 7) = t1 + t2;
} }
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 8; i++) for (i = 0; i < 8; i++)
out[i] += state[7-i]; out[i] += state[7-i];
} }
@ -646,6 +715,9 @@ void
ripemd160_init(uint *out) ripemd160_init(uint *out)
{ {
int i; int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for(i = 0; i < 5; i++) for(i = 0; i < 5; i++)
out[i] = ripemd160_iv[i]; out[i] = ripemd160_iv[i];
} }
@ -655,20 +727,38 @@ ripemd160_block(uint *out, uint *in)
{ {
uint vals[10], t; uint vals[10], t;
int i; int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 5; i++) for (i = 0; i < 5; i++)
vals[i] = vals[i + 5] = out[i]; vals[i] = vals[i + 5] = out[i];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 16; i++) for (i = 0; i < 16; i++)
ripemd160_round(i, in, vals, ripemd160_round(i, in, vals,
ripemd160_f0, ripemd160_f4, t); ripemd160_f0, ripemd160_f4, t);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 16; i < 32; i++) for (i = 16; i < 32; i++)
ripemd160_round(i, in, vals, ripemd160_round(i, in, vals,
ripemd160_f1, ripemd160_f3, t); ripemd160_f1, ripemd160_f3, t);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 32; i < 48; i++) for (i = 32; i < 48; i++)
ripemd160_round(i, in, vals, ripemd160_round(i, in, vals,
ripemd160_f2, ripemd160_f2, t); ripemd160_f2, ripemd160_f2, t);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 48; i < 64; i++) for (i = 48; i < 64; i++)
ripemd160_round(i, in, vals, ripemd160_round(i, in, vals,
ripemd160_f3, ripemd160_f1, t); ripemd160_f3, ripemd160_f1, t);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 64; i < 80; i++) for (i = 64; i < 80; i++)
ripemd160_round(i, in, vals, ripemd160_round(i, in, vals,
ripemd160_f4, ripemd160_f0, t); ripemd160_f4, ripemd160_f0, t);
@ -686,6 +776,7 @@ ripemd160_block(uint *out, uint *in)
(((v) << 8) & 0xff0000) | ((v) << 24)) (((v) << 8) & 0xff0000) | ((v) << 24))
#if 0
__kernel void __kernel void
calc_addrs(__global uint *hashes_out, calc_addrs(__global uint *hashes_out,
__global bignum *z_heap, __global bignum *point_tmp, __global bignum *z_heap, __global bignum *point_tmp,
@ -853,6 +944,7 @@ calc_addrs(__global uint *hashes_out,
} }
} }
#endif
__kernel void __kernel void
ec_add_grid(__global bignum *points_out, __global bignum *z_heap, ec_add_grid(__global bignum *points_out, __global bignum *z_heap,
@ -969,6 +1061,9 @@ hash_ec_point(__global uint *hashes_out,
bn_from_mont(&p, &p); bn_from_mont(&p, &p);
wh = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */ wh = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < BN_NWORDS; o++) { for (o = 0; o < BN_NWORDS; o++) {
wl = wh; wl = wh;
wh = p.d[(BN_NWORDS - 1) - o]; wh = p.d[(BN_NWORDS - 1) - o];
@ -980,6 +1075,9 @@ hash_ec_point(__global uint *hashes_out,
bn_mul_mont(&p, &p, &a); /* Y / Z^3 */ bn_mul_mont(&p, &p, &a); /* Y / Z^3 */
bn_from_mont(&p, &p); bn_from_mont(&p, &p);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < BN_NWORDS; o++) { for (o = 0; o < BN_NWORDS; o++) {
wl = wh; wl = wh;
wh = p.d[(BN_NWORDS - 1) - o]; wh = p.d[(BN_NWORDS - 1) - o];
@ -1018,6 +1116,9 @@ hash_ec_point(__global uint *hashes_out,
* Unfortunately, SHA-2 outputs big-endian, but * Unfortunately, SHA-2 outputs big-endian, but
* RIPEMD160 expects little-endian. Need to swap! * RIPEMD160 expects little-endian. Need to swap!
*/ */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < 8; o++) for (o = 0; o < 8; o++)
hash2[o] = bswap32(hash2[o]); hash2[o] = bswap32(hash2[o]);
hash2[8] = bswap32(0x80000000); hash2[8] = bswap32(0x80000000);
@ -1031,6 +1132,9 @@ hash_ec_point(__global uint *hashes_out,
ripemd160_init(hash1); ripemd160_init(hash1);
ripemd160_block(hash1, hash2); ripemd160_block(hash1, hash2);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < 5; o++) for (o = 0; o < 5; o++)
hashes_out[o] = hash1[o]; hashes_out[o] = hash1[o];
} }

57
oclvanitygen.c

@ -223,8 +223,8 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did)
if (!vg_ocl_load_program(vcp, vocp, if (!vg_ocl_load_program(vcp, vocp,
"calc_addrs.cl", "calc_addrs.cl",
//"-cl-nv-verbose -cl-nv-maxrregcount=32" //"-cl-nv-verbose -cl-nv-maxrregcount=32 "
NULL)) { "-DUNROLL_MAX=16")) {
printf("Could not load kernel\n"); printf("Could not load kernel\n");
return 0; return 0;
} }
@ -498,13 +498,29 @@ vg_ocl_put_point(unsigned char *buf, EC_POINT *ppnt)
memcpy(buf + 32, ppnt->Y.d, 32); memcpy(buf + 32, ppnt->Y.d, 32);
} }
void
show_elapsed(struct timeval *tv, const char *place)
{
struct timeval now, delta;
gettimeofday(&now, NULL);
timersub(&now, tv, &delta);
printf("%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec);
}
void * void *
vg_opencl_thread(void *arg) vg_opencl_thread(void *arg)
{ {
vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg; vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
vg_context_t *vcp = vocp->base.vxc_vc;
int halt = 0; int halt = 0;
int slot = -1; int slot = -1;
int rows, cols; int rows, cols;
unsigned long long idleu, busyu;
double pidle;
struct timeval tv, tvt, tvd, idle, busy;
memset(&idle, 0, sizeof(idle));
memset(&busy, 0, sizeof(busy));
while (1) { while (1) {
pthread_mutex_lock(&vocp->voc_lock); pthread_mutex_lock(&vocp->voc_lock);
@ -520,10 +536,17 @@ vg_opencl_thread(void *arg)
} }
if (vocp->voc_halt) if (vocp->voc_halt)
break; break;
while (vocp->voc_ocl_slot == -1) { if (vocp->voc_ocl_slot == -1) {
pthread_cond_wait(&vocp->voc_wait, &vocp->voc_lock); gettimeofday(&tv, NULL);
if (vocp->voc_halt) while (vocp->voc_ocl_slot == -1) {
goto out; pthread_cond_wait(&vocp->voc_wait,
&vocp->voc_lock);
if (vocp->voc_halt)
goto out;
}
gettimeofday(&tvt, NULL);
timersub(&tvt, &tv, &tvd);
timeradd(&tvd, &idle, &idle);
} }
assert(!vocp->voc_rekey); assert(!vocp->voc_rekey);
assert(!vocp->voc_halt); assert(!vocp->voc_halt);
@ -532,11 +555,31 @@ vg_opencl_thread(void *arg)
cols = vocp->voc_ocl_cols; cols = vocp->voc_ocl_cols;
pthread_mutex_unlock(&vocp->voc_lock); pthread_mutex_unlock(&vocp->voc_lock);
gettimeofday(&tv, NULL);
if (!vg_ocl_kernel_start(vocp, slot, cols, rows)) if (!vg_ocl_kernel_start(vocp, slot, cols, rows))
halt = 1; halt = 1;
if (!vg_ocl_kernel_wait(vocp, slot)) if (!vg_ocl_kernel_wait(vocp, slot))
halt = 1; halt = 1;
gettimeofday(&tvt, NULL);
timersub(&tvt, &tv, &tvd);
timeradd(&tvd, &busy, &busy);
if ((vcp->vc_verbose > 1) &&
((busy.tv_sec + idle.tv_sec) > 1)) {
idleu = (1000000 * idle.tv_sec) + idle.tv_usec;
busyu = (1000000 * busy.tv_sec) + busy.tv_usec;
pidle = ((double) idleu) / (idleu + busyu);
if (pidle > 0.05) {
printf("\rGPU idle: %.2f%%"
" "
" \n",
100 * pidle);
}
memset(&idle, 0, sizeof(idle));
memset(&busy, 0, sizeof(busy));
}
} }
out: out:
pthread_mutex_unlock(&vocp->voc_lock); pthread_mutex_unlock(&vocp->voc_lock);
@ -590,7 +633,7 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize)
batchsize = 256; batchsize = 256;
if (!worksize) if (!worksize)
worksize = 512; worksize = 4096;
nslots = 2; nslots = 2;
slot = 0; slot = 0;

Loading…
Cancel
Save