Browse Source

Add some loop unrolling optimizations to the OpenCL kernel.

Add GPU idle time reporting in verbose mode.
master
samr7 14 years ago
parent
commit
e328e73d6a
  1. 110
      calc_addrs.cl
  2. 57
      oclvanitygen.c

110
calc_addrs.cl

@ -99,6 +99,9 @@ void @@ -99,6 +99,9 @@ void
bn_lshift1(bignum *bn)
{
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = (BN_NWORDS - 1); i > 0; i--)
bn->d[i] = (bn->d[i] << 1) | (bn->d[i-1] >> 31);
bn->d[i] <<= 1;
@ -129,6 +132,9 @@ void @@ -129,6 +132,9 @@ void
bn_rshift1(bignum *bn)
{
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < (BN_NWORDS - 1); i++)
bn->d[i] = (bn->d[i+1] << 31) | (bn->d[i] >> 1);
bn->d[i] >>= 1;
@ -143,6 +149,9 @@ int @@ -143,6 +149,9 @@ int
bn_ucmp(bignum *a, bignum *b)
{
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = (BN_NWORDS - 1); i >= 0; i--) {
if (a->d[i] < b->d[i]) return -1;
if (a->d[i] > b->d[i]) return 1;
@ -154,6 +163,9 @@ int @@ -154,6 +163,9 @@ int
bn_ucmp_c(bignum *a, __constant bn_word *b)
{
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = (BN_NWORDS - 1); i >= 0; i--) {
if (a->d[i] < b[i]) return -1;
if (a->d[i] > b[i]) return 1;
@ -169,6 +181,9 @@ void @@ -169,6 +181,9 @@ void
bn_neg(bignum *n)
{
int i, c;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0, c = 1; i < BN_NWORDS; i++)
c = (n->d[i] = (~n->d[i]) + c) ? 0 : c;
}
@ -185,7 +200,7 @@ bn_neg(bignum *n) @@ -185,7 +200,7 @@ bn_neg(bignum *n)
#define bn_addc_word(r, a, b, t, c) do { \
t = a + b + c; \
c = (t < a) ? 1 : ((c && (t == a)) ? 1 : 0); \
c = (t < a) ? 1 : ((c & (t == a)) ? 1 : 0); \
r = t; \
} while (0)
@ -195,6 +210,9 @@ bn_uadd(bignum *r, bignum *a, bignum *b) @@ -195,6 +210,9 @@ bn_uadd(bignum *r, bignum *a, bignum *b)
bn_word t, c = 0;
int i;
bn_add_word(r->d[0], a->d[0], b->d[0], t, c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++)
bn_addc_word(r->d[i], a->d[i], b->d[i], t, c);
return c;
@ -206,6 +224,9 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b) @@ -206,6 +224,9 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b)
bn_word t, c = 0;
int i;
bn_add_word(r->d[0], a->d[0], b[0], t, c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++)
bn_addc_word(r->d[i], a->d[i], b[i], t, c);
return c;
@ -219,7 +240,7 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b) @@ -219,7 +240,7 @@ bn_uadd_c(bignum *r, bignum *a, __constant bn_word *b)
#define bn_subb_word(r, a, b, t, c) do { \
t = a - (b + c); \
c = ((a < b) || (!a && c)) ? 1 : 0; \
c = (a < b) ? 1 : (((!a) & c) ? 1 : 0); \
r = t; \
} while (0)
@ -229,6 +250,9 @@ bn_usub(bignum *r, bignum *a, bignum *b) @@ -229,6 +250,9 @@ bn_usub(bignum *r, bignum *a, bignum *b)
bn_word t, c = 0;
int i;
bn_sub_word(r->d[0], a->d[0], b->d[0], t, c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++)
bn_subb_word(r->d[i], a->d[i], b->d[i], t, c);
return c;
@ -240,6 +264,9 @@ bn_usub_c(bignum *r, bignum *a, __constant bn_word *b) @@ -240,6 +264,9 @@ bn_usub_c(bignum *r, bignum *a, __constant bn_word *b)
bn_word t, c = 0;
int i;
bn_sub_word(r->d[0], a->d[0], b[0], t, c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 1; i < BN_NWORDS; i++)
bn_subb_word(r->d[i], a->d[i], b[i], t, c);
return c;
@ -302,6 +329,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b) @@ -302,6 +329,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
int i, j;
c = 0;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 0; j < BN_NWORDS; j++)
bn_mul_word(t.d[j], a->d[j], b->d[0], c, p, s);
tea = c;
@ -310,6 +340,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b) @@ -310,6 +340,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
c = 0;
m = t.d[0] * mont_n0[0];
bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 1; j < BN_NWORDS; j++) {
bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);
t.d[j-1] = t.d[j];
@ -319,6 +352,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b) @@ -319,6 +352,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
for (i = 1; i < BN_NWORDS; i++) {
c = 0;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 0; j < BN_NWORDS; j++)
bn_mul_add_word(t.d[j], a->d[j], b->d[i], c, p, s);
tea += c;
@ -327,6 +363,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b) @@ -327,6 +363,9 @@ bn_mul_mont(bignum *r, bignum *a, bignum *b)
c = 0;
m = t.d[0] * mont_n0[0];
bn_mul_add_word(t.d[0], modulus[0], m, c, p, s);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 1; j < BN_NWORDS; j++) {
bn_mul_add_word(t.d[j], modulus[j], m, c, p, s);
t.d[j-1] = t.d[j];
@ -351,15 +390,27 @@ bn_from_mont(bignum *rb, bignum *b) @@ -351,15 +390,27 @@ bn_from_mont(bignum *rb, bignum *b)
bn_word m, c, p, s;
int i, j, top;
/* Copy the input to the working area */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++)
r[i] = b->d[i];
/* Zero the upper words */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = BN_NWORDS; i < WORKSIZE; i++)
r[i] = 0;
/* Multiply (long) by modulus */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < BN_NWORDS; i++) {
m = r[i] * mont_n0[0];
c = 0;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 0; j < BN_NWORDS; j++)
bn_mul_add_word(r[i+j], modulus[j], m, c, p, s);
r[BN_NWORDS + i] += c;
@ -368,12 +419,18 @@ bn_from_mont(bignum *rb, bignum *b) @@ -368,12 +419,18 @@ bn_from_mont(bignum *rb, bignum *b)
++r[BN_NWORDS + i + 2]; /* The end..? */
}
}
for (top = WORKSIZE - 1; (top > BN_NWORDS) && (r[top] == 0); top--);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (top = WORKSIZE - 1; ((top > BN_NWORDS) & (r[top] == 0)); top--);
if (top <= BN_NWORDS) {
*rb = bn_zero;
return;
}
c = 0;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (j = 0; j < BN_NWORDS; j++)
bn_subb_word(rb->d[j], r[BN_NWORDS + j], modulus[j], p, c);
if (c) {
@ -532,6 +589,9 @@ void @@ -532,6 +589,9 @@ void
sha2_256_init(uint *out)
{
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 8; i++)
out[i] = sha2_init[i];
}
@ -544,8 +604,14 @@ sha2_256_block(uint *out, uint *in) @@ -544,8 +604,14 @@ sha2_256_block(uint *out, uint *in)
{
int i;
uint state[8], s0, s1, t1, t2;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 8; i++)
state[7-i] = out[i];
#ifdef UNROLL_MAX
#pragma unroll 64
#endif
for (i = 0; i < 64; i++) {
if (i >= 16) {
/* Advance the input window */
@ -572,6 +638,9 @@ sha2_256_block(uint *out, uint *in) @@ -572,6 +638,9 @@ sha2_256_block(uint *out, uint *in)
sha2_stvar(state, i, 3) += t1;
sha2_stvar(state, i, 7) = t1 + t2;
}
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 8; i++)
out[i] += state[7-i];
}
@ -646,6 +715,9 @@ void @@ -646,6 +715,9 @@ void
ripemd160_init(uint *out)
{
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for(i = 0; i < 5; i++)
out[i] = ripemd160_iv[i];
}
@ -655,20 +727,38 @@ ripemd160_block(uint *out, uint *in) @@ -655,20 +727,38 @@ ripemd160_block(uint *out, uint *in)
{
uint vals[10], t;
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 5; i++)
vals[i] = vals[i + 5] = out[i];
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 16; i++)
ripemd160_round(i, in, vals,
ripemd160_f0, ripemd160_f4, t);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 16; i < 32; i++)
ripemd160_round(i, in, vals,
ripemd160_f1, ripemd160_f3, t);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 32; i < 48; i++)
ripemd160_round(i, in, vals,
ripemd160_f2, ripemd160_f2, t);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 48; i < 64; i++)
ripemd160_round(i, in, vals,
ripemd160_f3, ripemd160_f1, t);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 64; i < 80; i++)
ripemd160_round(i, in, vals,
ripemd160_f4, ripemd160_f0, t);
@ -686,6 +776,7 @@ ripemd160_block(uint *out, uint *in) @@ -686,6 +776,7 @@ ripemd160_block(uint *out, uint *in)
(((v) << 8) & 0xff0000) | ((v) << 24))
#if 0
__kernel void
calc_addrs(__global uint *hashes_out,
__global bignum *z_heap, __global bignum *point_tmp,
@ -853,6 +944,7 @@ calc_addrs(__global uint *hashes_out, @@ -853,6 +944,7 @@ calc_addrs(__global uint *hashes_out,
}
}
#endif
__kernel void
ec_add_grid(__global bignum *points_out, __global bignum *z_heap,
@ -969,6 +1061,9 @@ hash_ec_point(__global uint *hashes_out, @@ -969,6 +1061,9 @@ hash_ec_point(__global uint *hashes_out,
bn_from_mont(&p, &p);
wh = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < BN_NWORDS; o++) {
wl = wh;
wh = p.d[(BN_NWORDS - 1) - o];
@ -980,6 +1075,9 @@ hash_ec_point(__global uint *hashes_out, @@ -980,6 +1075,9 @@ hash_ec_point(__global uint *hashes_out,
bn_mul_mont(&p, &p, &a); /* Y / Z^3 */
bn_from_mont(&p, &p);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < BN_NWORDS; o++) {
wl = wh;
wh = p.d[(BN_NWORDS - 1) - o];
@ -1018,6 +1116,9 @@ hash_ec_point(__global uint *hashes_out, @@ -1018,6 +1116,9 @@ hash_ec_point(__global uint *hashes_out,
* Unfortunately, SHA-2 outputs big-endian, but
* RIPEMD160 expects little-endian. Need to swap!
*/
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < 8; o++)
hash2[o] = bswap32(hash2[o]);
hash2[8] = bswap32(0x80000000);
@ -1031,6 +1132,9 @@ hash_ec_point(__global uint *hashes_out, @@ -1031,6 +1132,9 @@ hash_ec_point(__global uint *hashes_out,
ripemd160_init(hash1);
ripemd160_block(hash1, hash2);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < 5; o++)
hashes_out[o] = hash1[o];
}

57
oclvanitygen.c

@ -223,8 +223,8 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did) @@ -223,8 +223,8 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did)
if (!vg_ocl_load_program(vcp, vocp,
"calc_addrs.cl",
//"-cl-nv-verbose -cl-nv-maxrregcount=32"
NULL)) {
//"-cl-nv-verbose -cl-nv-maxrregcount=32 "
"-DUNROLL_MAX=16")) {
printf("Could not load kernel\n");
return 0;
}
@ -498,13 +498,29 @@ vg_ocl_put_point(unsigned char *buf, EC_POINT *ppnt) @@ -498,13 +498,29 @@ vg_ocl_put_point(unsigned char *buf, EC_POINT *ppnt)
memcpy(buf + 32, ppnt->Y.d, 32);
}
void
show_elapsed(struct timeval *tv, const char *place)
{
struct timeval now, delta;
gettimeofday(&now, NULL);
timersub(&now, tv, &delta);
printf("%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec);
}
void *
vg_opencl_thread(void *arg)
{
vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
vg_context_t *vcp = vocp->base.vxc_vc;
int halt = 0;
int slot = -1;
int rows, cols;
unsigned long long idleu, busyu;
double pidle;
struct timeval tv, tvt, tvd, idle, busy;
memset(&idle, 0, sizeof(idle));
memset(&busy, 0, sizeof(busy));
while (1) {
pthread_mutex_lock(&vocp->voc_lock);
@ -520,10 +536,17 @@ vg_opencl_thread(void *arg) @@ -520,10 +536,17 @@ vg_opencl_thread(void *arg)
}
if (vocp->voc_halt)
break;
while (vocp->voc_ocl_slot == -1) {
pthread_cond_wait(&vocp->voc_wait, &vocp->voc_lock);
if (vocp->voc_halt)
goto out;
if (vocp->voc_ocl_slot == -1) {
gettimeofday(&tv, NULL);
while (vocp->voc_ocl_slot == -1) {
pthread_cond_wait(&vocp->voc_wait,
&vocp->voc_lock);
if (vocp->voc_halt)
goto out;
}
gettimeofday(&tvt, NULL);
timersub(&tvt, &tv, &tvd);
timeradd(&tvd, &idle, &idle);
}
assert(!vocp->voc_rekey);
assert(!vocp->voc_halt);
@ -532,11 +555,31 @@ vg_opencl_thread(void *arg) @@ -532,11 +555,31 @@ vg_opencl_thread(void *arg)
cols = vocp->voc_ocl_cols;
pthread_mutex_unlock(&vocp->voc_lock);
gettimeofday(&tv, NULL);
if (!vg_ocl_kernel_start(vocp, slot, cols, rows))
halt = 1;
if (!vg_ocl_kernel_wait(vocp, slot))
halt = 1;
gettimeofday(&tvt, NULL);
timersub(&tvt, &tv, &tvd);
timeradd(&tvd, &busy, &busy);
if ((vcp->vc_verbose > 1) &&
((busy.tv_sec + idle.tv_sec) > 1)) {
idleu = (1000000 * idle.tv_sec) + idle.tv_usec;
busyu = (1000000 * busy.tv_sec) + busy.tv_usec;
pidle = ((double) idleu) / (idleu + busyu);
if (pidle > 0.05) {
printf("\rGPU idle: %.2f%%"
" "
" \n",
100 * pidle);
}
memset(&idle, 0, sizeof(idle));
memset(&busy, 0, sizeof(busy));
}
}
out:
pthread_mutex_unlock(&vocp->voc_lock);
@ -590,7 +633,7 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) @@ -590,7 +633,7 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize)
batchsize = 256;
if (!worksize)
worksize = 512;
worksize = 4096;
nslots = 2;
slot = 0;

Loading…
Cancel
Save