Browse Source

Checkpoint development of oclvanitygen.

master
samr7 14 years ago
parent
commit
d64d355010
  1. 15
      Makefile.Win32
  2. 293
      calc_addrs.cl
  3. 422
      oclvanitygen.c
  4. 110
      pattern.c
  5. 4
      pattern.h
  6. 11
      winglue.c
  7. 2
      winglue.h

15
Makefile.Win32

@ -2,17 +2,26 @@ CC = cl @@ -2,17 +2,26 @@ CC = cl
OPENSSL_DIR = C:\OpenSSL-Win32
PTHREADS_DIR = C:\pthreads-w32
PCRE_DIR = C:\pcre-7.9-src
OPENCL_DIR = "C:\Program Files (x86)\AMD APP"
OPENCL_INCLUDE = /I$(OPENCL_DIR)\include
OPENCL_LIBS = $(OPENCL_DIR)\lib\x86\OpenCL.lib
CFLAGS = /D_WIN32 /DPTW32_STATIC_LIB /DPCRE_STATIC /I$(OPENSSL_DIR)\include /I$(PTHREADS_DIR) /I$(PCRE_DIR)
LIBS = $(OPENSSL_DIR)\lib\libeay32.lib $(PTHREADS_DIR)\pthread.lib $(PCRE_DIR)\pcre.lib ws2_32.lib
OBJS = vanitygen.obj pattern.obj winglue.obj
OBJS = vanitygen.obj oclvanitygen.obj pattern.obj winglue.obj
all: vanitygen.exe
vanitygen.exe: $(OBJS)
link /nologo /out:vanitygen.exe $(OBJS) $(LIBS)
vanitygen.exe: vanitygen.obj pattern.obj winglue.obj
link /nologo /out:$@ $** $(LIBS)
oclvanitygen.exe: oclvanitygen.obj pattern.obj winglue.obj
link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS)
.c.obj:
$(CC) $(CFLAGS) /c /Tp$< /Fo$@
oclvanitygen.obj: oclvanitygen.c
$(CC) $(CFLAGS) $(OPENCL_INCLUDE) /c /Tpoclvanitygen.c /Fo$@
clean:
del vanitygen.exe $(OBJS)

293
calc_addrs.cl

@ -42,6 +42,21 @@ @@ -42,6 +42,21 @@
* substantially reduces the cost of performing modular inversion.
*/
/* Byte-swapping and endianness */
#define bswap32(v) \
(((v) >> 24) | (((v) >> 8) & 0xff00) | \
(((v) << 8) & 0xff0000) | ((v) << 24))
#if __ENDIAN_LITTLE__ != 1
#define load_le32(v) bswap32(v)
#define load_be32(v) (v)
#else
#define load_le32(v) (v)
#define load_be32(v) bswap32(v)
#endif
/*
* BIGNUM mini-library
* This module deals with fixed-size 256-bit bignums.
@ -439,27 +454,6 @@ bn_from_mont(bignum *rb, bignum *b) @@ -439,27 +454,6 @@ bn_from_mont(bignum *rb, bignum *b)
}
}
/* Montgomery multiplication test kernel */
__kernel void
test_mul_mont(__global bignum *products_out, __global bignum *nums_in,
int count)
{
bignum x, y, tmp;
int i, o, p;
o = get_global_id(0) * count;
p = o * 2;
for (i = 0; i < count; i++) {
x = nums_in[p++];
y = nums_in[p++];
bn_mul_mont(&tmp, &x, &y);
bn_mul_mont(&tmp, &tmp, &x);
bn_mul_mont(&tmp, &tmp, &y);
bn_from_mont(&x, &tmp);
products_out[o++] = x;
}
}
/*
* Modular inversion
*/
@ -526,22 +520,6 @@ bn_mod_inverse(bignum *r, bignum *n) @@ -526,22 +520,6 @@ bn_mod_inverse(bignum *r, bignum *n)
return;
}
/* modular inversion test kernel */
__kernel void
test_mod_inverse(__global bignum *inv_out, __global bignum *nums_in,
int count)
{
bignum x, xp;
int i, o;
o = get_global_id(0) * count;
for (i = 0; i < count; i++) {
x = nums_in[o];
bn_mod_inverse(&xp, &x);
inv_out[o++] = xp;
}
}
/*
* HASH FUNCTIONS
*
@ -597,7 +575,7 @@ sha2_256_init(uint *out) @@ -597,7 +575,7 @@ sha2_256_init(uint *out)
}
/* The state variable remapping is really contorted */
#define sha2_stvar(vals, i, v) vals[(i+(7-v)) % 8]
#define sha2_stvar(vals, i, v) vals[(64+v-i) % 8]
void
sha2_256_block(uint *out, uint *in)
@ -608,7 +586,7 @@ sha2_256_block(uint *out, uint *in) @@ -608,7 +586,7 @@ sha2_256_block(uint *out, uint *in)
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 8; i++)
state[7-i] = out[i];
state[i] = out[i];
#ifdef UNROLL_MAX
#pragma unroll 64
#endif
@ -642,7 +620,7 @@ sha2_256_block(uint *out, uint *in) @@ -642,7 +620,7 @@ sha2_256_block(uint *out, uint *in)
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 8; i++)
out[i] += state[7-i];
out[i] += state[i];
}
@ -687,28 +665,36 @@ __constant uchar ripemd160_rlp[] = { @@ -687,28 +665,36 @@ __constant uchar ripemd160_rlp[] = {
8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11
};
#define ripemd160_val(v, i, n) (v)[(80+(n)-(i)) % 5]
#define ripemd160_valp(v, i, n) (v)[5 + ((80+(n)-(i)) % 5)]
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
#define ripemd160_f1(x, y, z) ((x & y) | (~x & z))
#define ripemd160_f2(x, y, z) ((x | ~y) ^ z)
#define ripemd160_f3(x, y, z) ((x & z) | (y & ~z))
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
#define ripemd160_round(i, in, vals, f, fp, t) do { \
t = rotate(vals[0] + \
f(vals[1], vals[2], vals[3]) + \
ripemd160_val(vals, i, 0) = \
rotate(ripemd160_val(vals, i, 0) + \
f(ripemd160_val(vals, i, 1), \
ripemd160_val(vals, i, 2), \
ripemd160_val(vals, i, 3)) + \
in[ripemd160_ws[i]] + \
ripemd160_k[i / 16], \
(uint)ripemd160_rl[i]) + vals[4]; \
vals[0] = vals[4]; vals[4] = vals[3]; \
vals[3] = rotate(vals[2], 10U); vals[2] = vals[1]; \
vals[1] = t; \
t = rotate(vals[5] + \
fp(vals[6], vals[7], vals[8]) + \
(uint)ripemd160_rl[i]) + \
ripemd160_val(vals, i, 4); \
ripemd160_val(vals, i, 2) = \
rotate(ripemd160_val(vals, i, 2), 10U); \
ripemd160_valp(vals, i, 0) = \
rotate(ripemd160_valp(vals, i, 0) + \
fp(ripemd160_valp(vals, i, 1), \
ripemd160_valp(vals, i, 2), \
ripemd160_valp(vals, i, 3)) + \
in[ripemd160_wsp[i]] + \
ripemd160_kp[i / 16], \
(uint)ripemd160_rlp[i]) + vals[9]; \
vals[5] = vals[9]; vals[9] = vals[8]; \
vals[8] = rotate(vals[7], 10U); vals[7] = vals[6]; \
vals[6] = t; \
(uint)ripemd160_rlp[i]) + \
ripemd160_valp(vals, i, 4); \
ripemd160_valp(vals, i, 2) = \
rotate(ripemd160_valp(vals, i, 2), 10U); \
} while (0)
void
@ -771,9 +757,41 @@ ripemd160_block(uint *out, uint *in) @@ -771,9 +757,41 @@ ripemd160_block(uint *out, uint *in)
}
#define bswap32(v) \
(((v) >> 24) | (((v) >> 8) & 0xff00) | \
(((v) << 8) & 0xff0000) | ((v) << 24))
#ifdef TEST_KERNELS
/*
* Test kernels
*/
/* Montgomery multiplication test kernel */
__kernel void
test_mul_mont(__global bignum *products_out, __global bignum *nums_in)
{
bignum a, b, c;
int o;
o = get_global_id(0);
nums_in += (2*o);
a = nums_in[0];
b = nums_in[1];
bn_mul_mont(&c, &a, &b);
products_out[o] = c;
}
/* modular inversion test kernel */
__kernel void
test_mod_inverse(__global bignum *inv_out, __global bignum *nums_in,
int count)
{
bignum x, xp;
int i, o;
o = get_global_id(0) * count;
for (i = 0; i < count; i++) {
x = nums_in[o];
bn_mod_inverse(&xp, &x);
inv_out[o++] = xp;
}
}
#endif /* TEST_KERNELS */
#if 0
@ -1028,24 +1046,13 @@ heap_invert(__global bignum *z_heap, int ncols) @@ -1028,24 +1046,13 @@ heap_invert(__global bignum *z_heap, int ncols)
}
}
__kernel void
hash_ec_point(__global uint *hashes_out,
__global bignum *points_in, __global bignum *z_heap)
void
hash_ec_point(uint *hash_out, __global bignum *xy, __global bignum *zip)
{
uint hash1[16], hash2[16];
bignum c, zi, zzi;
bn_word wh, wl;
bignum p, a, b;
int i, o;
o = get_global_size(0);
i = o * get_global_id(1);
z_heap += (i * 2);
points_in += (i * 2);
hashes_out += (i * 5);
i = get_global_id(0);
points_in += (i * 2);
hashes_out += (i * 5);
int i;
/*
* Multiply the coordinates by the inverted Z values.
@ -1054,34 +1061,34 @@ hash_ec_point(__global uint *hashes_out, @@ -1054,34 +1061,34 @@ hash_ec_point(__global uint *hashes_out,
* is big-endian, so swapping is unnecessary, but
* inserting the format byte in front causes a headache.
*/
a = z_heap[(o - 1) + i];
bn_mul_mont(&b, &a, &a); /* Z^2 */
p = points_in[0];
bn_mul_mont(&p, &p, &b); /* X / Z^2 */
bn_from_mont(&p, &p);
zi = zip[0];
bn_mul_mont(&zzi, &zi, &zi); /* 1 / Z^2 */
c = xy[0];
bn_mul_mont(&c, &c, &zzi); /* X / Z^2 */
bn_from_mont(&c, &c);
wh = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < BN_NWORDS; o++) {
for (i = 0; i < BN_NWORDS; i++) {
wl = wh;
wh = p.d[(BN_NWORDS - 1) - o];
hash1[o] = (wl << 24) | (wh >> 8);
wh = c.d[(BN_NWORDS - 1) - i];
hash1[i] = (wl << 24) | (wh >> 8);
}
bn_mul_mont(&a, &a, &b); /* Z^3 */
p = points_in[1];
bn_mul_mont(&p, &p, &a); /* Y / Z^3 */
bn_from_mont(&p, &p);
bn_mul_mont(&zzi, &zzi, &zi); /* 1 / Z^3 */
c = xy[1];
bn_mul_mont(&c, &c, &zzi); /* Y / Z^3 */
bn_from_mont(&c, &c);
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < BN_NWORDS; o++) {
for (i = 0; i < BN_NWORDS; i++) {
wl = wh;
wh = p.d[(BN_NWORDS - 1) - o];
hash1[BN_NWORDS + o] = (wl << 24) | (wh >> 8);
wh = c.d[(BN_NWORDS - 1) - i];
hash1[BN_NWORDS + i] = (wl << 24) | (wh >> 8);
}
/*
@ -1119,8 +1126,8 @@ hash_ec_point(__global uint *hashes_out, @@ -1119,8 +1126,8 @@ hash_ec_point(__global uint *hashes_out,
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < 8; o++)
hash2[o] = bswap32(hash2[o]);
for (i = 0; i < 8; i++)
hash2[i] = bswap32(hash2[i]);
hash2[8] = bswap32(0x80000000);
hash2[9] = 0;
hash2[10] = 0;
@ -1129,12 +1136,116 @@ hash_ec_point(__global uint *hashes_out, @@ -1129,12 +1136,116 @@ hash_ec_point(__global uint *hashes_out,
hash2[13] = 0;
hash2[14] = 32 * 8;
hash2[15] = 0;
ripemd160_init(hash1);
ripemd160_block(hash1, hash2);
ripemd160_init(hash_out);
ripemd160_block(hash_out, hash2);
}
__kernel void
hash_ec_point_get(__global uint *hashes_out,
__global bignum *points_in, __global bignum *z_heap)
{
uint hash[5];
int i, o;
o = get_global_size(0);
i = o * get_global_id(1);
z_heap += (i * 2);
points_in += (i * 2);
hashes_out += (i * 5);
i = get_global_id(0);
points_in += (i * 2);
hashes_out += (i * 5);
/* Complete the coordinates and hash */
hash_ec_point(hash, points_in, &z_heap[i + (o - 1)]);
/* Output the hash in proper byte-order */
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (o = 0; o < 5; o++)
hashes_out[o] = hash1[o];
for (i = 0; i < 5; i++)
hashes_out[i] = load_le32(hash[i]);
}
/*
* Normally this would be one function that compared two hash160s.
* This one compares a hash160 with an upper and lower bound in one
* function to work around a problem with AMD's OpenCL compiler.
*/
int
hash160_ucmp_g(uint *a, __global uint *bound)
{
uint gv;
int i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 5; i++) {
gv = load_be32(bound[i]);
if (a[i] < gv) return -1;
if (a[i] > gv) break;
}
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 5; i++) {
gv = load_be32(bound[5+i]);
if (a[i] < gv) return 0;
if (a[i] > gv) return 1;
}
return 0;
}
__kernel void
hash_ec_point_search_prefix(__global uint *found,
__global bignum *points_in, __global bignum *z_heap,
__global uint *target_table, int ntargets)
{
uint hash[5];
int i, high, low, p;
p = get_global_size(0);
i = p * get_global_id(1);
z_heap += (i * 2);
points_in += (i * 2);
i = get_global_id(0);
points_in += (i * 2);
/* Complete the coordinates and hash */
hash_ec_point(hash, points_in, &z_heap[(p - 1) + i]);
/*
* Unconditionally byteswap the hash result, because:
* - The byte-level convention of RIPEMD160 is little-endian
* - We are comparing it in big-endian order
*/
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (i = 0; i < 5; i++)
hash[i] = bswap32(hash[i]);
/* Binary-search the target table for the hash we just computed */
for (high = ntargets - 1, low = 0, i = high >> 1;
high >= low;
i = low + ((high - low) >> 1)) {
p = hash160_ucmp_g(hash, &target_table[10*i]);
low = (p > 0) ? (i + 1) : low;
high = (p < 0) ? (i - 1) : high;
if (p == 0) {
/* For debugging purposes, write the hash value */
found[0] = ((get_global_id(1) * get_global_size(0)) +
get_global_id(0));
found[1] = i;
#ifdef UNROLL_MAX
#pragma unroll UNROLL_MAX
#endif
for (p = 0; p < 5; p++)
found[p+2] = load_be32(hash[p]);
high = -1;
}
}
}

422
oclvanitygen.c

@ -33,8 +33,16 @@ @@ -33,8 +33,16 @@
const char *version = "0.13";
const int debug = 0;
#define MAX_SLOT 2
#define MAX_ARG 6
#define MAX_KERNEL 3
/* OpenCL address searching mode */
struct _vg_ocl_context_s;
typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *);
typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot);
typedef struct _vg_ocl_context_s {
vg_exec_context_t base;
@ -42,10 +50,17 @@ typedef struct _vg_ocl_context_s { @@ -42,10 +50,17 @@ typedef struct _vg_ocl_context_s {
cl_context voc_oclctx;
cl_command_queue voc_oclcmdq;
cl_program voc_oclprog;
cl_kernel voc_oclkernel[MAX_SLOT][3];
vg_ocl_init_t voc_init_func;
vg_ocl_init_t voc_rekey_func;
vg_ocl_check_t voc_check_func;
int voc_nslots;
cl_kernel voc_oclkernel[MAX_SLOT][MAX_KERNEL];
cl_event voc_oclkrnwait[MAX_SLOT];
cl_mem voc_args[MAX_SLOT][6];
size_t voc_arg_size[MAX_SLOT][6];
cl_mem voc_args[MAX_SLOT][MAX_ARG];
size_t voc_arg_size[MAX_SLOT][MAX_ARG];
int voc_pattern_rewrite;
int voc_pattern_alloc;
pthread_t voc_ocl_thread;
pthread_mutex_t voc_lock;
@ -171,8 +186,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, @@ -171,8 +186,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
vocp->voc_oclprog = prog;
if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") ||
!vg_ocl_create_kernel(vocp, 1, "heap_invert") ||
!vg_ocl_create_kernel(vocp, 2, "hash_ec_point")) {
!vg_ocl_create_kernel(vocp, 1, "heap_invert")) {
clReleaseProgram(vocp->voc_oclprog);
vocp->voc_oclprog = NULL;
return 0;
@ -181,7 +195,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, @@ -181,7 +195,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
return 1;
}
void
void CL_CALLBACK
vg_ocl_context_callback(const char *errinfo,
const void *private_info,
size_t cb,
@ -223,7 +237,7 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did) @@ -223,7 +237,7 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did)
if (!vg_ocl_load_program(vcp, vocp,
"calc_addrs.cl",
//"-cl-nv-verbose -cl-nv-maxrregcount=32 "
//"-cl-nv-verbose "
"-DUNROLL_MAX=16")) {
printf("Could not load kernel\n");
return 0;
@ -251,16 +265,8 @@ vg_ocl_del(vg_ocl_context_t *vocp) @@ -251,16 +265,8 @@ vg_ocl_del(vg_ocl_context_t *vocp)
vg_exec_context_del(&vocp->base);
}
int
vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot,
int arg, size_t size, int host)
{
cl_mem clbuf;
cl_int ret;
int i, j, knum, karg;
static int arg_map[5][8] = {
/* hashes_out */
static int vg_ocl_arg_map[][8] = {
/* hashes_out / found */
{ 2, 0, -1 },
/* z_heap */
{ 0, 1, 1, 0, 2, 2, -1 },
@ -270,8 +276,28 @@ vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot, @@ -270,8 +276,28 @@ vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot,
{ 0, 2, -1 },
/* col_in */
{ 0, 3, -1 },
/* target_table */
{ 2, 3, -1 },
};
int
vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot,
int arg, size_t size, int host)
{
cl_mem clbuf;
cl_int ret;
int i, j, knum, karg;
for (i = 0; i < MAX_SLOT; i++) {
if ((i != slot) && (slot >= 0))
continue;
if (vocp->voc_args[i][arg]) {
clReleaseMemObject(vocp->voc_args[i][arg]);
vocp->voc_args[i][arg] = NULL;
vocp->voc_arg_size[i][arg] = 0;
}
}
clbuf = clCreateBuffer(vocp->voc_oclctx,
CL_MEM_READ_WRITE |
(host ? CL_MEM_ALLOC_HOST_PTR : 0),
@ -287,24 +313,54 @@ vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot, @@ -287,24 +313,54 @@ vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot,
if ((i != slot) && (slot >= 0))
continue;
for (j = 0; arg_map[arg][j] >= 0; j += 2) {
knum = arg_map[arg][j];
karg = arg_map[arg][j+1];
clRetainMemObject(clbuf);
vocp->voc_args[i][arg] = clbuf;
vocp->voc_arg_size[i][arg] = size;
for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
knum = vg_ocl_arg_map[arg][j];
karg = vg_ocl_arg_map[arg][j+1];
ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
karg,
sizeof(clbuf),
&clbuf);
if (ret) {
clReleaseMemObject(clbuf);
printf("Could not set kernel argument: %d\n",
ret);
return 0;
}
}
vocp->voc_args[i][arg] = clbuf;
vocp->voc_arg_size[i][arg] = size;
}
clReleaseMemObject(clbuf);
return 1;
}
int
vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg,
void *buffer, size_t size)
{
cl_int slot, ret;
slot = (wslot < 0) ? 0 : wslot;
assert((slot >= 0) && (slot < MAX_SLOT));
assert(size <= vocp->voc_arg_size[slot][arg]);
ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq,
vocp->voc_args[slot][arg],
CL_TRUE,
0, size,
buffer,
0, NULL,
NULL);
if (ret) {
printf("Could not copyout argument buffer: %d\n", ret);
return 0;
}
return 1;
}
@ -320,7 +376,8 @@ vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot, @@ -320,7 +376,8 @@ vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot,
buf = clEnqueueMapBuffer(vocp->voc_oclcmdq,
vocp->voc_args[slot][arg],
CL_TRUE,
rw ? CL_MAP_WRITE : CL_MAP_READ,
(rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE)
: (rw ? CL_MAP_WRITE : CL_MAP_READ),
0, vocp->voc_arg_size[slot][arg],
0, NULL,
NULL,
@ -368,7 +425,7 @@ vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot, @@ -368,7 +425,7 @@ vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot,
for (i = 0; i < MAX_SLOT; i++) {
if ((i != slot) && (slot >= 0))
continue;
ret = clSetKernelArg(vocp->voc_oclkernel[i][0],
ret = clSetKernelArg(vocp->voc_oclkernel[i][2],
arg,
sizeof(value),
&value);
@ -507,6 +564,200 @@ show_elapsed(struct timeval *tv, const char *place) @@ -507,6 +564,200 @@ show_elapsed(struct timeval *tv, const char *place)
printf("%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec);
}
/*
* GPU address matching methods
*
* gethash: GPU computes and returns all address hashes.
* + Works with any matching method, including regular expressions.
* - The CPU will not be able to keep up with mid- to high-end GPUs.
*
* prefix: GPU computes hash, searches a range list, and discards.
* + Fast, minimal work for CPU.
*/
int
vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot)
{
vg_exec_context_t *vxcp = &vocp->base;
vg_context_t *vcp = vocp->base.vxc_vc;
vg_test_func_t test_func = vcp->vc_test;
unsigned char *ocl_hashes_out;
int i, res = 0, round;
ocl_hashes_out = (unsigned char *)
vg_ocl_map_arg_buffer(vocp, slot, 0, 0);
round = vocp->voc_ocl_cols * vocp->voc_ocl_rows;
for (i = 0; i < round; i++, vxcp->vxc_delta++) {
memcpy(&vxcp->vxc_binres[1],
ocl_hashes_out + (20*i),
20);
res = test_func(vxcp);
if (res)
break;
}
vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out);
return res;
}
int
vg_ocl_gethash_init(vg_ocl_context_t *vocp)
{
int i;
if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get"))
return 0;
for (i = 0; i < vocp->voc_nslots; i++) {
/* Each slot gets its own hash output buffer */
if (!vg_ocl_kernel_arg_alloc(vocp, i, 0,
20 *
vocp->voc_ocl_rows *
vocp->voc_ocl_cols, 1))
return 0;
}
vocp->voc_rekey_func = NULL;
vocp->voc_check_func = vg_ocl_gethash_check;
return 1;
}
static int
vg_ocl_prefix_rekey(vg_ocl_context_t *vocp)
{
vg_context_t *vcp = vocp->base.vxc_vc;
unsigned char *ocl_targets_in;
uint32_t *ocl_found_out;
int i;
/* Set the found indicator for each slot to -1 */
for (i = 0; i < vocp->voc_nslots; i++) {
ocl_found_out = (uint32_t *)
vg_ocl_map_arg_buffer(vocp, i, 0, 1);
ocl_found_out[0] = 0xffffffff;
vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out);
}
if (vocp->voc_pattern_rewrite) {
/* Count number of range records */
i = vg_context_hash160_sort(vcp, NULL);
if (!i) {
printf("No range records available, exiting\n");
return 0;
}
if (i > vocp->voc_pattern_alloc) {
/* (re)allocate target buffer */
if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0))
return 0;
vocp->voc_pattern_alloc = i;
}
/* Write range records */
ocl_targets_in = (unsigned char *)
vg_ocl_map_arg_buffer(vocp, 0, 5, 1);
vg_context_hash160_sort(vcp, ocl_targets_in);
vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in);
vg_ocl_kernel_int_arg(vocp, -1, 4, i);
vocp->voc_pattern_rewrite = 0;
}
return 1;
}
static int
vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot)
{
vg_exec_context_t *vxcp = &vocp->base;
vg_context_t *vcp = vocp->base.vxc_vc;
vg_test_func_t test_func = vcp->vc_test;
uint32_t *ocl_found_out;
uint32_t found_delta;
int orig_delta, tablesize;
int res = 0;
/* Retrieve the found indicator */
ocl_found_out = (uint32_t *)
vg_ocl_map_arg_buffer(vocp, slot, 0, 2);
found_delta = ocl_found_out[0];
if (found_delta != 0xffffffff) {
/* GPU code claims match, verify with CPU version */
orig_delta = vxcp->vxc_delta;
vxcp->vxc_delta += found_delta;
vg_exec_context_calc_address(vxcp);
res = test_func(vxcp);
if (res == 0) {
/*
* The match was not found in
* the pattern list. Hmm.
*/
tablesize = ocl_found_out[2];
printf("Match idx: %d\n", ocl_found_out[1]);
printf("CPU hash: ");
dumphex(vxcp->vxc_binres + 1, 20);
printf("GPU hash: ");
dumphex((unsigned char *) (ocl_found_out + 3), 20);
printf("Table size: %d\n", ocl_found_out[2]);
printf("Found delta: %d "
"Start delta: %d\n",
found_delta, orig_delta);
res = 1;
}
vocp->voc_pattern_rewrite = 1;
} else {
vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows);
}
vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out);
return res;
}
int
vg_ocl_prefix_init(vg_ocl_context_t *vocp)
{
int i;
if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix"))
return 0;
for (i = 0; i < vocp->voc_nslots; i++) {
if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1))
return 0;
}
vocp->voc_rekey_func = vg_ocl_prefix_rekey;
vocp->voc_check_func = vg_ocl_prefix_check;
vocp->voc_pattern_rewrite = 1;
vocp->voc_pattern_alloc = 0;
return 1;
}
int
vg_ocl_config_pattern(vg_ocl_context_t *vocp)
{
vg_context_t *vcp = vocp->base.vxc_vc;
int i;
i = vg_context_hash160_sort(vcp, NULL);
if (i > 0) {
if (vcp->vc_verbose > 1)
printf("Using GPU prefix matcher\n");
/* Configure for prefix matching */
return vg_ocl_prefix_init(vocp);
}
if (vcp->vc_verbose > 0)
printf("WARNING: Using CPU pattern matcher\n");
return vg_ocl_gethash_init(vocp);
}
void *
vg_opencl_thread(void *arg)
{
@ -561,17 +812,17 @@ vg_opencl_thread(void *arg) @@ -561,17 +812,17 @@ vg_opencl_thread(void *arg)
if (!vg_ocl_kernel_wait(vocp, slot))
halt = 1;
if (vcp->vc_verbose > 1) {
gettimeofday(&tvt, NULL);
timersub(&tvt, &tv, &tvd);
timeradd(&tvd, &busy, &busy);
if ((vcp->vc_verbose > 1) &&
((busy.tv_sec + idle.tv_sec) > 1)) {
if ((busy.tv_sec + idle.tv_sec) > 1) {
idleu = (1000000 * idle.tv_sec) + idle.tv_usec;
busyu = (1000000 * busy.tv_sec) + busy.tv_usec;
pidle = ((double) idleu) / (idleu + busyu);
if (pidle > 0.05) {
if (pidle > 0.01) {
printf("\rGPU idle: %.2f%%"
" "
" \n",
@ -581,11 +832,13 @@ vg_opencl_thread(void *arg) @@ -581,11 +832,13 @@ vg_opencl_thread(void *arg)
memset(&busy, 0, sizeof(busy));
}
}
}
out:
pthread_mutex_unlock(&vocp->voc_lock);
return NULL;
}
/*
* Address search thread main loop
*/
@ -605,12 +858,11 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) @@ -605,12 +858,11 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize)
EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL;
EC_POINT *pseek = NULL;
unsigned char *ocl_points_in, *ocl_strides_in, *ocl_hashes_out;
unsigned char *ocl_points_in, *ocl_strides_in;
vg_ocl_context_t ctx;
vg_ocl_context_t *vocp = &ctx;
vg_exec_context_t *vxcp = &vocp->base;
vg_test_func_t test_func = vcp->vc_test;
int slot, nslots;
int slot_busy = 0, slot_done = 0, halt = 0;
@ -637,6 +889,10 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) @@ -637,6 +889,10 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize)
nslots = 2;
slot = 0;
vocp->voc_ocl_cols = batchsize;
vocp->voc_ocl_rows = worksize;
vocp->voc_nslots = nslots;
ppbase = (EC_POINT **) malloc((batchsize + worksize) *
sizeof(EC_POINT*));
if (!ppbase)
@ -667,26 +923,28 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) @@ -667,26 +923,28 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize)
round = batchsize * worksize;
if (!vg_ocl_config_pattern(vocp))
goto enomem;
for (i = 0; i < nslots; i++) {
/*
* Each work group gets its own:
* - Hash output array
* - Point and z_heap scratch spaces
* - Column point array
*/
if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 20 * round, 1) ||
!vg_ocl_kernel_arg_alloc(vocp, i, 1, 32 * 2 * round, 0) ||
!vg_ocl_kernel_arg_alloc(vocp, i, 2, 32 * 2 * round, 0) ||
!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * worksize, 1))
if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * worksize, 1))
goto enomem;
}
/* Same row point array for all instances */
if (!vg_ocl_kernel_arg_alloc(vocp, -1, 3, 32 * 2 * batchsize, 1))
/*
* All instances share:
* - The z_heap and point scratch spaces
* - The row point array
*/
if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1, 32 * 2 * round, 0) ||
!vg_ocl_kernel_arg_alloc(vocp, -1, 2, 32 * 2 * round, 0) ||
!vg_ocl_kernel_arg_alloc(vocp, -1, 3, 32 * 2 * batchsize, 1))
goto enomem;
//vg_ocl_kernel_int_arg(vocp, -1, 5, batchsize);
npoints = 0;
rekey_at = 0;
vxcp->vxc_binres[0] = vcp->vc_addrtype;
@ -698,6 +956,10 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) @@ -698,6 +956,10 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize)
gettimeofday(&tvstart, NULL);
l_rekey:
if (vocp->voc_rekey_func &&
!vocp->voc_rekey_func(vocp))
goto enomem;
/* Generate a new random private key */
EC_KEY_generate_key(pkey);
npoints = 0;
@ -755,33 +1017,22 @@ l_rekey: @@ -755,33 +1017,22 @@ l_rekey:
while (1) {
if (slot_done) {
assert(rekey_at > 0);
slot_done = 0;
ocl_hashes_out = (unsigned char *)
vg_ocl_map_arg_buffer(vocp, slot, 0, 0);
for (i = 0; i < round; i++, vxcp->vxc_delta++) {
memcpy(&vxcp->vxc_binres[1],
ocl_hashes_out + (20*i),
20);
switch (test_func(vxcp)) {
/* Call the result check function */
switch (vocp->voc_check_func(vocp, slot)) {
case 1:
rekey_at = 0;
i = round;
break;
case 2:
halt = 1;
i = round;
break;
default:
break;
}
}
vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out);
c += (i + 1);
c += round;
if (!halt && (c >= output_interval)) {
output_interval =
vg_output_timing(vcp, c, &tvstart);
@ -789,24 +1040,8 @@ l_rekey: @@ -789,24 +1040,8 @@ l_rekey:
}
}
if (halt) {
if (vcp->vc_verbose > 1)
printf("Halting...");
pthread_mutex_lock(&vocp->voc_lock);
vocp->voc_halt = 1;
pthread_cond_signal(&vocp->voc_wait);
while (vocp->voc_ocl_slot != -1) {
assert(slot_busy);
pthread_cond_wait(&vocp->voc_wait,
&vocp->voc_lock);
}
slot_busy = 0;
pthread_mutex_unlock(&vocp->voc_lock);
pthread_join(vocp->voc_ocl_thread, NULL);
if (vcp->vc_verbose > 1)
printf("done!\n");
if (halt)
break;
}
if ((npoints + round) < rekey_at) {
if (npoints > 1) {
@ -843,34 +1078,35 @@ l_rekey: @@ -843,34 +1078,35 @@ l_rekey:
}
if (vocp->voc_halt) {
pthread_mutex_unlock(&vocp->voc_lock);
halt = 1;
} else {
break;
}
vocp->voc_ocl_slot = slot;
vocp->voc_ocl_cols = batchsize;
vocp->voc_ocl_rows = worksize;
pthread_cond_signal(&vocp->voc_wait);
pthread_mutex_unlock(&vocp->voc_lock);
if (slot_busy)
slot_done = 1;
slot_done = slot_busy;
slot_busy = 1;
slot = (slot + 1) % nslots;
}
pthread_mutex_unlock(&vocp->voc_lock);
}
else if (slot_busy) {
} else {
if (slot_busy) {
pthread_mutex_lock(&vocp->voc_lock);
while (vocp->voc_ocl_slot != -1) {
assert(vocp->voc_ocl_slot ==
((slot + nslots - 1) % nslots));
pthread_cond_wait(&vocp->voc_wait,
&vocp->voc_lock);
}
slot_busy = 0;
pthread_mutex_unlock(&vocp->voc_lock);
slot_busy = 0;
slot_done = 1;
}
else if (!rekey_at || ((npoints + round) >= rekey_at)) {
if (!rekey_at ||
(!slot_done && ((npoints + round) >= rekey_at)))
goto l_rekey;
}
}
@ -880,6 +1116,24 @@ l_rekey: @@ -880,6 +1116,24 @@ l_rekey:
printf("ERROR: allocation failure?\n");
}
if (halt) {
if (vcp->vc_verbose > 1)
printf("Halting...");
pthread_mutex_lock(&vocp->voc_lock);
vocp->voc_halt = 1;
pthread_cond_signal(&vocp->voc_wait);
while (vocp->voc_ocl_slot != -1) {
assert(slot_busy);
pthread_cond_wait(&vocp->voc_wait,
&vocp->voc_lock);
}
slot_busy = 0;
pthread_mutex_unlock(&vocp->voc_lock);
pthread_join(vocp->voc_ocl_thread, NULL);
if (vcp->vc_verbose > 1)
printf("done!\n");
}
if (ppbase) {
for (i = 0; i < (batchsize + worksize); i++)
if (ppbase[i])

110
pattern.c

@ -207,6 +207,27 @@ vg_exec_context_consolidate_key(vg_exec_context_t *vxcp) @@ -207,6 +207,27 @@ vg_exec_context_consolidate_key(vg_exec_context_t *vxcp)
}
}
void
vg_exec_context_calc_address(vg_exec_context_t *vxcp)
{
const EC_GROUP *pgroup;
unsigned char eckey_buf[96], hash1[32], hash2[20];
int len;
vg_exec_context_consolidate_key(vxcp);
pgroup = EC_KEY_get0_group(vxcp->vxc_key);
len = EC_POINT_point2oct(pgroup,
EC_KEY_get0_public_key(vxcp->vxc_key),
POINT_CONVERSION_UNCOMPRESSED,
eckey_buf,
sizeof(eckey_buf),
vxcp->vxc_bnctx);
SHA256(eckey_buf, len, hash1);
RIPEMD160(hash1, sizeof(hash1), hash2);
memcpy(&vxcp->vxc_binres[1],
hash2, 20);
}
typedef struct _timing_info_s {
struct _timing_info_s *ti_next;
@ -437,6 +458,14 @@ vg_context_add_patterns(vg_context_t *vcp, @@ -437,6 +458,14 @@ vg_context_add_patterns(vg_context_t *vcp,
return vcp->vc_add_patterns(vcp, patterns, npatterns);
}
int
vg_context_hash160_sort(vg_context_t *vcp, void *buf)
{
if (!vcp->vc_hash160_sort)
return 0;
return vcp->vc_hash160_sort(vcp, buf);
}
static const signed char b58_reverse_map[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
@ -1023,6 +1052,17 @@ avl_insert_fix(avl_root_t *rootp, avl_item_t *itemp) @@ -1023,6 +1052,17 @@ avl_insert_fix(avl_root_t *rootp, avl_item_t *itemp)
}
}
static INLINE avl_item_t *
avl_first(avl_root_t *rootp)
{
avl_item_t *itemp = rootp->ar_root;
if (itemp) {
while (itemp->ai_left)
itemp = itemp->ai_left;
}
return itemp;
}
static INLINE avl_item_t *
avl_next(avl_item_t *itemp)
{
@ -1181,6 +1221,26 @@ vg_prefix_avl_insert(avl_root_t *rootp, vg_prefix_t *vpnew) @@ -1181,6 +1221,26 @@ vg_prefix_avl_insert(avl_root_t *rootp, vg_prefix_t *vpnew)
return NULL;
}
static vg_prefix_t *
vg_prefix_first(avl_root_t *rootp)
{
avl_item_t *itemp;
itemp = avl_first(rootp);
if (itemp)
return avl_item_entry(itemp, vg_prefix_t, vp_item);
return NULL;
}
static vg_prefix_t *
vg_prefix_next(vg_prefix_t *vp)
{
avl_item_t *itemp = &vp->vp_item;
itemp = avl_next(itemp);
if (itemp)
return avl_item_entry(itemp, vg_prefix_t, vp_item);
return NULL;
}
static vg_prefix_t *
vg_prefix_add(avl_root_t *rootp, const char *pattern, BIGNUM *low, BIGNUM *high)
{
@ -1568,6 +1628,54 @@ research: @@ -1568,6 +1628,54 @@ research:
return res;
}
int
vg_prefix_hash160_sort(vg_context_t *vcp, void *buf)
{
vg_prefix_context_t *vcpp = (vg_prefix_context_t *) vcp;
vg_prefix_t *vp;
unsigned char *cbuf = (unsigned char *) buf;
unsigned char bnbuf[25];
int nbytes, ncopy, nskip, npfx = 0;
/*
* Walk the prefix tree in order, copy the upper and lower bound
* values into the hash160 buffer. Skip the lower four bytes
* and anything above the 24th byte.
*/
for (vp = vg_prefix_first(&vcpp->vcp_avlroot);
vp != NULL;
vp = vg_prefix_next(vp)) {
npfx++;
if (!buf)
continue;
/* Low */
nbytes = BN_bn2bin(vp->vp_low, bnbuf);
ncopy = ((nbytes >= 24) ? 20 :
((nbytes > 4) ? (nbytes - 4) : 0));
nskip = (nbytes >= 24) ? (nbytes - 24) : 0;
if (ncopy < 20)
memset(cbuf, 0, 20 - ncopy);
memcpy(cbuf + (20 - ncopy),
bnbuf + nskip,
ncopy);
cbuf += 20;
/* High */
nbytes = BN_bn2bin(vp->vp_high, bnbuf);
ncopy = ((nbytes >= 24) ? 20 :
((nbytes > 4) ? (nbytes - 4) : 0));
nskip = (nbytes >= 24) ? (nbytes - 24) : 0;
if (ncopy < 20)
memset(cbuf, 0, 20 - ncopy);
memcpy(cbuf + (20 - ncopy),
bnbuf + nskip,
ncopy);
cbuf += 20;
}
return npfx;
}
vg_context_t *
vg_prefix_context_new(int addrtype, int privtype, int caseinsensitive)
{
@ -1584,6 +1692,7 @@ vg_prefix_context_new(int addrtype, int privtype, int caseinsensitive) @@ -1584,6 +1692,7 @@ vg_prefix_context_new(int addrtype, int privtype, int caseinsensitive)
vcpp->base.vc_free = vg_prefix_context_free;
vcpp->base.vc_add_patterns = vg_prefix_context_add_patterns;
vcpp->base.vc_test = vg_prefix_test;
vcpp->base.vc_hash160_sort = vg_prefix_hash160_sort;
avl_root_init(&vcpp->vcp_avlroot);
BN_init(&vcpp->vcp_difficulty);
vcpp->vcp_caseinsensitive = caseinsensitive;
@ -1811,6 +1920,7 @@ vg_regex_context_new(int addrtype, int privtype) @@ -1811,6 +1920,7 @@ vg_regex_context_new(int addrtype, int privtype)
vcrp->base.vc_free = vg_regex_context_free;
vcrp->base.vc_add_patterns = vg_regex_context_add_patterns;
vcrp->base.vc_test = vg_regex_test;
vcrp->base.vc_hash160_sort = NULL;
vcrp->vcr_regex = NULL;
vcrp->vcr_nalloc = 0;
}

4
pattern.h

@ -59,6 +59,7 @@ typedef struct _vg_exec_context_s { @@ -59,6 +59,7 @@ typedef struct _vg_exec_context_s {
extern int vg_exec_context_init(vg_context_t *vcp, vg_exec_context_t *vxcp);
extern void vg_exec_context_del(vg_exec_context_t *vxcp);
extern void vg_exec_context_consolidate_key(vg_exec_context_t *vxcp);
extern void vg_exec_context_calc_address(vg_exec_context_t *vxcp);
/* Implementation-specific lock/unlock/consolidate */
extern void vg_exec_downgrade_lock(vg_exec_context_t *vxcp);
@ -69,6 +70,7 @@ typedef void (*vg_free_func_t)(vg_context_t *); @@ -69,6 +70,7 @@ typedef void (*vg_free_func_t)(vg_context_t *);
typedef int (*vg_add_pattern_func_t)(vg_context_t *,
char ** const patterns, int npatterns);
typedef int (*vg_test_func_t)(vg_exec_context_t *);
typedef int (*vg_hash160_sort_func_t)(vg_context_t *vcp, void *buf);
/* Application-level context, incl. parameters and global pattern store */
struct _vg_context_s {
@ -84,12 +86,14 @@ struct _vg_context_s { @@ -84,12 +86,14 @@ struct _vg_context_s {
vg_free_func_t vc_free;
vg_add_pattern_func_t vc_add_patterns;
vg_test_func_t vc_test;
vg_hash160_sort_func_t vc_hash160_sort;
};
extern void vg_context_free(vg_context_t *vcp);
extern int vg_context_add_patterns(vg_context_t *vcp,
char ** const patterns, int npatterns);
extern int vg_context_hash160_sort(vg_context_t *vcp, void *buf);
extern vg_context_t *vg_prefix_context_new(int addrtype, int privtype,

11
winglue.c

@ -102,6 +102,17 @@ gettimeofday(struct timeval *tv, struct timezone *tz) @@ -102,6 +102,17 @@ gettimeofday(struct timeval *tv, struct timezone *tz)
return 0;
}
void
timeradd(struct timeval *a, struct timeval *b, struct timeval *result)
{
result->tv_sec = a->tv_sec + b->tv_sec;
result->tv_usec = a->tv_usec + b->tv_usec;
if (result->tv_usec > 10000000) {
result->tv_sec++;
result->tv_usec -= 1000000;
}
}
void
timersub(struct timeval *a, struct timeval *b, struct timeval *result)
{

2
winglue.h

@ -29,6 +29,8 @@ @@ -29,6 +29,8 @@
struct timezone;
extern int gettimeofday(struct timeval *tv, struct timezone *tz);
extern void timeradd(struct timeval *a, struct timeval *b,
struct timeval *result);
extern void timersub(struct timeval *a, struct timeval *b,
struct timeval *result);

Loading…
Cancel
Save