diff --git a/Makefile.Win32 b/Makefile.Win32 index a8c2cdb..a74ab95 100644 --- a/Makefile.Win32 +++ b/Makefile.Win32 @@ -2,17 +2,26 @@ CC = cl OPENSSL_DIR = C:\OpenSSL-Win32 PTHREADS_DIR = C:\pthreads-w32 PCRE_DIR = C:\pcre-7.9-src +OPENCL_DIR = "C:\Program Files (x86)\AMD APP" +OPENCL_INCLUDE = /I$(OPENCL_DIR)\include +OPENCL_LIBS = $(OPENCL_DIR)\lib\x86\OpenCL.lib CFLAGS = /D_WIN32 /DPTW32_STATIC_LIB /DPCRE_STATIC /I$(OPENSSL_DIR)\include /I$(PTHREADS_DIR) /I$(PCRE_DIR) LIBS = $(OPENSSL_DIR)\lib\libeay32.lib $(PTHREADS_DIR)\pthread.lib $(PCRE_DIR)\pcre.lib ws2_32.lib -OBJS = vanitygen.obj pattern.obj winglue.obj +OBJS = vanitygen.obj oclvanitygen.obj pattern.obj winglue.obj all: vanitygen.exe -vanitygen.exe: $(OBJS) - link /nologo /out:vanitygen.exe $(OBJS) $(LIBS) +vanitygen.exe: vanitygen.obj pattern.obj winglue.obj + link /nologo /out:$@ $** $(LIBS) + +oclvanitygen.exe: oclvanitygen.obj pattern.obj winglue.obj + link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS) .c.obj: $(CC) $(CFLAGS) /c /Tp$< /Fo$@ +oclvanitygen.obj: oclvanitygen.c + $(CC) $(CFLAGS) $(OPENCL_INCLUDE) /c /Tpoclvanitygen.c /Fo$@ + clean: del vanitygen.exe $(OBJS) diff --git a/calc_addrs.cl b/calc_addrs.cl index 2526d5d..027a883 100644 --- a/calc_addrs.cl +++ b/calc_addrs.cl @@ -42,6 +42,21 @@ * substantially reduces the cost of performing modular inversion. */ + +/* Byte-swapping and endianness */ +#define bswap32(v) \ + (((v) >> 24) | (((v) >> 8) & 0xff00) | \ + (((v) << 8) & 0xff0000) | ((v) << 24)) + +#if __ENDIAN_LITTLE__ != 1 +#define load_le32(v) bswap32(v) +#define load_be32(v) (v) +#else +#define load_le32(v) (v) +#define load_be32(v) bswap32(v) +#endif + + /* * BIGNUM mini-library * This module deals with fixed-size 256-bit bignums. @@ -439,27 +454,6 @@ bn_from_mont(bignum *rb, bignum *b) } } - -/* Montgomery multiplication test kernel */ -__kernel void -test_mul_mont(__global bignum *products_out, __global bignum *nums_in, - int count) -{ - bignum x, y, tmp; - int i, o, p; - o = get_global_id(0) * count; - p = o * 2; - for (i = 0; i < count; i++) { - x = nums_in[p++]; - y = nums_in[p++]; - bn_mul_mont(&tmp, &x, &y); - bn_mul_mont(&tmp, &tmp, &x); - bn_mul_mont(&tmp, &tmp, &y); - bn_from_mont(&x, &tmp); - products_out[o++] = x; - } -} - /* * Modular inversion */ @@ -526,22 +520,6 @@ bn_mod_inverse(bignum *r, bignum *n) return; } -/* modular inversion test kernel */ -__kernel void -test_mod_inverse(__global bignum *inv_out, __global bignum *nums_in, - int count) -{ - bignum x, xp; - int i, o; - o = get_global_id(0) * count; - for (i = 0; i < count; i++) { - x = nums_in[o]; - bn_mod_inverse(&xp, &x); - inv_out[o++] = xp; - } -} - - /* * HASH FUNCTIONS * @@ -597,7 +575,7 @@ sha2_256_init(uint *out) } /* The state variable remapping is really contorted */ -#define sha2_stvar(vals, i, v) vals[(i+(7-v)) % 8] +#define sha2_stvar(vals, i, v) vals[(64+v-i) % 8] void sha2_256_block(uint *out, uint *in) @@ -608,7 +586,7 @@ sha2_256_block(uint *out, uint *in) #pragma unroll UNROLL_MAX #endif for (i = 0; i < 8; i++) - state[7-i] = out[i]; + state[i] = out[i]; #ifdef UNROLL_MAX #pragma unroll 64 #endif @@ -642,7 +620,7 @@ sha2_256_block(uint *out, uint *in) #pragma unroll UNROLL_MAX #endif for (i = 0; i < 8; i++) - out[i] += state[7-i]; + out[i] += state[i]; } @@ -687,28 +665,36 @@ __constant uchar ripemd160_rlp[] = { 8, 5, 12, 9, 12, 5, 14, 6, 8, 13, 6, 5, 15, 13, 11, 11 }; +#define ripemd160_val(v, i, n) (v)[(80+(n)-(i)) % 5] +#define ripemd160_valp(v, i, n) (v)[5 + ((80+(n)-(i)) % 5)] #define ripemd160_f0(x, y, z) (x ^ y ^ z) #define ripemd160_f1(x, y, z) ((x & y) | (~x & z)) #define ripemd160_f2(x, y, z) ((x | ~y) ^ z) #define ripemd160_f3(x, y, z) ((x & z) | (y & ~z)) #define ripemd160_f4(x, y, z) (x ^ (y | ~z)) -#define ripemd160_round(i, in, vals, f, fp, t) do { \ - t = rotate(vals[0] + \ - f(vals[1], vals[2], vals[3]) + \ - in[ripemd160_ws[i]] + \ - ripemd160_k[i / 16], \ - (uint)ripemd160_rl[i]) + vals[4]; \ - vals[0] = vals[4]; vals[4] = vals[3]; \ - vals[3] = rotate(vals[2], 10U); vals[2] = vals[1]; \ - vals[1] = t; \ - t = rotate(vals[5] + \ - fp(vals[6], vals[7], vals[8]) + \ - in[ripemd160_wsp[i]] + \ - ripemd160_kp[i / 16], \ - (uint)ripemd160_rlp[i]) + vals[9]; \ - vals[5] = vals[9]; vals[9] = vals[8]; \ - vals[8] = rotate(vals[7], 10U); vals[7] = vals[6]; \ - vals[6] = t; \ +#define ripemd160_round(i, in, vals, f, fp, t) do { \ + ripemd160_val(vals, i, 0) = \ + rotate(ripemd160_val(vals, i, 0) + \ + f(ripemd160_val(vals, i, 1), \ + ripemd160_val(vals, i, 2), \ + ripemd160_val(vals, i, 3)) + \ + in[ripemd160_ws[i]] + \ + ripemd160_k[i / 16], \ + (uint)ripemd160_rl[i]) + \ + ripemd160_val(vals, i, 4); \ + ripemd160_val(vals, i, 2) = \ + rotate(ripemd160_val(vals, i, 2), 10U); \ + ripemd160_valp(vals, i, 0) = \ + rotate(ripemd160_valp(vals, i, 0) + \ + fp(ripemd160_valp(vals, i, 1), \ + ripemd160_valp(vals, i, 2), \ + ripemd160_valp(vals, i, 3)) + \ + in[ripemd160_wsp[i]] + \ + ripemd160_kp[i / 16], \ + (uint)ripemd160_rlp[i]) + \ + ripemd160_valp(vals, i, 4); \ + ripemd160_valp(vals, i, 2) = \ + rotate(ripemd160_valp(vals, i, 2), 10U); \ } while (0) void @@ -771,9 +757,41 @@ ripemd160_block(uint *out, uint *in) } -#define bswap32(v) \ - (((v) >> 24) | (((v) >> 8) & 0xff00) | \ - (((v) << 8) & 0xff0000) | ((v) << 24)) +#ifdef TEST_KERNELS +/* + * Test kernels + */ + +/* Montgomery multiplication test kernel */ +__kernel void +test_mul_mont(__global bignum *products_out, __global bignum *nums_in) +{ + bignum a, b, c; + int o; + o = get_global_id(0); + nums_in += (2*o); + + a = nums_in[0]; + b = nums_in[1]; + bn_mul_mont(&c, &a, &b); + products_out[o] = c; +} + +/* modular inversion test kernel */ +__kernel void +test_mod_inverse(__global bignum *inv_out, __global bignum *nums_in, + int count) +{ + bignum x, xp; + int i, o; + o = get_global_id(0) * count; + for (i = 0; i < count; i++) { + x = nums_in[o]; + bn_mod_inverse(&xp, &x); + inv_out[o++] = xp; + } +} +#endif /* TEST_KERNELS */ #if 0 @@ -1028,24 +1046,13 @@ heap_invert(__global bignum *z_heap, int ncols) } } -__kernel void -hash_ec_point(__global uint *hashes_out, - __global bignum *points_in, __global bignum *z_heap) +void +hash_ec_point(uint *hash_out, __global bignum *xy, __global bignum *zip) { uint hash1[16], hash2[16]; + bignum c, zi, zzi; bn_word wh, wl; - bignum p, a, b; - int i, o; - - o = get_global_size(0); - i = o * get_global_id(1); - z_heap += (i * 2); - points_in += (i * 2); - hashes_out += (i * 5); - - i = get_global_id(0); - points_in += (i * 2); - hashes_out += (i * 5); + int i; /* * Multiply the coordinates by the inverted Z values. @@ -1054,34 +1061,34 @@ hash_ec_point(__global uint *hashes_out, * is big-endian, so swapping is unnecessary, but * inserting the format byte in front causes a headache. */ - a = z_heap[(o - 1) + i]; - bn_mul_mont(&b, &a, &a); /* Z^2 */ - p = points_in[0]; - bn_mul_mont(&p, &p, &b); /* X / Z^2 */ - bn_from_mont(&p, &p); + zi = zip[0]; + bn_mul_mont(&zzi, &zi, &zi); /* 1 / Z^2 */ + c = xy[0]; + bn_mul_mont(&c, &c, &zzi); /* X / Z^2 */ + bn_from_mont(&c, &c); wh = 0x00000004; /* POINT_CONVERSION_UNCOMPRESSED */ #ifdef UNROLL_MAX #pragma unroll UNROLL_MAX #endif - for (o = 0; o < BN_NWORDS; o++) { + for (i = 0; i < BN_NWORDS; i++) { wl = wh; - wh = p.d[(BN_NWORDS - 1) - o]; - hash1[o] = (wl << 24) | (wh >> 8); + wh = c.d[(BN_NWORDS - 1) - i]; + hash1[i] = (wl << 24) | (wh >> 8); } - bn_mul_mont(&a, &a, &b); /* Z^3 */ - p = points_in[1]; - bn_mul_mont(&p, &p, &a); /* Y / Z^3 */ - bn_from_mont(&p, &p); + bn_mul_mont(&zzi, &zzi, &zi); /* 1 / Z^3 */ + c = xy[1]; + bn_mul_mont(&c, &c, &zzi); /* Y / Z^3 */ + bn_from_mont(&c, &c); #ifdef UNROLL_MAX #pragma unroll UNROLL_MAX #endif - for (o = 0; o < BN_NWORDS; o++) { + for (i = 0; i < BN_NWORDS; i++) { wl = wh; - wh = p.d[(BN_NWORDS - 1) - o]; - hash1[BN_NWORDS + o] = (wl << 24) | (wh >> 8); + wh = c.d[(BN_NWORDS - 1) - i]; + hash1[BN_NWORDS + i] = (wl << 24) | (wh >> 8); } /* @@ -1119,8 +1126,8 @@ hash_ec_point(__global uint *hashes_out, #ifdef UNROLL_MAX #pragma unroll UNROLL_MAX #endif - for (o = 0; o < 8; o++) - hash2[o] = bswap32(hash2[o]); + for (i = 0; i < 8; i++) + hash2[i] = bswap32(hash2[i]); hash2[8] = bswap32(0x80000000); hash2[9] = 0; hash2[10] = 0; @@ -1129,12 +1136,116 @@ hash_ec_point(__global uint *hashes_out, hash2[13] = 0; hash2[14] = 32 * 8; hash2[15] = 0; - ripemd160_init(hash1); - ripemd160_block(hash1, hash2); + ripemd160_init(hash_out); + ripemd160_block(hash_out, hash2); +} + + +__kernel void +hash_ec_point_get(__global uint *hashes_out, + __global bignum *points_in, __global bignum *z_heap) +{ + uint hash[5]; + int i, o; + + o = get_global_size(0); + i = o * get_global_id(1); + z_heap += (i * 2); + points_in += (i * 2); + hashes_out += (i * 5); + + i = get_global_id(0); + points_in += (i * 2); + hashes_out += (i * 5); + /* Complete the coordinates and hash */ + hash_ec_point(hash, points_in, &z_heap[i + (o - 1)]); + + /* Output the hash in proper byte-order */ #ifdef UNROLL_MAX #pragma unroll UNROLL_MAX #endif - for (o = 0; o < 5; o++) - hashes_out[o] = hash1[o]; + for (i = 0; i < 5; i++) + hashes_out[i] = load_le32(hash[i]); +} + +/* + * Normally this would be one function that compared two hash160s. + * This one compares a hash160 with an upper and lower bound in one + * function to work around a problem with AMD's OpenCL compiler. + */ +int +hash160_ucmp_g(uint *a, __global uint *bound) +{ + uint gv; + int i; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < 5; i++) { + gv = load_be32(bound[i]); + if (a[i] < gv) return -1; + if (a[i] > gv) break; + } +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < 5; i++) { + gv = load_be32(bound[5+i]); + if (a[i] < gv) return 0; + if (a[i] > gv) return 1; + } + return 0; +} + +__kernel void +hash_ec_point_search_prefix(__global uint *found, + __global bignum *points_in, __global bignum *z_heap, + __global uint *target_table, int ntargets) +{ + uint hash[5]; + int i, high, low, p; + + p = get_global_size(0); + i = p * get_global_id(1); + z_heap += (i * 2); + points_in += (i * 2); + + i = get_global_id(0); + points_in += (i * 2); + + /* Complete the coordinates and hash */ + hash_ec_point(hash, points_in, &z_heap[(p - 1) + i]); + + /* + * Unconditionally byteswap the hash result, because: + * - The byte-level convention of RIPEMD160 is little-endian + * - We are comparing it in big-endian order + */ +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < 5; i++) + hash[i] = bswap32(hash[i]); + + /* Binary-search the target table for the hash we just computed */ + for (high = ntargets - 1, low = 0, i = high >> 1; + high >= low; + i = low + ((high - low) >> 1)) { + p = hash160_ucmp_g(hash, &target_table[10*i]); + low = (p > 0) ? (i + 1) : low; + high = (p < 0) ? (i - 1) : high; + if (p == 0) { + /* For debugging purposes, write the hash value */ + found[0] = ((get_global_id(1) * get_global_size(0)) + + get_global_id(0)); + found[1] = i; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (p = 0; p < 5; p++) + found[p+2] = load_be32(hash[p]); + high = -1; + } + } } diff --git a/oclvanitygen.c b/oclvanitygen.c index f78b0b1..8062178 100644 --- a/oclvanitygen.c +++ b/oclvanitygen.c @@ -33,8 +33,16 @@ const char *version = "0.13"; +const int debug = 0; #define MAX_SLOT 2 +#define MAX_ARG 6 +#define MAX_KERNEL 3 + +/* OpenCL address searching mode */ +struct _vg_ocl_context_s; +typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *); +typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot); typedef struct _vg_ocl_context_s { vg_exec_context_t base; @@ -42,10 +50,17 @@ typedef struct _vg_ocl_context_s { cl_context voc_oclctx; cl_command_queue voc_oclcmdq; cl_program voc_oclprog; - cl_kernel voc_oclkernel[MAX_SLOT][3]; + vg_ocl_init_t voc_init_func; + vg_ocl_init_t voc_rekey_func; + vg_ocl_check_t voc_check_func; + int voc_nslots; + cl_kernel voc_oclkernel[MAX_SLOT][MAX_KERNEL]; cl_event voc_oclkrnwait[MAX_SLOT]; - cl_mem voc_args[MAX_SLOT][6]; - size_t voc_arg_size[MAX_SLOT][6]; + cl_mem voc_args[MAX_SLOT][MAX_ARG]; + size_t voc_arg_size[MAX_SLOT][MAX_ARG]; + + int voc_pattern_rewrite; + int voc_pattern_alloc; pthread_t voc_ocl_thread; pthread_mutex_t voc_lock; @@ -171,8 +186,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, vocp->voc_oclprog = prog; if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") || - !vg_ocl_create_kernel(vocp, 1, "heap_invert") || - !vg_ocl_create_kernel(vocp, 2, "hash_ec_point")) { + !vg_ocl_create_kernel(vocp, 1, "heap_invert")) { clReleaseProgram(vocp->voc_oclprog); vocp->voc_oclprog = NULL; return 0; @@ -181,7 +195,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, return 1; } -void +void CL_CALLBACK vg_ocl_context_callback(const char *errinfo, const void *private_info, size_t cb, @@ -223,7 +237,7 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did) if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", - //"-cl-nv-verbose -cl-nv-maxrregcount=32 " + //"-cl-nv-verbose " "-DUNROLL_MAX=16")) { printf("Could not load kernel\n"); return 0; @@ -251,6 +265,21 @@ vg_ocl_del(vg_ocl_context_t *vocp) vg_exec_context_del(&vocp->base); } +static int vg_ocl_arg_map[][8] = { + /* hashes_out / found */ + { 2, 0, -1 }, + /* z_heap */ + { 0, 1, 1, 0, 2, 2, -1 }, + /* point_tmp */ + { 0, 0, 2, 1, -1 }, + /* row_in */ + { 0, 2, -1 }, + /* col_in */ + { 0, 3, -1 }, + /* target_table */ + { 2, 3, -1 }, +}; + int vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot, int arg, size_t size, int host) @@ -259,18 +288,15 @@ vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot, cl_int ret; int i, j, knum, karg; - static int arg_map[5][8] = { - /* hashes_out */ - { 2, 0, -1 }, - /* z_heap */ - { 0, 1, 1, 0, 2, 2, -1 }, - /* point_tmp */ - { 0, 0, 2, 1, -1 }, - /* row_in */ - { 0, 2, -1 }, - /* col_in */ - { 0, 3, -1 }, - }; + for (i = 0; i < MAX_SLOT; i++) { + if ((i != slot) && (slot >= 0)) + continue; + if (vocp->voc_args[i][arg]) { + clReleaseMemObject(vocp->voc_args[i][arg]); + vocp->voc_args[i][arg] = NULL; + vocp->voc_arg_size[i][arg] = 0; + } + } clbuf = clCreateBuffer(vocp->voc_oclctx, CL_MEM_READ_WRITE | @@ -287,24 +313,54 @@ vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot, if ((i != slot) && (slot >= 0)) continue; - for (j = 0; arg_map[arg][j] >= 0; j += 2) { - knum = arg_map[arg][j]; - karg = arg_map[arg][j+1]; + clRetainMemObject(clbuf); + vocp->voc_args[i][arg] = clbuf; + vocp->voc_arg_size[i][arg] = size; + + for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) { + knum = vg_ocl_arg_map[arg][j]; + karg = vg_ocl_arg_map[arg][j+1]; ret = clSetKernelArg(vocp->voc_oclkernel[i][knum], karg, sizeof(clbuf), &clbuf); if (ret) { - clReleaseMemObject(clbuf); printf("Could not set kernel argument: %d\n", ret); return 0; } } - vocp->voc_args[i][arg] = clbuf; - vocp->voc_arg_size[i][arg] = size; } + + clReleaseMemObject(clbuf); + return 1; +} + +int +vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg, + void *buffer, size_t size) +{ + cl_int slot, ret; + + slot = (wslot < 0) ? 0 : wslot; + + assert((slot >= 0) && (slot < MAX_SLOT)); + assert(size <= vocp->voc_arg_size[slot][arg]); + + ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq, + vocp->voc_args[slot][arg], + CL_TRUE, + 0, size, + buffer, + 0, NULL, + NULL); + + if (ret) { + printf("Could not copyout argument buffer: %d\n", ret); + return 0; + } + return 1; } @@ -320,7 +376,8 @@ vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot, buf = clEnqueueMapBuffer(vocp->voc_oclcmdq, vocp->voc_args[slot][arg], CL_TRUE, - rw ? CL_MAP_WRITE : CL_MAP_READ, + (rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE) + : (rw ? CL_MAP_WRITE : CL_MAP_READ), 0, vocp->voc_arg_size[slot][arg], 0, NULL, NULL, @@ -368,7 +425,7 @@ vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot, for (i = 0; i < MAX_SLOT; i++) { if ((i != slot) && (slot >= 0)) continue; - ret = clSetKernelArg(vocp->voc_oclkernel[i][0], + ret = clSetKernelArg(vocp->voc_oclkernel[i][2], arg, sizeof(value), &value); @@ -507,6 +564,200 @@ show_elapsed(struct timeval *tv, const char *place) printf("%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec); } + +/* + * GPU address matching methods + * + * gethash: GPU computes and returns all address hashes. + * + Works with any matching method, including regular expressions. + * - The CPU will not be able to keep up with mid- to high-end GPUs. + * + * prefix: GPU computes hash, searches a range list, and discards. + * + Fast, minimal work for CPU. + */ + +int +vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot) +{ + vg_exec_context_t *vxcp = &vocp->base; + vg_context_t *vcp = vocp->base.vxc_vc; + vg_test_func_t test_func = vcp->vc_test; + unsigned char *ocl_hashes_out; + int i, res = 0, round; + + ocl_hashes_out = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, slot, 0, 0); + + round = vocp->voc_ocl_cols * vocp->voc_ocl_rows; + + for (i = 0; i < round; i++, vxcp->vxc_delta++) { + memcpy(&vxcp->vxc_binres[1], + ocl_hashes_out + (20*i), + 20); + + res = test_func(vxcp); + if (res) + break; + } + + vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out); + return res; +} + +int +vg_ocl_gethash_init(vg_ocl_context_t *vocp) +{ + int i; + + if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get")) + return 0; + + for (i = 0; i < vocp->voc_nslots; i++) { + /* Each slot gets its own hash output buffer */ + if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, + 20 * + vocp->voc_ocl_rows * + vocp->voc_ocl_cols, 1)) + return 0; + } + + vocp->voc_rekey_func = NULL; + vocp->voc_check_func = vg_ocl_gethash_check; + return 1; +} + + +static int +vg_ocl_prefix_rekey(vg_ocl_context_t *vocp) +{ + vg_context_t *vcp = vocp->base.vxc_vc; + unsigned char *ocl_targets_in; + uint32_t *ocl_found_out; + int i; + + /* Set the found indicator for each slot to -1 */ + for (i = 0; i < vocp->voc_nslots; i++) { + ocl_found_out = (uint32_t *) + vg_ocl_map_arg_buffer(vocp, i, 0, 1); + ocl_found_out[0] = 0xffffffff; + vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out); + } + + if (vocp->voc_pattern_rewrite) { + /* Count number of range records */ + i = vg_context_hash160_sort(vcp, NULL); + if (!i) { + printf("No range records available, exiting\n"); + return 0; + } + + if (i > vocp->voc_pattern_alloc) { + /* (re)allocate target buffer */ + if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0)) + return 0; + vocp->voc_pattern_alloc = i; + } + + /* Write range records */ + ocl_targets_in = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, 0, 5, 1); + vg_context_hash160_sort(vcp, ocl_targets_in); + vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in); + vg_ocl_kernel_int_arg(vocp, -1, 4, i); + + vocp->voc_pattern_rewrite = 0; + } + return 1; +} + +static int +vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot) +{ + vg_exec_context_t *vxcp = &vocp->base; + vg_context_t *vcp = vocp->base.vxc_vc; + vg_test_func_t test_func = vcp->vc_test; + uint32_t *ocl_found_out; + uint32_t found_delta; + int orig_delta, tablesize; + int res = 0; + + /* Retrieve the found indicator */ + ocl_found_out = (uint32_t *) + vg_ocl_map_arg_buffer(vocp, slot, 0, 2); + found_delta = ocl_found_out[0]; + + if (found_delta != 0xffffffff) { + /* GPU code claims match, verify with CPU version */ + orig_delta = vxcp->vxc_delta; + vxcp->vxc_delta += found_delta; + vg_exec_context_calc_address(vxcp); + res = test_func(vxcp); + if (res == 0) { + /* + * The match was not found in + * the pattern list. Hmm. + */ + tablesize = ocl_found_out[2]; + printf("Match idx: %d\n", ocl_found_out[1]); + printf("CPU hash: "); + dumphex(vxcp->vxc_binres + 1, 20); + printf("GPU hash: "); + dumphex((unsigned char *) (ocl_found_out + 3), 20); + printf("Table size: %d\n", ocl_found_out[2]); + printf("Found delta: %d " + "Start delta: %d\n", + found_delta, orig_delta); + res = 1; + } + vocp->voc_pattern_rewrite = 1; + } else { + vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows); + } + + vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out); + return res; +} + +int +vg_ocl_prefix_init(vg_ocl_context_t *vocp) +{ + int i; + + if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix")) + return 0; + + for (i = 0; i < vocp->voc_nslots; i++) { + if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1)) + return 0; + } + vocp->voc_rekey_func = vg_ocl_prefix_rekey; + vocp->voc_check_func = vg_ocl_prefix_check; + vocp->voc_pattern_rewrite = 1; + vocp->voc_pattern_alloc = 0; + return 1; +} + + +int +vg_ocl_config_pattern(vg_ocl_context_t *vocp) +{ + vg_context_t *vcp = vocp->base.vxc_vc; + int i; + + i = vg_context_hash160_sort(vcp, NULL); + if (i > 0) { + if (vcp->vc_verbose > 1) + printf("Using GPU prefix matcher\n"); + /* Configure for prefix matching */ + return vg_ocl_prefix_init(vocp); + } + + if (vcp->vc_verbose > 0) + printf("WARNING: Using CPU pattern matcher\n"); + return vg_ocl_gethash_init(vocp); +} + + void * vg_opencl_thread(void *arg) { @@ -561,24 +812,25 @@ vg_opencl_thread(void *arg) if (!vg_ocl_kernel_wait(vocp, slot)) halt = 1; - gettimeofday(&tvt, NULL); - timersub(&tvt, &tv, &tvd); - timeradd(&tvd, &busy, &busy); - - if ((vcp->vc_verbose > 1) && - ((busy.tv_sec + idle.tv_sec) > 1)) { - idleu = (1000000 * idle.tv_sec) + idle.tv_usec; - busyu = (1000000 * busy.tv_sec) + busy.tv_usec; - pidle = ((double) idleu) / (idleu + busyu); - - if (pidle > 0.05) { - printf("\rGPU idle: %.2f%%" - " " + + if (vcp->vc_verbose > 1) { + gettimeofday(&tvt, NULL); + timersub(&tvt, &tv, &tvd); + timeradd(&tvd, &busy, &busy); + if ((busy.tv_sec + idle.tv_sec) > 1) { + idleu = (1000000 * idle.tv_sec) + idle.tv_usec; + busyu = (1000000 * busy.tv_sec) + busy.tv_usec; + pidle = ((double) idleu) / (idleu + busyu); + + if (pidle > 0.01) { + printf("\rGPU idle: %.2f%%" + " " " \n", - 100 * pidle); + 100 * pidle); + } + memset(&idle, 0, sizeof(idle)); + memset(&busy, 0, sizeof(busy)); } - memset(&idle, 0, sizeof(idle)); - memset(&busy, 0, sizeof(busy)); } } out: @@ -586,6 +838,7 @@ out: return NULL; } + /* * Address search thread main loop */ @@ -605,12 +858,11 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL; EC_POINT *pseek = NULL; - unsigned char *ocl_points_in, *ocl_strides_in, *ocl_hashes_out; + unsigned char *ocl_points_in, *ocl_strides_in; vg_ocl_context_t ctx; vg_ocl_context_t *vocp = &ctx; vg_exec_context_t *vxcp = &vocp->base; - vg_test_func_t test_func = vcp->vc_test; int slot, nslots; int slot_busy = 0, slot_done = 0, halt = 0; @@ -637,6 +889,10 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) nslots = 2; slot = 0; + vocp->voc_ocl_cols = batchsize; + vocp->voc_ocl_rows = worksize; + vocp->voc_nslots = nslots; + ppbase = (EC_POINT **) malloc((batchsize + worksize) * sizeof(EC_POINT*)); if (!ppbase) @@ -667,26 +923,28 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) round = batchsize * worksize; + if (!vg_ocl_config_pattern(vocp)) + goto enomem; + for (i = 0; i < nslots; i++) { /* * Each work group gets its own: - * - Hash output array - * - Point and z_heap scratch spaces * - Column point array */ - if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 20 * round, 1) || - !vg_ocl_kernel_arg_alloc(vocp, i, 1, 32 * 2 * round, 0) || - !vg_ocl_kernel_arg_alloc(vocp, i, 2, 32 * 2 * round, 0) || - !vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * worksize, 1)) + if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * worksize, 1)) goto enomem; } - /* Same row point array for all instances */ - if (!vg_ocl_kernel_arg_alloc(vocp, -1, 3, 32 * 2 * batchsize, 1)) + /* + * All instances share: + * - The z_heap and point scratch spaces + * - The row point array + */ + if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1, 32 * 2 * round, 0) || + !vg_ocl_kernel_arg_alloc(vocp, -1, 2, 32 * 2 * round, 0) || + !vg_ocl_kernel_arg_alloc(vocp, -1, 3, 32 * 2 * batchsize, 1)) goto enomem; - //vg_ocl_kernel_int_arg(vocp, -1, 5, batchsize); - npoints = 0; rekey_at = 0; vxcp->vxc_binres[0] = vcp->vc_addrtype; @@ -698,6 +956,10 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) gettimeofday(&tvstart, NULL); l_rekey: + if (vocp->voc_rekey_func && + !vocp->voc_rekey_func(vocp)) + goto enomem; + /* Generate a new random private key */ EC_KEY_generate_key(pkey); npoints = 0; @@ -755,33 +1017,22 @@ l_rekey: while (1) { if (slot_done) { + assert(rekey_at > 0); slot_done = 0; - ocl_hashes_out = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, slot, 0, 0); - - for (i = 0; i < round; i++, vxcp->vxc_delta++) { - memcpy(&vxcp->vxc_binres[1], - ocl_hashes_out + (20*i), - 20); - - switch (test_func(vxcp)) { - case 1: - rekey_at = 0; - i = round; - break; - case 2: - halt = 1; - i = round; - break; - default: - break; - } + /* Call the result check function */ + switch (vocp->voc_check_func(vocp, slot)) { + case 1: + rekey_at = 0; + break; + case 2: + halt = 1; + break; + default: + break; } - vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out); - - c += (i + 1); + c += round; if (!halt && (c >= output_interval)) { output_interval = vg_output_timing(vcp, c, &tvstart); @@ -789,24 +1040,8 @@ l_rekey: } } - if (halt) { - if (vcp->vc_verbose > 1) - printf("Halting..."); - pthread_mutex_lock(&vocp->voc_lock); - vocp->voc_halt = 1; - pthread_cond_signal(&vocp->voc_wait); - while (vocp->voc_ocl_slot != -1) { - assert(slot_busy); - pthread_cond_wait(&vocp->voc_wait, - &vocp->voc_lock); - } - slot_busy = 0; - pthread_mutex_unlock(&vocp->voc_lock); - pthread_join(vocp->voc_ocl_thread, NULL); - if (vcp->vc_verbose > 1) - printf("done!\n"); + if (halt) break; - } if ((npoints + round) < rekey_at) { if (npoints > 1) { @@ -843,35 +1078,36 @@ l_rekey: } if (vocp->voc_halt) { - halt = 1; - } else { - vocp->voc_ocl_slot = slot; - vocp->voc_ocl_cols = batchsize; - vocp->voc_ocl_rows = worksize; - pthread_cond_signal(&vocp->voc_wait); pthread_mutex_unlock(&vocp->voc_lock); - - if (slot_busy) - slot_done = 1; - slot_busy = 1; - slot = (slot + 1) % nslots; + halt = 1; + break; } + + vocp->voc_ocl_slot = slot; + pthread_cond_signal(&vocp->voc_wait); pthread_mutex_unlock(&vocp->voc_lock); - } - else if (slot_busy) { - pthread_mutex_lock(&vocp->voc_lock); - while (vocp->voc_ocl_slot != -1) { - pthread_cond_wait(&vocp->voc_wait, - &vocp->voc_lock); + slot_done = slot_busy; + slot_busy = 1; + slot = (slot + 1) % nslots; + + } else { + if (slot_busy) { + pthread_mutex_lock(&vocp->voc_lock); + while (vocp->voc_ocl_slot != -1) { + assert(vocp->voc_ocl_slot == + ((slot + nslots - 1) % nslots)); + pthread_cond_wait(&vocp->voc_wait, + &vocp->voc_lock); + } + pthread_mutex_unlock(&vocp->voc_lock); + slot_busy = 0; + slot_done = 1; } - slot_busy = 0; - pthread_mutex_unlock(&vocp->voc_lock); - slot_done = 1; - } - else if (!rekey_at || ((npoints + round) >= rekey_at)) { - goto l_rekey; + if (!rekey_at || + (!slot_done && ((npoints + round) >= rekey_at))) + goto l_rekey; } } @@ -880,6 +1116,24 @@ l_rekey: printf("ERROR: allocation failure?\n"); } + if (halt) { + if (vcp->vc_verbose > 1) + printf("Halting..."); + pthread_mutex_lock(&vocp->voc_lock); + vocp->voc_halt = 1; + pthread_cond_signal(&vocp->voc_wait); + while (vocp->voc_ocl_slot != -1) { + assert(slot_busy); + pthread_cond_wait(&vocp->voc_wait, + &vocp->voc_lock); + } + slot_busy = 0; + pthread_mutex_unlock(&vocp->voc_lock); + pthread_join(vocp->voc_ocl_thread, NULL); + if (vcp->vc_verbose > 1) + printf("done!\n"); + } + if (ppbase) { for (i = 0; i < (batchsize + worksize); i++) if (ppbase[i]) diff --git a/pattern.c b/pattern.c index 184dc02..005f769 100644 --- a/pattern.c +++ b/pattern.c @@ -207,6 +207,27 @@ vg_exec_context_consolidate_key(vg_exec_context_t *vxcp) } } +void +vg_exec_context_calc_address(vg_exec_context_t *vxcp) +{ + const EC_GROUP *pgroup; + unsigned char eckey_buf[96], hash1[32], hash2[20]; + int len; + + vg_exec_context_consolidate_key(vxcp); + pgroup = EC_KEY_get0_group(vxcp->vxc_key); + len = EC_POINT_point2oct(pgroup, + EC_KEY_get0_public_key(vxcp->vxc_key), + POINT_CONVERSION_UNCOMPRESSED, + eckey_buf, + sizeof(eckey_buf), + vxcp->vxc_bnctx); + SHA256(eckey_buf, len, hash1); + RIPEMD160(hash1, sizeof(hash1), hash2); + memcpy(&vxcp->vxc_binres[1], + hash2, 20); +} + typedef struct _timing_info_s { struct _timing_info_s *ti_next; @@ -437,6 +458,14 @@ vg_context_add_patterns(vg_context_t *vcp, return vcp->vc_add_patterns(vcp, patterns, npatterns); } +int +vg_context_hash160_sort(vg_context_t *vcp, void *buf) +{ + if (!vcp->vc_hash160_sort) + return 0; + return vcp->vc_hash160_sort(vcp, buf); +} + static const signed char b58_reverse_map[256] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -1023,6 +1052,17 @@ avl_insert_fix(avl_root_t *rootp, avl_item_t *itemp) } } +static INLINE avl_item_t * +avl_first(avl_root_t *rootp) +{ + avl_item_t *itemp = rootp->ar_root; + if (itemp) { + while (itemp->ai_left) + itemp = itemp->ai_left; + } + return itemp; +} + static INLINE avl_item_t * avl_next(avl_item_t *itemp) { @@ -1181,6 +1221,26 @@ vg_prefix_avl_insert(avl_root_t *rootp, vg_prefix_t *vpnew) return NULL; } +static vg_prefix_t * +vg_prefix_first(avl_root_t *rootp) +{ + avl_item_t *itemp; + itemp = avl_first(rootp); + if (itemp) + return avl_item_entry(itemp, vg_prefix_t, vp_item); + return NULL; +} + +static vg_prefix_t * +vg_prefix_next(vg_prefix_t *vp) +{ + avl_item_t *itemp = &vp->vp_item; + itemp = avl_next(itemp); + if (itemp) + return avl_item_entry(itemp, vg_prefix_t, vp_item); + return NULL; +} + static vg_prefix_t * vg_prefix_add(avl_root_t *rootp, const char *pattern, BIGNUM *low, BIGNUM *high) { @@ -1568,6 +1628,54 @@ research: return res; } +int +vg_prefix_hash160_sort(vg_context_t *vcp, void *buf) +{ + vg_prefix_context_t *vcpp = (vg_prefix_context_t *) vcp; + vg_prefix_t *vp; + unsigned char *cbuf = (unsigned char *) buf; + unsigned char bnbuf[25]; + int nbytes, ncopy, nskip, npfx = 0; + + /* + * Walk the prefix tree in order, copy the upper and lower bound + * values into the hash160 buffer. Skip the lower four bytes + * and anything above the 24th byte. + */ + for (vp = vg_prefix_first(&vcpp->vcp_avlroot); + vp != NULL; + vp = vg_prefix_next(vp)) { + npfx++; + if (!buf) + continue; + + /* Low */ + nbytes = BN_bn2bin(vp->vp_low, bnbuf); + ncopy = ((nbytes >= 24) ? 20 : + ((nbytes > 4) ? (nbytes - 4) : 0)); + nskip = (nbytes >= 24) ? (nbytes - 24) : 0; + if (ncopy < 20) + memset(cbuf, 0, 20 - ncopy); + memcpy(cbuf + (20 - ncopy), + bnbuf + nskip, + ncopy); + cbuf += 20; + + /* High */ + nbytes = BN_bn2bin(vp->vp_high, bnbuf); + ncopy = ((nbytes >= 24) ? 20 : + ((nbytes > 4) ? (nbytes - 4) : 0)); + nskip = (nbytes >= 24) ? (nbytes - 24) : 0; + if (ncopy < 20) + memset(cbuf, 0, 20 - ncopy); + memcpy(cbuf + (20 - ncopy), + bnbuf + nskip, + ncopy); + cbuf += 20; + } + return npfx; +} + vg_context_t * vg_prefix_context_new(int addrtype, int privtype, int caseinsensitive) { @@ -1584,6 +1692,7 @@ vg_prefix_context_new(int addrtype, int privtype, int caseinsensitive) vcpp->base.vc_free = vg_prefix_context_free; vcpp->base.vc_add_patterns = vg_prefix_context_add_patterns; vcpp->base.vc_test = vg_prefix_test; + vcpp->base.vc_hash160_sort = vg_prefix_hash160_sort; avl_root_init(&vcpp->vcp_avlroot); BN_init(&vcpp->vcp_difficulty); vcpp->vcp_caseinsensitive = caseinsensitive; @@ -1811,6 +1920,7 @@ vg_regex_context_new(int addrtype, int privtype) vcrp->base.vc_free = vg_regex_context_free; vcrp->base.vc_add_patterns = vg_regex_context_add_patterns; vcrp->base.vc_test = vg_regex_test; + vcrp->base.vc_hash160_sort = NULL; vcrp->vcr_regex = NULL; vcrp->vcr_nalloc = 0; } diff --git a/pattern.h b/pattern.h index aa05033..51ff298 100644 --- a/pattern.h +++ b/pattern.h @@ -59,6 +59,7 @@ typedef struct _vg_exec_context_s { extern int vg_exec_context_init(vg_context_t *vcp, vg_exec_context_t *vxcp); extern void vg_exec_context_del(vg_exec_context_t *vxcp); extern void vg_exec_context_consolidate_key(vg_exec_context_t *vxcp); +extern void vg_exec_context_calc_address(vg_exec_context_t *vxcp); /* Implementation-specific lock/unlock/consolidate */ extern void vg_exec_downgrade_lock(vg_exec_context_t *vxcp); @@ -69,6 +70,7 @@ typedef void (*vg_free_func_t)(vg_context_t *); typedef int (*vg_add_pattern_func_t)(vg_context_t *, char ** const patterns, int npatterns); typedef int (*vg_test_func_t)(vg_exec_context_t *); +typedef int (*vg_hash160_sort_func_t)(vg_context_t *vcp, void *buf); /* Application-level context, incl. parameters and global pattern store */ struct _vg_context_s { @@ -84,12 +86,14 @@ struct _vg_context_s { vg_free_func_t vc_free; vg_add_pattern_func_t vc_add_patterns; vg_test_func_t vc_test; + vg_hash160_sort_func_t vc_hash160_sort; }; extern void vg_context_free(vg_context_t *vcp); extern int vg_context_add_patterns(vg_context_t *vcp, char ** const patterns, int npatterns); +extern int vg_context_hash160_sort(vg_context_t *vcp, void *buf); extern vg_context_t *vg_prefix_context_new(int addrtype, int privtype, diff --git a/winglue.c b/winglue.c index e7bd9f9..6b4f281 100644 --- a/winglue.c +++ b/winglue.c @@ -102,6 +102,17 @@ gettimeofday(struct timeval *tv, struct timezone *tz) return 0; } +void +timeradd(struct timeval *a, struct timeval *b, struct timeval *result) +{ + result->tv_sec = a->tv_sec + b->tv_sec; + result->tv_usec = a->tv_usec + b->tv_usec; + if (result->tv_usec > 10000000) { + result->tv_sec++; + result->tv_usec -= 1000000; + } +} + void timersub(struct timeval *a, struct timeval *b, struct timeval *result) { diff --git a/winglue.h b/winglue.h index f94866d..d499733 100644 --- a/winglue.h +++ b/winglue.h @@ -29,6 +29,8 @@ struct timezone; extern int gettimeofday(struct timeval *tv, struct timezone *tz); +extern void timeradd(struct timeval *a, struct timeval *b, + struct timeval *result); extern void timersub(struct timeval *a, struct timeval *b, struct timeval *result);