diff --git a/Makefile b/Makefile
index cebe1f9..55c17b8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,21 @@
 LIBS=-lpcre -lcrypto -lm -lpthread
 CFLAGS=-ggdb -O3 -Wall
-OBJS=vanitygen.o oclvanitygen.o keyconv.o pattern.o util.o
-PROGS=vanitygen keyconv
+OBJS=vanitygen.o oclvanitygen.o oclvanityminer.o oclengine.o keyconv.o pattern.o util.o
+PROGS=vanitygen oclvanitygen oclvanityminer keyconv
+
+most: vanitygen keyconv
 
 all: $(PROGS)
 
 vanitygen: vanitygen.o pattern.o util.o
 	$(CC) $^ -o $@ $(CFLAGS) $(LIBS)
 
-oclvanitygen: oclvanitygen.o pattern.o util.o
+oclvanitygen: oclvanitygen.o oclengine.o pattern.o util.o
 	$(CC) $^ -o $@ $(CFLAGS) $(LIBS) -lOpenCL
 
+oclvanityminer: oclvanityminer.o oclengine.o pattern.o util.o
+	$(CC) $^ -o $@ $(CFLAGS) $(LIBS) -lOpenCL -lcurl
+
 keyconv: keyconv.o util.o
 	$(CC) $^ -o $@ $(CFLAGS) $(LIBS)
 
diff --git a/Makefile.Win32 b/Makefile.Win32
index 77b523e..40072ff 100644
--- a/Makefile.Win32
+++ b/Makefile.Win32
@@ -2,29 +2,42 @@ CC = cl
 OPENSSL_DIR = C:\OpenSSL-Win32
 PTHREADS_DIR = C:\pthreads-w32-2-8-0-release
 PCRE_DIR = C:\pcre-7.9-src
+CURL_DIR = C:\curl-7.26.0-x86\builds\libcurl-release-static-ssl-static-ipv6-sspi
 OPENCL_DIR = "C:\Program Files (x86)\AMD APP"
 OPENCL_INCLUDE = /I$(OPENCL_DIR)\include
 OPENCL_LIBS = $(OPENCL_DIR)\lib\x86\OpenCL.lib
-CFLAGS = /D_WIN32 /DPTW32_STATIC_LIB /DPCRE_STATIC /I$(OPENSSL_DIR)\include /I$(PTHREADS_DIR) /I$(PCRE_DIR)
-LIBS = $(OPENSSL_DIR)\lib\libeay32.lib $(PTHREADS_DIR)\pthreadVC2.lib $(PCRE_DIR)\pcre.lib ws2_32.lib
-OBJS = vanitygen.obj oclvanitygen.obj keyconv.obj pattern.obj util.obj winglue.obj
+CURL_INCLUDE = /I$(CURL_DIR)\include /DCURL_STATICLIB
+CURL_LIBS = $(CURL_DIR)\lib\libcurl_a.lib
+CFLAGS_BASE = /D_WIN32 /DPTW32_STATIC_LIB /DPCRE_STATIC /I$(OPENSSL_DIR)\inc32 /I$(PTHREADS_DIR) /I$(PCRE_DIR) /Ox /Zi
+CFLAGS = $(CFLAGS_BASE) /GL
+LIBS = $(OPENSSL_DIR)\out32\libeay32.lib $(PTHREADS_DIR)\pthreadVC2.lib $(PCRE_DIR)\pcre.lib ws2_32.lib user32.lib advapi32.lib gdi32.lib /LTCG /DEBUG
+OBJS = vanitygen.obj oclvanitygen.obj oclengine.obj oclvanityminer.obj keyconv.obj pattern.obj util.obj winglue.obj
 
-all: vanitygen.exe
+all: vanitygen.exe keyconv.exe
 
 vanitygen.exe: vanitygen.obj pattern.obj util.obj winglue.obj
 	link /nologo /out:$@ $** $(LIBS)
 
-oclvanitygen.exe: oclvanitygen.obj pattern.obj util.obj winglue.obj
-	link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS)
+oclvanitygen.exe: oclvanitygen.obj oclengine.obj pattern.obj util.obj winglue.obj
+	link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS) $(CURL_LIBS)
+
+oclvanityminer.exe: oclvanityminer.obj oclengine.obj pattern.obj util.obj winglue.obj
+	link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS) $(CURL_LIBS)
 
 keyconv.exe: keyconv.obj util.obj winglue.obj
-	link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS)
+	link /nologo /out:$@ $** $(LIBS)
 
 .c.obj:
 	@$(CC) /nologo $(CFLAGS) /c /Tp$< /Fo$@
 
+oclengine.obj: oclengine.c
+	@$(CC) /nologo $(CFLAGS_BASE) $(OPENCL_INCLUDE) /c /Tpoclengine.c /Fo$@
+
 oclvanitygen.obj: oclvanitygen.c
-	@$(CC) /nologo $(CFLAGS) $(OPENCL_INCLUDE) /c /Tpoclvanitygen.c /Fo$@
+	@$(CC) /nologo $(CFLAGS_BASE) /c /Tpoclvanitygen.c /Fo$@
+
+oclvanityminer.obj: oclvanityminer.c
+	@$(CC) /nologo $(CFLAGS_BASE) $(CURL_INCLUDE) /c /Tpoclvanityminer.c /Fo$@
 
 clean:
-	del vanitygen.exe $(OBJS)
+	del vanitygen.exe oclvanitygen.exe oclvanityminer.exe keyconv.exe $(OBJS)
diff --git a/keyconv.c b/keyconv.c
index 1283fa1..0fac934 100644
--- a/keyconv.c
+++ b/keyconv.c
@@ -28,7 +28,8 @@ usage(const char *progname)
 "-8            Output key in PKCS#8 form\n"
 "-e            Encrypt output key, prompt for password\n"
 "-E <password> Encrypt output key with <password> (UNSAFE)\n"
-"-c <key>      Combine private key parts to make complete private key",
+"-c <key>      Combine private key parts to make complete private key\n"
+"-v            Verbose output\n",
 		version, progname);
 }
 
@@ -47,10 +48,11 @@ main(int argc, char **argv)
 	int privtype, addrtype;
 	int pkcs8 = 0;
 	int pass_prompt = 0;
+	int verbose = 0;
 	int opt;
 	int res;
 
-	while ((opt = getopt(argc, argv, "8E:ec:")) != -1) {
+	while ((opt = getopt(argc, argv, "8E:ec:v")) != -1) {
 		switch (opt) {
 		case '8':
 			pkcs8 = 1;
@@ -75,6 +77,9 @@ main(int argc, char **argv)
 		case 'c':
 			key2_in = optarg;
 			break;
+		case 'v':
+			verbose = 1;
+			break;
 		default:
 			usage(argv[0]);
 			return 1;
@@ -149,6 +154,15 @@ main(int argc, char **argv)
 	default:  addrtype = 0; break;
 	}
 
+	if (verbose) {
+		unsigned char *pend = (unsigned char *) pbuf;
+		res = i2o_ECPublicKey(pkey, &pend);
+		fprintf(stderr, "Pubkey (hex): ");
+		dumphex((unsigned char *)pbuf, res);
+		fprintf(stderr, "Privkey (hex): ");
+		dumpbn(EC_KEY_get0_private_key(pkey));
+	}
+			
 	if (pkcs8) {
 		res = vg_pkcs8_encode_privkey(pbuf, sizeof(pbuf),
 					      pkey, pass_in);
diff --git a/oclengine.c b/oclengine.c
new file mode 100644
index 0000000..2b59835
--- /dev/null
+++ b/oclengine.c
@@ -0,0 +1,2591 @@
+/*
+ * Vanitygen, vanity bitcoin address generator
+ * Copyright (C) 2011 <samr7@cs.washington.edu>
+ *
+ * Vanitygen is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version. 
+ *
+ * Vanitygen is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Vanitygen.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#include <pthread.h>
+
+#include <openssl/ec.h>
+#include <openssl/bn.h>
+#include <openssl/rand.h>
+#include <openssl/md5.h>
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#ifndef CL_CALLBACK
+#define CL_CALLBACK
+#endif
+#else
+#include <CL/cl.h>
+#endif
+
+#include "oclengine.h"
+#include "pattern.h"
+#include "util.h"
+
+
+#define MAX_SLOT 2
+#define MAX_ARG 6
+#define MAX_KERNEL 3
+
+#define is_pow2(v) (!((v) & ((v)-1)))
+#define round_up_pow2(x, a) (((x) + ((a)-1)) & ~((a)-1))
+
+static void vg_ocl_free_args(vg_ocl_context_t *vocp);
+
+
+/* OpenCL address searching mode */
+struct _vg_ocl_context_s;
+typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *);
+typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot);
+
+struct _vg_ocl_context_s {
+	vg_exec_context_t		base;
+	cl_device_id			voc_ocldid;
+	cl_context			voc_oclctx;
+	cl_command_queue		voc_oclcmdq;
+	cl_program			voc_oclprog;
+	vg_ocl_init_t			voc_init_func;
+	vg_ocl_init_t			voc_rekey_func;
+	vg_ocl_check_t			voc_check_func;
+	int				voc_quirks;
+	int				voc_nslots;
+	cl_kernel			voc_oclkernel[MAX_SLOT][MAX_KERNEL];
+	cl_event			voc_oclkrnwait[MAX_SLOT];
+	cl_mem				voc_args[MAX_SLOT][MAX_ARG];
+	size_t				voc_arg_size[MAX_SLOT][MAX_ARG];
+
+	int				voc_pattern_rewrite;
+	int				voc_pattern_alloc;
+
+	vg_ocl_check_t			voc_verify_func[MAX_KERNEL];
+
+	pthread_t			voc_ocl_thread;
+	pthread_mutex_t			voc_lock;
+	pthread_cond_t			voc_wait;
+	int				voc_ocl_slot;
+	int				voc_ocl_rows;
+	int				voc_ocl_cols;
+	int				voc_ocl_invsize;
+	int				voc_halt;
+	int				voc_dump_done;
+};
+
+
+/* Thread synchronization stubs */
+void
+vg_exec_downgrade_lock(vg_exec_context_t *vxcp)
+{
+}
+
+int
+vg_exec_upgrade_lock(vg_exec_context_t *vxcp)
+{
+	return 0;
+}
+
+
+/*
+ * OpenCL debugging and support
+ */
+
+static const char *
+vg_ocl_strerror(cl_int ret)
+{
+#define OCL_STATUS(st) case st: return #st;
+	switch (ret) {
+		OCL_STATUS(CL_SUCCESS);
+		OCL_STATUS(CL_DEVICE_NOT_FOUND);
+		OCL_STATUS(CL_DEVICE_NOT_AVAILABLE);
+		OCL_STATUS(CL_COMPILER_NOT_AVAILABLE);
+		OCL_STATUS(CL_MEM_OBJECT_ALLOCATION_FAILURE);
+		OCL_STATUS(CL_OUT_OF_RESOURCES);
+		OCL_STATUS(CL_OUT_OF_HOST_MEMORY);
+		OCL_STATUS(CL_PROFILING_INFO_NOT_AVAILABLE);
+		OCL_STATUS(CL_MEM_COPY_OVERLAP);
+		OCL_STATUS(CL_IMAGE_FORMAT_MISMATCH);
+		OCL_STATUS(CL_IMAGE_FORMAT_NOT_SUPPORTED);
+		OCL_STATUS(CL_BUILD_PROGRAM_FAILURE);
+		OCL_STATUS(CL_MAP_FAILURE);
+#if defined(CL_MISALIGNED_SUB_BUFFER_OFFSET)
+		OCL_STATUS(CL_MISALIGNED_SUB_BUFFER_OFFSET);
+#endif /* defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) */
+#if defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
+		OCL_STATUS(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
+#endif /* defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) */
+		OCL_STATUS(CL_INVALID_VALUE);
+		OCL_STATUS(CL_INVALID_DEVICE_TYPE);
+		OCL_STATUS(CL_INVALID_PLATFORM);
+		OCL_STATUS(CL_INVALID_DEVICE);
+		OCL_STATUS(CL_INVALID_CONTEXT);
+		OCL_STATUS(CL_INVALID_QUEUE_PROPERTIES);
+		OCL_STATUS(CL_INVALID_COMMAND_QUEUE);
+		OCL_STATUS(CL_INVALID_HOST_PTR);
+		OCL_STATUS(CL_INVALID_MEM_OBJECT);
+		OCL_STATUS(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+		OCL_STATUS(CL_INVALID_IMAGE_SIZE);
+		OCL_STATUS(CL_INVALID_SAMPLER);
+		OCL_STATUS(CL_INVALID_BINARY);
+		OCL_STATUS(CL_INVALID_BUILD_OPTIONS);
+		OCL_STATUS(CL_INVALID_PROGRAM);
+		OCL_STATUS(CL_INVALID_PROGRAM_EXECUTABLE);
+		OCL_STATUS(CL_INVALID_KERNEL_NAME);
+		OCL_STATUS(CL_INVALID_KERNEL_DEFINITION);
+		OCL_STATUS(CL_INVALID_KERNEL);
+		OCL_STATUS(CL_INVALID_ARG_INDEX);
+		OCL_STATUS(CL_INVALID_ARG_VALUE);
+		OCL_STATUS(CL_INVALID_ARG_SIZE);
+		OCL_STATUS(CL_INVALID_KERNEL_ARGS);
+		OCL_STATUS(CL_INVALID_WORK_DIMENSION);
+		OCL_STATUS(CL_INVALID_WORK_GROUP_SIZE);
+		OCL_STATUS(CL_INVALID_WORK_ITEM_SIZE);
+		OCL_STATUS(CL_INVALID_GLOBAL_OFFSET);
+		OCL_STATUS(CL_INVALID_EVENT_WAIT_LIST);
+		OCL_STATUS(CL_INVALID_EVENT);
+		OCL_STATUS(CL_INVALID_OPERATION);
+		OCL_STATUS(CL_INVALID_GL_OBJECT);
+		OCL_STATUS(CL_INVALID_BUFFER_SIZE);
+		OCL_STATUS(CL_INVALID_MIP_LEVEL);
+		OCL_STATUS(CL_INVALID_GLOBAL_WORK_SIZE);
+#if defined(CL_INVALID_PROPERTY)
+		OCL_STATUS(CL_INVALID_PROPERTY);
+#endif /* defined(CL_INVALID_PROPERTY) */
+#undef OCL_STATUS
+	default: {
+		static char tmp[64];
+		snprintf(tmp, sizeof(tmp), "Unknown code %d", ret);
+		return tmp;
+	}
+	}
+}
+
+/* Get device strings, using a static buffer -- caveat emptor */
+static const char *
+vg_ocl_platform_getstr(cl_platform_id pid, cl_platform_info param)
+{
+	static char platform_str[1024];
+	cl_int ret;
+	size_t size_ret;
+	ret = clGetPlatformInfo(pid, param,
+				sizeof(platform_str), platform_str,
+				&size_ret);
+	if (ret != CL_SUCCESS) {
+		snprintf(platform_str, sizeof(platform_str),
+			 "clGetPlatformInfo(%d): %s",
+			 param, vg_ocl_strerror(ret));
+	}
+	return platform_str;
+}
+
+static cl_platform_id
+vg_ocl_device_getplatform(cl_device_id did)
+{
+	cl_int ret;
+	cl_platform_id val;
+	size_t size_ret;
+	ret = clGetDeviceInfo(did, CL_DEVICE_PLATFORM,
+			      sizeof(val), &val, &size_ret);
+	if (ret != CL_SUCCESS) {
+		fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_PLATFORM): %s",
+			vg_ocl_strerror(ret));
+	}
+	return val;
+}
+
+static cl_device_type
+vg_ocl_device_gettype(cl_device_id did)
+{
+	cl_int ret;
+	cl_device_type val;
+	size_t size_ret;
+	ret = clGetDeviceInfo(did, CL_DEVICE_TYPE,
+			      sizeof(val), &val, &size_ret);
+	if (ret != CL_SUCCESS) {
+		fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_TYPE): %s",
+			vg_ocl_strerror(ret));
+	}
+	return val;
+}
+
+static const char *
+vg_ocl_device_getstr(cl_device_id did, cl_device_info param)
+{
+	static char device_str[1024];
+	cl_int ret;
+	size_t size_ret;
+	ret = clGetDeviceInfo(did, param,
+			      sizeof(device_str), device_str,
+			      &size_ret);
+	if (ret != CL_SUCCESS) {
+		snprintf(device_str, sizeof(device_str),
+			 "clGetDeviceInfo(%d): %s",
+			 param, vg_ocl_strerror(ret));
+	}
+	return device_str;
+}
+
+static size_t
+vg_ocl_device_getsizet(cl_device_id did, cl_device_info param)
+{
+	cl_int ret;
+	size_t val;
+	size_t size_ret;
+	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
+	if (ret != CL_SUCCESS) {
+		fprintf(stderr,
+			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
+	}
+	return val;
+}
+
+static cl_ulong
+vg_ocl_device_getulong(cl_device_id did, cl_device_info param)
+{
+	cl_int ret;
+	cl_ulong val;
+	size_t size_ret;
+	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
+	if (ret != CL_SUCCESS) {
+		fprintf(stderr,
+			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
+	}
+	return val;
+}
+
+static cl_uint
+vg_ocl_device_getuint(cl_device_id did, cl_device_info param)
+{
+	cl_int ret;
+        cl_uint val;
+	size_t size_ret;
+	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
+	if (ret != CL_SUCCESS) {
+		fprintf(stderr,
+			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
+	}
+	return val;
+}
+
+void
+vg_ocl_dump_info(vg_ocl_context_t *vocp)
+{
+	cl_device_id did;
+	if (vocp->base.vxc_vc && (vocp->base.vxc_vc->vc_verbose < 1))
+		return;
+	if (vocp->voc_dump_done)
+		return;
+	did = vocp->voc_ocldid;
+	fprintf(stderr, "Device: %s\n",
+	       vg_ocl_device_getstr(did, CL_DEVICE_NAME));
+	fprintf(stderr, "Vendor: %s (%04x)\n",
+	       vg_ocl_device_getstr(did, CL_DEVICE_VENDOR),
+	       vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID));
+	fprintf(stderr, "Driver: %s\n",
+	       vg_ocl_device_getstr(did, CL_DRIVER_VERSION));
+	fprintf(stderr, "Profile: %s\n",
+	       vg_ocl_device_getstr(did, CL_DEVICE_PROFILE));
+	fprintf(stderr, "Version: %s\n",
+	       vg_ocl_device_getstr(did, CL_DEVICE_VERSION));
+	fprintf(stderr, "Max compute units: %"PRSIZET"d\n",
+	       vg_ocl_device_getsizet(did, CL_DEVICE_MAX_COMPUTE_UNITS));
+	fprintf(stderr, "Max workgroup size: %"PRSIZET"d\n",
+	       vg_ocl_device_getsizet(did, CL_DEVICE_MAX_WORK_GROUP_SIZE));
+	fprintf(stderr, "Global memory: %ld\n",
+	       vg_ocl_device_getulong(did, CL_DEVICE_GLOBAL_MEM_SIZE));
+	fprintf(stderr, "Max allocation: %ld\n",
+	       vg_ocl_device_getulong(did, CL_DEVICE_MAX_MEM_ALLOC_SIZE));
+	vocp->voc_dump_done = 1;
+}
+
+void
+vg_ocl_error(vg_ocl_context_t *vocp, int code, const char *desc)
+{
+	const char *err = vg_ocl_strerror(code);
+	if (desc) {
+		fprintf(stderr, "%s: %s\n", desc, err);
+	} else {
+		fprintf(stderr, "%s\n", err);
+	}
+
+	if (vocp && vocp->voc_ocldid)
+		vg_ocl_dump_info(vocp);
+}
+
+static void
+vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog)
+{
+	size_t logbufsize, logsize;
+	char *log;
+	int off = 0;
+	cl_int ret;
+
+	ret = clGetProgramBuildInfo(prog,
+				    vocp->voc_ocldid,
+				    CL_PROGRAM_BUILD_LOG,
+				    0, NULL,
+				    &logbufsize);
+	if (ret != CL_SUCCESS) {
+		vg_ocl_error(NULL, ret, "clGetProgramBuildInfo");
+		return;
+	}
+
+	log = (char *) malloc(logbufsize);
+	if (!log) {
+		fprintf(stderr, "Could not allocate build log buffer\n");
+		return;
+	}
+
+	ret = clGetProgramBuildInfo(prog,
+				    vocp->voc_ocldid,
+				    CL_PROGRAM_BUILD_LOG,
+				    logbufsize,
+				    log,
+				    &logsize);
+	if (ret != CL_SUCCESS) {
+		vg_ocl_error(NULL, ret, "clGetProgramBuildInfo");
+
+	} else {
+		/* Remove leading newlines and trailing newlines/whitespace */
+		log[logbufsize-1] = '\0';
+		for (off = logsize - 1; off >= 0; off--) {
+			if ((log[off] != '\r') &&
+			    (log[off] != '\n') &&
+			    (log[off] != ' ') &&
+			    (log[off] != '\t') &&
+			    (log[off] != '\0'))
+				break;
+			log[off] = '\0';
+		}
+		for (off = 0; off < logbufsize; off++) {
+			if ((log[off] != '\r') &&
+			    (log[off] != '\n'))
+				break;
+		}
+
+		fprintf(stderr, "Build log:\n%s\n", &log[off]);
+	}
+	free(log);
+}
+
+/*
+ * OpenCL per-exec functions
+ */
+
+enum {
+	VG_OCL_DEEP_PREPROC_UNROLL  = (1 << 0),
+	VG_OCL_PRAGMA_UNROLL        = (1 << 1),
+	VG_OCL_EXPENSIVE_BRANCHES   = (1 << 2),
+	VG_OCL_DEEP_VLIW            = (1 << 3),
+	VG_OCL_AMD_BFI_INT          = (1 << 4),
+	VG_OCL_NV_VERBOSE           = (1 << 5),
+	VG_OCL_BROKEN               = (1 << 6),
+	VG_OCL_NO_BINARIES          = (1 << 7),
+
+	VG_OCL_OPTIMIZATIONS        = (VG_OCL_DEEP_PREPROC_UNROLL |
+				       VG_OCL_PRAGMA_UNROLL |
+				       VG_OCL_EXPENSIVE_BRANCHES |
+				       VG_OCL_DEEP_VLIW |
+				       VG_OCL_AMD_BFI_INT),
+
+};
+
+static int
+vg_ocl_get_quirks(vg_ocl_context_t *vocp)
+{
+	uint32_t vend;
+	const char *dvn;
+	unsigned int quirks = 0;
+
+	quirks |= VG_OCL_DEEP_PREPROC_UNROLL;
+
+	vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID);
+	switch (vend) {
+	case 0x10de: /* NVIDIA */
+		/*
+		 * NVIDIA's compiler seems to take a really really long
+		 * time when using preprocessor unrolling, but works
+		 * well with pragma unroll.
+		 */
+		quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL;
+		quirks |= VG_OCL_PRAGMA_UNROLL;
+		quirks |= VG_OCL_NV_VERBOSE;
+		break;
+	case 0x1002: /* AMD/ATI */
+		/*
+		 * AMD's compiler works best with preprocesor unrolling.
+		 * Pragma unroll is unreliable with AMD's compiler and
+		 * seems to crash based on whether the gods were smiling
+		 * when Catalyst was last installed/upgraded.
+		 */
+		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
+		    CL_DEVICE_TYPE_GPU) {
+			quirks |= VG_OCL_EXPENSIVE_BRANCHES;
+			quirks |= VG_OCL_DEEP_VLIW;
+			dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
+						   CL_DEVICE_EXTENSIONS);
+			if (dvn && strstr(dvn, "cl_amd_media_ops"))
+				quirks |= VG_OCL_AMD_BFI_INT;
+
+			dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
+						   CL_DEVICE_NAME);
+			if (!strcmp(dvn, "ATI RV710")) {
+				quirks &= ~VG_OCL_OPTIMIZATIONS;
+				quirks |= VG_OCL_NO_BINARIES;
+			}
+		}
+		break;
+	default:
+		break;
+	}
+	return quirks;
+}
+
+static int
+vg_ocl_create_kernel(vg_ocl_context_t *vocp, int knum, const char *func)
+{
+	int i;
+	cl_kernel krn;
+	cl_int ret;
+
+	for (i = 0; i < MAX_SLOT; i++) {
+		krn = clCreateKernel(vocp->voc_oclprog, func, &ret);
+		if (!krn) {
+			fprintf(stderr, "clCreateKernel(%d): ", i);
+			vg_ocl_error(vocp, ret, NULL);
+			while (--i >= 0) {
+				clReleaseKernel(vocp->voc_oclkernel[i][knum]);
+				vocp->voc_oclkernel[i][knum] = NULL;
+			}
+			return 0;
+		}
+		vocp->voc_oclkernel[i][knum] = krn;
+		vocp->voc_oclkrnwait[i] = NULL;
+	}
+	return 1;
+}
+
+static void
+vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts,
+		    const char *program, size_t size,
+		    unsigned char *hash_out)
+{
+	MD5_CTX ctx;
+	cl_platform_id pid;
+	const char *str;
+
+	MD5_Init(&ctx);
+	pid = vg_ocl_device_getplatform(vocp->voc_ocldid);
+	str = vg_ocl_platform_getstr(pid, CL_PLATFORM_NAME);
+	MD5_Update(&ctx, str, strlen(str) + 1);
+	str = vg_ocl_platform_getstr(pid, CL_PLATFORM_VERSION);
+	MD5_Update(&ctx, str, strlen(str) + 1);
+	str = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
+	MD5_Update(&ctx, str, strlen(str) + 1);
+	if (opts)
+		MD5_Update(&ctx, opts, strlen(opts) + 1);
+	if (size)
+		MD5_Update(&ctx, program, size);
+	MD5_Final(hash_out, &ctx);
+}
+
+typedef struct {
+	unsigned char	e_ident[16];
+	uint16_t	e_type;
+	uint16_t	e_machine;
+	uint32_t	e_version;
+	uint32_t	e_entry;
+	uint32_t	e_phoff;
+	uint32_t	e_shoff;
+	uint32_t	e_flags;
+	uint16_t	e_ehsize;
+	uint16_t	e_phentsize;
+	uint16_t	e_phnum;
+	uint16_t	e_shentsize;
+	uint16_t	e_shnum;
+	uint16_t	e_shstrndx;
+} vg_elf32_header_t;
+
+typedef struct {
+	uint32_t	sh_name;
+	uint32_t	sh_type;
+	uint32_t	sh_flags;
+	uint32_t	sh_addr;
+	uint32_t	sh_offset;
+	uint32_t	sh_size;
+	uint32_t	sh_link;
+	uint32_t	sh_info;
+	uint32_t	sh_addralign;
+	uint32_t	sh_entsize;
+} vg_elf32_shdr_t;
+
+static int
+vg_ocl_amd_patch_inner(unsigned char *binary, size_t size)
+{
+	vg_elf32_header_t *ehp;
+	vg_elf32_shdr_t *shp, *nshp;
+	uint32_t *instr;
+	size_t off;
+	int i, n, txt2idx, patched;
+
+	ehp = (vg_elf32_header_t *) binary;
+	if ((size < sizeof(*ehp)) ||
+	    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
+	    !ehp->e_shoff)
+		return 0;
+
+	off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize);
+	nshp = (vg_elf32_shdr_t *) (binary + off);
+	if ((off + sizeof(*nshp)) > size)
+		return 0;
+
+	shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff);
+	n = 0;
+	txt2idx = 0;
+	for (i = 0; i < ehp->e_shnum; i++) {
+		off = nshp->sh_offset + shp[i].sh_name;
+		if (((off + 6) >= size) ||
+		    memcmp(binary + off, ".text", 6))
+			continue;
+		n++;
+		if (n == 2)
+			txt2idx = i;
+	}
+	if (n != 2)
+		return 0;
+
+	off = shp[txt2idx].sh_offset;
+	instr = (uint32_t *) (binary + off);
+	n = shp[txt2idx].sh_size / 4;
+	patched = 0;
+	for (i = 0; i < n; i += 2) {
+		if (((instr[i] & 0x02001000) == 0) &&
+		    ((instr[i+1] & 0x9003f000) == 0x0001a000)) {
+			instr[i+1] ^= (0x0001a000 ^ 0x0000c000);
+			patched++;
+		}
+	}
+
+	return patched;
+}
+
+static int
+vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size)
+{
+	vg_context_t *vcp = vocp->base.vxc_vc;
+	vg_elf32_header_t *ehp;
+	unsigned char *ptr;
+	size_t offset = 1;
+	int ninner = 0, nrun, npatched = 0;
+
+	ehp = (vg_elf32_header_t *) binary;
+	if ((size < sizeof(*ehp)) ||
+	    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) ||
+	    !ehp->e_shoff)
+		return 0;
+
+	offset = 1;
+	while (offset < (size - 8)) {
+		ptr = (unsigned char *) memchr(binary + offset,
+					       0x7f,
+					       size - offset);
+		if (!ptr)
+			return npatched;
+		offset = ptr - binary;
+		ehp = (vg_elf32_header_t *) ptr;
+		if (((size - offset) < sizeof(*ehp)) ||
+		    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
+		    !ehp->e_shoff) {
+			offset += 1;
+			continue;
+		}
+
+		ninner++;
+		nrun = vg_ocl_amd_patch_inner(ptr, size - offset);
+		npatched += nrun;
+		if (vcp->vc_verbose > 1)
+			fprintf(stderr, "AMD BFI_INT: patched %d instructions "
+			       "in kernel %d\n",
+			       nrun, ninner);
+		npatched++;
+		offset += 1;
+	}
+	return npatched;
+}
+
+
+static int
+vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
+		    const char *filename, const char *opts)
+{
+	FILE *kfp;
+	char *buf, *tbuf;
+	int len, fromsource = 0, patched = 0;
+	size_t sz, szr;
+	cl_program prog;
+	cl_int ret, sts;
+	unsigned char prog_hash[16];
+	char bin_name[64];
+
+	if (vcp->vc_verbose > 1)
+		fprintf(stderr,
+			"OpenCL compiler flags: %s\n", opts ? opts : "");
+
+	sz = 128 * 1024;
+	buf = (char *) malloc(sz);
+	if (!buf) {
+		fprintf(stderr, "Could not allocate program buffer\n");
+		return 0;
+	}
+
+	kfp = fopen(filename, "r");
+	if (!kfp) {
+		fprintf(stderr, "Error loading kernel file '%s': %s\n",
+		       filename, strerror(errno));
+		free(buf);
+		return 0;
+	}
+
+	len = fread(buf, 1, sz, kfp);
+	fclose(kfp);
+
+	if (!len) {
+		fprintf(stderr, "Short read on CL kernel\n");
+		free(buf);
+		return 0;
+	}
+
+	vg_ocl_hash_program(vocp, opts, buf, len, prog_hash);
+	snprintf(bin_name, sizeof(bin_name),
+		 "%02x%02x%02x%02x%02x%02x%02x%02x"
+		 "%02x%02x%02x%02x%02x%02x%02x%02x.oclbin",
+		 prog_hash[0], prog_hash[1], prog_hash[2], prog_hash[3],
+		 prog_hash[4], prog_hash[5], prog_hash[6], prog_hash[7],
+		 prog_hash[8], prog_hash[9], prog_hash[10], prog_hash[11],
+		 prog_hash[12], prog_hash[13], prog_hash[14], prog_hash[15]);
+
+	if (vocp->voc_quirks & VG_OCL_NO_BINARIES) {
+		kfp = NULL;
+		if (vcp->vc_verbose > 1)
+			fprintf(stderr, "Binary OpenCL programs disabled\n");
+	} else {
+		kfp = fopen(bin_name, "rb");
+	}
+
+	if (!kfp) {
+		/* No binary available, create with source */
+		fromsource = 1;
+		sz = len;
+		prog = clCreateProgramWithSource(vocp->voc_oclctx,
+						 1, (const char **) &buf, &sz,
+						 &ret);
+	} else {
+		if (vcp->vc_verbose > 1)
+			fprintf(stderr, "Loading kernel binary %s\n", bin_name);
+		szr = 0;
+		while (!feof(kfp)) {
+			len = fread(buf + szr, 1, sz - szr, kfp);
+			if (!len) {
+				fprintf(stderr,
+					"Short read on CL kernel binary\n");
+				fclose(kfp);
+				free(buf);
+				return 0;
+			}
+			szr += len;
+			if (szr == sz) {
+				tbuf = (char *) realloc(buf, sz*2);
+				if (!tbuf) {
+					fprintf(stderr,
+						"Could not expand CL kernel "
+						"binary buffer\n");
+					fclose(kfp);
+					free(buf);
+					return 0;
+				}
+				buf = tbuf;
+				sz *= 2;
+			}
+		}
+		fclose(kfp);
+	rebuild:
+		prog = clCreateProgramWithBinary(vocp->voc_oclctx,
+						 1, &vocp->voc_ocldid,
+						 &szr,
+						 (const unsigned char **) &buf,
+						 &sts,
+						 &ret);
+	}
+	free(buf);
+	if (!prog) {
+		vg_ocl_error(vocp, ret, "clCreateProgramWithSource");
+		return 0;
+	}
+
+	if (vcp->vc_verbose > 0) {
+		if (fromsource && !patched) {
+			fprintf(stderr,
+				"Compiling kernel, can take minutes...");
+			fflush(stderr);
+		}
+	}
+	ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL);
+	if (ret != CL_SUCCESS) {
+		if ((vcp->vc_verbose > 0) && fromsource && !patched)
+			fprintf(stderr, "failure.\n");
+		vg_ocl_error(NULL, ret, "clBuildProgram");
+	} else if ((vcp->vc_verbose > 0) && fromsource && !patched) {
+		fprintf(stderr, "done!\n");
+	}
+	if ((ret != CL_SUCCESS) ||
+	    ((vcp->vc_verbose > 1) && fromsource && !patched)) {
+		vg_ocl_buildlog(vocp, prog);
+	}
+	if (ret != CL_SUCCESS) {
+		vg_ocl_dump_info(vocp);
+		clReleaseProgram(prog);
+		return 0;
+	}
+
+	if (fromsource && !(vocp->voc_quirks & VG_OCL_NO_BINARIES)) {
+		ret = clGetProgramInfo(prog,
+				       CL_PROGRAM_BINARY_SIZES,
+				       sizeof(szr), &szr,
+				       &sz);
+		if (ret != CL_SUCCESS) {
+			vg_ocl_error(vocp, ret,
+				     "WARNING: clGetProgramInfo(BINARY_SIZES)");
+			goto out;
+		}
+		if (sz == 0) {
+			fprintf(stderr,
+				"WARNING: zero-length CL kernel binary\n");
+			goto out;
+		}
+
+		buf = (char *) malloc(szr);
+		if (!buf) {
+			fprintf(stderr,
+				"WARNING: Could not allocate %"PRSIZET"d bytes "
+				"for CL binary\n",
+			       szr);
+			goto out;
+		}
+
+		ret = clGetProgramInfo(prog,
+				       CL_PROGRAM_BINARIES,
+				       sizeof(buf), &buf,
+				       &sz);
+		if (ret != CL_SUCCESS) {
+			vg_ocl_error(vocp, ret,
+				     "WARNING: clGetProgramInfo(BINARIES)");
+			free(buf);
+			goto out;
+		}
+
+		if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) {
+			patched = vg_ocl_amd_patch(vocp,
+						   (unsigned char *) buf, szr);
+			if (patched > 0) {
+				if (vcp->vc_verbose > 1)
+					fprintf(stderr,
+						"AMD BFI_INT patch complete\n");
+				clReleaseProgram(prog);
+				goto rebuild;
+			}
+			fprintf(stderr,
+				"WARNING: AMD BFI_INT patching failed\n");
+			if (patched < 0) {
+				/* Program was incompletely modified */
+				free(buf);
+				goto out;
+			}
+		}
+
+		kfp = fopen(bin_name, "wb");
+		if (!kfp) {
+			fprintf(stderr, "WARNING: "
+				"could not save CL kernel binary: %s\n",
+				strerror(errno));
+		} else {
+			sz = fwrite(buf, 1, szr, kfp);
+			fclose(kfp);
+			if (sz != szr) {
+				fprintf(stderr,
+					"WARNING: short write on CL kernel "
+					"binary file: expected "
+					"%"PRSIZET"d, got %"PRSIZET"d\n",
+					szr, sz);
+				unlink(bin_name);
+			}
+		}
+		free(buf);
+	}
+
+out:
+	vocp->voc_oclprog = prog;
+	if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") ||
+	    !vg_ocl_create_kernel(vocp, 1, "heap_invert")) {
+		clReleaseProgram(vocp->voc_oclprog);
+		vocp->voc_oclprog = NULL;
+		return 0;
+	}
+
+	return 1;
+}
+
+static void CL_CALLBACK
+vg_ocl_context_callback(const char *errinfo,
+			const void *private_info,
+			size_t cb,
+			void *user_data)
+{
+	fprintf(stderr, "vg_ocl_context_callback error: %s\n", errinfo);
+}
+
+static int
+vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
+	    int safe_mode)
+{
+	cl_int ret;
+	char optbuf[128];
+	int end = 0;
+
+	memset(vocp, 0, sizeof(*vocp));
+	vg_exec_context_init(vcp, &vocp->base);
+
+	pthread_mutex_init(&vocp->voc_lock, NULL);
+	pthread_cond_init(&vocp->voc_wait, NULL);
+	vocp->voc_ocl_slot = -1;
+
+	vocp->voc_ocldid = did;
+
+	if (vcp->vc_verbose > 1)
+		vg_ocl_dump_info(vocp);
+
+	vocp->voc_quirks = vg_ocl_get_quirks(vocp);
+
+	if ((vocp->voc_quirks & VG_OCL_BROKEN) && (vcp->vc_verbose > 0)) {
+		char yesbuf[16];
+		printf("Type 'yes' to continue: ");
+		fflush(stdout);
+		if (!fgets(yesbuf, sizeof(yesbuf), stdin) ||
+		    strncmp(yesbuf, "yes", 3))
+			exit(1);
+	}
+
+	vocp->voc_oclctx = clCreateContext(NULL,
+					   1, &did,
+					   vg_ocl_context_callback,
+					   NULL,
+					   &ret);
+	if (!vocp->voc_oclctx) {
+		vg_ocl_error(vocp, ret, "clCreateContext");
+		return 0;
+	}
+
+	vocp->voc_oclcmdq = clCreateCommandQueue(vocp->voc_oclctx,
+						 vocp->voc_ocldid,
+						 0, &ret);
+	if (!vocp->voc_oclcmdq) {
+		vg_ocl_error(vocp, ret, "clCreateCommandQueue");
+		return 0;
+	}
+
+	if (safe_mode)
+		vocp->voc_quirks &= ~VG_OCL_OPTIMIZATIONS;
+
+	end = 0;
+	optbuf[end] = '\0';
+	if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL)
+		end += snprintf(optbuf + end, sizeof(optbuf) - end,
+				"-DDEEP_PREPROC_UNROLL ");
+	if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL)
+		end += snprintf(optbuf + end, sizeof(optbuf) - end,
+				"-DPRAGMA_UNROLL ");
+	if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES)
+		end += snprintf(optbuf + end, sizeof(optbuf) - end,
+				"-DVERY_EXPENSIVE_BRANCHES ");
+	if (vocp->voc_quirks & VG_OCL_DEEP_VLIW)
+		end += snprintf(optbuf + end, sizeof(optbuf) - end,
+				"-DDEEP_VLIW ");
+	if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT)
+		end += snprintf(optbuf + end, sizeof(optbuf) - end,
+				"-DAMD_BFI_INT ");
+	if (vocp->voc_quirks & VG_OCL_NV_VERBOSE)
+		end += snprintf(optbuf + end, sizeof(optbuf) - end,
+				"-cl-nv-verbose ");
+
+	if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", optbuf))
+		return 0;
+	return 1;
+}
+
+static void
+vg_ocl_del(vg_ocl_context_t *vocp)
+{
+	vg_ocl_free_args(vocp);
+	if (vocp->voc_oclprog) {
+		clReleaseProgram(vocp->voc_oclprog);
+		vocp->voc_oclprog = NULL;
+	}
+	if (vocp->voc_oclcmdq) {
+		clReleaseCommandQueue(vocp->voc_oclcmdq);
+		vocp->voc_oclcmdq = NULL;
+	}
+	if (vocp->voc_oclctx) {
+		clReleaseContext(vocp->voc_oclctx);
+		vocp->voc_oclctx = NULL;
+	}
+	pthread_cond_destroy(&vocp->voc_wait);
+	pthread_mutex_destroy(&vocp->voc_lock);
+	vg_exec_context_del(&vocp->base);
+}
+
+static int vg_ocl_arg_map[][8] = {
+	/* hashes_out / found */
+	{ 2, 0, -1 },
+	/* z_heap */
+	{ 0, 1, 1, 0, 2, 2, -1 },
+	/* point_tmp */
+	{ 0, 0, 2, 1, -1 },
+	/* row_in */
+	{ 0, 2, -1 },
+	/* col_in */
+	{ 0, 3, -1 },
+	/* target_table */
+	{ 2, 3, -1 },
+};
+
+static int
+vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot,
+			int arg, size_t size, int host)
+{
+	cl_mem clbuf;
+	cl_int ret;
+	int i, j, knum, karg;
+
+	for (i = 0; i < MAX_SLOT; i++) {
+		if ((i != slot) && (slot >= 0))
+			continue;
+		if (vocp->voc_args[i][arg]) {
+			clReleaseMemObject(vocp->voc_args[i][arg]);
+			vocp->voc_args[i][arg] = NULL;
+			vocp->voc_arg_size[i][arg] = 0;
+		}
+	}
+
+	clbuf = clCreateBuffer(vocp->voc_oclctx,
+			       CL_MEM_READ_WRITE |
+			       (host ? CL_MEM_ALLOC_HOST_PTR : 0),
+			       size,
+			       NULL,
+			       &ret);
+	if (!clbuf) {
+		fprintf(stderr, "clCreateBuffer(%d,%d): ", slot, arg);
+		vg_ocl_error(vocp, ret, NULL);
+		return 0;
+	}
+
+	for (i = 0; i < MAX_SLOT; i++) {
+		if ((i != slot) && (slot >= 0))
+			continue;
+
+		clRetainMemObject(clbuf);
+		vocp->voc_args[i][arg] = clbuf;
+		vocp->voc_arg_size[i][arg] = size;
+
+		for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
+			knum = vg_ocl_arg_map[arg][j];
+			karg = vg_ocl_arg_map[arg][j+1];
+			ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
+					     karg,
+					     sizeof(clbuf),
+					     &clbuf);
+			
+			if (ret) {
+				fprintf(stderr,
+					"clSetKernelArg(%d,%d): ", knum, karg);
+				vg_ocl_error(vocp, ret, NULL);
+				return 0;
+			}
+		}
+	}
+
+	clReleaseMemObject(clbuf);
+	return 1;
+}
+
+int
+vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg,
+		   void *buffer, size_t size)
+{
+	cl_int slot, ret;
+
+	slot = (wslot < 0) ? 0 : wslot;
+
+	assert((slot >= 0) && (slot < MAX_SLOT));
+	assert(size <= vocp->voc_arg_size[slot][arg]);
+
+	ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq,
+				   vocp->voc_args[slot][arg],
+				   CL_TRUE,
+				   0, size,
+				   buffer,
+				   0, NULL,
+				   NULL);
+			
+	if (ret) {
+		fprintf(stderr, "clEnqueueWriteBuffer(%d): ", arg);
+		vg_ocl_error(vocp, ret, NULL);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void *
+vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot,
+		      int arg, int rw)
+{
+	void *buf;
+	cl_int ret;
+
+	assert((slot >= 0) && (slot < MAX_SLOT));
+
+	buf = clEnqueueMapBuffer(vocp->voc_oclcmdq,
+				 vocp->voc_args[slot][arg],
+				 CL_TRUE,
+				 (rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE)
+				           : (rw ? CL_MAP_WRITE : CL_MAP_READ),
+				 0, vocp->voc_arg_size[slot][arg],
+				 0, NULL,
+				 NULL,
+				 &ret);
+	if (!buf) {
+		fprintf(stderr, "clEnqueueMapBuffer(%d): ", arg);
+		vg_ocl_error(vocp, ret, NULL);
+		return NULL;
+	}
+	return buf;
+}
+
+static void
+vg_ocl_unmap_arg_buffer(vg_ocl_context_t *vocp, int slot,
+			int arg, void *buf)
+{
+	cl_int ret;
+	cl_event ev;
+
+	assert((slot >= 0) && (slot < MAX_SLOT));
+
+	ret = clEnqueueUnmapMemObject(vocp->voc_oclcmdq,
+				      vocp->voc_args[slot][arg],
+				      buf,
+				      0, NULL,
+				      &ev);
+	if (ret != CL_SUCCESS) {
+		fprintf(stderr, "clEnqueueUnmapMemObject(%d): ", arg);
+		vg_ocl_error(vocp, ret, NULL);
+		return;
+	}
+
+	ret = clWaitForEvents(1, &ev);
+	clReleaseEvent(ev);
+	if (ret != CL_SUCCESS) {
+		fprintf(stderr, "clWaitForEvent(clUnmapMemObject,%d): ", arg);
+		vg_ocl_error(vocp, ret, NULL);
+	}
+}
+
+int
+vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot,
+		      int arg, int value)
+{
+	cl_int ret;
+	int i;
+
+	for (i = 0; i < MAX_SLOT; i++) {
+		if ((i != slot) && (slot >= 0))
+			continue;
+		ret = clSetKernelArg(vocp->voc_oclkernel[i][2],
+				     arg,
+				     sizeof(value),
+				     &value);
+		if (ret) {
+			fprintf(stderr, "clSetKernelArg(%d): ", arg);
+			vg_ocl_error(vocp, ret, NULL);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+int
+vg_ocl_kernel_buffer_arg(vg_ocl_context_t *vocp, int slot,
+			 int arg, void *value, size_t size)
+{
+	cl_int ret;
+	int i, j, knum, karg;
+
+	for (i = 0; i < MAX_SLOT; i++) {
+		if ((i != slot) && (slot >= 0))
+			continue;
+		for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
+			knum = vg_ocl_arg_map[arg][j];
+			karg = vg_ocl_arg_map[arg][j+1];
+			ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
+					     karg,
+					     size,
+					     value);
+			if (ret) {
+				fprintf(stderr,
+					"clSetKernelArg(%d,%d): ", knum, karg);
+				vg_ocl_error(vocp, ret, NULL);
+				return 0;
+			}
+		}
+	}
+	return 1;
+}
+
+static void
+vg_ocl_free_args(vg_ocl_context_t *vocp)
+{
+	int i, arg;
+	for (i = 0; i < MAX_SLOT; i++) {
+		for (arg = 0; arg < MAX_ARG; arg++) {
+			if (vocp->voc_args[i][arg]) {
+				clReleaseMemObject(vocp->voc_args[i][arg]);
+				vocp->voc_args[i][arg] = NULL;
+				vocp->voc_arg_size[i][arg] = 0;
+			}
+		}
+	}
+}
+
+int
+vg_ocl_kernel_dead(vg_ocl_context_t *vocp, int slot)
+{
+	return (vocp->voc_oclkrnwait[slot] == NULL);
+}
+
+static int
+vg_ocl_kernel_start(vg_ocl_context_t *vocp, int slot, int ncol, int nrow,
+		    int invsize)
+{
+	cl_int val, ret;
+	cl_event ev;
+	size_t globalws[2] = { ncol, nrow };
+	size_t invws = (ncol * nrow) / invsize;
+
+	assert(!vocp->voc_oclkrnwait[slot]);
+
+	/* heap_invert() preconditions */
+	assert(is_pow2(invsize) && (invsize > 1));
+
+	val = invsize;
+	ret = clSetKernelArg(vocp->voc_oclkernel[slot][1],
+			     1,
+			     sizeof(val),
+			     &val);
+	if (ret != CL_SUCCESS) {
+		vg_ocl_error(vocp, ret, "clSetKernelArg(ncol)");
+		return 0;
+	}
+	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
+				     vocp->voc_oclkernel[slot][0],
+				     2,
+				     NULL, globalws, NULL,
+				     0, NULL,
+				     &ev);
+	if (ret != CL_SUCCESS) {
+		vg_ocl_error(vocp, ret, "clEnqueueNDRange(0)");
+		return 0;
+	}
+
+	ret = clWaitForEvents(1, &ev);
+	clReleaseEvent(ev);
+	if (ret != CL_SUCCESS) {
+		vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,0)");
+		return 0;
+	}
+
+	if (vocp->voc_verify_func[0] &&
+	    !(vocp->voc_verify_func[0])(vocp, slot)) {
+		fprintf(stderr, "ERROR: Kernel 0 failed verification test\n");
+		return 0;
+	}
+
+	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
+				     vocp->voc_oclkernel[slot][1],
+				     1,
+				     NULL, &invws, NULL,
+				     0, NULL,
+				     &ev);
+	if (ret != CL_SUCCESS) {
+		vg_ocl_error(vocp, ret, "clEnqueueNDRange(1)");
+		return 0;
+	}
+
+	ret = clWaitForEvents(1, &ev);
+	clReleaseEvent(ev);
+	if (ret != CL_SUCCESS) {
+		vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,1)");
+		return 0;
+	}
+
+	if (vocp->voc_verify_func[1] &&
+	    !(vocp->voc_verify_func[1])(vocp, slot)) {
+		fprintf(stderr, "ERROR: Kernel 1 failed verification test\n");
+		return 0;
+	}
+
+	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
+				     vocp->voc_oclkernel[slot][2],
+				     2,
+				     NULL, globalws, NULL,
+				     0, NULL,
+				     &ev);
+	if (ret != CL_SUCCESS) {
+		vg_ocl_error(vocp, ret, "clEnqueueNDRange(2)");
+		return 0;
+	}
+
+	vocp->voc_oclkrnwait[slot] = ev;
+	return 1;
+}
+
+static int
+vg_ocl_kernel_wait(vg_ocl_context_t *vocp, int slot)
+{
+	cl_event ev;
+	cl_int ret;
+
+	ev = vocp->voc_oclkrnwait[slot];
+	vocp->voc_oclkrnwait[slot] = NULL;
+	if (ev) {
+		ret = clWaitForEvents(1, &ev);
+		clReleaseEvent(ev);
+		if (ret != CL_SUCCESS) {
+			vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,e)");
+			return 0;
+		}
+	}
+	return 1;
+}
+
+
+static INLINE void
+vg_ocl_get_bignum_raw(BIGNUM *bn, const unsigned char *buf)
+{
+	bn_expand(bn, 256);
+	memcpy(bn->d, buf, 32);
+	bn->top = (32 / sizeof(BN_ULONG));
+}
+
+static INLINE void
+vg_ocl_put_bignum_raw(unsigned char *buf, const BIGNUM *bn)
+{
+	int bnlen = (bn->top * sizeof(BN_ULONG));
+	if (bnlen >= 32) {
+		memcpy(buf, bn->d, 32);
+	} else {
+		memcpy(buf, bn->d, bnlen);
+		memset(buf + bnlen, 0, 32 - bnlen);
+	}
+}
+
+#define ACCESS_BUNDLE 1024
+#define ACCESS_STRIDE (ACCESS_BUNDLE/8)
+
+static void
+vg_ocl_get_bignum_tpa(BIGNUM *bn, const unsigned char *buf, int cell)
+{
+	unsigned char bnbuf[32];
+	int start, i;
+
+	start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
+		 (cell % ACCESS_STRIDE));
+	for (i = 0; i < 8; i++)
+		memcpy(bnbuf+(i*4),
+		       buf + 4*(start + i*ACCESS_STRIDE),
+		       4);
+
+	vg_ocl_get_bignum_raw(bn, bnbuf);
+}
+
+/*
+ * Absolutely disgusting.
+ * We want points in Montgomery form, and it's a lot easier to read the
+ * coordinates from the structure than to export and re-montgomeryize.
+ */
+
+struct ec_point_st {
+	const EC_METHOD *meth;
+	BIGNUM X;
+	BIGNUM Y;
+	BIGNUM Z;
+	int Z_is_one;
+};
+
+static INLINE void
+vg_ocl_get_point(EC_POINT *ppnt, const unsigned char *buf)
+{
+	static const unsigned char mont_one[] = { 0x01,0x00,0x00,0x03,0xd1 };
+	vg_ocl_get_bignum_raw(&ppnt->X, buf);
+	vg_ocl_get_bignum_raw(&ppnt->Y, buf + 32);
+	if (!ppnt->Z_is_one) {
+		ppnt->Z_is_one = 1;
+		BN_bin2bn(mont_one, sizeof(mont_one), &ppnt->Z);
+	}
+}
+
+static INLINE void
+vg_ocl_put_point(unsigned char *buf, const EC_POINT *ppnt)
+{
+	assert(ppnt->Z_is_one);
+	vg_ocl_put_bignum_raw(buf, &ppnt->X);
+	vg_ocl_put_bignum_raw(buf + 32, &ppnt->Y);
+}
+
+static void
+vg_ocl_put_point_tpa(unsigned char *buf, int cell, const EC_POINT *ppnt)
+{
+	unsigned char pntbuf[64];
+	int start, i;
+
+	vg_ocl_put_point(pntbuf, ppnt);
+
+	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
+		 (cell % (ACCESS_STRIDE/2)));
+	for (i = 0; i < 8; i++)
+		memcpy(buf + 4*(start + i*ACCESS_STRIDE),
+		       pntbuf+(i*4),
+		       4);
+	for (i = 0; i < 8; i++)
+		memcpy(buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)),
+		       pntbuf+32+(i*4),
+		       4);
+}
+
+static void
+vg_ocl_get_point_tpa(EC_POINT *ppnt, const unsigned char *buf, int cell)
+{
+	unsigned char pntbuf[64];
+	int start, i;
+
+	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
+		 (cell % (ACCESS_STRIDE/2)));
+	for (i = 0; i < 8; i++)
+		memcpy(pntbuf+(i*4),
+		       buf + 4*(start + i*ACCESS_STRIDE),
+		       4);
+	for (i = 0; i < 8; i++)
+		memcpy(pntbuf+32+(i*4),
+		       buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)),
+		       4);
+
+	vg_ocl_get_point(ppnt, pntbuf);
+}
+
+void
+show_elapsed(struct timeval *tv, const char *place)
+{
+	struct timeval now, delta;
+        gettimeofday(&now, NULL);
+	timersub(&now, tv, &delta);
+	fprintf(stderr,
+		"%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec);
+}
+
+
+/*
+ * GPU address matching methods
+ *
+ * gethash: GPU computes and returns all address hashes.
+ *  + Works with any matching method, including regular expressions.
+ *  - The CPU will not be able to keep up with mid- to high-end GPUs.
+ *
+ * prefix: GPU computes hash, searches a range list, and discards.
+ *  + Fast, minimal work for CPU.
+ */
+
+static int
+vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot)
+{
+	vg_exec_context_t *vxcp = &vocp->base;
+	vg_context_t *vcp = vocp->base.vxc_vc;
+	vg_test_func_t test_func = vcp->vc_test;
+	unsigned char *ocl_hashes_out;
+	int i, res = 0, round;
+
+	ocl_hashes_out = (unsigned char *)
+		vg_ocl_map_arg_buffer(vocp, slot, 0, 0);
+
+	round = vocp->voc_ocl_cols * vocp->voc_ocl_rows;
+
+	for (i = 0; i < round; i++, vxcp->vxc_delta++) {
+		memcpy(&vxcp->vxc_binres[1],
+		       ocl_hashes_out + (20*i),
+		       20);
+
+		res = test_func(vxcp);
+		if (res)
+			break;
+	}
+
+	vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out);
+	return res;
+}
+
+static int
+vg_ocl_gethash_init(vg_ocl_context_t *vocp)
+{
+	int i;
+
+	if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get"))
+		return 0;
+
+	for (i = 0; i < vocp->voc_nslots; i++) {
+		/* Each slot gets its own hash output buffer */
+		if (!vg_ocl_kernel_arg_alloc(vocp, i, 0,
+					     20 *
+					     vocp->voc_ocl_rows *
+					     vocp->voc_ocl_cols, 1))
+			return 0;
+	}
+
+	vocp->voc_rekey_func = NULL;
+	vocp->voc_check_func = vg_ocl_gethash_check;
+	return 1;
+}
+
+
+static int
+vg_ocl_prefix_rekey(vg_ocl_context_t *vocp)
+{
+	vg_context_t *vcp = vocp->base.vxc_vc;
+	unsigned char *ocl_targets_in;
+	uint32_t *ocl_found_out;
+	int i;
+
+	/* Set the found indicator for each slot to -1 */
+	for (i = 0; i < vocp->voc_nslots; i++) {
+		ocl_found_out = (uint32_t *)
+			vg_ocl_map_arg_buffer(vocp, i, 0, 1);
+		ocl_found_out[0] = 0xffffffff;
+		vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out);
+	}
+
+	if (vocp->voc_pattern_rewrite) {
+		/* Count number of range records */
+		i = vg_context_hash160_sort(vcp, NULL);
+		if (!i) {
+			fprintf(stderr,
+				"No range records available, exiting\n");
+			return 0;
+		}
+
+		if (i > vocp->voc_pattern_alloc) {
+			/* (re)allocate target buffer */
+			if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0))
+				return 0;
+			vocp->voc_pattern_alloc = i;
+		}
+
+		/* Write range records */
+		ocl_targets_in = (unsigned char *)
+			vg_ocl_map_arg_buffer(vocp, 0, 5, 1);
+		vg_context_hash160_sort(vcp, ocl_targets_in);
+		vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in);
+		vg_ocl_kernel_int_arg(vocp, -1, 4, i);
+
+		vocp->voc_pattern_rewrite = 0;
+	}
+	return 1;
+}
+
+static int
+vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot)
+{
+	vg_exec_context_t *vxcp = &vocp->base;
+	vg_context_t *vcp = vocp->base.vxc_vc;
+	vg_test_func_t test_func = vcp->vc_test;
+	uint32_t *ocl_found_out;
+	uint32_t found_delta;
+	int orig_delta, tablesize;
+	int res = 0;
+
+	/* Retrieve the found indicator */
+	ocl_found_out = (uint32_t *)
+		vg_ocl_map_arg_buffer(vocp, slot, 0, 2);
+	found_delta = ocl_found_out[0];
+
+	if (found_delta != 0xffffffff) {
+		/* GPU code claims match, verify with CPU version */
+		orig_delta = vxcp->vxc_delta;
+		vxcp->vxc_delta += found_delta;
+		vg_exec_context_calc_address(vxcp);
+
+		/* Make sure the GPU produced the expected hash */
+		res = 0;
+		if (!memcmp(vxcp->vxc_binres + 1,
+			    ocl_found_out + 2,
+			    20)) {
+			res = test_func(vxcp);
+		}
+		if (res == 0) {
+			/*
+			 * The match was not found in
+			 * the pattern list.  Hmm.
+			 */
+			tablesize = ocl_found_out[2];
+			fprintf(stderr, "Match idx: %d\n", ocl_found_out[1]);
+			fprintf(stderr, "CPU hash: ");
+			fdumphex(stderr, vxcp->vxc_binres + 1, 20);
+			fprintf(stderr, "GPU hash: ");
+			fdumphex(stderr,
+				 (unsigned char *) (ocl_found_out + 2), 20);
+			fprintf(stderr, "Found delta: %d "
+			       "Start delta: %d\n",
+			       found_delta, orig_delta);
+			res = 1;
+		}
+		vocp->voc_pattern_rewrite = 1;
+	} else {
+		vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows);
+	}
+
+	vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out);
+	return res;
+}
+
+static int
+vg_ocl_prefix_init(vg_ocl_context_t *vocp)
+{
+	int i;
+
+	if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix"))
+		return 0;
+
+	for (i = 0; i < vocp->voc_nslots; i++) {
+		if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1))
+			return 0;
+	}
+	vocp->voc_rekey_func = vg_ocl_prefix_rekey;
+	vocp->voc_check_func = vg_ocl_prefix_check;
+	vocp->voc_pattern_rewrite = 1;
+	vocp->voc_pattern_alloc = 0;
+	return 1;
+}
+
+
+static int
+vg_ocl_config_pattern(vg_ocl_context_t *vocp)
+{
+	vg_context_t *vcp = vocp->base.vxc_vc;
+	int i;
+
+	i = vg_context_hash160_sort(vcp, NULL);
+	if (i > 0) {
+		if (vcp->vc_verbose > 1)
+			fprintf(stderr, "Using OpenCL prefix matcher\n");
+		/* Configure for prefix matching */
+		return vg_ocl_prefix_init(vocp);
+	}
+
+	if (vcp->vc_verbose > 0)
+		fprintf(stderr, "WARNING: Using CPU pattern matcher\n");
+	return vg_ocl_gethash_init(vocp);
+}
+
+
+/*
+ * Temporary buffer content verification functions
+ * This provides a simple test of the kernel, the OpenCL compiler,
+ * and the hardware.
+ */
+static int
+vg_ocl_verify_temporary(vg_ocl_context_t *vocp, int slot, int z_inverted)
+{
+	vg_exec_context_t *vxcp = &vocp->base;
+	unsigned char *point_tmp = NULL, *z_heap = NULL;
+	unsigned char *ocl_points_in = NULL, *ocl_strides_in = NULL;
+	const EC_GROUP *pgroup;
+	EC_POINT *ppr = NULL, *ppc = NULL, *pps = NULL, *ppt = NULL;
+	BIGNUM bnz, bnez, bnm, *bnzc;
+	BN_CTX *bnctx = NULL;
+	BN_MONT_CTX *bnmont;
+	int ret = 0;
+	int mismatches = 0, mm_r;
+	int x, y, bx;
+	static const unsigned char raw_modulus[] = {
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+		0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
+	};
+
+	BN_init(&bnz);
+	BN_init(&bnez);
+	BN_init(&bnm);
+
+	bnctx = BN_CTX_new();
+	bnmont = BN_MONT_CTX_new();
+	pgroup = EC_KEY_get0_group(vxcp->vxc_key);
+	ppr = EC_POINT_new(pgroup);
+	ppc = EC_POINT_new(pgroup);
+	pps = EC_POINT_new(pgroup);
+	ppt = EC_POINT_new(pgroup);
+
+	if (!bnctx || !bnmont || !ppr || !ppc || !pps || !ppt) {
+		fprintf(stderr, "ERROR: out of memory\n");
+		goto out;
+	}
+
+	BN_bin2bn(raw_modulus, sizeof(raw_modulus), &bnm);
+	BN_MONT_CTX_set(bnmont, &bnm, bnctx);
+
+	if (z_inverted) {
+		bnzc = &bnez;
+	} else {
+		bnzc = &pps->Z;
+	}
+
+	z_heap = (unsigned char *)
+		vg_ocl_map_arg_buffer(vocp, slot, 1, 0);
+	point_tmp = (unsigned char *)
+		vg_ocl_map_arg_buffer(vocp, slot, 2, 0);
+	ocl_points_in = (unsigned char *)
+		vg_ocl_map_arg_buffer(vocp, slot, 3, 0);
+	ocl_strides_in = (unsigned char *)
+		vg_ocl_map_arg_buffer(vocp, slot, 4, 0);
+
+	if (!z_heap || !point_tmp || !ocl_points_in || !ocl_strides_in) {
+		fprintf(stderr, "ERROR: could not map OpenCL point buffers\n");
+		goto out;
+	}
+
+	for (y = 0; y < vocp->voc_ocl_rows; y++) {
+		vg_ocl_get_point(ppr, ocl_strides_in + (64*y));
+		bx = y * vocp->voc_ocl_cols;
+		mm_r = 0;
+
+		for (x = 0; x < vocp->voc_ocl_cols; x++) {
+			vg_ocl_get_point_tpa(ppc, ocl_points_in, x);
+			assert(ppr->Z_is_one && ppc->Z_is_one);
+			EC_POINT_add(pgroup, pps, ppc, ppr, bnctx);
+			assert(!pps->Z_is_one);
+			vg_ocl_get_point_tpa(ppt, point_tmp, bx + x);
+			vg_ocl_get_bignum_tpa(&bnz, z_heap, bx + x);
+			if (z_inverted) {
+				BN_mod_inverse(&bnez, &pps->Z, &bnm, bnctx);
+				BN_to_montgomery(&bnez, &bnez, bnmont, bnctx);
+				BN_to_montgomery(&bnez, &bnez, bnmont, bnctx);
+			}
+			if (BN_cmp(&ppt->X, &pps->X) ||
+			    BN_cmp(&ppt->Y, &pps->Y) ||
+			    BN_cmp(&bnz, bnzc)) {
+				if (!mismatches) {
+					fprintf(stderr, "Base privkey: ");
+					fdumpbn(stderr, EC_KEY_get0_private_key(
+						       vxcp->vxc_key));
+				}
+				mismatches++;
+				fprintf(stderr, "Mismatch for kernel %d, "
+				       "offset %d (%d,%d)\n",
+				       z_inverted, bx + x, y, x);
+				if (!mm_r) {
+					mm_r = 1;
+					fprintf(stderr, "Row X   : ");
+					fdumpbn(stderr, &ppr->X);
+					fprintf(stderr, "Row Y   : ");
+					fdumpbn(stderr, &ppr->Y);
+				}
+
+				fprintf(stderr, "Column X: ");
+				fdumpbn(stderr, &ppc->X);
+				fprintf(stderr, "Column Y: ");
+				fdumpbn(stderr, &ppc->Y);
+
+				if (BN_cmp(&ppt->X, &pps->X)) {
+					fprintf(stderr, "Expect X: ");
+					fdumpbn(stderr, &pps->X);
+					fprintf(stderr, "Device X: ");
+					fdumpbn(stderr, &ppt->X);
+				}
+				if (BN_cmp(&ppt->Y, &pps->Y)) {
+					fprintf(stderr, "Expect Y: ");
+					fdumpbn(stderr, &pps->Y);
+					fprintf(stderr, "Device Y: ");
+					fdumpbn(stderr, &ppt->Y);
+				}
+				if (BN_cmp(&bnz, bnzc)) {
+					fprintf(stderr, "Expect Z: ");
+					fdumpbn(stderr, bnzc);
+					fprintf(stderr, "Device Z: ");
+					fdumpbn(stderr, &bnz);
+				}
+			}
+		}
+	}
+
+	ret = !mismatches;
+
+out:
+	if (z_heap)
+		vg_ocl_unmap_arg_buffer(vocp, slot, 1, z_heap);
+	if (point_tmp)
+		vg_ocl_unmap_arg_buffer(vocp, slot, 2, point_tmp);
+	if (ocl_points_in)
+		vg_ocl_unmap_arg_buffer(vocp, slot, 3, ocl_points_in);
+	if (ocl_strides_in)
+		vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in);
+	if (ppr)
+		EC_POINT_free(ppr);
+	if (ppc)
+		EC_POINT_free(ppc);
+	if (pps)
+		EC_POINT_free(pps);
+	if (ppt)
+		EC_POINT_free(ppt);
+	BN_clear_free(&bnz);
+	BN_clear_free(&bnez);
+	BN_clear_free(&bnm);
+	if (bnmont)
+		BN_MONT_CTX_free(bnmont);
+	if (bnctx)
+		BN_CTX_free(bnctx);
+	return ret;
+}
+
+static int
+vg_ocl_verify_k0(vg_ocl_context_t *vocp, int slot)
+{
+	return vg_ocl_verify_temporary(vocp, slot, 0);
+}
+
+static int
+vg_ocl_verify_k1(vg_ocl_context_t *vocp, int slot)
+{
+	return vg_ocl_verify_temporary(vocp, slot, 1);
+}
+
+static void *
+vg_opencl_thread(void *arg)
+{
+	vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
+	vg_context_t *vcp = vocp->base.vxc_vc;
+	int halt = 0;
+	int slot = -1;
+	int rows, cols, invsize;
+	unsigned long long idleu, busyu;
+	double pidle;
+	struct timeval tv, tvt, tvd, idle, busy;
+
+	memset(&idle, 0, sizeof(idle));
+	memset(&busy, 0, sizeof(busy));
+
+	while (1) {
+		pthread_mutex_lock(&vocp->voc_lock);
+		if (halt) {
+			halt = 0;
+			vocp->voc_halt = 1;
+		}
+		if (slot != -1) {
+			assert(vocp->voc_ocl_slot == slot);
+			vocp->voc_ocl_slot = -1;
+			slot = -1;
+			pthread_cond_signal(&vocp->voc_wait);
+		}
+		if (vocp->voc_ocl_slot == -1) {
+			gettimeofday(&tv, NULL);
+			while (vocp->voc_ocl_slot == -1) {
+				if (vocp->voc_halt)
+					goto out;
+				pthread_cond_wait(&vocp->voc_wait,
+						  &vocp->voc_lock);
+			}
+			gettimeofday(&tvt, NULL);
+			timersub(&tvt, &tv, &tvd);
+			timeradd(&tvd, &idle, &idle);
+		}
+		slot = vocp->voc_ocl_slot;
+		rows = vocp->voc_ocl_rows;
+		cols = vocp->voc_ocl_cols;
+		invsize = vocp->voc_ocl_invsize;
+		pthread_mutex_unlock(&vocp->voc_lock);
+
+		gettimeofday(&tv, NULL);
+		if (!vg_ocl_kernel_start(vocp, slot, cols, rows, invsize))
+			halt = 1;
+
+		if (!vg_ocl_kernel_wait(vocp, slot))
+			halt = 1;
+
+		if (vcp->vc_verbose > 1) {
+			gettimeofday(&tvt, NULL);
+			timersub(&tvt, &tv, &tvd);
+			timeradd(&tvd, &busy, &busy);
+			if ((busy.tv_sec + idle.tv_sec) > 1) {
+				idleu = (1000000 * idle.tv_sec) + idle.tv_usec;
+				busyu = (1000000 * busy.tv_sec) + busy.tv_usec;
+				pidle = ((double) idleu) / (idleu + busyu);
+
+				if (pidle > 0.01) {
+					fprintf(stderr, "\rGPU idle: %.2f%%"
+					       "                              "
+				       "                                \n",
+					       100 * pidle);
+				}
+				memset(&idle, 0, sizeof(idle));
+				memset(&busy, 0, sizeof(busy));
+			}
+		}
+	}
+out:
+	pthread_mutex_unlock(&vocp->voc_lock);
+	return NULL;
+}
+
+
+/*
+ * Address search thread main loop
+ */
+
+void *
+vg_opencl_loop(void *arg)
+{
+	vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
+	int i;
+	int round, nrows, ncols;
+
+	const BN_ULONG rekey_max = 100000000;
+	BN_ULONG npoints, rekey_at;
+
+	EC_KEY *pkey = NULL;
+	const EC_GROUP *pgroup;
+	const EC_POINT *pgen;
+	EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL;
+	EC_POINT *pseek = NULL;
+
+	unsigned char *ocl_points_in, *ocl_strides_in;
+
+	vg_context_t *vcp = vocp->base.vxc_vc;
+	vg_exec_context_t *vxcp = &vocp->base;
+
+	int slot, nslots;
+	int slot_busy = 0, slot_done = 0, halt = 0;
+	int c = 0, output_interval = 1000;
+
+	struct timeval tvstart;
+
+	pkey = vxcp->vxc_key;
+	pgroup = EC_KEY_get0_group(pkey);
+	pgen = EC_GROUP_get0_generator(pgroup);
+
+	round = vocp->voc_ocl_rows * vocp->voc_ocl_cols;
+
+	if (!vcp->vc_remove_on_match &&
+	    (vcp->vc_chance >= 1.0f) &&
+	    (vcp->vc_chance < round) &&
+	    (vcp->vc_verbose > 0)) {
+		fprintf(stderr, "WARNING: low pattern difficulty\n");
+		fprintf(stderr,
+			"WARNING: better match throughput is possible "
+			"using vanitygen on the CPU\n");
+	}
+
+	slot = 0;
+	nslots = 2;
+	vocp->voc_nslots = nslots;
+
+	nrows = vocp->voc_ocl_rows;
+	ncols = vocp->voc_ocl_cols;
+
+	ppbase = (EC_POINT **) malloc((nrows + ncols) *
+				      sizeof(EC_POINT*));
+	if (!ppbase)
+		goto enomem;
+
+	for (i = 0; i < (nrows + ncols); i++) {
+		ppbase[i] = EC_POINT_new(pgroup);
+		if (!ppbase[i])
+			goto enomem;
+	}
+
+	pprow = ppbase + ncols;
+	pbatchinc = EC_POINT_new(pgroup);
+	poffset = EC_POINT_new(pgroup);
+	pseek = EC_POINT_new(pgroup);
+	if (!pbatchinc || !poffset || !pseek)
+		goto enomem;
+
+	BN_set_word(&vxcp->vxc_bntmp, ncols);
+	EC_POINT_mul(pgroup, pbatchinc, &vxcp->vxc_bntmp, NULL, NULL,
+		     vxcp->vxc_bnctx);
+	EC_POINT_make_affine(pgroup, pbatchinc, vxcp->vxc_bnctx);
+
+	BN_set_word(&vxcp->vxc_bntmp, round);
+	EC_POINT_mul(pgroup, poffset, &vxcp->vxc_bntmp, NULL, NULL,
+		     vxcp->vxc_bnctx);
+	EC_POINT_make_affine(pgroup, poffset, vxcp->vxc_bnctx);
+
+	if (!vg_ocl_config_pattern(vocp))
+		goto enomem;
+
+	for (i = 0; i < nslots; i++) {
+		/*
+		 * Each work group gets its own:
+		 * - Column point array
+		 */
+		if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * nrows, 1))
+			goto enomem;
+	}
+
+	/*
+	 * All instances share:
+	 * - The z_heap and point scratch spaces
+	 * - The row point array
+	 */
+	if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1,
+			     round_up_pow2(32 * 2 * round, 4096), 0) ||
+	    !vg_ocl_kernel_arg_alloc(vocp, -1, 2,
+			     round_up_pow2(32 * 2 * round, 4096), 0) ||
+	    !vg_ocl_kernel_arg_alloc(vocp, -1, 3,
+			     round_up_pow2(32 * 2 * ncols, 4096), 1))
+		goto enomem;
+
+	npoints = 0;
+	rekey_at = 0;
+	vxcp->vxc_binres[0] = vcp->vc_addrtype;
+
+	if (pthread_create(&vocp->voc_ocl_thread, NULL,
+			   vg_opencl_thread, vocp))
+		goto enomem;
+
+	gettimeofday(&tvstart, NULL);
+
+l_rekey:
+	if (vocp->voc_rekey_func &&
+	    !vocp->voc_rekey_func(vocp))
+		goto enomem;
+
+	/* Generate a new random private key */
+	EC_KEY_generate_key(pkey);
+	npoints = 0;
+
+	/* Determine rekey interval */
+	EC_GROUP_get_order(pgroup, &vxcp->vxc_bntmp, vxcp->vxc_bnctx);
+	BN_sub(&vxcp->vxc_bntmp2,
+	       &vxcp->vxc_bntmp,
+	       EC_KEY_get0_private_key(pkey));
+	rekey_at = BN_get_word(&vxcp->vxc_bntmp2);
+	if ((rekey_at == BN_MASK2) || (rekey_at > rekey_max))
+		rekey_at = rekey_max;
+	assert(rekey_at > 0);
+
+	EC_POINT_copy(ppbase[0], EC_KEY_get0_public_key(pkey));
+
+	if (vcp->vc_pubkey_base) {
+		EC_POINT_add(pgroup,
+			     ppbase[0],
+			     ppbase[0],
+			     vcp->vc_pubkey_base,
+			     vxcp->vxc_bnctx);
+	}
+
+	/* Build the base array of sequential points */
+	for (i = 1; i < ncols; i++) {
+		EC_POINT_add(pgroup,
+			     ppbase[i],
+			     ppbase[i-1],
+			     pgen, vxcp->vxc_bnctx);
+	}
+
+	EC_POINTs_make_affine(pgroup, ncols, ppbase, vxcp->vxc_bnctx);
+
+	/* Fill the sequential point array */
+	ocl_points_in = (unsigned char *)
+		vg_ocl_map_arg_buffer(vocp, 0, 3, 1);
+	if (!ocl_points_in)
+		goto enomem;
+	for (i = 0; i < ncols; i++)
+		vg_ocl_put_point_tpa(ocl_points_in, i, ppbase[i]);
+	vg_ocl_unmap_arg_buffer(vocp, 0, 3, ocl_points_in);
+
+	/*
+	 * Set up the initial row increment table.
+	 * Set the first element to pgen -- effectively
+	 * skipping the exact key generated above.
+	 */
+	EC_POINT_copy(pprow[0], pgen);
+	for (i = 1; i < nrows; i++) {
+		EC_POINT_add(pgroup,
+			     pprow[i],
+			     pprow[i-1],
+			     pbatchinc, vxcp->vxc_bnctx);
+	}
+	EC_POINTs_make_affine(pgroup, nrows, pprow, vxcp->vxc_bnctx);
+	vxcp->vxc_delta = 1;
+	npoints = 1;
+	slot = 0;
+	slot_busy = 0;
+	slot_done = 0;
+
+	while (1) {
+		if (slot_done) {
+			assert(rekey_at > 0);
+			slot_done = 0;
+
+			/* Call the result check function */
+			switch (vocp->voc_check_func(vocp, slot)) {
+			case 1:
+				rekey_at = 0;
+				break;
+			case 2:
+				halt = 1;
+				break;
+			default:
+				break;
+			}
+
+			c += round;
+			if (!halt && (c >= output_interval)) {
+				output_interval =
+					vg_output_timing(vcp, c, &tvstart);
+				c = 0;
+			}
+		}
+
+		if (vcp->vc_halt)
+			halt = 1;
+		if (halt)
+			break;
+
+		if ((npoints + round) < rekey_at) {
+			if (npoints > 1) {
+				/* Move the row increments forward */
+				for (i = 0; i < nrows; i++) {
+					EC_POINT_add(pgroup,
+						     pprow[i],
+						     pprow[i],
+						     poffset,
+						     vxcp->vxc_bnctx);
+				}
+
+				EC_POINTs_make_affine(pgroup, nrows, pprow,
+						      vxcp->vxc_bnctx);
+			}
+
+			/* Copy the row stride array to the device */
+			ocl_strides_in = (unsigned char *)
+				vg_ocl_map_arg_buffer(vocp, slot, 4, 1);
+			if (!ocl_strides_in)
+				goto enomem;
+			memset(ocl_strides_in, 0, 64*nrows);
+			for (i = 0; i < nrows; i++)
+				vg_ocl_put_point(ocl_strides_in + (64*i),
+						 pprow[i]);
+			vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in);
+			npoints += round;
+
+			pthread_mutex_lock(&vocp->voc_lock);
+			while (vocp->voc_ocl_slot != -1) {
+				assert(slot_busy);
+				pthread_cond_wait(&vocp->voc_wait,
+						  &vocp->voc_lock);
+			}
+
+			if (vocp->voc_halt) {
+				pthread_mutex_unlock(&vocp->voc_lock);
+				halt = 1;
+				break;
+			}
+
+			vocp->voc_ocl_slot = slot;
+			pthread_cond_signal(&vocp->voc_wait);
+			pthread_mutex_unlock(&vocp->voc_lock);
+
+			slot_done = slot_busy;
+			slot_busy = 1;
+			slot = (slot + 1) % nslots;
+
+		} else { 
+			if (slot_busy) {
+				pthread_mutex_lock(&vocp->voc_lock);
+				while (vocp->voc_ocl_slot != -1) {
+					assert(vocp->voc_ocl_slot ==
+					       ((slot + nslots - 1) % nslots));
+					pthread_cond_wait(&vocp->voc_wait,
+							  &vocp->voc_lock);
+				}
+				pthread_mutex_unlock(&vocp->voc_lock);
+				slot_busy = 0;
+				slot_done = 1;
+			}
+
+			if (!rekey_at ||
+			    (!slot_done && ((npoints + round) >= rekey_at)))
+				goto l_rekey;
+		}
+	}
+
+	if (0) {
+	enomem:
+		fprintf(stderr, "ERROR: allocation failure?\n");
+	}
+
+	if (halt) {
+		if (vcp->vc_verbose > 1) {
+			printf("Halting...");
+			fflush(stdout);
+		}
+		pthread_mutex_lock(&vocp->voc_lock);
+		vocp->voc_halt = 1;
+		pthread_cond_signal(&vocp->voc_wait);
+		while (vocp->voc_ocl_slot != -1) {
+			assert(slot_busy);
+			pthread_cond_wait(&vocp->voc_wait,
+					  &vocp->voc_lock);
+		}
+		slot_busy = 0;
+		pthread_mutex_unlock(&vocp->voc_lock);
+		pthread_join(vocp->voc_ocl_thread, NULL);
+		if (vcp->vc_verbose > 1)
+			printf("done!\n");
+	}
+
+	if (ppbase) {
+		for (i = 0; i < (nrows + ncols); i++)
+			if (ppbase[i])
+				EC_POINT_free(ppbase[i]);
+		free(ppbase);
+	}
+	if (pbatchinc)
+		EC_POINT_free(pbatchinc);
+
+	/* Release the argument buffers */
+	vg_ocl_free_args(vocp);
+	vocp->voc_halt = 0;
+	vocp->voc_ocl_slot = -1;
+	return NULL;
+}
+
+
+
+
+/*
+ * OpenCL platform/device selection junk
+ */
+
+static int
+get_device_list(cl_platform_id pid, cl_device_id **list_out)
+{
+	cl_uint nd;
+	cl_int res;
+	cl_device_id *ids;
+	res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
+	if (res != CL_SUCCESS) {
+		vg_ocl_error(NULL, res, "clGetDeviceIDs(0)");
+		*list_out = NULL;
+		return -1;
+	}
+	if (nd) {
+		ids = (cl_device_id *) malloc(nd * sizeof(*ids));
+		if (ids == NULL) {
+			fprintf(stderr, "Could not allocate device ID list\n");
+			*list_out = NULL;
+			return -1;
+		}
+		res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, nd, ids, NULL);
+		if (res != CL_SUCCESS) {
+			vg_ocl_error(NULL, res, "clGetDeviceIDs(n)");
+			free(ids);
+			*list_out = NULL;
+			return -1;
+		}
+		*list_out = ids;
+	}
+	return nd;
+}
+
+static void
+show_devices(cl_platform_id pid, cl_device_id *ids, int nd, int base)
+{
+	int i;
+	char nbuf[128];
+	char vbuf[128];
+	size_t len;
+	cl_int res;
+
+	for (i = 0; i < nd; i++) {
+		res = clGetDeviceInfo(ids[i], CL_DEVICE_NAME,
+				      sizeof(nbuf), nbuf, &len);
+		if (res != CL_SUCCESS)
+			continue;
+		if (len >= sizeof(nbuf))
+			len = sizeof(nbuf) - 1;
+		nbuf[len] = '\0';
+		res = clGetDeviceInfo(ids[i], CL_DEVICE_VENDOR,
+				      sizeof(vbuf), vbuf, &len);
+		if (res != CL_SUCCESS)
+			continue;
+		if (len >= sizeof(vbuf))
+			len = sizeof(vbuf) - 1;
+		vbuf[len] = '\0';
+		fprintf(stderr, "  %d: [%s] %s\n", i + base, vbuf, nbuf);
+	}
+}
+
+static cl_device_id
+get_device(cl_platform_id pid, int num)
+{
+	int nd;
+	cl_device_id id, *ids;
+
+	nd = get_device_list(pid, &ids);
+	if (nd < 0)
+		return NULL;
+	if (!nd) {
+		fprintf(stderr, "No OpenCL devices found\n");
+		return NULL;
+	}
+	if (num < 0) {
+		if (nd == 1)
+			num = 0;
+		else
+			num = nd;
+	}
+	if (num < nd) {
+		id = ids[num];
+		free(ids);
+		return id;
+	}
+	free(ids);
+	return NULL;
+}
+
+static int
+get_platform_list(cl_platform_id **list_out)
+{
+	cl_uint np;
+	cl_int res;
+	cl_platform_id *ids;
+	res = clGetPlatformIDs(0, NULL, &np);
+	if (res != CL_SUCCESS) {
+		vg_ocl_error(NULL, res, "clGetPlatformIDs(0)");
+		*list_out = NULL;
+		return -1;
+	}
+	if (np) {
+		ids = (cl_platform_id *) malloc(np * sizeof(*ids));
+		if (ids == NULL) {
+			fprintf(stderr,
+				"Could not allocate platform ID list\n");
+			*list_out = NULL;
+			return -1;
+		}
+		res = clGetPlatformIDs(np, ids, NULL);
+		if (res != CL_SUCCESS) {
+			vg_ocl_error(NULL, res, "clGetPlatformIDs(n)");
+			free(ids);
+			*list_out = NULL;
+			return -1;
+		}
+		*list_out = ids;
+	}
+	return np;
+}
+
+void
+show_platforms(cl_platform_id *ids, int np, int base)
+{
+	int i;
+	char nbuf[128];
+	char vbuf[128];
+	size_t len;
+	cl_int res;
+
+	for (i = 0; i < np; i++) {
+		res = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME,
+					sizeof(nbuf), nbuf, &len);
+		if (res != CL_SUCCESS) {
+			vg_ocl_error(NULL, res, "clGetPlatformInfo(NAME)");
+			continue;
+		}
+		if (len >= sizeof(nbuf))
+			len = sizeof(nbuf) - 1;
+		nbuf[len] = '\0';
+		res = clGetPlatformInfo(ids[i], CL_PLATFORM_VENDOR,
+					sizeof(vbuf), vbuf, &len);
+		if (res != CL_SUCCESS) {
+			vg_ocl_error(NULL, res, "clGetPlatformInfo(VENDOR)");
+			continue;
+		}
+		if (len >= sizeof(vbuf))
+			len = sizeof(vbuf) - 1;
+		vbuf[len] = '\0';
+		fprintf(stderr, "%d: [%s] %s\n", i + base, vbuf, nbuf);
+	}
+}
+
+static cl_platform_id
+get_platform(int num)
+{
+	int np;
+	cl_platform_id id, *ids;
+
+	np = get_platform_list(&ids);
+	if (np < 0)
+		return NULL;
+	if (!np) {
+		fprintf(stderr, "No OpenCL platforms available\n");
+		return NULL;
+	}
+	if (num < 0) {
+		if (np == 1)
+			num = 0;
+		else
+			num = np;
+	}
+	if (num < np) {
+		id = ids[num];
+		free(ids);
+		return id;
+	}
+	free(ids);
+	return NULL;
+}
+
+void
+enumerate_opencl(void)
+{
+	cl_platform_id *pids;
+	cl_device_id *dids;
+	int np, nd, i;
+
+	np = get_platform_list(&pids);
+	if (!np) {
+		fprintf(stderr, "No OpenCL platforms available\n");
+		return;
+	}
+	fprintf(stderr, "Available OpenCL platforms:\n");
+	for (i = 0; i < np; i++) {
+		show_platforms(&pids[i], 1, i);
+		nd = get_device_list(pids[i], &dids);
+		if (!nd) {
+			fprintf(stderr, "  -- No devices\n");
+		} else {
+			show_devices(pids[i], dids, nd, 0);
+		}
+	}
+}
+
+cl_device_id
+get_opencl_device(int platformidx, int deviceidx)
+{
+	cl_platform_id pid;
+	cl_device_id did = NULL;
+
+	pid = get_platform(platformidx);
+	if (pid) {
+		did = get_device(pid, deviceidx);
+		if (did)
+			return did;
+	}
+	enumerate_opencl();
+	return NULL;
+}
+
+
+vg_ocl_context_t *
+vg_ocl_context_new(vg_context_t *vcp,
+		   int platformidx, int deviceidx, int safe_mode, int verify,
+		   int worksize, int nthreads, int nrows, int ncols,
+		   int invsize)
+{
+	cl_device_id did;
+	int round, full_threads, wsmult;
+	cl_ulong memsize, allocsize;
+	vg_ocl_context_t *vocp;
+
+	/* Find the device */
+	did = get_opencl_device(platformidx, deviceidx);
+	if (!did) {
+		return 0;
+	}
+
+	vocp = (vg_ocl_context_t *) malloc(sizeof(*vocp));
+	if (!vocp)
+		return NULL;
+
+	/* Open the device and compile the kernel */
+	if (!vg_ocl_init(vcp, vocp, did, safe_mode)) {
+		free(vocp);
+		return NULL;
+	}
+
+	if (verify) {
+		if (vcp->vc_verbose > 0) {
+			fprintf(stderr, "WARNING: "
+				"Hardware verification mode enabled\n");
+		}
+		if (!nthreads)
+			nthreads = 1;
+		vocp->voc_verify_func[0] = vg_ocl_verify_k0;
+		vocp->voc_verify_func[1] = vg_ocl_verify_k1;
+	}
+
+	/*
+	 * nrows: number of point rows per job
+	 * ncols: number of point columns per job
+	 * invsize: number of modular inversion tasks per job
+	 *    (each task performs (nrows*ncols)/invsize inversions)
+	 * nslots: number of kernels
+	 *    (create two, keep one running while we service the other or wait)
+	 */
+
+	if (!nthreads) {
+		/* Pick nthreads sufficient to saturate one compute unit */
+		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
+		    CL_DEVICE_TYPE_CPU)
+			nthreads = 1;
+		else
+			nthreads = vg_ocl_device_getsizet(vocp->voc_ocldid,
+					CL_DEVICE_MAX_WORK_GROUP_SIZE);
+	}
+
+	full_threads = vg_ocl_device_getsizet(vocp->voc_ocldid,
+					      CL_DEVICE_MAX_COMPUTE_UNITS);
+	full_threads *= nthreads;
+
+	/*
+	 * The work size selection is complicated, and the most
+	 * important factor is the batch size of the heap_invert kernel.
+	 * Each value added to the batch trades one complete modular
+	 * inversion for four multiply operations.  Ideally the work
+	 * size would be as large as possible.  The practical limiting
+	 * factors are:
+	 * 1. Available memory
+	 * 2. Responsiveness and operational latency
+	 *
+	 * We take a naive approach and limit batch size to a point of
+	 * sufficiently diminishing returns, hoping that responsiveness
+	 * will be sufficient.
+	 *
+	 * The measured value for the OpenSSL implementations on my CPU
+	 * is 80:1.  This causes heap_invert to get batches of 20 or so
+	 * for free, and receive 10% incremental returns at 200.  The CPU
+	 * work size is therefore set to 256.
+	 *
+	 * The ratio on most GPUs with the oclvanitygen implementations
+	 * is closer to 500:1, and larger batches are required for
+	 * good performance.
+	 */
+	if (!worksize) {
+		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
+		    CL_DEVICE_TYPE_GPU)
+			worksize = 2048;
+		else
+			worksize = 256;
+	}
+
+	if (!ncols) {
+		memsize = vg_ocl_device_getulong(vocp->voc_ocldid,
+					CL_DEVICE_GLOBAL_MEM_SIZE);
+		allocsize = vg_ocl_device_getulong(vocp->voc_ocldid,
+					CL_DEVICE_MAX_MEM_ALLOC_SIZE);
+		memsize /= 2;
+		ncols = full_threads;
+		nrows = 2;
+		/* Find row and column counts close to sqrt(full_threads) */
+		while ((ncols > nrows) && !(ncols & 1)) {
+			ncols /= 2;
+			nrows *= 2;
+		}
+
+		/*
+		 * Increase row & column counts to satisfy work size
+		 * multiplier or fill available memory.
+		 */
+		wsmult = 1;
+		while ((!worksize || ((wsmult * 2) <= worksize)) &&
+		       ((ncols * nrows * 2 * 128) < memsize) &&
+		       ((ncols * nrows * 2 * 64) < allocsize)) {
+			if (ncols > nrows)
+				nrows *= 2;
+			else
+				ncols *= 2;
+			wsmult *= 2;
+		}
+	}
+
+	round = nrows * ncols;
+
+	if (!invsize) {
+		invsize = 2;
+		while (!(round % (invsize << 1)) &&
+		       ((round / invsize) > full_threads))
+			invsize <<= 1;
+	}
+
+	if (vcp->vc_verbose > 1) {
+		fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows);
+		fprintf(stderr, "Modular inverse: %d threads, %d ops each\n",
+			round/invsize, invsize);
+	}
+
+	if ((round % invsize) || !is_pow2(invsize) || (invsize < 2)) {
+		if (vcp->vc_verbose <= 1) {
+			fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows);
+			fprintf(stderr,
+				"Modular inverse: %d threads, %d ops each\n",
+				round/invsize, invsize);
+		}
+		if (round % invsize)
+			fprintf(stderr,
+				"Modular inverse work size must "
+				"evenly divide points\n");
+		else
+			fprintf(stderr,
+				"Modular inverse work per task (%d) "
+				"must be a power of 2\n", invsize);
+		goto out_fail;
+	}
+
+	vocp->voc_ocl_rows = nrows;
+	vocp->voc_ocl_cols = ncols;
+	vocp->voc_ocl_invsize = invsize;
+
+	return vocp;
+
+out_fail:
+	vg_ocl_context_free(vocp);
+	return NULL;
+}
+
+void
+vg_ocl_context_free(vg_ocl_context_t *vocp)
+{
+	vg_ocl_del(vocp);
+	free(vocp);
+}
diff --git a/oclengine.h b/oclengine.h
new file mode 100644
index 0000000..d13c1f0
--- /dev/null
+++ b/oclengine.h
@@ -0,0 +1,35 @@
+/*
+ * Vanitygen, vanity bitcoin address generator
+ * Copyright (C) 2011 <samr7@cs.washington.edu>
+ *
+ * Vanitygen is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version. 
+ *
+ * Vanitygen is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Vanitygen.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if !defined (__VG_OCLENGINE_H__)
+#define __VG_OCLENGINE_H__
+
+#include "pattern.h"
+
+typedef struct _vg_ocl_context_s vg_ocl_context_t;
+
+extern vg_ocl_context_t *vg_ocl_context_new(
+	vg_context_t *vcp, int platformidx, int deviceidx,
+	int safe_mode, int verify,
+	int worksize, int nthreads, int nrows, int ncols,
+	int invsize);
+extern void vg_ocl_context_free(vg_ocl_context_t *vocp);
+
+extern void *vg_opencl_loop(void *vocp);
+
+#endif /* !defined (__VG_OCLENGINE_H__) */
diff --git a/oclvanitygen.c b/oclvanitygen.c
index 7b204fb..0284ea0 100644
--- a/oclvanitygen.c
+++ b/oclvanitygen.c
@@ -21,2517 +21,17 @@
 #include <math.h>
 #include <assert.h>
 
-#include <pthread.h>
-
-#include <openssl/ec.h>
-#include <openssl/bn.h>
-#include <openssl/rand.h>
-#include <openssl/md5.h>
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#ifndef CL_CALLBACK
-#define CL_CALLBACK
-#endif
-#else
-#include <CL/cl.h>
-#endif
-
-#include "pattern.h"
-#include "util.h"
-
-
-const char *version = "0.17";
-const int debug = 0;
-
-#define MAX_SLOT 2
-#define MAX_ARG 6
-#define MAX_KERNEL 3
-
-#define is_pow2(v) (!((v) & ((v)-1)))
-#define round_up_pow2(x, a) (((x) + ((a)-1)) & ~((a)-1))
-
-/* OpenCL address searching mode */
-struct _vg_ocl_context_s;
-typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *);
-typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot);
-
-typedef struct _vg_ocl_context_s {
-	vg_exec_context_t		base;
-	cl_device_id			voc_ocldid;
-	cl_context			voc_oclctx;
-	cl_command_queue		voc_oclcmdq;
-	cl_program			voc_oclprog;
-	vg_ocl_init_t			voc_init_func;
-	vg_ocl_init_t			voc_rekey_func;
-	vg_ocl_check_t			voc_check_func;
-	int				voc_quirks;
-	int				voc_nslots;
-	cl_kernel			voc_oclkernel[MAX_SLOT][MAX_KERNEL];
-	cl_event			voc_oclkrnwait[MAX_SLOT];
-	cl_mem				voc_args[MAX_SLOT][MAX_ARG];
-	size_t				voc_arg_size[MAX_SLOT][MAX_ARG];
-
-	int				voc_pattern_rewrite;
-	int				voc_pattern_alloc;
-
-	vg_ocl_check_t			voc_verify_func[MAX_KERNEL];
-
-	pthread_t			voc_ocl_thread;
-	pthread_mutex_t			voc_lock;
-	pthread_cond_t			voc_wait;
-	int				voc_ocl_slot;
-	int				voc_ocl_rows;
-	int				voc_ocl_cols;
-	int				voc_ocl_invsize;
-	int				voc_halt;
-	int				voc_rekey;
-	int				voc_dump_done;
-} vg_ocl_context_t;
-
-
-/* Thread synchronization stubs */
-void
-vg_exec_downgrade_lock(vg_exec_context_t *vxcp)
-{
-}
-
-int
-vg_exec_upgrade_lock(vg_exec_context_t *vxcp)
-{
-	return 0;
-}
-
-
-/*
- * OpenCL debugging and support
- */
-
-const char *
-vg_ocl_strerror(cl_int ret)
-{
-#define OCL_STATUS(st) case st: return #st;
-	switch (ret) {
-		OCL_STATUS(CL_SUCCESS);
-		OCL_STATUS(CL_DEVICE_NOT_FOUND);
-		OCL_STATUS(CL_DEVICE_NOT_AVAILABLE);
-		OCL_STATUS(CL_COMPILER_NOT_AVAILABLE);
-		OCL_STATUS(CL_MEM_OBJECT_ALLOCATION_FAILURE);
-		OCL_STATUS(CL_OUT_OF_RESOURCES);
-		OCL_STATUS(CL_OUT_OF_HOST_MEMORY);
-		OCL_STATUS(CL_PROFILING_INFO_NOT_AVAILABLE);
-		OCL_STATUS(CL_MEM_COPY_OVERLAP);
-		OCL_STATUS(CL_IMAGE_FORMAT_MISMATCH);
-		OCL_STATUS(CL_IMAGE_FORMAT_NOT_SUPPORTED);
-		OCL_STATUS(CL_BUILD_PROGRAM_FAILURE);
-		OCL_STATUS(CL_MAP_FAILURE);
-#if defined(CL_MISALIGNED_SUB_BUFFER_OFFSET)
-		OCL_STATUS(CL_MISALIGNED_SUB_BUFFER_OFFSET);
-#endif /* defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) */
-#if defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
-		OCL_STATUS(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
-#endif /* defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) */
-		OCL_STATUS(CL_INVALID_VALUE);
-		OCL_STATUS(CL_INVALID_DEVICE_TYPE);
-		OCL_STATUS(CL_INVALID_PLATFORM);
-		OCL_STATUS(CL_INVALID_DEVICE);
-		OCL_STATUS(CL_INVALID_CONTEXT);
-		OCL_STATUS(CL_INVALID_QUEUE_PROPERTIES);
-		OCL_STATUS(CL_INVALID_COMMAND_QUEUE);
-		OCL_STATUS(CL_INVALID_HOST_PTR);
-		OCL_STATUS(CL_INVALID_MEM_OBJECT);
-		OCL_STATUS(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-		OCL_STATUS(CL_INVALID_IMAGE_SIZE);
-		OCL_STATUS(CL_INVALID_SAMPLER);
-		OCL_STATUS(CL_INVALID_BINARY);
-		OCL_STATUS(CL_INVALID_BUILD_OPTIONS);
-		OCL_STATUS(CL_INVALID_PROGRAM);
-		OCL_STATUS(CL_INVALID_PROGRAM_EXECUTABLE);
-		OCL_STATUS(CL_INVALID_KERNEL_NAME);
-		OCL_STATUS(CL_INVALID_KERNEL_DEFINITION);
-		OCL_STATUS(CL_INVALID_KERNEL);
-		OCL_STATUS(CL_INVALID_ARG_INDEX);
-		OCL_STATUS(CL_INVALID_ARG_VALUE);
-		OCL_STATUS(CL_INVALID_ARG_SIZE);
-		OCL_STATUS(CL_INVALID_KERNEL_ARGS);
-		OCL_STATUS(CL_INVALID_WORK_DIMENSION);
-		OCL_STATUS(CL_INVALID_WORK_GROUP_SIZE);
-		OCL_STATUS(CL_INVALID_WORK_ITEM_SIZE);
-		OCL_STATUS(CL_INVALID_GLOBAL_OFFSET);
-		OCL_STATUS(CL_INVALID_EVENT_WAIT_LIST);
-		OCL_STATUS(CL_INVALID_EVENT);
-		OCL_STATUS(CL_INVALID_OPERATION);
-		OCL_STATUS(CL_INVALID_GL_OBJECT);
-		OCL_STATUS(CL_INVALID_BUFFER_SIZE);
-		OCL_STATUS(CL_INVALID_MIP_LEVEL);
-		OCL_STATUS(CL_INVALID_GLOBAL_WORK_SIZE);
-#if defined(CL_INVALID_PROPERTY)
-		OCL_STATUS(CL_INVALID_PROPERTY);
-#endif /* defined(CL_INVALID_PROPERTY) */
-#undef OCL_STATUS
-	default: {
-		static char tmp[64];
-		snprintf(tmp, sizeof(tmp), "Unknown code %d", ret);
-		return tmp;
-	}
-	}
-}
-
-/* Get device strings, using a static buffer -- caveat emptor */
-const char *
-vg_ocl_platform_getstr(cl_platform_id pid, cl_platform_info param)
-{
-	static char platform_str[1024];
-	cl_int ret;
-	size_t size_ret;
-	ret = clGetPlatformInfo(pid, param,
-				sizeof(platform_str), platform_str,
-				&size_ret);
-	if (ret != CL_SUCCESS) {
-		snprintf(platform_str, sizeof(platform_str),
-			 "clGetPlatformInfo(%d): %s",
-			 param, vg_ocl_strerror(ret));
-	}
-	return platform_str;
-}
-
-cl_platform_id
-vg_ocl_device_getplatform(cl_device_id did)
-{
-	cl_int ret;
-	cl_platform_id val;
-	size_t size_ret;
-	ret = clGetDeviceInfo(did, CL_DEVICE_PLATFORM,
-			      sizeof(val), &val, &size_ret);
-	if (ret != CL_SUCCESS) {
-		fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_PLATFORM): %s",
-			vg_ocl_strerror(ret));
-	}
-	return val;
-}
-
-cl_device_type
-vg_ocl_device_gettype(cl_device_id did)
-{
-	cl_int ret;
-	cl_device_type val;
-	size_t size_ret;
-	ret = clGetDeviceInfo(did, CL_DEVICE_TYPE,
-			      sizeof(val), &val, &size_ret);
-	if (ret != CL_SUCCESS) {
-		fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_TYPE): %s",
-			vg_ocl_strerror(ret));
-	}
-	return val;
-}
-
-const char *
-vg_ocl_device_getstr(cl_device_id did, cl_device_info param)
-{
-	static char device_str[1024];
-	cl_int ret;
-	size_t size_ret;
-	ret = clGetDeviceInfo(did, param,
-			      sizeof(device_str), device_str,
-			      &size_ret);
-	if (ret != CL_SUCCESS) {
-		snprintf(device_str, sizeof(device_str),
-			 "clGetDeviceInfo(%d): %s",
-			 param, vg_ocl_strerror(ret));
-	}
-	return device_str;
-}
-
-size_t
-vg_ocl_device_getsizet(cl_device_id did, cl_device_info param)
-{
-	cl_int ret;
-	size_t val;
-	size_t size_ret;
-	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
-	if (ret != CL_SUCCESS) {
-		fprintf(stderr,
-			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
-	}
-	return val;
-}
-
-cl_ulong
-vg_ocl_device_getulong(cl_device_id did, cl_device_info param)
-{
-	cl_int ret;
-	cl_ulong val;
-	size_t size_ret;
-	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
-	if (ret != CL_SUCCESS) {
-		fprintf(stderr,
-			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
-	}
-	return val;
-}
-
-cl_uint
-vg_ocl_device_getuint(cl_device_id did, cl_device_info param)
-{
-	cl_int ret;
-        cl_uint val;
-	size_t size_ret;
-	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
-	if (ret != CL_SUCCESS) {
-		fprintf(stderr,
-			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
-	}
-	return val;
-}
-
-void
-vg_ocl_dump_info(vg_ocl_context_t *vocp)
-{
-	cl_device_id did;
-	if (vocp->base.vxc_vc && (vocp->base.vxc_vc->vc_verbose < 1))
-		return;
-	if (vocp->voc_dump_done)
-		return;
-	did = vocp->voc_ocldid;
-	fprintf(stderr, "Device: %s\n",
-	       vg_ocl_device_getstr(did, CL_DEVICE_NAME));
-	fprintf(stderr, "Vendor: %s (%04x)\n",
-	       vg_ocl_device_getstr(did, CL_DEVICE_VENDOR),
-	       vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID));
-	fprintf(stderr, "Driver: %s\n",
-	       vg_ocl_device_getstr(did, CL_DRIVER_VERSION));
-	fprintf(stderr, "Profile: %s\n",
-	       vg_ocl_device_getstr(did, CL_DEVICE_PROFILE));
-	fprintf(stderr, "Version: %s\n",
-	       vg_ocl_device_getstr(did, CL_DEVICE_VERSION));
-	fprintf(stderr, "Max compute units: %"PRSIZET"d\n",
-	       vg_ocl_device_getsizet(did, CL_DEVICE_MAX_COMPUTE_UNITS));
-	fprintf(stderr, "Max workgroup size: %"PRSIZET"d\n",
-	       vg_ocl_device_getsizet(did, CL_DEVICE_MAX_WORK_GROUP_SIZE));
-	fprintf(stderr, "Global memory: %ld\n",
-	       vg_ocl_device_getulong(did, CL_DEVICE_GLOBAL_MEM_SIZE));
-	fprintf(stderr, "Max allocation: %ld\n",
-	       vg_ocl_device_getulong(did, CL_DEVICE_MAX_MEM_ALLOC_SIZE));
-	vocp->voc_dump_done = 1;
-}
-
-void
-vg_ocl_error(vg_ocl_context_t *vocp, int code, const char *desc)
-{
-	const char *err = vg_ocl_strerror(code);
-	if (desc) {
-		fprintf(stderr, "%s: %s\n", desc, err);
-	} else {
-		fprintf(stderr, "%s\n", err);
-	}
-
-	if (vocp && vocp->voc_ocldid)
-		vg_ocl_dump_info(vocp);
-}
-
-void
-vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog)
-{
-	size_t logbufsize, logsize;
-	char *log;
-	int off = 0;
-	cl_int ret;
-
-	ret = clGetProgramBuildInfo(prog,
-				    vocp->voc_ocldid,
-				    CL_PROGRAM_BUILD_LOG,
-				    0, NULL,
-				    &logbufsize);
-	if (ret != CL_SUCCESS) {
-		vg_ocl_error(NULL, ret, "clGetProgramBuildInfo");
-		return;
-	}
-
-	log = (char *) malloc(logbufsize);
-	if (!log) {
-		fprintf(stderr, "Could not allocate build log buffer\n");
-		return;
-	}
-
-	ret = clGetProgramBuildInfo(prog,
-				    vocp->voc_ocldid,
-				    CL_PROGRAM_BUILD_LOG,
-				    logbufsize,
-				    log,
-				    &logsize);
-	if (ret != CL_SUCCESS) {
-		vg_ocl_error(NULL, ret, "clGetProgramBuildInfo");
-
-	} else {
-		/* Remove leading newlines and trailing newlines/whitespace */
-		log[logbufsize-1] = '\0';
-		for (off = logsize - 1; off >= 0; off--) {
-			if ((log[off] != '\r') &&
-			    (log[off] != '\n') &&
-			    (log[off] != ' ') &&
-			    (log[off] != '\t') &&
-			    (log[off] != '\0'))
-				break;
-			log[off] = '\0';
-		}
-		for (off = 0; off < logbufsize; off++) {
-			if ((log[off] != '\r') &&
-			    (log[off] != '\n'))
-				break;
-		}
-
-		fprintf(stderr, "Build log:\n%s\n", &log[off]);
-	}
-	free(log);
-}
-
-/*
- * OpenCL per-exec functions
- */
-
-enum {
-	VG_OCL_DEEP_PREPROC_UNROLL  = (1 << 0),
-	VG_OCL_PRAGMA_UNROLL        = (1 << 1),
-	VG_OCL_EXPENSIVE_BRANCHES   = (1 << 2),
-	VG_OCL_DEEP_VLIW            = (1 << 3),
-	VG_OCL_AMD_BFI_INT          = (1 << 4),
-	VG_OCL_NV_VERBOSE           = (1 << 5),
-	VG_OCL_BROKEN               = (1 << 6),
-	VG_OCL_NO_BINARIES          = (1 << 7),
-
-	VG_OCL_OPTIMIZATIONS        = (VG_OCL_DEEP_PREPROC_UNROLL |
-				       VG_OCL_PRAGMA_UNROLL |
-				       VG_OCL_EXPENSIVE_BRANCHES |
-				       VG_OCL_DEEP_VLIW |
-				       VG_OCL_AMD_BFI_INT),
-
-};
-
-int
-vg_ocl_get_quirks(vg_ocl_context_t *vocp)
-{
-	uint32_t vend;
-	const char *dvn;
-	unsigned int quirks = 0;
-
-	quirks |= VG_OCL_DEEP_PREPROC_UNROLL;
-
-	vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID);
-	switch (vend) {
-	case 0x10de: /* NVIDIA */
-		/*
-		 * NVIDIA's compiler seems to take a really really long
-		 * time when using preprocessor unrolling, but works
-		 * well with pragma unroll.
-		 */
-		quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL;
-		quirks |= VG_OCL_PRAGMA_UNROLL;
-		quirks |= VG_OCL_NV_VERBOSE;
-		break;
-	case 0x1002: /* AMD/ATI */
-		/*
-		 * AMD's compiler works best with preprocesor unrolling.
-		 * Pragma unroll is unreliable with AMD's compiler and
-		 * seems to crash based on whether the gods were smiling
-		 * when Catalyst was last installed/upgraded.
-		 */
-		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
-		    CL_DEVICE_TYPE_GPU) {
-			quirks |= VG_OCL_EXPENSIVE_BRANCHES;
-			quirks |= VG_OCL_DEEP_VLIW;
-			dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
-						   CL_DEVICE_EXTENSIONS);
-			if (dvn && strstr(dvn, "cl_amd_media_ops"))
-				quirks |= VG_OCL_AMD_BFI_INT;
-
-			dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
-						   CL_DEVICE_NAME);
-			if (!strcmp(dvn, "ATI RV710")) {
-				quirks &= ~VG_OCL_OPTIMIZATIONS;
-				quirks |= VG_OCL_NO_BINARIES;
-			}
-		}
-		break;
-	default:
-		break;
-	}
-	return quirks;
-}
-
-int
-vg_ocl_create_kernel(vg_ocl_context_t *vocp, int knum, const char *func)
-{
-	int i;
-	cl_kernel krn;
-	cl_int ret;
-
-	for (i = 0; i < MAX_SLOT; i++) {
-		krn = clCreateKernel(vocp->voc_oclprog, func, &ret);
-		if (!krn) {
-			fprintf(stderr, "clCreateKernel(%d): ", i);
-			vg_ocl_error(vocp, ret, NULL);
-			while (--i >= 0) {
-				clReleaseKernel(vocp->voc_oclkernel[i][knum]);
-				vocp->voc_oclkernel[i][knum] = NULL;
-			}
-			return 0;
-		}
-		vocp->voc_oclkernel[i][knum] = krn;
-		vocp->voc_oclkrnwait[i] = NULL;
-	}
-	return 1;
-}
-
-void
-vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts,
-		    const char *program, size_t size,
-		    unsigned char *hash_out)
-{
-	MD5_CTX ctx;
-	cl_platform_id pid;
-	const char *str;
-
-	MD5_Init(&ctx);
-	pid = vg_ocl_device_getplatform(vocp->voc_ocldid);
-	str = vg_ocl_platform_getstr(pid, CL_PLATFORM_NAME);
-	MD5_Update(&ctx, str, strlen(str) + 1);
-	str = vg_ocl_platform_getstr(pid, CL_PLATFORM_VERSION);
-	MD5_Update(&ctx, str, strlen(str) + 1);
-	str = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
-	MD5_Update(&ctx, str, strlen(str) + 1);
-	if (opts)
-		MD5_Update(&ctx, opts, strlen(opts) + 1);
-	if (size)
-		MD5_Update(&ctx, program, size);
-	MD5_Final(hash_out, &ctx);
-}
-
-typedef struct {
-	unsigned char	e_ident[16];
-	uint16_t	e_type;
-	uint16_t	e_machine;
-	uint32_t	e_version;
-	uint32_t	e_entry;
-	uint32_t	e_phoff;
-	uint32_t	e_shoff;
-	uint32_t	e_flags;
-	uint16_t	e_ehsize;
-	uint16_t	e_phentsize;
-	uint16_t	e_phnum;
-	uint16_t	e_shentsize;
-	uint16_t	e_shnum;
-	uint16_t	e_shstrndx;
-} vg_elf32_header_t;
-
-typedef struct {
-	uint32_t	sh_name;
-	uint32_t	sh_type;
-	uint32_t	sh_flags;
-	uint32_t	sh_addr;
-	uint32_t	sh_offset;
-	uint32_t	sh_size;
-	uint32_t	sh_link;
-	uint32_t	sh_info;
-	uint32_t	sh_addralign;
-	uint32_t	sh_entsize;
-} vg_elf32_shdr_t;
-
-int
-vg_ocl_amd_patch_inner(unsigned char *binary, size_t size)
-{
-	vg_elf32_header_t *ehp;
-	vg_elf32_shdr_t *shp, *nshp;
-	uint32_t *instr;
-	size_t off;
-	int i, n, txt2idx, patched;
-
-	ehp = (vg_elf32_header_t *) binary;
-	if ((size < sizeof(*ehp)) ||
-	    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
-	    !ehp->e_shoff)
-		return 0;
-
-	off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize);
-	nshp = (vg_elf32_shdr_t *) (binary + off);
-	if ((off + sizeof(*nshp)) > size)
-		return 0;
-
-	shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff);
-	n = 0;
-	txt2idx = 0;
-	for (i = 0; i < ehp->e_shnum; i++) {
-		off = nshp->sh_offset + shp[i].sh_name;
-		if (((off + 6) >= size) ||
-		    memcmp(binary + off, ".text", 6))
-			continue;
-		n++;
-		if (n == 2)
-			txt2idx = i;
-	}
-	if (n != 2)
-		return 0;
-
-	off = shp[txt2idx].sh_offset;
-	instr = (uint32_t *) (binary + off);
-	n = shp[txt2idx].sh_size / 4;
-	patched = 0;
-	for (i = 0; i < n; i += 2) {
-		if (((instr[i] & 0x02001000) == 0) &&
-		    ((instr[i+1] & 0x9003f000) == 0x0001a000)) {
-			instr[i+1] ^= (0x0001a000 ^ 0x0000c000);
-			patched++;
-		}
-	}
-
-	return patched;
-}
-
-int
-vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size)
-{
-	vg_context_t *vcp = vocp->base.vxc_vc;
-	vg_elf32_header_t *ehp;
-	unsigned char *ptr;
-	size_t offset = 1;
-	int ninner = 0, nrun, npatched = 0;
-
-	ehp = (vg_elf32_header_t *) binary;
-	if ((size < sizeof(*ehp)) ||
-	    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) ||
-	    !ehp->e_shoff)
-		return 0;
-
-	offset = 1;
-	while (offset < (size - 8)) {
-		ptr = (unsigned char *) memchr(binary + offset,
-					       0x7f,
-					       size - offset);
-		if (!ptr)
-			return npatched;
-		offset = ptr - binary;
-		ehp = (vg_elf32_header_t *) ptr;
-		if (((size - offset) < sizeof(*ehp)) ||
-		    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
-		    !ehp->e_shoff) {
-			offset += 1;
-			continue;
-		}
-
-		ninner++;
-		nrun = vg_ocl_amd_patch_inner(ptr, size - offset);
-		npatched += nrun;
-		if (vcp->vc_verbose > 1)
-			fprintf(stderr, "AMD BFI_INT: patched %d instructions "
-			       "in kernel %d\n",
-			       nrun, ninner);
-		npatched++;
-		offset += 1;
-	}
-	return npatched;
-}
-
-
-int
-vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
-		    const char *filename, const char *opts)
-{
-	FILE *kfp;
-	char *buf, *tbuf;
-	int len, fromsource = 0, patched = 0;
-	size_t sz, szr;
-	cl_program prog;
-	cl_int ret, sts;
-	unsigned char prog_hash[16];
-	char bin_name[64];
-
-	if (vcp->vc_verbose > 1)
-		fprintf(stderr,
-			"OpenCL compiler flags: %s\n", opts ? opts : "");
-
-	sz = 128 * 1024;
-	buf = (char *) malloc(sz);
-	if (!buf) {
-		fprintf(stderr, "Could not allocate program buffer\n");
-		return 0;
-	}
-
-	kfp = fopen(filename, "r");
-	if (!kfp) {
-		fprintf(stderr, "Error loading kernel file '%s': %s\n",
-		       filename, strerror(errno));
-		free(buf);
-		return 0;
-	}
-
-	len = fread(buf, 1, sz, kfp);
-	fclose(kfp);
-
-	if (!len) {
-		fprintf(stderr, "Short read on CL kernel\n");
-		free(buf);
-		return 0;
-	}
-
-	vg_ocl_hash_program(vocp, opts, buf, len, prog_hash);
-	snprintf(bin_name, sizeof(bin_name),
-		 "%02x%02x%02x%02x%02x%02x%02x%02x"
-		 "%02x%02x%02x%02x%02x%02x%02x%02x.oclbin",
-		 prog_hash[0], prog_hash[1], prog_hash[2], prog_hash[3],
-		 prog_hash[4], prog_hash[5], prog_hash[6], prog_hash[7],
-		 prog_hash[8], prog_hash[9], prog_hash[10], prog_hash[11],
-		 prog_hash[12], prog_hash[13], prog_hash[14], prog_hash[15]);
-
-	if (vocp->voc_quirks & VG_OCL_NO_BINARIES) {
-		kfp = NULL;
-		if (vcp->vc_verbose > 1)
-			fprintf(stderr, "Binary OpenCL programs disabled\n");
-	} else {
-		kfp = fopen(bin_name, "rb");
-	}
-
-	if (!kfp) {
-		/* No binary available, create with source */
-		fromsource = 1;
-		sz = len;
-		prog = clCreateProgramWithSource(vocp->voc_oclctx,
-						 1, (const char **) &buf, &sz,
-						 &ret);
-	} else {
-		if (vcp->vc_verbose > 1)
-			fprintf(stderr, "Loading kernel binary %s\n", bin_name);
-		szr = 0;
-		while (!feof(kfp)) {
-			len = fread(buf + szr, 1, sz - szr, kfp);
-			if (!len) {
-				fprintf(stderr,
-					"Short read on CL kernel binary\n");
-				fclose(kfp);
-				free(buf);
-				return 0;
-			}
-			szr += len;
-			if (szr == sz) {
-				tbuf = (char *) realloc(buf, sz*2);
-				if (!tbuf) {
-					fprintf(stderr,
-						"Could not expand CL kernel "
-						"binary buffer\n");
-					fclose(kfp);
-					free(buf);
-					return 0;
-				}
-				buf = tbuf;
-				sz *= 2;
-			}
-		}
-		fclose(kfp);
-	rebuild:
-		prog = clCreateProgramWithBinary(vocp->voc_oclctx,
-						 1, &vocp->voc_ocldid,
-						 &szr,
-						 (const unsigned char **) &buf,
-						 &sts,
-						 &ret);
-	}
-	free(buf);
-	if (!prog) {
-		vg_ocl_error(vocp, ret, "clCreateProgramWithSource");
-		return 0;
-	}
-
-	if (vcp->vc_verbose > 0) {
-		if (fromsource && !patched) {
-			fprintf(stderr,
-				"Compiling kernel, can take minutes...");
-			fflush(stderr);
-		}
-	}
-	ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL);
-	if (ret != CL_SUCCESS) {
-		if ((vcp->vc_verbose > 0) && fromsource && !patched)
-			fprintf(stderr, "failure.\n");
-		vg_ocl_error(NULL, ret, "clBuildProgram");
-	} else if ((vcp->vc_verbose > 0) && fromsource && !patched) {
-		fprintf(stderr, "done!\n");
-	}
-	if ((ret != CL_SUCCESS) ||
-	    ((vcp->vc_verbose > 1) && fromsource && !patched)) {
-		vg_ocl_buildlog(vocp, prog);
-	}
-	if (ret != CL_SUCCESS) {
-		vg_ocl_dump_info(vocp);
-		clReleaseProgram(prog);
-		return 0;
-	}
-
-	if (fromsource && !(vocp->voc_quirks & VG_OCL_NO_BINARIES)) {
-		ret = clGetProgramInfo(prog,
-				       CL_PROGRAM_BINARY_SIZES,
-				       sizeof(szr), &szr,
-				       &sz);
-		if (ret != CL_SUCCESS) {
-			vg_ocl_error(vocp, ret,
-				     "WARNING: clGetProgramInfo(BINARY_SIZES)");
-			goto out;
-		}
-		if (sz == 0) {
-			fprintf(stderr,
-				"WARNING: zero-length CL kernel binary\n");
-			goto out;
-		}
-
-		buf = (char *) malloc(szr);
-		if (!buf) {
-			fprintf(stderr,
-				"WARNING: Could not allocate %"PRSIZET"d bytes "
-				"for CL binary\n",
-			       szr);
-			goto out;
-		}
-
-		ret = clGetProgramInfo(prog,
-				       CL_PROGRAM_BINARIES,
-				       sizeof(buf), &buf,
-				       &sz);
-		if (ret != CL_SUCCESS) {
-			vg_ocl_error(vocp, ret,
-				     "WARNING: clGetProgramInfo(BINARIES)");
-			free(buf);
-			goto out;
-		}
-
-		if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) {
-			patched = vg_ocl_amd_patch(vocp,
-						   (unsigned char *) buf, szr);
-			if (patched > 0) {
-				if (vcp->vc_verbose > 1)
-					fprintf(stderr,
-						"AMD BFI_INT patch complete\n");
-				clReleaseProgram(prog);
-				goto rebuild;
-			}
-			fprintf(stderr,
-				"WARNING: AMD BFI_INT patching failed\n");
-			if (patched < 0) {
-				/* Program was incompletely modified */
-				free(buf);
-				goto out;
-			}
-		}
-
-		kfp = fopen(bin_name, "wb");
-		if (!kfp) {
-			fprintf(stderr, "WARNING: "
-				"could not save CL kernel binary: %s\n",
-				strerror(errno));
-		} else {
-			sz = fwrite(buf, 1, szr, kfp);
-			fclose(kfp);
-			if (sz != szr) {
-				fprintf(stderr,
-					"WARNING: short write on CL kernel "
-					"binary file: expected "
-					"%"PRSIZET"d, got %"PRSIZET"d\n",
-					szr, sz);
-				unlink(bin_name);
-			}
-		}
-		free(buf);
-	}
-
-out:
-	vocp->voc_oclprog = prog;
-	if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") ||
-	    !vg_ocl_create_kernel(vocp, 1, "heap_invert")) {
-		clReleaseProgram(vocp->voc_oclprog);
-		vocp->voc_oclprog = NULL;
-		return 0;
-	}
-
-	return 1;
-}
-
-void CL_CALLBACK
-vg_ocl_context_callback(const char *errinfo,
-			const void *private_info,
-			size_t cb,
-			void *user_data)
-{
-	fprintf(stderr, "vg_ocl_context_callback error: %s\n", errinfo);
-}
-
-int
-vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
-	    int safe_mode)
-{
-	cl_int ret;
-	char optbuf[128];
-	int end = 0;
-
-	memset(vocp, 0, sizeof(*vocp));
-	vg_exec_context_init(vcp, &vocp->base);
-
-	pthread_mutex_init(&vocp->voc_lock, NULL);
-	pthread_cond_init(&vocp->voc_wait, NULL);
-	vocp->voc_ocl_slot = -1;
-
-	vocp->voc_ocldid = did;
-
-	if (vcp->vc_verbose > 1)
-		vg_ocl_dump_info(vocp);
-
-	vocp->voc_quirks = vg_ocl_get_quirks(vocp);
-
-	if ((vocp->voc_quirks & VG_OCL_BROKEN) && (vcp->vc_verbose > 0)) {
-		char yesbuf[16];
-		printf("Type 'yes' to continue: ");
-		fflush(stdout);
-		if (!fgets(yesbuf, sizeof(yesbuf), stdin) ||
-		    strncmp(yesbuf, "yes", 3))
-			exit(1);
-	}
-
-	vocp->voc_oclctx = clCreateContext(NULL,
-					   1, &did,
-					   vg_ocl_context_callback,
-					   NULL,
-					   &ret);
-	if (!vocp->voc_oclctx) {
-		vg_ocl_error(vocp, ret, "clCreateContext");
-		return 0;
-	}
-
-	vocp->voc_oclcmdq = clCreateCommandQueue(vocp->voc_oclctx,
-						 vocp->voc_ocldid,
-						 0, &ret);
-	if (!vocp->voc_oclcmdq) {
-		vg_ocl_error(vocp, ret, "clCreateCommandQueue");
-		return 0;
-	}
-
-	if (safe_mode)
-		vocp->voc_quirks &= ~VG_OCL_OPTIMIZATIONS;
-
-	end = 0;
-	optbuf[end] = '\0';
-	if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL)
-		end += snprintf(optbuf + end, sizeof(optbuf) - end,
-				"-DDEEP_PREPROC_UNROLL ");
-	if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL)
-		end += snprintf(optbuf + end, sizeof(optbuf) - end,
-				"-DPRAGMA_UNROLL ");
-	if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES)
-		end += snprintf(optbuf + end, sizeof(optbuf) - end,
-				"-DVERY_EXPENSIVE_BRANCHES ");
-	if (vocp->voc_quirks & VG_OCL_DEEP_VLIW)
-		end += snprintf(optbuf + end, sizeof(optbuf) - end,
-				"-DDEEP_VLIW ");
-	if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT)
-		end += snprintf(optbuf + end, sizeof(optbuf) - end,
-				"-DAMD_BFI_INT ");
-	if (vocp->voc_quirks & VG_OCL_NV_VERBOSE)
-		end += snprintf(optbuf + end, sizeof(optbuf) - end,
-				"-cl-nv-verbose ");
-
-	if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", optbuf))
-		return 0;
-	return 1;
-}
-
-void
-vg_ocl_del(vg_ocl_context_t *vocp)
-{
-	if (vocp->voc_oclprog) {
-		clReleaseProgram(vocp->voc_oclprog);
-		vocp->voc_oclprog = NULL;
-	}
-	if (vocp->voc_oclcmdq) {
-		clReleaseCommandQueue(vocp->voc_oclcmdq);
-		vocp->voc_oclcmdq = NULL;
-	}
-	if (vocp->voc_oclctx) {
-		clReleaseContext(vocp->voc_oclctx);
-		vocp->voc_oclctx = NULL;
-	}
-	pthread_cond_destroy(&vocp->voc_wait);
-	pthread_mutex_destroy(&vocp->voc_lock);
-	vg_exec_context_del(&vocp->base);
-}
-
-static int vg_ocl_arg_map[][8] = {
-	/* hashes_out / found */
-	{ 2, 0, -1 },
-	/* z_heap */
-	{ 0, 1, 1, 0, 2, 2, -1 },
-	/* point_tmp */
-	{ 0, 0, 2, 1, -1 },
-	/* row_in */
-	{ 0, 2, -1 },
-	/* col_in */
-	{ 0, 3, -1 },
-	/* target_table */
-	{ 2, 3, -1 },
-};
-
-int
-vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot,
-			int arg, size_t size, int host)
-{
-	cl_mem clbuf;
-	cl_int ret;
-	int i, j, knum, karg;
-
-	for (i = 0; i < MAX_SLOT; i++) {
-		if ((i != slot) && (slot >= 0))
-			continue;
-		if (vocp->voc_args[i][arg]) {
-			clReleaseMemObject(vocp->voc_args[i][arg]);
-			vocp->voc_args[i][arg] = NULL;
-			vocp->voc_arg_size[i][arg] = 0;
-		}
-	}
-
-	clbuf = clCreateBuffer(vocp->voc_oclctx,
-			       CL_MEM_READ_WRITE |
-			       (host ? CL_MEM_ALLOC_HOST_PTR : 0),
-			       size,
-			       NULL,
-			       &ret);
-	if (!clbuf) {
-		fprintf(stderr, "clCreateBuffer(%d,%d): ", slot, arg);
-		vg_ocl_error(vocp, ret, NULL);
-		return 0;
-	}
-
-	for (i = 0; i < MAX_SLOT; i++) {
-		if ((i != slot) && (slot >= 0))
-			continue;
-
-		clRetainMemObject(clbuf);
-		vocp->voc_args[i][arg] = clbuf;
-		vocp->voc_arg_size[i][arg] = size;
-
-		for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
-			knum = vg_ocl_arg_map[arg][j];
-			karg = vg_ocl_arg_map[arg][j+1];
-			ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
-					     karg,
-					     sizeof(clbuf),
-					     &clbuf);
-			
-			if (ret) {
-				fprintf(stderr,
-					"clSetKernelArg(%d,%d): ", knum, karg);
-				vg_ocl_error(vocp, ret, NULL);
-				return 0;
-			}
-		}
-	}
-
-	clReleaseMemObject(clbuf);
-	return 1;
-}
-
-int
-vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg,
-		   void *buffer, size_t size)
-{
-	cl_int slot, ret;
-
-	slot = (wslot < 0) ? 0 : wslot;
-
-	assert((slot >= 0) && (slot < MAX_SLOT));
-	assert(size <= vocp->voc_arg_size[slot][arg]);
-
-	ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq,
-				   vocp->voc_args[slot][arg],
-				   CL_TRUE,
-				   0, size,
-				   buffer,
-				   0, NULL,
-				   NULL);
-			
-	if (ret) {
-		fprintf(stderr, "clEnqueueWriteBuffer(%d): ", arg);
-		vg_ocl_error(vocp, ret, NULL);
-		return 0;
-	}
-
-	return 1;
-}
-
-void *
-vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot,
-		      int arg, int rw)
-{
-	void *buf;
-	cl_int ret;
-
-	assert((slot >= 0) && (slot < MAX_SLOT));
-
-	buf = clEnqueueMapBuffer(vocp->voc_oclcmdq,
-				 vocp->voc_args[slot][arg],
-				 CL_TRUE,
-				 (rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE)
-				           : (rw ? CL_MAP_WRITE : CL_MAP_READ),
-				 0, vocp->voc_arg_size[slot][arg],
-				 0, NULL,
-				 NULL,
-				 &ret);
-	if (!buf) {
-		fprintf(stderr, "clEnqueueMapBuffer(%d): ", arg);
-		vg_ocl_error(vocp, ret, NULL);
-		return NULL;
-	}
-	return buf;
-}
-
-void
-vg_ocl_unmap_arg_buffer(vg_ocl_context_t *vocp, int slot,
-			int arg, void *buf)
-{
-	cl_int ret;
-	cl_event ev;
-
-	assert((slot >= 0) && (slot < MAX_SLOT));
-
-	ret = clEnqueueUnmapMemObject(vocp->voc_oclcmdq,
-				      vocp->voc_args[slot][arg],
-				      buf,
-				      0, NULL,
-				      &ev);
-	if (ret != CL_SUCCESS) {
-		fprintf(stderr, "clEnqueueUnmapMemObject(%d): ", arg);
-		vg_ocl_error(vocp, ret, NULL);
-		return;
-	}
-
-	ret = clWaitForEvents(1, &ev);
-	clReleaseEvent(ev);
-	if (ret != CL_SUCCESS) {
-		fprintf(stderr, "clWaitForEvent(clUnmapMemObject,%d): ", arg);
-		vg_ocl_error(vocp, ret, NULL);
-	}
-}
-
-int
-vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot,
-		      int arg, int value)
-{
-	cl_int ret;
-	int i;
-
-	for (i = 0; i < MAX_SLOT; i++) {
-		if ((i != slot) && (slot >= 0))
-			continue;
-		ret = clSetKernelArg(vocp->voc_oclkernel[i][2],
-				     arg,
-				     sizeof(value),
-				     &value);
-		if (ret) {
-			fprintf(stderr, "clSetKernelArg(%d): ", arg);
-			vg_ocl_error(vocp, ret, NULL);
-			return 0;
-		}
-	}
-	return 1;
-}
-
-int
-vg_ocl_kernel_buffer_arg(vg_ocl_context_t *vocp, int slot,
-			 int arg, void *value, size_t size)
-{
-	cl_int ret;
-	int i, j, knum, karg;
-
-	for (i = 0; i < MAX_SLOT; i++) {
-		if ((i != slot) && (slot >= 0))
-			continue;
-		for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
-			knum = vg_ocl_arg_map[arg][j];
-			karg = vg_ocl_arg_map[arg][j+1];
-			ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
-					     karg,
-					     size,
-					     value);
-			if (ret) {
-				fprintf(stderr,
-					"clSetKernelArg(%d,%d): ", knum, karg);
-				vg_ocl_error(vocp, ret, NULL);
-				return 0;
-			}
-		}
-	}
-	return 1;
-}
-
-int
-vg_ocl_kernel_dead(vg_ocl_context_t *vocp, int slot)
-{
-	return (vocp->voc_oclkrnwait[slot] == NULL);
-}
-
-int
-vg_ocl_kernel_start(vg_ocl_context_t *vocp, int slot, int ncol, int nrow,
-		    int invsize)
-{
-	cl_int val, ret;
-	cl_event ev;
-	size_t globalws[2] = { ncol, nrow };
-	size_t invws = (ncol * nrow) / invsize;
-
-	assert(!vocp->voc_oclkrnwait[slot]);
-
-	/* heap_invert() preconditions */
-	assert(is_pow2(invsize) && (invsize > 1));
-
-	val = invsize;
-	ret = clSetKernelArg(vocp->voc_oclkernel[slot][1],
-			     1,
-			     sizeof(val),
-			     &val);
-	if (ret != CL_SUCCESS) {
-		vg_ocl_error(vocp, ret, "clSetKernelArg(ncol)");
-		return 0;
-	}
-	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
-				     vocp->voc_oclkernel[slot][0],
-				     2,
-				     NULL, globalws, NULL,
-				     0, NULL,
-				     &ev);
-	if (ret != CL_SUCCESS) {
-		vg_ocl_error(vocp, ret, "clEnqueueNDRange(0)");
-		return 0;
-	}
-
-	ret = clWaitForEvents(1, &ev);
-	clReleaseEvent(ev);
-	if (ret != CL_SUCCESS) {
-		vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,0)");
-		return 0;
-	}
-
-	if (vocp->voc_verify_func[0] &&
-	    !(vocp->voc_verify_func[0])(vocp, slot)) {
-		fprintf(stderr, "ERROR: Kernel 0 failed verification test\n");
-		return 0;
-	}
-
-	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
-				     vocp->voc_oclkernel[slot][1],
-				     1,
-				     NULL, &invws, NULL,
-				     0, NULL,
-				     &ev);
-	if (ret != CL_SUCCESS) {
-		vg_ocl_error(vocp, ret, "clEnqueueNDRange(1)");
-		return 0;
-	}
-
-	ret = clWaitForEvents(1, &ev);
-	clReleaseEvent(ev);
-	if (ret != CL_SUCCESS) {
-		vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,1)");
-		return 0;
-	}
-
-	if (vocp->voc_verify_func[1] &&
-	    !(vocp->voc_verify_func[1])(vocp, slot)) {
-		fprintf(stderr, "ERROR: Kernel 1 failed verification test\n");
-		return 0;
-	}
-
-	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
-				     vocp->voc_oclkernel[slot][2],
-				     2,
-				     NULL, globalws, NULL,
-				     0, NULL,
-				     &ev);
-	if (ret != CL_SUCCESS) {
-		vg_ocl_error(vocp, ret, "clEnqueueNDRange(2)");
-		return 0;
-	}
-
-	vocp->voc_oclkrnwait[slot] = ev;
-	return 1;
-}
-
-int
-vg_ocl_kernel_wait(vg_ocl_context_t *vocp, int slot)
-{
-	cl_event ev;
-	cl_int ret;
-
-	ev = vocp->voc_oclkrnwait[slot];
-	vocp->voc_oclkrnwait[slot] = NULL;
-	if (ev) {
-		ret = clWaitForEvents(1, &ev);
-		clReleaseEvent(ev);
-		if (ret != CL_SUCCESS) {
-			vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,e)");
-			return 0;
-		}
-	}
-	return 1;
-}
-
-
-INLINE void
-vg_ocl_get_bignum_raw(BIGNUM *bn, const unsigned char *buf)
-{
-	bn_expand(bn, 256);
-	memcpy(bn->d, buf, 32);
-	bn->top = (32 / sizeof(BN_ULONG));
-}
-
-INLINE void
-vg_ocl_put_bignum_raw(unsigned char *buf, const BIGNUM *bn)
-{
-	int bnlen = (bn->top * sizeof(BN_ULONG));
-	if (bnlen >= 32) {
-		memcpy(buf, bn->d, 32);
-	} else {
-		memcpy(buf, bn->d, bnlen);
-		memset(buf + bnlen, 0, 32 - bnlen);
-	}
-}
-
-#define ACCESS_BUNDLE 1024
-#define ACCESS_STRIDE (ACCESS_BUNDLE/8)
-
-void
-vg_ocl_get_bignum_tpa(BIGNUM *bn, const unsigned char *buf, int cell)
-{
-	unsigned char bnbuf[32];
-	int start, i;
-
-	start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
-		 (cell % ACCESS_STRIDE));
-	for (i = 0; i < 8; i++)
-		memcpy(bnbuf+(i*4),
-		       buf + 4*(start + i*ACCESS_STRIDE),
-		       4);
-
-	vg_ocl_get_bignum_raw(bn, bnbuf);
-}
-
-/*
- * Absolutely disgusting.
- * We want points in Montgomery form, and it's a lot easier to read the
- * coordinates from the structure than to export and re-montgomeryize.
- */
-
-struct ec_point_st {
-	const EC_METHOD *meth;
-	BIGNUM X;
-	BIGNUM Y;
-	BIGNUM Z;
-	int Z_is_one;
-};
-
-INLINE void
-vg_ocl_get_point(EC_POINT *ppnt, const unsigned char *buf)
-{
-	static const unsigned char mont_one[] = { 0x01,0x00,0x00,0x03,0xd1 };
-	vg_ocl_get_bignum_raw(&ppnt->X, buf);
-	vg_ocl_get_bignum_raw(&ppnt->Y, buf + 32);
-	if (!ppnt->Z_is_one) {
-		ppnt->Z_is_one = 1;
-		BN_bin2bn(mont_one, sizeof(mont_one), &ppnt->Z);
-	}
-}
-
-INLINE void
-vg_ocl_put_point(unsigned char *buf, const EC_POINT *ppnt)
-{
-	assert(ppnt->Z_is_one);
-	vg_ocl_put_bignum_raw(buf, &ppnt->X);
-	vg_ocl_put_bignum_raw(buf + 32, &ppnt->Y);
-}
-
-void
-vg_ocl_put_point_tpa(unsigned char *buf, int cell, const EC_POINT *ppnt)
-{
-	unsigned char pntbuf[64];
-	int start, i;
-
-	vg_ocl_put_point(pntbuf, ppnt);
-
-	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
-		 (cell % (ACCESS_STRIDE/2)));
-	for (i = 0; i < 8; i++)
-		memcpy(buf + 4*(start + i*ACCESS_STRIDE),
-		       pntbuf+(i*4),
-		       4);
-	for (i = 0; i < 8; i++)
-		memcpy(buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)),
-		       pntbuf+32+(i*4),
-		       4);
-}
-
-void
-vg_ocl_get_point_tpa(EC_POINT *ppnt, const unsigned char *buf, int cell)
-{
-	unsigned char pntbuf[64];
-	int start, i;
-
-	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
-		 (cell % (ACCESS_STRIDE/2)));
-	for (i = 0; i < 8; i++)
-		memcpy(pntbuf+(i*4),
-		       buf + 4*(start + i*ACCESS_STRIDE),
-		       4);
-	for (i = 0; i < 8; i++)
-		memcpy(pntbuf+32+(i*4),
-		       buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)),
-		       4);
-
-	vg_ocl_get_point(ppnt, pntbuf);
-}
-
-void
-show_elapsed(struct timeval *tv, const char *place)
-{
-	struct timeval now, delta;
-        gettimeofday(&now, NULL);
-	timersub(&now, tv, &delta);
-	fprintf(stderr,
-		"%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec);
-}
-
-
-/*
- * GPU address matching methods
- *
- * gethash: GPU computes and returns all address hashes.
- *  + Works with any matching method, including regular expressions.
- *  - The CPU will not be able to keep up with mid- to high-end GPUs.
- *
- * prefix: GPU computes hash, searches a range list, and discards.
- *  + Fast, minimal work for CPU.
- */
-
-int
-vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot)
-{
-	vg_exec_context_t *vxcp = &vocp->base;
-	vg_context_t *vcp = vocp->base.vxc_vc;
-	vg_test_func_t test_func = vcp->vc_test;
-	unsigned char *ocl_hashes_out;
-	int i, res = 0, round;
-
-	ocl_hashes_out = (unsigned char *)
-		vg_ocl_map_arg_buffer(vocp, slot, 0, 0);
-
-	round = vocp->voc_ocl_cols * vocp->voc_ocl_rows;
-
-	for (i = 0; i < round; i++, vxcp->vxc_delta++) {
-		memcpy(&vxcp->vxc_binres[1],
-		       ocl_hashes_out + (20*i),
-		       20);
-
-		res = test_func(vxcp);
-		if (res)
-			break;
-	}
-
-	vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out);
-	return res;
-}
-
-int
-vg_ocl_gethash_init(vg_ocl_context_t *vocp)
-{
-	int i;
-
-	if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get"))
-		return 0;
-
-	for (i = 0; i < vocp->voc_nslots; i++) {
-		/* Each slot gets its own hash output buffer */
-		if (!vg_ocl_kernel_arg_alloc(vocp, i, 0,
-					     20 *
-					     vocp->voc_ocl_rows *
-					     vocp->voc_ocl_cols, 1))
-			return 0;
-	}
-
-	vocp->voc_rekey_func = NULL;
-	vocp->voc_check_func = vg_ocl_gethash_check;
-	return 1;
-}
-
-
-static int
-vg_ocl_prefix_rekey(vg_ocl_context_t *vocp)
-{
-	vg_context_t *vcp = vocp->base.vxc_vc;
-	unsigned char *ocl_targets_in;
-	uint32_t *ocl_found_out;
-	int i;
-
-	/* Set the found indicator for each slot to -1 */
-	for (i = 0; i < vocp->voc_nslots; i++) {
-		ocl_found_out = (uint32_t *)
-			vg_ocl_map_arg_buffer(vocp, i, 0, 1);
-		ocl_found_out[0] = 0xffffffff;
-		vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out);
-	}
-
-	if (vocp->voc_pattern_rewrite) {
-		/* Count number of range records */
-		i = vg_context_hash160_sort(vcp, NULL);
-		if (!i) {
-			fprintf(stderr,
-				"No range records available, exiting\n");
-			return 0;
-		}
-
-		if (i > vocp->voc_pattern_alloc) {
-			/* (re)allocate target buffer */
-			if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0))
-				return 0;
-			vocp->voc_pattern_alloc = i;
-		}
-
-		/* Write range records */
-		ocl_targets_in = (unsigned char *)
-			vg_ocl_map_arg_buffer(vocp, 0, 5, 1);
-		vg_context_hash160_sort(vcp, ocl_targets_in);
-		vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in);
-		vg_ocl_kernel_int_arg(vocp, -1, 4, i);
-
-		vocp->voc_pattern_rewrite = 0;
-	}
-	return 1;
-}
-
-static int
-vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot)
-{
-	vg_exec_context_t *vxcp = &vocp->base;
-	vg_context_t *vcp = vocp->base.vxc_vc;
-	vg_test_func_t test_func = vcp->vc_test;
-	uint32_t *ocl_found_out;
-	uint32_t found_delta;
-	int orig_delta, tablesize;
-	int res = 0;
-
-	/* Retrieve the found indicator */
-	ocl_found_out = (uint32_t *)
-		vg_ocl_map_arg_buffer(vocp, slot, 0, 2);
-	found_delta = ocl_found_out[0];
-
-	if (found_delta != 0xffffffff) {
-		/* GPU code claims match, verify with CPU version */
-		orig_delta = vxcp->vxc_delta;
-		vxcp->vxc_delta += found_delta;
-		vg_exec_context_calc_address(vxcp);
-
-		/* Make sure the GPU produced the expected hash */
-		res = 0;
-		if (!memcmp(vxcp->vxc_binres + 1,
-			    ocl_found_out + 2,
-			    20)) {
-			res = test_func(vxcp);
-		}
-		if (res == 0) {
-			/*
-			 * The match was not found in
-			 * the pattern list.  Hmm.
-			 */
-			tablesize = ocl_found_out[2];
-			fprintf(stderr, "Match idx: %d\n", ocl_found_out[1]);
-			fprintf(stderr, "CPU hash: ");
-			fdumphex(stderr, vxcp->vxc_binres + 1, 20);
-			fprintf(stderr, "GPU hash: ");
-			fdumphex(stderr,
-				 (unsigned char *) (ocl_found_out + 2), 20);
-			fprintf(stderr, "Found delta: %d "
-			       "Start delta: %d\n",
-			       found_delta, orig_delta);
-			res = 1;
-		}
-		vocp->voc_pattern_rewrite = 1;
-	} else {
-		vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows);
-	}
-
-	vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out);
-	return res;
-}
-
-int
-vg_ocl_prefix_init(vg_ocl_context_t *vocp)
-{
-	int i;
-
-	if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix"))
-		return 0;
-
-	for (i = 0; i < vocp->voc_nslots; i++) {
-		if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1))
-			return 0;
-	}
-	vocp->voc_rekey_func = vg_ocl_prefix_rekey;
-	vocp->voc_check_func = vg_ocl_prefix_check;
-	vocp->voc_pattern_rewrite = 1;
-	vocp->voc_pattern_alloc = 0;
-	return 1;
-}
-
-
-int
-vg_ocl_config_pattern(vg_ocl_context_t *vocp)
-{
-	vg_context_t *vcp = vocp->base.vxc_vc;
-	int i;
-
-	i = vg_context_hash160_sort(vcp, NULL);
-	if (i > 0) {
-		if (vcp->vc_verbose > 1)
-			fprintf(stderr, "Using OpenCL prefix matcher\n");
-		/* Configure for prefix matching */
-		return vg_ocl_prefix_init(vocp);
-	}
-
-	if (vcp->vc_verbose > 0)
-		fprintf(stderr, "WARNING: Using CPU pattern matcher\n");
-	return vg_ocl_gethash_init(vocp);
-}
-
-
-/*
- * Temporary buffer content verification functions
- * This provides a simple test of the kernel, the OpenCL compiler,
- * and the hardware.
- */
-int
-vg_ocl_verify_temporary(vg_ocl_context_t *vocp, int slot, int z_inverted)
-{
-	vg_exec_context_t *vxcp = &vocp->base;
-	unsigned char *point_tmp = NULL, *z_heap = NULL;
-	unsigned char *ocl_points_in = NULL, *ocl_strides_in = NULL;
-	const EC_GROUP *pgroup;
-	EC_POINT *ppr = NULL, *ppc = NULL, *pps = NULL, *ppt = NULL;
-	BIGNUM bnz, bnez, bnm, *bnzc;
-	BN_CTX *bnctx = NULL;
-	BN_MONT_CTX *bnmont;
-	int ret = 0;
-	int mismatches = 0, mm_r;
-	int x, y, bx;
-	static const unsigned char raw_modulus[] = {
-		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-		0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-	};
-
-	BN_init(&bnz);
-	BN_init(&bnez);
-	BN_init(&bnm);
-
-	bnctx = BN_CTX_new();
-	bnmont = BN_MONT_CTX_new();
-	pgroup = EC_KEY_get0_group(vxcp->vxc_key);
-	ppr = EC_POINT_new(pgroup);
-	ppc = EC_POINT_new(pgroup);
-	pps = EC_POINT_new(pgroup);
-	ppt = EC_POINT_new(pgroup);
-
-	if (!bnctx || !bnmont || !ppr || !ppc || !pps || !ppt) {
-		fprintf(stderr, "ERROR: out of memory\n");
-		goto out;
-	}
-
-	BN_bin2bn(raw_modulus, sizeof(raw_modulus), &bnm);
-	BN_MONT_CTX_set(bnmont, &bnm, bnctx);
-
-	if (z_inverted) {
-		bnzc = &bnez;
-	} else {
-		bnzc = &pps->Z;
-	}
-
-	z_heap = (unsigned char *)
-		vg_ocl_map_arg_buffer(vocp, slot, 1, 0);
-	point_tmp = (unsigned char *)
-		vg_ocl_map_arg_buffer(vocp, slot, 2, 0);
-	ocl_points_in = (unsigned char *)
-		vg_ocl_map_arg_buffer(vocp, slot, 3, 0);
-	ocl_strides_in = (unsigned char *)
-		vg_ocl_map_arg_buffer(vocp, slot, 4, 0);
-
-	if (!z_heap || !point_tmp || !ocl_points_in || !ocl_strides_in) {
-		fprintf(stderr, "ERROR: could not map OpenCL point buffers\n");
-		goto out;
-	}
-
-	for (y = 0; y < vocp->voc_ocl_rows; y++) {
-		vg_ocl_get_point(ppr, ocl_strides_in + (64*y));
-		bx = y * vocp->voc_ocl_cols;
-		mm_r = 0;
-
-		for (x = 0; x < vocp->voc_ocl_cols; x++) {
-			vg_ocl_get_point_tpa(ppc, ocl_points_in, x);
-			assert(ppr->Z_is_one && ppc->Z_is_one);
-			EC_POINT_add(pgroup, pps, ppc, ppr, bnctx);
-			assert(!pps->Z_is_one);
-			vg_ocl_get_point_tpa(ppt, point_tmp, bx + x);
-			vg_ocl_get_bignum_tpa(&bnz, z_heap, bx + x);
-			if (z_inverted) {
-				BN_mod_inverse(&bnez, &pps->Z, &bnm, bnctx);
-				BN_to_montgomery(&bnez, &bnez, bnmont, bnctx);
-				BN_to_montgomery(&bnez, &bnez, bnmont, bnctx);
-			}
-			if (BN_cmp(&ppt->X, &pps->X) ||
-			    BN_cmp(&ppt->Y, &pps->Y) ||
-			    BN_cmp(&bnz, bnzc)) {
-				if (!mismatches) {
-					fprintf(stderr, "Base privkey: ");
-					fdumpbn(stderr, EC_KEY_get0_private_key(
-						       vxcp->vxc_key));
-				}
-				mismatches++;
-				fprintf(stderr, "Mismatch for kernel %d, "
-				       "offset %d (%d,%d)\n",
-				       z_inverted, bx + x, y, x);
-				if (!mm_r) {
-					mm_r = 1;
-					fprintf(stderr, "Row X   : ");
-					fdumpbn(stderr, &ppr->X);
-					fprintf(stderr, "Row Y   : ");
-					fdumpbn(stderr, &ppr->Y);
-				}
-
-				fprintf(stderr, "Column X: ");
-				fdumpbn(stderr, &ppc->X);
-				fprintf(stderr, "Column Y: ");
-				fdumpbn(stderr, &ppc->Y);
-
-				if (BN_cmp(&ppt->X, &pps->X)) {
-					fprintf(stderr, "Expect X: ");
-					fdumpbn(stderr, &pps->X);
-					fprintf(stderr, "Device X: ");
-					fdumpbn(stderr, &ppt->X);
-				}
-				if (BN_cmp(&ppt->Y, &pps->Y)) {
-					fprintf(stderr, "Expect Y: ");
-					fdumpbn(stderr, &pps->Y);
-					fprintf(stderr, "Device Y: ");
-					fdumpbn(stderr, &ppt->Y);
-				}
-				if (BN_cmp(&bnz, bnzc)) {
-					fprintf(stderr, "Expect Z: ");
-					fdumpbn(stderr, bnzc);
-					fprintf(stderr, "Device Z: ");
-					fdumpbn(stderr, &bnz);
-				}
-			}
-		}
-	}
-
-	ret = !mismatches;
-
-out:
-	if (z_heap)
-		vg_ocl_unmap_arg_buffer(vocp, slot, 1, z_heap);
-	if (point_tmp)
-		vg_ocl_unmap_arg_buffer(vocp, slot, 2, point_tmp);
-	if (ocl_points_in)
-		vg_ocl_unmap_arg_buffer(vocp, slot, 3, ocl_points_in);
-	if (ocl_strides_in)
-		vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in);
-	if (ppr)
-		EC_POINT_free(ppr);
-	if (ppc)
-		EC_POINT_free(ppc);
-	if (pps)
-		EC_POINT_free(pps);
-	if (ppt)
-		EC_POINT_free(ppt);
-	BN_clear_free(&bnz);
-	BN_clear_free(&bnez);
-	BN_clear_free(&bnm);
-	if (bnmont)
-		BN_MONT_CTX_free(bnmont);
-	if (bnctx)
-		BN_CTX_free(bnctx);
-	return ret;
-}
-
-int
-vg_ocl_verify_k0(vg_ocl_context_t *vocp, int slot)
-{
-	return vg_ocl_verify_temporary(vocp, slot, 0);
-}
-
-int
-vg_ocl_verify_k1(vg_ocl_context_t *vocp, int slot)
-{
-	return vg_ocl_verify_temporary(vocp, slot, 1);
-}
-
-void *
-vg_opencl_thread(void *arg)
-{
-	vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
-	vg_context_t *vcp = vocp->base.vxc_vc;
-	int halt = 0;
-	int slot = -1;
-	int rows, cols, invsize;
-	unsigned long long idleu, busyu;
-	double pidle;
-	struct timeval tv, tvt, tvd, idle, busy;
-
-	memset(&idle, 0, sizeof(idle));
-	memset(&busy, 0, sizeof(busy));
-
-	while (1) {
-		pthread_mutex_lock(&vocp->voc_lock);
-		if (halt) {
-			halt = 0;
-			vocp->voc_halt = 1;
-		}
-		if (slot != -1) {
-			assert(vocp->voc_ocl_slot == slot);
-			vocp->voc_ocl_slot = -1;
-			slot = -1;
-			pthread_cond_signal(&vocp->voc_wait);
-		}
-		if (vocp->voc_halt)
-			break;
-		if (vocp->voc_ocl_slot == -1) {
-			gettimeofday(&tv, NULL);
-			while (vocp->voc_ocl_slot == -1) {
-				pthread_cond_wait(&vocp->voc_wait,
-						  &vocp->voc_lock);
-				if (vocp->voc_halt)
-					goto out;
-			}
-			gettimeofday(&tvt, NULL);
-			timersub(&tvt, &tv, &tvd);
-			timeradd(&tvd, &idle, &idle);
-		}
-		assert(!vocp->voc_rekey);
-		assert(!vocp->voc_halt);
-		slot = vocp->voc_ocl_slot;
-		rows = vocp->voc_ocl_rows;
-		cols = vocp->voc_ocl_cols;
-		invsize = vocp->voc_ocl_invsize;
-		pthread_mutex_unlock(&vocp->voc_lock);
-
-		gettimeofday(&tv, NULL);
-		if (!vg_ocl_kernel_start(vocp, slot, cols, rows, invsize))
-			halt = 1;
-
-		if (!vg_ocl_kernel_wait(vocp, slot))
-			halt = 1;
-
-		if (vcp->vc_verbose > 1) {
-			gettimeofday(&tvt, NULL);
-			timersub(&tvt, &tv, &tvd);
-			timeradd(&tvd, &busy, &busy);
-			if ((busy.tv_sec + idle.tv_sec) > 1) {
-				idleu = (1000000 * idle.tv_sec) + idle.tv_usec;
-				busyu = (1000000 * busy.tv_sec) + busy.tv_usec;
-				pidle = ((double) idleu) / (idleu + busyu);
-
-				if (pidle > 0.01) {
-					fprintf(stderr, "\rGPU idle: %.2f%%"
-					       "                              "
-				       "                                \n",
-					       100 * pidle);
-				}
-				memset(&idle, 0, sizeof(idle));
-				memset(&busy, 0, sizeof(busy));
-			}
-		}
-	}
-out:
-	pthread_mutex_unlock(&vocp->voc_lock);
-	return NULL;
-}
-
-
-/*
- * Address search thread main loop
- */
-
-void *
-vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode, int verify,
-	       int worksize, int nthreads, int nrows, int ncols, int invsize)
-{
-	int i;
-	int round, full_threads, wsmult;
-	cl_ulong memsize, allocsize;
-
-	const BN_ULONG rekey_max = 100000000;
-	BN_ULONG npoints, rekey_at;
-
-	EC_KEY *pkey = NULL;
-	const EC_GROUP *pgroup;
-	const EC_POINT *pgen;
-	EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL;
-	EC_POINT *pseek = NULL;
-
-	unsigned char *ocl_points_in, *ocl_strides_in;
-
-	vg_ocl_context_t ctx;
-	vg_ocl_context_t *vocp = &ctx;
-	vg_exec_context_t *vxcp = &vocp->base;
-
-	int slot, nslots;
-	int slot_busy = 0, slot_done = 0, halt = 0;
-	int c = 0, output_interval = 1000;
-
-	struct timeval tvstart;
-
-	if (!vg_ocl_init(vcp, &ctx, did, safe_mode))
-		return NULL;
-
-	if (verify) {
-		if (vcp->vc_verbose > 0) {
-			fprintf(stderr, "WARNING: "
-				"Hardware verification mode enabled\n");
-		}
-		if (!nthreads)
-			nthreads = 1;
-		vocp->voc_verify_func[0] = vg_ocl_verify_k0;
-		vocp->voc_verify_func[1] = vg_ocl_verify_k1;
-	}
-
-	pkey = vxcp->vxc_key;
-	pgroup = EC_KEY_get0_group(pkey);
-	pgen = EC_GROUP_get0_generator(pgroup);
-
-	/*
-	 * nrows: number of point rows per job
-	 * ncols: number of point columns per job
-	 * invsize: number of modular inversion tasks per job
-	 *    (each task performs (nrows*ncols)/invsize inversions)
-	 * nslots: number of kernels
-	 *    (create two, keep one running while we service the other or wait)
-	 */
-
-	if (!nthreads) {
-		/* Pick nthreads sufficient to saturate one compute unit */
-		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
-		    CL_DEVICE_TYPE_CPU)
-			nthreads = 1;
-		else
-			nthreads = vg_ocl_device_getsizet(vocp->voc_ocldid,
-					CL_DEVICE_MAX_WORK_GROUP_SIZE);
-	}
-
-	full_threads = vg_ocl_device_getsizet(vocp->voc_ocldid,
-					      CL_DEVICE_MAX_COMPUTE_UNITS);
-	full_threads *= nthreads;
-
-	/*
-	 * The work size selection is complicated, and the most
-	 * important factor is the batch size of the heap_invert kernel.
-	 * Each value added to the batch trades one complete modular
-	 * inversion for four multiply operations.  Ideally the work
-	 * size would be as large as possible.  The practical limiting
-	 * factors are:
-	 * 1. Available memory
-	 * 2. Responsiveness and operational latency
-	 *
-	 * We take a naive approach and limit batch size to a point of
-	 * sufficiently diminishing returns, hoping that responsiveness
-	 * will be sufficient.
-	 *
-	 * The measured value for the OpenSSL implementations on my CPU
-	 * is 80:1.  This causes heap_invert to get batches of 20 or so
-	 * for free, and receive 10% incremental returns at 200.  The CPU
-	 * work size is therefore set to 256.
-	 *
-	 * The ratio on most GPUs with the oclvanitygen implementations
-	 * is closer to 500:1, and larger batches are required for
-	 * good performance.
-	 */
-	if (!worksize) {
-		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
-		    CL_DEVICE_TYPE_GPU)
-			worksize = 2048;
-		else
-			worksize = 256;
-	}
-
-	if (!ncols) {
-		memsize = vg_ocl_device_getulong(vocp->voc_ocldid,
-					CL_DEVICE_GLOBAL_MEM_SIZE);
-		allocsize = vg_ocl_device_getulong(vocp->voc_ocldid,
-					CL_DEVICE_MAX_MEM_ALLOC_SIZE);
-		memsize /= 2;
-		ncols = full_threads;
-		nrows = 2;
-		/* Find row and column counts close to sqrt(full_threads) */
-		while ((ncols > nrows) && !(ncols & 1)) {
-			ncols /= 2;
-			nrows *= 2;
-		}
-
-		/*
-		 * Increase row & column counts to satisfy work size
-		 * multiplier or fill available memory.
-		 */
-		wsmult = 1;
-		while ((!worksize || ((wsmult * 2) <= worksize)) &&
-		       ((ncols * nrows * 2 * 128) < memsize) &&
-		       ((ncols * nrows * 2 * 64) < allocsize)) {
-			if (ncols > nrows)
-				nrows *= 2;
-			else
-				ncols *= 2;
-			wsmult *= 2;
-		}
-	}
-
-	round = nrows * ncols;
-
-	if (!invsize) {
-		invsize = 2;
-		while (!(round % (invsize << 1)) &&
-		       ((round / invsize) > full_threads))
-			invsize <<= 1;
-	}
-
-	if (vcp->vc_verbose > 1) {
-		fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows);
-		fprintf(stderr, "Modular inverse: %d threads, %d ops each\n",
-			round/invsize, invsize);
-	}
-
-	if ((round % invsize) || !is_pow2(invsize) || (invsize < 2)) {
-		if (vcp->vc_verbose <= 1) {
-			fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows);
-			fprintf(stderr,
-				"Modular inverse: %d threads, %d ops each\n",
-				round/invsize, invsize);
-		}
-		if (round % invsize)
-			fprintf(stderr,
-				"Modular inverse work size must "
-				"evenly divide points\n");
-		else
-			fprintf(stderr,
-				"Modular inverse work per task (%d) "
-				"must be a power of 2\n", invsize);
-		goto out;
-	}
-
-	if (!vcp->vc_remove_on_match &&
-	    (vcp->vc_chance >= 1.0f) &&
-	    (vcp->vc_chance < round) &&
-	    (vcp->vc_verbose > 0)) {
-		fprintf(stderr, "WARNING: low pattern difficulty\n");
-		fprintf(stderr,
-			"WARNING: better match throughput is possible "
-			"using vanitygen on the CPU\n");
-	}
-
-	nslots = 2;
-	slot = 0;
-	vocp->voc_ocl_rows = nrows;
-	vocp->voc_ocl_cols = ncols;
-	vocp->voc_ocl_invsize = invsize;
-	vocp->voc_nslots = nslots;
-
-	ppbase = (EC_POINT **) malloc((nrows + ncols) *
-				      sizeof(EC_POINT*));
-	if (!ppbase)
-		goto enomem;
-
-	for (i = 0; i < (nrows + ncols); i++) {
-		ppbase[i] = EC_POINT_new(pgroup);
-		if (!ppbase[i])
-			goto enomem;
-	}
-
-	pprow = ppbase + ncols;
-	pbatchinc = EC_POINT_new(pgroup);
-	poffset = EC_POINT_new(pgroup);
-	pseek = EC_POINT_new(pgroup);
-	if (!pbatchinc || !poffset || !pseek)
-		goto enomem;
-
-	BN_set_word(&vxcp->vxc_bntmp, ncols);
-	EC_POINT_mul(pgroup, pbatchinc, &vxcp->vxc_bntmp, NULL, NULL,
-		     vxcp->vxc_bnctx);
-	EC_POINT_make_affine(pgroup, pbatchinc, vxcp->vxc_bnctx);
-
-	BN_set_word(&vxcp->vxc_bntmp, round);
-	EC_POINT_mul(pgroup, poffset, &vxcp->vxc_bntmp, NULL, NULL,
-		     vxcp->vxc_bnctx);
-	EC_POINT_make_affine(pgroup, poffset, vxcp->vxc_bnctx);
-
-	if (!vg_ocl_config_pattern(vocp))
-		goto enomem;
-
-	for (i = 0; i < nslots; i++) {
-		/*
-		 * Each work group gets its own:
-		 * - Column point array
-		 */
-		if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * nrows, 1))
-			goto enomem;
-	}
-
-	/*
-	 * All instances share:
-	 * - The z_heap and point scratch spaces
-	 * - The row point array
-	 */
-	if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1,
-			     round_up_pow2(32 * 2 * round, 4096), 0) ||
-	    !vg_ocl_kernel_arg_alloc(vocp, -1, 2,
-			     round_up_pow2(32 * 2 * round, 4096), 0) ||
-	    !vg_ocl_kernel_arg_alloc(vocp, -1, 3,
-			     round_up_pow2(32 * 2 * ncols, 4096), 1))
-		goto enomem;
-
-	npoints = 0;
-	rekey_at = 0;
-	vxcp->vxc_binres[0] = vcp->vc_addrtype;
-
-	if (pthread_create(&vocp->voc_ocl_thread, NULL,
-			   vg_opencl_thread, vocp))
-		goto enomem;
-
-	gettimeofday(&tvstart, NULL);
-
-l_rekey:
-	if (vocp->voc_rekey_func &&
-	    !vocp->voc_rekey_func(vocp))
-		goto enomem;
-
-	/* Generate a new random private key */
-	EC_KEY_generate_key(pkey);
-	npoints = 0;
-
-	/* Determine rekey interval */
-	EC_GROUP_get_order(pgroup, &vxcp->vxc_bntmp, vxcp->vxc_bnctx);
-	BN_sub(&vxcp->vxc_bntmp2,
-	       &vxcp->vxc_bntmp,
-	       EC_KEY_get0_private_key(pkey));
-	rekey_at = BN_get_word(&vxcp->vxc_bntmp2);
-	if ((rekey_at == BN_MASK2) || (rekey_at > rekey_max))
-		rekey_at = rekey_max;
-	assert(rekey_at > 0);
-
-	EC_POINT_copy(ppbase[0], EC_KEY_get0_public_key(pkey));
-
-	if (vcp->vc_pubkey_base) {
-		EC_POINT_add(pgroup,
-			     ppbase[0],
-			     ppbase[0],
-			     vcp->vc_pubkey_base,
-			     vxcp->vxc_bnctx);
-	}
-
-	/* Build the base array of sequential points */
-	for (i = 1; i < ncols; i++) {
-		EC_POINT_add(pgroup,
-			     ppbase[i],
-			     ppbase[i-1],
-			     pgen, vxcp->vxc_bnctx);
-	}
-
-	EC_POINTs_make_affine(pgroup, ncols, ppbase, vxcp->vxc_bnctx);
-
-	/* Fill the sequential point array */
-	ocl_points_in = (unsigned char *)
-		vg_ocl_map_arg_buffer(vocp, 0, 3, 1);
-	if (!ocl_points_in)
-		goto enomem;
-	for (i = 0; i < ncols; i++)
-		vg_ocl_put_point_tpa(ocl_points_in, i, ppbase[i]);
-	vg_ocl_unmap_arg_buffer(vocp, 0, 3, ocl_points_in);
-
-	/*
-	 * Set up the initial row increment table.
-	 * Set the first element to pgen -- effectively
-	 * skipping the exact key generated above.
-	 */
-	EC_POINT_copy(pprow[0], pgen);
-	for (i = 1; i < nrows; i++) {
-		EC_POINT_add(pgroup,
-			     pprow[i],
-			     pprow[i-1],
-			     pbatchinc, vxcp->vxc_bnctx);
-	}
-	EC_POINTs_make_affine(pgroup, nrows, pprow, vxcp->vxc_bnctx);
-	vxcp->vxc_delta = 1;
-	npoints = 1;
-	slot = 0;
-	slot_busy = 0;
-	slot_done = 0;
-
-	while (1) {
-		if (slot_done) {
-			assert(rekey_at > 0);
-			slot_done = 0;
-
-			/* Call the result check function */
-			switch (vocp->voc_check_func(vocp, slot)) {
-			case 1:
-				rekey_at = 0;
-				break;
-			case 2:
-				halt = 1;
-				break;
-			default:
-				break;
-			}
-
-			c += round;
-			if (!halt && (c >= output_interval)) {
-				output_interval =
-					vg_output_timing(vcp, c, &tvstart);
-				c = 0;
-			}
-		}
-
-		if (halt)
-			break;
-
-		if ((npoints + round) < rekey_at) {
-			if (npoints > 1) {
-				/* Move the row increments forward */
-				for (i = 0; i < nrows; i++) {
-					EC_POINT_add(pgroup,
-						     pprow[i],
-						     pprow[i],
-						     poffset,
-						     vxcp->vxc_bnctx);
-				}
-
-				EC_POINTs_make_affine(pgroup, nrows, pprow,
-						      vxcp->vxc_bnctx);
-			}
-
-			/* Copy the row stride array to the device */
-			ocl_strides_in = (unsigned char *)
-				vg_ocl_map_arg_buffer(vocp, slot, 4, 1);
-			if (!ocl_strides_in)
-				goto enomem;
-			memset(ocl_strides_in, 0, 64*nrows);
-			for (i = 0; i < nrows; i++)
-				vg_ocl_put_point(ocl_strides_in + (64*i),
-						 pprow[i]);
-			vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in);
-			npoints += round;
-
-			pthread_mutex_lock(&vocp->voc_lock);
-			while (vocp->voc_ocl_slot != -1) {
-				assert(slot_busy);
-				pthread_cond_wait(&vocp->voc_wait,
-						  &vocp->voc_lock);
-			}
-
-			if (vocp->voc_halt) {
-				pthread_mutex_unlock(&vocp->voc_lock);
-				halt = 1;
-				break;
-			}
-
-			vocp->voc_ocl_slot = slot;
-			pthread_cond_signal(&vocp->voc_wait);
-			pthread_mutex_unlock(&vocp->voc_lock);
-
-			slot_done = slot_busy;
-			slot_busy = 1;
-			slot = (slot + 1) % nslots;
-
-		} else { 
-			if (slot_busy) {
-				pthread_mutex_lock(&vocp->voc_lock);
-				while (vocp->voc_ocl_slot != -1) {
-					assert(vocp->voc_ocl_slot ==
-					       ((slot + nslots - 1) % nslots));
-					pthread_cond_wait(&vocp->voc_wait,
-							  &vocp->voc_lock);
-				}
-				pthread_mutex_unlock(&vocp->voc_lock);
-				slot_busy = 0;
-				slot_done = 1;
-			}
-
-			if (!rekey_at ||
-			    (!slot_done && ((npoints + round) >= rekey_at)))
-				goto l_rekey;
-		}
-	}
-
-	if (0) {
-	enomem:
-		fprintf(stderr, "ERROR: allocation failure?\n");
-	}
-
-out:
-	if (halt) {
-		if (vcp->vc_verbose > 1) {
-			printf("Halting...");
-			fflush(stdout);
-		}
-		pthread_mutex_lock(&vocp->voc_lock);
-		vocp->voc_halt = 1;
-		pthread_cond_signal(&vocp->voc_wait);
-		while (vocp->voc_ocl_slot != -1) {
-			assert(slot_busy);
-			pthread_cond_wait(&vocp->voc_wait,
-					  &vocp->voc_lock);
-		}
-		slot_busy = 0;
-		pthread_mutex_unlock(&vocp->voc_lock);
-		pthread_join(vocp->voc_ocl_thread, NULL);
-		if (vcp->vc_verbose > 1)
-			printf("done!\n");
-	}
-
-	if (ppbase) {
-		for (i = 0; i < (nrows + ncols); i++)
-			if (ppbase[i])
-				EC_POINT_free(ppbase[i]);
-		free(ppbase);
-	}
-	if (pbatchinc)
-		EC_POINT_free(pbatchinc);
-
-	vg_ocl_del(vocp);
-
-	return NULL;
-}
-
-
-
-
-/*
- * OpenCL platform/device selection junk
- */
-
-int
-get_device_list(cl_platform_id pid, cl_device_id **list_out)
-{
-	cl_uint nd;
-	cl_int res;
-	cl_device_id *ids;
-	res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
-	if (res != CL_SUCCESS) {
-		vg_ocl_error(NULL, res, "clGetDeviceIDs(0)");
-		*list_out = NULL;
-		return -1;
-	}
-	if (nd) {
-		ids = (cl_device_id *) malloc(nd * sizeof(*ids));
-		if (ids == NULL) {
-			fprintf(stderr, "Could not allocate device ID list\n");
-			*list_out = NULL;
-			return -1;
-		}
-		res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, nd, ids, NULL);
-		if (res != CL_SUCCESS) {
-			vg_ocl_error(NULL, res, "clGetDeviceIDs(n)");
-			free(ids);
-			*list_out = NULL;
-			return -1;
-		}
-		*list_out = ids;
-	}
-	return nd;
-}
-
-void
-show_devices(cl_platform_id pid, cl_device_id *ids, int nd, int base)
-{
-	int i;
-	char nbuf[128];
-	char vbuf[128];
-	size_t len;
-	cl_int res;
-
-	for (i = 0; i < nd; i++) {
-		res = clGetDeviceInfo(ids[i], CL_DEVICE_NAME,
-				      sizeof(nbuf), nbuf, &len);
-		if (res != CL_SUCCESS)
-			continue;
-		if (len >= sizeof(nbuf))
-			len = sizeof(nbuf) - 1;
-		nbuf[len] = '\0';
-		res = clGetDeviceInfo(ids[i], CL_DEVICE_VENDOR,
-				      sizeof(vbuf), vbuf, &len);
-		if (res != CL_SUCCESS)
-			continue;
-		if (len >= sizeof(vbuf))
-			len = sizeof(vbuf) - 1;
-		vbuf[len] = '\0';
-		fprintf(stderr, "  %d: [%s] %s\n", i + base, vbuf, nbuf);
-	}
-}
-
-cl_device_id
-get_device(cl_platform_id pid, int num)
-{
-	int nd;
-	cl_device_id id, *ids;
-
-	nd = get_device_list(pid, &ids);
-	if (nd < 0)
-		return NULL;
-	if (!nd) {
-		fprintf(stderr, "No OpenCL devices found\n");
-		return NULL;
-	}
-	if (num < 0) {
-		if (nd == 1)
-			num = 0;
-		else
-			num = nd;
-	}
-	if (num < nd) {
-		id = ids[num];
-		free(ids);
-		return id;
-	}
-	free(ids);
-	return NULL;
-}
-
-int
-get_platform_list(cl_platform_id **list_out)
-{
-	cl_uint np;
-	cl_int res;
-	cl_platform_id *ids;
-	res = clGetPlatformIDs(0, NULL, &np);
-	if (res != CL_SUCCESS) {
-		vg_ocl_error(NULL, res, "clGetPlatformIDs(0)");
-		*list_out = NULL;
-		return -1;
-	}
-	if (np) {
-		ids = (cl_platform_id *) malloc(np * sizeof(*ids));
-		if (ids == NULL) {
-			fprintf(stderr,
-				"Could not allocate platform ID list\n");
-			*list_out = NULL;
-			return -1;
-		}
-		res = clGetPlatformIDs(np, ids, NULL);
-		if (res != CL_SUCCESS) {
-			vg_ocl_error(NULL, res, "clGetPlatformIDs(n)");
-			free(ids);
-			*list_out = NULL;
-			return -1;
-		}
-		*list_out = ids;
-	}
-	return np;
-}
-
-void
-show_platforms(cl_platform_id *ids, int np, int base)
-{
-	int i;
-	char nbuf[128];
-	char vbuf[128];
-	size_t len;
-	cl_int res;
-
-	for (i = 0; i < np; i++) {
-		res = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME,
-					sizeof(nbuf), nbuf, &len);
-		if (res != CL_SUCCESS) {
-			vg_ocl_error(NULL, res, "clGetPlatformInfo(NAME)");
-			continue;
-		}
-		if (len >= sizeof(nbuf))
-			len = sizeof(nbuf) - 1;
-		nbuf[len] = '\0';
-		res = clGetPlatformInfo(ids[i], CL_PLATFORM_VENDOR,
-					sizeof(vbuf), vbuf, &len);
-		if (res != CL_SUCCESS) {
-			vg_ocl_error(NULL, res, "clGetPlatformInfo(VENDOR)");
-			continue;
-		}
-		if (len >= sizeof(vbuf))
-			len = sizeof(vbuf) - 1;
-		vbuf[len] = '\0';
-		fprintf(stderr, "%d: [%s] %s\n", i + base, vbuf, nbuf);
-	}
-}
-
-cl_platform_id
-get_platform(int num)
-{
-	int np;
-	cl_platform_id id, *ids;
-
-	np = get_platform_list(&ids);
-	if (np < 0)
-		return NULL;
-	if (!np) {
-		fprintf(stderr, "No OpenCL platforms available\n");
-		return NULL;
-	}
-	if (num < 0) {
-		if (np == 1)
-			num = 0;
-		else
-			num = np;
-	}
-	if (num < np) {
-		id = ids[num];
-		free(ids);
-		return id;
-	}
-	free(ids);
-	return NULL;
-}
-
-void
-enumerate_opencl(void)
-{
-	cl_platform_id *pids;
-	cl_device_id *dids;
-	int np, nd, i;
-
-	np = get_platform_list(&pids);
-	if (!np) {
-		fprintf(stderr, "No OpenCL platforms available\n");
-		return;
-	}
-	fprintf(stderr, "Available OpenCL platforms:\n");
-	for (i = 0; i < np; i++) {
-		show_platforms(&pids[i], 1, i);
-		nd = get_device_list(pids[i], &dids);
-		if (!nd) {
-			fprintf(stderr, "  -- No devices\n");
-		} else {
-			show_devices(pids[i], dids, nd, 0);
-		}
-	}
-}
+#include <openssl/ec.h>
+#include <openssl/bn.h>
+#include <openssl/rand.h>
 
-cl_device_id
-get_opencl_device(int platformidx, int deviceidx)
-{
-	cl_platform_id pid;
-	cl_device_id did = NULL;
+#include "oclengine.h"
+#include "pattern.h"
+#include "util.h"
 
-	pid = get_platform(platformidx);
-	if (pid) {
-		did = get_device(pid, deviceidx);
-		if (did)
-			return did;
-	}
-	enumerate_opencl();
-	return NULL;
-}
 
+const char *version = "0.17";
+const int debug = 0;
 
 
 void
@@ -2597,7 +97,7 @@ main(int argc, char **argv)
 	int verify_mode = 0;
 	int safe_mode = 0;
 	vg_context_t *vcp = NULL;
-	cl_device_id did;
+	vg_ocl_context_t *vocp = NULL;
 	EC_POINT *pubkey_base = NULL;
 	const char *result_file = NULL;
 	const char *key_password = NULL;
@@ -2814,7 +314,11 @@ main(int argc, char **argv)
 	vcp->vc_remove_on_match = remove_on_match;
 	vcp->vc_pubkey_base = pubkey_base;
 
-	if (!vg_context_add_patterns(vcp, patterns, npatterns))
+	vcp->vc_output_match = vg_output_match_console;
+	vcp->vc_output_timing = vg_output_timing_console;
+
+	if (!vg_context_add_patterns(vcp, (const char ** const)patterns,
+				     npatterns))
 		return 1;
 
 	if (!vcp->vc_npatterns) {
@@ -2839,12 +343,14 @@ main(int argc, char **argv)
 		fprintf(stderr,
 			"Regular expressions: %ld\n", vcp->vc_npatterns);
 
-	did = get_opencl_device(platformidx, deviceidx);
-	if (!did) {
+	vocp = vg_ocl_context_new(vcp, platformidx, deviceidx,
+				  safe_mode, verify_mode,
+				  worksize, nthreads, nrows, ncols, invsize);
+	if (!vocp) {
 		return 1;
 	}
 
-	vg_opencl_loop(vcp, did, safe_mode, verify_mode,
-		       worksize, nthreads, nrows, ncols, invsize);
+	vg_opencl_loop(vocp);
+	vg_ocl_context_free(vocp);
 	return 0;
 }
diff --git a/oclvanityminer.c b/oclvanityminer.c
new file mode 100644
index 0000000..67b6058
--- /dev/null
+++ b/oclvanityminer.c
@@ -0,0 +1,748 @@
+/*
+ * Vanitygen, vanity bitcoin address generator
+ * Copyright (C) 2011 <samr7@cs.washington.edu>
+ *
+ * Vanitygen is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version. 
+ *
+ * Vanitygen is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with Vanitygen.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#include <pthread.h>
+
+#include <openssl/ec.h>
+#include <openssl/bn.h>
+#include <openssl/rand.h>
+
+#include <curl/curl.h>
+
+#include "oclengine.h"
+#include "pattern.h"
+#include "util.h"
+
+
+const char *version = "0.17";
+const int debug = 0;
+
+
+typedef struct workitem_s {
+	const char *pattern;
+	const char *comment;
+	EC_POINT *pubkey;
+	int addrtype;
+	double difficulty;
+	double reward;
+	double value;
+} workitem_t;
+
+typedef struct server_request_s {
+	int request_status;
+	const EC_GROUP *group;
+
+	char *part_buf;
+	size_t part_off;
+	size_t part_end;
+	size_t part_size;
+
+	workitem_t **items;
+	int nitems;
+	int nalloc;
+} server_request_t;
+
+void
+server_workitem_free(workitem_t *wip)
+{
+	if (wip->pubkey)
+		EC_POINT_free(wip->pubkey);
+	free(wip);
+}
+
+static workitem_t *
+server_workitem_new(server_request_t *reqp,
+		    const char *pfx, const char *pubkey_s,
+		    const char *addrtype_s, const char *reward_s,
+		    const char *comment)
+{
+	workitem_t *wip;
+	EC_POINT *pubkey;
+	int addrtype;
+	double reward;
+	double difficulty;
+
+	addrtype = atoi(addrtype_s);
+	if ((addrtype < 0) || (addrtype > 255))
+		return NULL;
+
+	reward = strtod(reward_s, NULL);
+	if (reward < 0.0)
+		return NULL;
+
+	difficulty = vg_prefix_get_difficulty(addrtype, pfx);
+	if (difficulty == 0.0)
+		return NULL;
+
+	pubkey = EC_POINT_hex2point(reqp->group, pubkey_s, NULL, NULL);
+	if (pubkey == NULL)
+		return NULL;
+
+
+	wip = (workitem_t *) malloc(sizeof(*wip) +
+				    strlen(pfx) +
+				    strlen(comment) + 2);
+	memset(wip, 0, sizeof(*wip));
+	wip->pattern = (char *) (wip + 1);
+	strcpy((char *)wip->pattern, pfx);
+	wip->comment = wip->pattern + (strlen(wip->pattern) + 1);
+	strcpy((char *) wip->comment, comment);
+	wip->pubkey = pubkey;
+	wip->addrtype = addrtype;
+	wip->difficulty = difficulty;
+	wip->reward = reward;
+	wip->value = (reward * 1000000.0 * 3600.0) / difficulty;
+
+	return wip;
+}
+
+static int
+server_workitem_ptr_comp(const void *pa, const void *pb)
+{
+	workitem_t *a = *(workitem_t **) pa, *b = *(workitem_t **) pb;
+	return (a->value > b->value) ? -1 : ((a->value < b->value) ? 1 : 0);
+}
+
+
+typedef struct server_context_s {
+	EC_KEY *dummy_key;
+	const char *url;
+	const char *credit_addr;
+	char *getwork;
+	char *submit;
+	int verbose;
+} server_context_t;
+
+static int
+server_workitem_equal(server_context_t *ctxp, workitem_t *a, workitem_t *b)
+{
+	if (strcmp(a->pattern, b->pattern))
+		return 0;
+	if (EC_POINT_cmp(EC_KEY_get0_group(ctxp->dummy_key),
+			 a->pubkey, b->pubkey, NULL))
+		return 0;
+	return 1;
+}
+
+
+void
+server_context_free(server_context_t *ctxp)
+{
+	if (ctxp->dummy_key)
+		EC_KEY_free(ctxp->dummy_key);
+	if (ctxp->getwork)
+		free(ctxp->getwork);
+	if (ctxp->submit)
+		free(ctxp->submit);
+	free(ctxp);
+}
+
+server_context_t *
+server_context_new(const char *url, const char *credit_addr)
+{
+	server_context_t *ctxp;
+	int urllen = strlen(url);
+	int addrlen = strlen(credit_addr);
+	ctxp = (server_context_t *)
+		malloc(sizeof(*ctxp) + urllen + addrlen + 2);
+	memset(ctxp, 0, sizeof(*ctxp));
+	ctxp->url = (const char *) (ctxp + 1);
+	ctxp->credit_addr = (const char *) (ctxp->url + urllen + 1);
+	strcpy((char *) ctxp->url, url);
+	strcpy((char *) ctxp->credit_addr, credit_addr);
+
+	ctxp->dummy_key = vg_exec_context_new_key();
+	ctxp->getwork = (char *) malloc(urllen + 9);
+	ctxp->submit = (char *) malloc(urllen + 7);
+	if (url[urllen - 1] == '/') {
+		snprintf(ctxp->getwork, urllen + 9, "%sgetWork", url);
+		snprintf(ctxp->submit, urllen + 7, "%ssolve", url);
+	} else {
+		snprintf(ctxp->getwork, urllen + 9, "%s/getWork", url);
+		snprintf(ctxp->submit, urllen + 7, "%s/solve", url);
+	}
+
+	return ctxp;
+}
+
+
+int
+server_workitem_add(server_request_t *reqp, workitem_t *wip)
+{
+	int nalloc;
+
+	if ((reqp->nitems + 1) >= reqp->nalloc) {
+		nalloc = reqp->nalloc * 2;
+		if (nalloc == 0)
+			nalloc = 16;
+		if (nalloc > 65536)
+			return -1;
+		reqp->items = (workitem_t **)
+			realloc(reqp->items, nalloc * sizeof(*reqp->items));
+		if (reqp->items == NULL)
+			return -1;
+		reqp->nalloc = nalloc;
+	}
+	reqp->items[reqp->nitems++] = wip;
+	return 0;
+}
+
+
+static int
+server_body_reader(const char *buf, size_t elemsize, size_t len, void *param)
+{
+	server_request_t *reqp = (server_request_t *) param;
+	char *line, *sep, *pfx, *pubkey_s, *addrtype_s, *reward_s, *comment;
+	workitem_t *wip;
+
+	if (!len)
+		return 0;
+
+	if ((reqp->part_size < (reqp->part_end + len)) &&
+	    (reqp->part_off > 0)) {
+		memmove(reqp->part_buf,
+			reqp->part_buf + reqp->part_off,
+			reqp->part_end - reqp->part_off);
+		reqp->part_end -= reqp->part_off;
+		reqp->part_off = 0;
+	}
+
+	if (reqp->part_size < (reqp->part_end + len)) {
+		if (reqp->part_size == 0)
+			reqp->part_size = 4096;
+		while (reqp->part_size < (reqp->part_end + len)) {
+			reqp->part_size *= 2;
+			if (reqp->part_size > (1024*1024)) {
+				fprintf(stderr, "Line too long from server");
+				reqp->request_status = 0;
+				return -1;
+			}
+		}
+		reqp->part_buf = (char *) realloc(reqp->part_buf,
+						  reqp->part_size);
+		if (!reqp->part_buf) {
+			fprintf(stderr, "Out of memory");
+			return -1;
+		}
+	}
+
+	memcpy(reqp->part_buf + reqp->part_end, buf, len);
+	reqp->part_end += len;
+
+	line = reqp->part_buf;
+	while (1) {
+		sep = strchr(line, '\n');
+		if (!sep)
+			break;
+		pfx = line;
+		*sep = '\0';
+		line = sep + 1;
+		sep = strchr(pfx, ':');
+		if (!sep)
+			goto bad_line;
+	        *sep = '\0'; sep += 1;
+		pubkey_s = sep;
+		sep = strchr(sep, ':');
+		if (!sep)
+			goto bad_line;
+	        *sep = '\0'; sep += 1;
+		addrtype_s = sep;
+		sep = strchr(sep, ':');
+		if (!sep)
+			goto bad_line;
+	        *sep = '\0'; sep += 1;
+		reward_s = sep;
+		sep = strchr(sep, ';');
+		if (!sep)
+			goto bad_line;
+	        *sep = '\0'; sep += 1;
+		comment = sep;
+
+		wip = server_workitem_new(reqp, pfx, pubkey_s, addrtype_s,
+					  reward_s, comment);
+		if (!wip)
+			goto bad_line;
+		if (server_workitem_add(reqp, wip)) {
+			server_workitem_free(wip);
+			goto bad_line;
+		}
+		continue;
+
+	bad_line:
+		;
+	}
+
+	reqp->part_off = line - reqp->part_buf;
+	if (reqp->part_off == reqp->part_end) {
+		reqp->part_off = 0;
+		reqp->part_end = 0;
+	}
+
+	return len;
+}
+
+void
+dump_work(workitem_t **workarray)
+{
+	workitem_t *wip;
+	int i;
+	printf("Available bounties:\n");
+	for (i = 0; workarray[i] != NULL; i++) {
+		wip = workarray[i];
+		printf("Pattern: \"%s\" Reward: %f Value: %f BTC/MkeyHr\n",
+		       wip->pattern,
+		       wip->reward,
+		       wip->value);
+	}
+}
+
+void
+free_work_array(workitem_t **workarray, workitem_t *except)
+{
+	int i;
+	if (workarray) {
+		for (i = 0; workarray[i] != NULL; i++) {
+			if (workarray[i] != except)
+				server_workitem_free(workarray[i]);
+		}
+		free(workarray);
+	}
+}
+
+workitem_t **
+server_context_getwork(server_context_t *ctxp)
+{
+	CURLcode res;
+	server_request_t *reqp;
+	CURL *creq;
+
+	reqp = (server_request_t *) malloc(sizeof(*reqp));
+	memset(reqp, 0, sizeof(*reqp));
+
+	reqp->group = EC_KEY_get0_group(ctxp->dummy_key);
+
+	creq = curl_easy_init();
+	if (curl_easy_setopt(creq, CURLOPT_URL, ctxp->getwork) ||
+	    curl_easy_setopt(creq, CURLOPT_VERBOSE, ctxp->verbose > 1) ||
+	    curl_easy_setopt(creq, CURLOPT_WRITEFUNCTION,
+			     server_body_reader) ||
+	    curl_easy_setopt(creq, CURLOPT_WRITEDATA, reqp)) {
+		fprintf(stderr, "Failed to set up libcurl\n");
+		exit(1);
+	}
+
+	res = curl_easy_perform(creq);
+	if (res != CURLE_OK) {
+		fprintf(stderr, "Get work request failed: %s\n",
+			curl_easy_strerror(res));
+		curl_easy_cleanup(creq);
+		free_work_array(reqp->items, NULL);
+		return NULL;
+	}
+
+	if (reqp->items) {
+		qsort(reqp->items, reqp->nitems, sizeof(*(reqp->items)),
+		      server_workitem_ptr_comp);
+		reqp->items[reqp->nitems] = NULL;
+	}
+
+	curl_easy_cleanup(creq);
+	return reqp->items;
+}
+
+
+int
+server_context_submit_solution(server_context_t *ctxp,
+			       workitem_t *work,
+			       const char *privkey)
+{
+	char urlbuf[8192];
+	char *pubhex;
+	CURL *creq;
+	CURLcode res;
+
+	pubhex = EC_POINT_point2hex(EC_KEY_get0_group(ctxp->dummy_key),
+				    work->pubkey,
+				    POINT_CONVERSION_UNCOMPRESSED,
+				    NULL);
+	snprintf(urlbuf, sizeof(urlbuf),
+		 "%s?key=%s%%3A%s&privateKey=%s&bitcoinAddress=%s",
+		 ctxp->submit,
+		 work->pattern,
+		 pubhex,
+		 privkey,
+		 ctxp->credit_addr);
+	OPENSSL_free(pubhex);
+	creq = curl_easy_init();
+	if (curl_easy_setopt(creq, CURLOPT_URL, urlbuf) ||
+	    curl_easy_setopt(creq, CURLOPT_VERBOSE, ctxp->verbose > 1) ||
+	    curl_easy_setopt(creq, CURLOPT_POST, 1)) {
+		fprintf(stderr, "Failed to set up libcurl\n");
+		exit(1);
+	}
+
+	res = curl_easy_perform(creq);
+	if (res != CURLE_OK) {
+		fprintf(stderr, "Submission failed: %s\n",
+			curl_easy_strerror(res));
+		curl_easy_cleanup(creq);
+		return -1;
+	}
+
+	curl_easy_cleanup(creq);
+	return 0;
+}
+
+static pthread_mutex_t soln_lock;
+static pthread_cond_t soln_cond;
+static char *soln_pattern = NULL;
+static char *soln_private_key = NULL;
+
+void
+free_soln()
+{
+	if (soln_pattern) {
+		free(soln_pattern);
+		soln_pattern = NULL;
+	}
+	if (soln_private_key) {
+		OPENSSL_free(soln_private_key);
+		soln_private_key = NULL;
+	}
+}
+
+void
+output_match_work_complete(vg_context_t *vcp, EC_KEY *pkey, const char *pattern)
+{
+	vg_output_match_console(vcp, pkey, pattern);
+	pthread_mutex_lock(&soln_lock);
+	free_soln();
+	soln_pattern = strdup(pattern);
+	soln_private_key = BN_bn2hex(EC_KEY_get0_private_key(pkey));
+
+	/* Signal the generator to stop */
+	vcp->vc_halt = 1;
+
+	/* Wake up the main thread, if it's sleeping */
+	pthread_cond_broadcast(&soln_cond);
+	pthread_mutex_unlock(&soln_lock);
+}
+
+int
+check_solution(server_context_t *scp, workitem_t *wip)
+{
+	int res = 0;
+	pthread_mutex_lock(&soln_lock);
+	if (soln_private_key != NULL) {
+		assert(!strcmp(soln_pattern, wip->pattern));
+		server_context_submit_solution(scp, wip, soln_private_key);
+		free_soln();
+		res = 1;
+	}
+	pthread_mutex_unlock(&soln_lock);
+	return res;
+}
+
+void
+usage(const char *name)
+{
+	fprintf(stderr,
+"oclVanityMiner %s (" OPENSSL_VERSION_TEXT ")\n"
+"Usage: %s -u <URL> -a <credit address>\n"
+"Organized vanity address mining client using OpenCL.  Contacts the specified\n"
+"bounty pool server, downloads a list of active bounties, and attempts to\n"
+"generate the address with the best difficulty to reward ratio.  Maintains\n"
+"contact with the bounty pool server and periodically refreshes the bounty\n"
+"list.\n"
+"\n"
+"Options:\n"
+"-u <URL>      Bounty pool URL\n"
+"-a <address>  Credit address for completed work\n"
+"-i <interval> Set server polling interval in seconds (default 90)\n"
+"-v            Verbose output\n"
+"-q            Quiet output\n"
+"-p <platform> Select OpenCL platform\n"
+"-d <device>   Select OpenCL device\n"
+"-S            Safe mode, disable OpenCL loop unrolling optimizations\n"
+"-w <worksize> Set work items per thread in a work unit\n"
+"-t <threads>  Set target thread count per multiprocessor\n"
+"-g <x>x<y>    Set grid size\n"
+"-b <invsize>  Set modular inverse ops per thread\n"
+"-V            Enable kernel/OpenCL/hardware verification (SLOW)\n",
+version, name);
+}
+
+
+int
+main(int argc, char **argv)
+{
+	const char *url = NULL;
+	const char *credit_addr = NULL;
+	int opt;
+	int platformidx = -1, deviceidx = -1;
+	char *pend;
+	int verbose = 1;
+	int interval = 90;
+	int nthreads = 0;
+	int worksize = 0;
+	int nrows = 0, ncols = 0;
+	int invsize = 0;
+	int verify_mode = 0;
+	int safe_mode = 0;
+	vg_context_t *vcp = NULL;
+	vg_ocl_context_t *vocp = NULL;
+
+	int res;
+	int thread_started = 0;
+	pthread_t thread;
+	workitem_t *active_wip = NULL;
+
+	server_context_t *scp = NULL;
+	workitem_t *wip = NULL, **wipa;
+	int wip_index;
+	int was_sleeping = 0;
+
+	struct timeval tv;
+	struct timespec sleepy;
+
+	pthread_mutex_init(&soln_lock, NULL);
+	pthread_cond_init(&soln_cond, NULL);
+
+	if (argc == 1) {
+		usage(argv[0]);
+		return 1;
+	}
+
+	while ((opt = getopt(argc, argv,
+			     "u:a:vqp:d:w:t:g:b:VSh?i:")) != -1) {
+		switch (opt) {
+		case 'u':
+			url = optarg;
+			break;
+		case 'a':
+			credit_addr = optarg;
+			break;
+		case 'v':
+			verbose = 2;
+			break;
+		case 'q':
+			verbose = 0;
+			break;
+		case 'i':
+			interval = atoi(optarg);
+			if (interval < 60) {
+				fprintf(stderr,
+					"Invalid interval '%s'\n", optarg);
+				return 1;
+			}
+		case 'p':
+			platformidx = atoi(optarg);
+			break;
+		case 'd':
+			deviceidx = atoi(optarg);
+			break;
+		case 'w':
+			worksize = atoi(optarg);
+			if (worksize == 0) {
+				fprintf(stderr,
+					"Invalid work size '%s'\n", optarg);
+				return 1;
+			}
+			break;
+		case 't':
+			nthreads = atoi(optarg);
+			if (nthreads == 0) {
+				fprintf(stderr,
+					"Invalid thread count '%s'\n", optarg);
+				return 1;
+			}
+			break;
+		case 'g':
+			nrows = 0;
+			ncols = strtol(optarg, &pend, 0);
+			if (pend && *pend == 'x') {
+				nrows = strtol(pend+1, NULL, 0);
+			}
+			if (!nrows || !ncols) {
+				fprintf(stderr,
+					"Invalid grid size '%s'\n", optarg);
+				return 1;
+			}
+			break;
+		case 'b':
+			invsize = atoi(optarg);
+			if (!invsize) {
+				fprintf(stderr,
+					"Invalid modular inverse size '%s'\n",
+					optarg);
+				return 1;
+			}
+			if (invsize & (invsize - 1)) {
+				fprintf(stderr,
+					"Modular inverse size must be "
+					"a power of 2\n");
+				return 1;
+			}
+			break;
+		case 'V':
+			verify_mode = 1;
+			break;
+		case 'S':
+			safe_mode = 1;
+			break;
+		default:
+			usage(argv[0]);
+			return 1;
+		}
+	}
+
+#if OPENSSL_VERSION_NUMBER < 0x10000000L
+	/* Complain about older versions of OpenSSL */
+	if (verbose > 0) {
+		fprintf(stderr,
+			"WARNING: Built with " OPENSSL_VERSION_TEXT "\n"
+			"WARNING: Use OpenSSL 1.0.0d+ for best performance\n");
+	}
+#endif
+	curl_easy_init();
+
+	vcp = vg_prefix_context_new(0, 128, 0);
+
+	vcp->vc_verbose = verbose;
+
+	vcp->vc_output_match = output_match_work_complete;
+	vcp->vc_output_timing = vg_output_timing_console;
+
+
+	if (!url) {
+		fprintf(stderr, "ERROR: No server URL specified\n");
+		return 1;
+	}
+	if (!credit_addr) {
+		fprintf(stderr, "ERROR: No reward address specified\n");
+		return 1;
+	}
+	if (!vg_b58_decode_check(credit_addr, NULL, 0)) {
+		fprintf(stderr, "ERROR: Invalid reward address specified\n");
+		return 1;
+	}
+
+	scp = server_context_new(url, credit_addr);
+	scp->verbose = verbose;
+	wipa = NULL;
+
+	while (1) {
+		if (!wipa) {
+			wipa = server_context_getwork(scp);
+			wip_index = 0;
+		}
+
+		if (wipa) {
+			wip = wipa[wip_index];
+			if (wip)
+				wip_index += 1;
+		} else
+			wip = NULL;
+
+		/* If the work item is the same as the one we're executing,
+		   keep it */
+		if (wip && active_wip &&
+		    server_workitem_equal(scp, active_wip, wip))
+			wip = active_wip;
+
+		if (thread_started && (!active_wip || (wip != active_wip))) {
+			/* If a thread is running, stop it */
+			vcp->vc_halt = 1;
+			pthread_join(thread, NULL);
+			thread_started = 0;
+			vcp->vc_halt = 0;
+			if (active_wip) {
+				check_solution(scp, active_wip);
+				active_wip = NULL;
+			}
+			vg_context_clear_all_patterns(vcp);
+		}
+
+		if (!wip) {
+			if (!was_sleeping) {
+				fprintf(stderr,
+					"No work available, sleeping\n");
+				was_sleeping = 1;
+			}
+
+		} else if (!active_wip) {
+			was_sleeping = 0;
+			fprintf(stderr,
+				"Searching for pattern: \"%s\" "
+				"Reward: %f Value: %f BTC/MkeyHr\n",
+				wip->pattern,
+				wip->reward,
+				wip->value);
+			vcp->vc_addrtype = wip->addrtype;
+			vcp->vc_pubkey_base = wip->pubkey;
+			if (!vg_context_add_patterns(vcp, &wip->pattern, 1))
+				return 1;
+			assert(vcp->vc_npatterns);
+
+			if (!vocp) {
+				vocp = vg_ocl_context_new(vcp,
+						  platformidx, deviceidx,
+						  safe_mode, verify_mode,
+						  worksize, nthreads, nrows,
+						  ncols, invsize);
+				if (!vocp)
+					return 1;
+			}
+
+			res = pthread_create(&thread, NULL,
+					     vg_opencl_loop, vocp);
+			thread_started = 1;
+			active_wip = wip;
+		}
+
+		/* Wait for something to happen */
+		gettimeofday(&tv, NULL);
+		sleepy.tv_sec = tv.tv_sec;
+		sleepy.tv_nsec = tv.tv_usec * 1000;
+		sleepy.tv_sec += interval;
+
+		pthread_mutex_lock(&soln_lock);
+		if (!soln_private_key)
+			res = pthread_cond_timedwait(&soln_cond,
+						     &soln_lock, &sleepy);
+		pthread_mutex_unlock(&soln_lock);
+
+		if (res == 0) {
+			if (check_solution(scp, active_wip))
+				active_wip = NULL;
+		}
+		else if (res == ETIMEDOUT) {
+			if (wipa) {
+				free_work_array(wipa, active_wip);
+				wipa = NULL;
+			}
+		}
+	}
+
+	return 0;
+}
diff --git a/pattern.c b/pattern.c
index 9f874d6..384b0ae 100644
--- a/pattern.c
+++ b/pattern.c
@@ -141,20 +141,13 @@ typedef struct _timing_info_s {
 int
 vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last)
 {
-	static unsigned long long total = 0, prevfound = 0, sincelast = 0;
 	static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-	static timing_info_t *timing_head = NULL;
 
 	pthread_t me;
 	struct timeval tvnow, tv;
 	timing_info_t *tip, *mytip;
-	unsigned long long rate, myrate = 0, mytime;
-	double count, prob, time, targ;
-	char linebuf[80];
-	char *unit;
-	int rem, p, i;
-
-	const double targs[] = { 0.5, 0.75, 0.8, 0.9, 0.95, 1.0 };
+	unsigned long long rate, myrate = 0, mytime, total, sincelast;
+	int p, i;
 
 	/* Compute the rate */
 	gettimeofday(&tvnow, NULL);
@@ -167,7 +160,8 @@ vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last)
 
 	pthread_mutex_lock(&mutex);
 	me = pthread_self();
-	for (tip = timing_head, mytip = NULL; tip != NULL; tip = tip->ti_next) {
+	for (tip = vcp->vc_timing_head, mytip = NULL;
+	     tip != NULL; tip = tip->ti_next) {
 		if (pthread_equal(tip->ti_thread, me)) {
 			mytip = tip;
 			p = ((tip->ti_hist_last + 1) % timing_hist_size);
@@ -190,9 +184,9 @@ vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last)
 	}
 	if (!mytip) {
 		mytip = (timing_info_t *) malloc(sizeof(*tip));
-		mytip->ti_next = timing_head;
+		mytip->ti_next = vcp->vc_timing_head;
 		mytip->ti_thread = me;
-		timing_head = mytip;
+		vcp->vc_timing_head = mytip;
 		mytip->ti_hist_last = 0;
 		mytip->ti_hist_time[0] = mytime;
 		mytip->ti_hist_work[0] = cycle;
@@ -205,20 +199,47 @@ vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last)
 		rate += myrate;
 	}
 
-	total += cycle;
-	if (prevfound != vcp->vc_found) {
-		prevfound = vcp->vc_found;
-		sincelast = 0;
+	vcp->vc_timing_total += cycle;
+	if (vcp->vc_timing_prevfound != vcp->vc_found) {
+		vcp->vc_timing_prevfound = vcp->vc_found;
+		vcp->vc_timing_sincelast = 0;
 	}
-	sincelast += cycle;
-	count = sincelast;
+	vcp->vc_timing_sincelast += cycle;
 
-	if (mytip != timing_head) {
+	if (mytip != vcp->vc_timing_head) {
 		pthread_mutex_unlock(&mutex);
 		return myrate;
 	}
+	total = vcp->vc_timing_total;
+	sincelast = vcp->vc_timing_sincelast;
 	pthread_mutex_unlock(&mutex);
 
+	vcp->vc_output_timing(vcp, sincelast, rate, total);
+	return myrate;
+}
+
+static void
+vg_timing_info_free(vg_context_t *vcp)
+{
+	timing_info_t *tp;
+	while (vcp->vc_timing_head != NULL) {
+		tp = vcp->vc_timing_head;
+		vcp->vc_timing_head = tp->ti_next;
+		free(tp);
+	}
+}
+
+void
+vg_output_timing_console(vg_context_t *vcp, double count,
+			 unsigned long long rate, unsigned long long total)
+{
+	double prob, time, targ;
+	char *unit;
+	char linebuf[80];
+	int rem, p, i;
+
+	const double targs[] = { 0.5, 0.75, 0.8, 0.9, 0.95, 1.0 };
+
 	targ = rate;
 	unit = "key/s";
 	if (targ > 1000) {
@@ -314,11 +335,10 @@ vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last)
 	}
 	printf("\r%s", linebuf);
 	fflush(stdout);
-	return myrate;
 }
 
 void
-vg_output_match(vg_context_t *vcp, EC_KEY *pkey, const char *pattern)
+vg_output_match_console(vg_context_t *vcp, EC_KEY *pkey, const char *pattern)
 {
 	unsigned char key_buf[512], *pend;
 	char addr_buf[64], addr2_buf[64];
@@ -386,6 +406,7 @@ vg_output_match(vg_context_t *vcp, EC_KEY *pkey, const char *pattern)
 			printf("Privkey (ASN1): ");
 			dumphex(key_buf, len);
 		}
+
 	}
 
 	if (!vcp->vc_result_file || (vcp->vc_verbose > 0)) {
@@ -420,20 +441,26 @@ vg_output_match(vg_context_t *vcp, EC_KEY *pkey, const char *pattern)
 }
 
 
-
 void
 vg_context_free(vg_context_t *vcp)
 {
+	vg_timing_info_free(vcp);
 	vcp->vc_free(vcp);
 }
 
 int
 vg_context_add_patterns(vg_context_t *vcp,
-			char ** const patterns, int npatterns)
+			const char ** const patterns, int npatterns)
 {
 	return vcp->vc_add_patterns(vcp, patterns, npatterns);
 }
 
+void
+vg_context_clear_all_patterns(vg_context_t *vcp)
+{
+	vcp->vc_clear_all_patterns(vcp);
+}
+
 int
 vg_context_hash160_sort(vg_context_t *vcp, void *buf)
 {
@@ -699,6 +726,21 @@ out:
 	return ret;
 }
 
+static void
+free_ranges(BIGNUM **ranges)
+{
+	BN_free(ranges[0]);
+	BN_free(ranges[1]);
+	ranges[0] = NULL;
+	ranges[1] = NULL;
+	if (ranges[2]) {
+		BN_free(ranges[2]);
+		BN_free(ranges[3]);
+		ranges[2] = NULL;
+		ranges[3] = NULL;
+	}
+}
+
 /*
  * AVL tree implementation
  */
@@ -1364,7 +1406,7 @@ typedef struct _vg_prefix_context_s {
 } vg_prefix_context_t;
 
 static void
-vg_prefix_context_free(vg_context_t *vcp)
+vg_prefix_context_clear_all_patterns(vg_context_t *vcp)
 {
 	vg_prefix_context_t *vcpp = (vg_prefix_context_t *) vcp;
 	vg_prefix_t *vp;
@@ -1378,6 +1420,17 @@ vg_prefix_context_free(vg_context_t *vcp)
 	}
 
 	assert(npfx_left == vcpp->base.vc_npatterns);
+	vcpp->base.vc_npatterns = 0;
+	vcpp->base.vc_npatterns_start = 0;
+	vcpp->base.vc_found = 0;
+	BN_clear(&vcpp->vcp_difficulty);
+}
+
+static void
+vg_prefix_context_free(vg_context_t *vcp)
+{
+	vg_prefix_context_t *vcpp = (vg_prefix_context_t *) vcp;
+	vg_prefix_context_clear_all_patterns(vcp);
 	BN_clear_free(&vcpp->vcp_difficulty);
 	free(vcpp);
 }
@@ -1407,7 +1460,7 @@ vg_prefix_context_next_difficulty(vg_prefix_context_t *vcpp,
 
 static int
 vg_prefix_context_add_patterns(vg_context_t *vcp,
-			       char ** const patterns, int npatterns)
+			       const char ** const patterns, int npatterns)
 {
 	vg_prefix_context_t *vcpp = (vg_prefix_context_t *) vcp;
 	prefix_case_iter_t caseiter;
@@ -1556,6 +1609,47 @@ vg_prefix_context_add_patterns(vg_context_t *vcp,
 	return ret;
 }
 
+double
+vg_prefix_get_difficulty(int addrtype, const char *pattern)
+{
+	BN_CTX *bnctx;
+	BIGNUM result, bntmp;
+	BIGNUM *ranges[4];
+	char *dbuf;
+	int ret;
+	double diffret = 0.0;
+
+	bnctx = BN_CTX_new();
+	BN_init(&result);
+	BN_init(&bntmp);
+
+	ret = get_prefix_ranges(addrtype,
+				pattern, ranges, bnctx);
+
+	if (ret == 0) {
+		BN_sub(&bntmp, ranges[1], ranges[0]);
+		BN_add(&result, &result, &bntmp);
+		if (ranges[2]) {
+			BN_sub(&bntmp, ranges[3], ranges[2]);
+			BN_add(&result, &result, &bntmp);
+		}
+		free_ranges(ranges);
+
+		BN_clear(&bntmp);
+		BN_set_bit(&bntmp, 192);
+		BN_div(&result, NULL, &bntmp, &result, bnctx);
+
+		dbuf = BN_bn2dec(&result);
+		diffret = strtod(dbuf, NULL);
+		OPENSSL_free(dbuf);
+	}
+
+	BN_clear_free(&result);
+	BN_clear_free(&bntmp);
+	BN_CTX_free(bnctx);
+	return diffret;
+}
+
 
 static int
 vg_prefix_test(vg_exec_context_t *vxcp)
@@ -1579,7 +1673,8 @@ research:
 			goto research;
 
 		vg_exec_context_consolidate_key(vxcp);
-		vg_output_match(&vcpp->base, vxcp->vxc_key, vp->vp_pattern);
+		vcpp->base.vc_output_match(&vcpp->base, vxcp->vxc_key,
+					   vp->vp_pattern);
 
 		vcpp->base.vc_found++;
 
@@ -1673,6 +1768,8 @@ vg_prefix_context_new(int addrtype, int privtype, int caseinsensitive)
 		vcpp->base.vc_chance = 0.0;
 		vcpp->base.vc_free = vg_prefix_context_free;
 		vcpp->base.vc_add_patterns = vg_prefix_context_add_patterns;
+		vcpp->base.vc_clear_all_patterns =
+			vg_prefix_context_clear_all_patterns;
 		vcpp->base.vc_test = vg_prefix_test;
 		vcpp->base.vc_hash160_sort = vg_prefix_hash160_sort;
 		avl_root_init(&vcpp->vcp_avlroot);
@@ -1695,7 +1792,7 @@ typedef struct _vg_regex_context_s {
 
 static int
 vg_regex_context_add_patterns(vg_context_t *vcp,
-			      char ** const patterns, int npatterns)
+			      const char ** const patterns, int npatterns)
 {
 	vg_regex_context_t *vcrp = (vg_regex_context_t *) vcp;
 	const char *pcre_errptr;
@@ -1770,7 +1867,7 @@ vg_regex_context_add_patterns(vg_context_t *vcp,
 }
 
 static void
-vg_regex_context_free(vg_context_t *vcp)
+vg_regex_context_clear_all_patterns(vg_context_t *vcp)
 {
 	vg_regex_context_t *vcrp = (vg_regex_context_t *) vcp;
 	int i;
@@ -1779,6 +1876,16 @@ vg_regex_context_free(vg_context_t *vcp)
 			pcre_free(vcrp->vcr_regex_extra[i]);
 		pcre_free(vcrp->vcr_regex[i]);
 	}
+	vcrp->base.vc_npatterns = 0;
+	vcrp->base.vc_npatterns_start = 0;
+	vcrp->base.vc_found = 0;
+}
+
+static void
+vg_regex_context_free(vg_context_t *vcp)
+{
+	vg_regex_context_t *vcrp = (vg_regex_context_t *) vcp;
+	vg_regex_context_clear_all_patterns(vcp);
 	if (vcrp->vcr_nalloc)
 		free(vcrp->vcr_regex);
 	free(vcrp);
@@ -1860,8 +1967,8 @@ restart_loop:
 			goto restart_loop;
 
 		vg_exec_context_consolidate_key(vxcp);
-		vg_output_match(&vcrp->base, vxcp->vxc_key,
-				vcrp->vcr_regex_pat[i]);
+		vcrp->base.vc_output_match(&vcrp->base, vxcp->vxc_key,
+					   vcrp->vcr_regex_pat[i]);
 		vcrp->base.vc_found++;
 
 		if (vcrp->base.vc_remove_on_match) {
@@ -1902,6 +2009,8 @@ vg_regex_context_new(int addrtype, int privtype)
 		vcrp->base.vc_chance = 0.0;
 		vcrp->base.vc_free = vg_regex_context_free;
 		vcrp->base.vc_add_patterns = vg_regex_context_add_patterns;
+		vcrp->base.vc_clear_all_patterns =
+			vg_regex_context_clear_all_patterns;
 		vcrp->base.vc_test = vg_regex_test;
 		vcrp->base.vc_hash160_sort = NULL;
 		vcrp->vcr_regex = NULL;
diff --git a/pattern.h b/pattern.h
index 70f86d6..a775937 100644
--- a/pattern.h
+++ b/pattern.h
@@ -56,16 +56,24 @@ extern void vg_exec_context_consolidate_key(vg_exec_context_t *vxcp);
 extern void vg_exec_context_calc_address(vg_exec_context_t *vxcp);
 extern EC_KEY *vg_exec_context_new_key(void);
 
-
 /* Implementation-specific lock/unlock/consolidate */
 extern void vg_exec_downgrade_lock(vg_exec_context_t *vxcp);
 extern int vg_exec_upgrade_lock(vg_exec_context_t *vxcp);
 
+
 typedef void (*vg_free_func_t)(vg_context_t *);
 typedef int (*vg_add_pattern_func_t)(vg_context_t *,
-				     char ** const patterns, int npatterns);
+				     const char ** const patterns,
+				     int npatterns);
+typedef void (*vg_clear_all_patterns_func_t)(vg_context_t *);
 typedef int (*vg_test_func_t)(vg_exec_context_t *);
 typedef int (*vg_hash160_sort_func_t)(vg_context_t *vcp, void *buf);
+typedef void (*vg_output_error_func_t)(vg_context_t *vcp, const char *info);
+typedef void (*vg_output_match_func_t)(vg_context_t *vcp, EC_KEY *pkey,
+				       const char *pattern);
+typedef void (*vg_output_timing_func_t)(vg_context_t *vcp, double count,
+					unsigned long long rate,
+					unsigned long long total);
 
 enum vg_format {
 	VCF_PUBKEY,
@@ -84,28 +92,50 @@ struct _vg_context_s {
 	const char		*vc_key_protect_pass;
 	int			vc_remove_on_match;
 	int			vc_verbose;
-	vg_free_func_t		vc_free;
-	vg_add_pattern_func_t	vc_add_patterns;
-	vg_test_func_t		vc_test;
-	vg_hash160_sort_func_t	vc_hash160_sort;
 	enum vg_format		vc_format;
 	int			vc_pubkeytype;
 	EC_POINT		*vc_pubkey_base;
+	int			vc_halt;
+
+	/* Internal methods */
+	vg_free_func_t			vc_free;
+	vg_add_pattern_func_t		vc_add_patterns;
+	vg_clear_all_patterns_func_t	vc_clear_all_patterns;
+	vg_test_func_t			vc_test;
+	vg_hash160_sort_func_t		vc_hash160_sort;
+
+	/* Performance related members */
+	unsigned long long		vc_timing_total;
+	unsigned long long		vc_timing_prevfound;
+	unsigned long long		vc_timing_sincelast;
+	struct _timing_info_s		*vc_timing_head;
+
+	/* External methods */
+	vg_output_error_func_t		vc_output_error;
+	vg_output_match_func_t		vc_output_match;
+	vg_output_timing_func_t		vc_output_timing;
 };
 
 
 extern void vg_context_free(vg_context_t *vcp);
 extern int vg_context_add_patterns(vg_context_t *vcp,
-				   char ** const patterns, int npatterns);
+				   const char ** const patterns, int npatterns);
+extern void vg_context_clear_all_patterns(vg_context_t *vcp);
 extern int vg_context_hash160_sort(vg_context_t *vcp, void *buf);
 
 
 extern vg_context_t *vg_prefix_context_new(int addrtype, int privtype,
 					   int caseinsensitive);
+extern double vg_prefix_get_difficulty(int addrtype, const char *pattern);
+
 extern vg_context_t *vg_regex_context_new(int addrtype, int privtype);
 
 extern int vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last);
-extern void vg_output_match(vg_context_t *vcp, EC_KEY *pkey,
-			    const char *pattern);
+
+extern void vg_output_match_console(vg_context_t *vcp, EC_KEY *pkey,
+				    const char *pattern);
+extern void vg_output_timing_console(vg_context_t *vcp, double count,
+				     unsigned long long rate,
+				     unsigned long long total);
 
 #endif /* !defined (__VG_PATTERN_H__) */
diff --git a/vanitygen.c b/vanitygen.c
index d11dd04..e368729 100644
--- a/vanitygen.c
+++ b/vanitygen.c
@@ -280,7 +280,7 @@ vg_thread_loop(void *arg)
 		hash_len = 65;
 	}
 
-	while (1) {
+	while (!vcp->vc_halt) {
 		if (++npoints >= rekey_at) {
 			pthread_mutex_lock(&vg_thread_lock);
 			/* Generate a new random private key */
@@ -704,7 +704,11 @@ main(int argc, char **argv)
 	vcp->vc_pubkeytype = pubkeytype;
 	vcp->vc_pubkey_base = pubkey_base;
 
-	if (!vg_context_add_patterns(vcp, patterns, npatterns))
+	vcp->vc_output_match = vg_output_match_console;
+	vcp->vc_output_timing = vg_output_timing_console;
+
+	if (!vg_context_add_patterns(vcp, (const char ** const) patterns,
+				     npatterns))
 		return 1;
 
 	if (!vcp->vc_npatterns) {
diff --git a/winglue.c b/winglue.c
index 10c0d02..b249e6a 100644
--- a/winglue.c
+++ b/winglue.c
@@ -73,11 +73,9 @@ count_processors(void)
  * struct timeval compatibility for Win32
  */
 
-#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
-#define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64
-#else
-#define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL
-#endif
+#define TIMESPEC_TO_FILETIME_OFFSET \
+	  ( ((unsigned __int64) 27111902 << 32) + \
+	    (unsigned __int64) 3577643008 )
 
 int
 gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -88,14 +86,13 @@ gettimeofday(struct timeval *tv, struct timezone *tz)
 	if (NULL != tv) {
 		GetSystemTimeAsFileTime(&ft);
 
-		tmpres |= ft.dwHighDateTime;
-		tmpres <<= 32;
-		tmpres |= ft.dwLowDateTime;
-
-		tmpres -= DELTA_EPOCH_IN_MICROSECS; 
-		tmpres /= 10;
-		tv->tv_sec = (long)(tmpres / 1000000UL);
-		tv->tv_usec = (long)(tmpres % 1000000UL);
+		tv->tv_sec = (int) ((*(unsigned __int64 *) &ft -
+				     TIMESPEC_TO_FILETIME_OFFSET) /
+				    10000000);
+		tv->tv_usec = (int) ((*(unsigned __int64 *) &ft - 
+				      TIMESPEC_TO_FILETIME_OFFSET -
+				      ((unsigned __int64) tv->tv_sec *
+				       (unsigned __int64) 10000000)) / 10);
 	}
 
 	return 0;