vanitygen/oclengine.c

/*
 * Vanitygen, vanity bitcoin address generator
 * Copyright (C) 2011 <samr7@cs.washington.edu>
 *
 * Vanitygen is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * any later version.
 *
 * Vanitygen is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with Vanitygen.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>

#include <pthread.h>

#include <openssl/ec.h>
#include <openssl/bn.h>
#include <openssl/rand.h>
#include <openssl/evp.h>

#ifdef __APPLE__
#include <OpenCL/cl.h>
#ifndef CL_CALLBACK
#define CL_CALLBACK
#endif
#else
#include <CL/cl.h>
#endif

#include "oclengine.h"
#include "pattern.h"
#include "util.h"


#define MAX_SLOT 2
#define MAX_ARG 6
#define MAX_KERNEL 3

#define is_pow2(v) (!((v) & ((v)-1)))
#define round_up_pow2(x, a) (((x) + ((a)-1)) & ~((a)-1))

static void vg_ocl_free_args(vg_ocl_context_t *vocp);
static void *vg_opencl_loop(vg_exec_context_t *arg);


/* OpenCL address searching mode */
struct _vg_ocl_context_s;
typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *);
typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot);

struct _vg_ocl_context_s {
	vg_exec_context_t		base;
	cl_device_id			voc_ocldid;
	cl_context			voc_oclctx;
	cl_command_queue		voc_oclcmdq;
	cl_program			voc_oclprog;
	vg_ocl_init_t			voc_init_func;
	vg_ocl_init_t			voc_rekey_func;
	vg_ocl_check_t			voc_check_func;
	int				voc_quirks;
	int				voc_nslots;
	cl_kernel			voc_oclkernel[MAX_SLOT][MAX_KERNEL];
	cl_event			voc_oclkrnwait[MAX_SLOT];
	cl_mem				voc_args[MAX_SLOT][MAX_ARG];
	size_t				voc_arg_size[MAX_SLOT][MAX_ARG];

	int				voc_pattern_rewrite;
	int				voc_pattern_alloc;

	vg_ocl_check_t			voc_verify_func[MAX_KERNEL];

	pthread_t			voc_ocl_thread;
	pthread_mutex_t			voc_lock;
	pthread_cond_t			voc_wait;
	int				voc_ocl_slot;
	int				voc_ocl_rows;
	int				voc_ocl_cols;
	int				voc_ocl_invsize;
	int				voc_halt;
	int				voc_dump_done;
};


/* Thread synchronization stubs */
void
vg_exec_downgrade_lock(vg_exec_context_t *vxcp)
{
}

int
vg_exec_upgrade_lock(vg_exec_context_t *vxcp)
{
	return 0;
}


/*
 * OpenCL debugging and support
 */

static const char *
vg_ocl_strerror(cl_int ret)
{
#define OCL_STATUS(st) case st: return #st;
	switch (ret) {
		OCL_STATUS(CL_SUCCESS);
		OCL_STATUS(CL_DEVICE_NOT_FOUND);
		OCL_STATUS(CL_DEVICE_NOT_AVAILABLE);
		OCL_STATUS(CL_COMPILER_NOT_AVAILABLE);
		OCL_STATUS(CL_MEM_OBJECT_ALLOCATION_FAILURE);
		OCL_STATUS(CL_OUT_OF_RESOURCES);
		OCL_STATUS(CL_OUT_OF_HOST_MEMORY);
		OCL_STATUS(CL_PROFILING_INFO_NOT_AVAILABLE);
		OCL_STATUS(CL_MEM_COPY_OVERLAP);
		OCL_STATUS(CL_IMAGE_FORMAT_MISMATCH);
		OCL_STATUS(CL_IMAGE_FORMAT_NOT_SUPPORTED);
		OCL_STATUS(CL_BUILD_PROGRAM_FAILURE);
		OCL_STATUS(CL_MAP_FAILURE);
#if defined(CL_MISALIGNED_SUB_BUFFER_OFFSET)
		OCL_STATUS(CL_MISALIGNED_SUB_BUFFER_OFFSET);
#endif /* defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) */
#if defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
		OCL_STATUS(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
#endif /* defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) */
		OCL_STATUS(CL_INVALID_VALUE);
		OCL_STATUS(CL_INVALID_DEVICE_TYPE);
		OCL_STATUS(CL_INVALID_PLATFORM);
		OCL_STATUS(CL_INVALID_DEVICE);
		OCL_STATUS(CL_INVALID_CONTEXT);
		OCL_STATUS(CL_INVALID_QUEUE_PROPERTIES);
		OCL_STATUS(CL_INVALID_COMMAND_QUEUE);
		OCL_STATUS(CL_INVALID_HOST_PTR);
		OCL_STATUS(CL_INVALID_MEM_OBJECT);
		OCL_STATUS(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
		OCL_STATUS(CL_INVALID_IMAGE_SIZE);
		OCL_STATUS(CL_INVALID_SAMPLER);
		OCL_STATUS(CL_INVALID_BINARY);
		OCL_STATUS(CL_INVALID_BUILD_OPTIONS);
		OCL_STATUS(CL_INVALID_PROGRAM);
		OCL_STATUS(CL_INVALID_PROGRAM_EXECUTABLE);
		OCL_STATUS(CL_INVALID_KERNEL_NAME);
		OCL_STATUS(CL_INVALID_KERNEL_DEFINITION);
		OCL_STATUS(CL_INVALID_KERNEL);
		OCL_STATUS(CL_INVALID_ARG_INDEX);
		OCL_STATUS(CL_INVALID_ARG_VALUE);
		OCL_STATUS(CL_INVALID_ARG_SIZE);
		OCL_STATUS(CL_INVALID_KERNEL_ARGS);
		OCL_STATUS(CL_INVALID_WORK_DIMENSION);
		OCL_STATUS(CL_INVALID_WORK_GROUP_SIZE);
		OCL_STATUS(CL_INVALID_WORK_ITEM_SIZE);
		OCL_STATUS(CL_INVALID_GLOBAL_OFFSET);
		OCL_STATUS(CL_INVALID_EVENT_WAIT_LIST);
		OCL_STATUS(CL_INVALID_EVENT);
		OCL_STATUS(CL_INVALID_OPERATION);
		OCL_STATUS(CL_INVALID_GL_OBJECT);
		OCL_STATUS(CL_INVALID_BUFFER_SIZE);
		OCL_STATUS(CL_INVALID_MIP_LEVEL);
		OCL_STATUS(CL_INVALID_GLOBAL_WORK_SIZE);
#if defined(CL_INVALID_PROPERTY)
		OCL_STATUS(CL_INVALID_PROPERTY);
#endif /* defined(CL_INVALID_PROPERTY) */
#undef OCL_STATUS
	default: {
		static char tmp[64];
		snprintf(tmp, sizeof(tmp), "Unknown code %d", ret);
		return tmp;
	}
	}
}

/* Get device strings, using a static buffer -- caveat emptor */
static const char *
vg_ocl_platform_getstr(cl_platform_id pid, cl_platform_info param)
{
	static char platform_str[1024];
	cl_int ret;
	size_t size_ret;
	ret = clGetPlatformInfo(pid, param,
				sizeof(platform_str), platform_str,
				&size_ret);
	if (ret != CL_SUCCESS) {
		snprintf(platform_str, sizeof(platform_str),
			 "clGetPlatformInfo(%d): %s",
			 param, vg_ocl_strerror(ret));
	}
	return platform_str;
}

static cl_platform_id
vg_ocl_device_getplatform(cl_device_id did)
{
	cl_int ret;
	cl_platform_id val;
	size_t size_ret;
	ret = clGetDeviceInfo(did, CL_DEVICE_PLATFORM,
			      sizeof(val), &val, &size_ret);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_PLATFORM): %s",
			vg_ocl_strerror(ret));
	}
	return val;
}

static cl_device_type
vg_ocl_device_gettype(cl_device_id did)
{
	cl_int ret;
	cl_device_type val;
	size_t size_ret;
	ret = clGetDeviceInfo(did, CL_DEVICE_TYPE,
			      sizeof(val), &val, &size_ret);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_TYPE): %s",
			vg_ocl_strerror(ret));
	}
	return val;
}

static const char *
vg_ocl_device_getstr(cl_device_id did, cl_device_info param)
{
	static char device_str[1024];
	cl_int ret;
	size_t size_ret;
	ret = clGetDeviceInfo(did, param,
			      sizeof(device_str), device_str,
			      &size_ret);
	if (ret != CL_SUCCESS) {
		snprintf(device_str, sizeof(device_str),
			 "clGetDeviceInfo(%d): %s",
			 param, vg_ocl_strerror(ret));
	}
	return device_str;
}

static size_t
vg_ocl_device_getsizet(cl_device_id did, cl_device_info param)
{
	cl_int ret;
	size_t val;
	size_t size_ret;
	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
	if (ret != CL_SUCCESS) {
		fprintf(stderr,
			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
	}
	return val;
}

static cl_ulong
vg_ocl_device_getulong(cl_device_id did, cl_device_info param)
{
	cl_int ret;
	cl_ulong val;
	size_t size_ret;
	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
	if (ret != CL_SUCCESS) {
		fprintf(stderr,
			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
	}
	return val;
}

static cl_uint
vg_ocl_device_getuint(cl_device_id did, cl_device_info param)
{
	cl_int ret;
        cl_uint val;
	size_t size_ret;
	ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
	if (ret != CL_SUCCESS) {
		fprintf(stderr,
			"clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret));
	}
	return val;
}

void
vg_ocl_dump_info(vg_ocl_context_t *vocp)
{
	cl_device_id did;
	if (vocp->base.vxc_vc && (vocp->base.vxc_vc->vc_verbose < 1))
		return;
	if (vocp->voc_dump_done)
		return;
	did = vocp->voc_ocldid;
	fprintf(stderr, "Device: %s\n",
	       vg_ocl_device_getstr(did, CL_DEVICE_NAME));
	fprintf(stderr, "Vendor: %s (%04x)\n",
	       vg_ocl_device_getstr(did, CL_DEVICE_VENDOR),
	       vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID));
	fprintf(stderr, "Driver: %s\n",
	       vg_ocl_device_getstr(did, CL_DRIVER_VERSION));
	fprintf(stderr, "Profile: %s\n",
	       vg_ocl_device_getstr(did, CL_DEVICE_PROFILE));
	fprintf(stderr, "Version: %s\n",
	       vg_ocl_device_getstr(did, CL_DEVICE_VERSION));
	fprintf(stderr, "Max compute units: %"PRSIZET"d\n",
	       vg_ocl_device_getsizet(did, CL_DEVICE_MAX_COMPUTE_UNITS));
	fprintf(stderr, "Max workgroup size: %"PRSIZET"d\n",
	       vg_ocl_device_getsizet(did, CL_DEVICE_MAX_WORK_GROUP_SIZE));
	fprintf(stderr, "Global memory: %ld\n",
	       vg_ocl_device_getulong(did, CL_DEVICE_GLOBAL_MEM_SIZE));
	fprintf(stderr, "Max allocation: %ld\n",
	       vg_ocl_device_getulong(did, CL_DEVICE_MAX_MEM_ALLOC_SIZE));
	vocp->voc_dump_done = 1;
}

void
vg_ocl_error(vg_ocl_context_t *vocp, int code, const char *desc)
{
	const char *err = vg_ocl_strerror(code);
	if (desc) {
		fprintf(stderr, "%s: %s\n", desc, err);
	} else {
		fprintf(stderr, "%s\n", err);
	}

	if (vocp && vocp->voc_ocldid)
		vg_ocl_dump_info(vocp);
}

static void
vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog)
{
	size_t logbufsize, logsize;
	char *log;
	int off = 0;
	cl_int ret;

	ret = clGetProgramBuildInfo(prog,
				    vocp->voc_ocldid,
				    CL_PROGRAM_BUILD_LOG,
				    0, NULL,
				    &logbufsize);
	if (ret != CL_SUCCESS) {
		vg_ocl_error(NULL, ret, "clGetProgramBuildInfo");
		return;
	}

	log = (char *) malloc(logbufsize);
	if (!log) {
		fprintf(stderr, "Could not allocate build log buffer\n");
		return;
	}

	ret = clGetProgramBuildInfo(prog,
				    vocp->voc_ocldid,
				    CL_PROGRAM_BUILD_LOG,
				    logbufsize,
				    log,
				    &logsize);
	if (ret != CL_SUCCESS) {
		vg_ocl_error(NULL, ret, "clGetProgramBuildInfo");

	} else {
		/* Remove leading newlines and trailing newlines/whitespace */
		log[logbufsize-1] = '\0';
		for (off = logsize - 1; off >= 0; off--) {
			if ((log[off] != '\r') &&
			    (log[off] != '\n') &&
			    (log[off] != ' ') &&
			    (log[off] != '\t') &&
			    (log[off] != '\0'))
				break;
			log[off] = '\0';
		}
		for (off = 0; off < logbufsize; off++) {
			if ((log[off] != '\r') &&
			    (log[off] != '\n'))
				break;
		}

		fprintf(stderr, "Build log:\n%s\n", &log[off]);
	}
	free(log);
}

/*
 * OpenCL per-exec functions
 */

enum {
	VG_OCL_DEEP_PREPROC_UNROLL  = (1 << 0),
	VG_OCL_PRAGMA_UNROLL        = (1 << 1),
	VG_OCL_EXPENSIVE_BRANCHES   = (1 << 2),
	VG_OCL_DEEP_VLIW            = (1 << 3),
	VG_OCL_AMD_BFI_INT          = (1 << 4),
	VG_OCL_NV_VERBOSE           = (1 << 5),
	VG_OCL_BROKEN               = (1 << 6),
	VG_OCL_NO_BINARIES          = (1 << 7),

	VG_OCL_OPTIMIZATIONS        = (VG_OCL_DEEP_PREPROC_UNROLL |
				       VG_OCL_PRAGMA_UNROLL |
				       VG_OCL_EXPENSIVE_BRANCHES |
				       VG_OCL_DEEP_VLIW |
				       VG_OCL_AMD_BFI_INT),

};

static int
vg_ocl_get_quirks(vg_ocl_context_t *vocp)
{
	uint32_t vend;
	const char *dvn;
	unsigned int quirks = 0;

	quirks |= VG_OCL_DEEP_PREPROC_UNROLL;

	vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID);
	switch (vend) {
	case 0x10de: /* NVIDIA */
		/*
		 * NVIDIA's compiler seems to take a really really long
		 * time when using preprocessor unrolling, but works
		 * well with pragma unroll.
		 */
		quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL;
		quirks |= VG_OCL_PRAGMA_UNROLL;
		quirks |= VG_OCL_NV_VERBOSE;
		break;
	case 0x1002: /* AMD/ATI */
		/*
		 * AMD's compiler works best with preprocesor unrolling.
		 * Pragma unroll is unreliable with AMD's compiler and
		 * seems to crash based on whether the gods were smiling
		 * when Catalyst was last installed/upgraded.
		 */
		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
		    CL_DEVICE_TYPE_GPU) {
			quirks |= VG_OCL_EXPENSIVE_BRANCHES;
			quirks |= VG_OCL_DEEP_VLIW;
			dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
						   CL_DEVICE_EXTENSIONS);
			if (dvn && strstr(dvn, "cl_amd_media_ops"))
				quirks |= VG_OCL_AMD_BFI_INT;

			dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
						   CL_DEVICE_NAME);
			if (!strcmp(dvn, "ATI RV710")) {
				quirks &= ~VG_OCL_OPTIMIZATIONS;
				quirks |= VG_OCL_NO_BINARIES;
			}
		}
		break;
	default:
		break;
	}
	return quirks;
}

static int
vg_ocl_create_kernel(vg_ocl_context_t *vocp, int knum, const char *func)
{
	int i;
	cl_kernel krn;
	cl_int ret;

	for (i = 0; i < MAX_SLOT; i++) {
		krn = clCreateKernel(vocp->voc_oclprog, func, &ret);
		if (!krn) {
			fprintf(stderr, "clCreateKernel(%d): ", i);
			vg_ocl_error(vocp, ret, NULL);
			while (--i >= 0) {
				clReleaseKernel(vocp->voc_oclkernel[i][knum]);
				vocp->voc_oclkernel[i][knum] = NULL;
			}
			return 0;
		}
		vocp->voc_oclkernel[i][knum] = krn;
		vocp->voc_oclkrnwait[i] = NULL;
	}
	return 1;
}

static void
vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts,
		    const char *program, size_t size,
		    unsigned char *hash_out)
{
	EVP_MD_CTX *mdctx;
	cl_platform_id pid;
	const char *str;

	mdctx = EVP_MD_CTX_create();
	EVP_DigestInit_ex(mdctx, EVP_md5(), NULL);
	pid = vg_ocl_device_getplatform(vocp->voc_ocldid);
	str = vg_ocl_platform_getstr(pid, CL_PLATFORM_NAME);
	EVP_DigestUpdate(mdctx, str, strlen(str) + 1);
	str = vg_ocl_platform_getstr(pid, CL_PLATFORM_VERSION);
	EVP_DigestUpdate(mdctx, str, strlen(str) + 1);
	str = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
	EVP_DigestUpdate(mdctx, str, strlen(str) + 1);
	if (opts)
		EVP_DigestUpdate(mdctx, opts, strlen(opts) + 1);
	if (size)
		EVP_DigestUpdate(mdctx, program, size);
	EVP_DigestFinal_ex(mdctx, hash_out, NULL);
	EVP_MD_CTX_destroy(mdctx);
}

typedef struct {
	unsigned char	e_ident[16];
	uint16_t	e_type;
	uint16_t	e_machine;
	uint32_t	e_version;
	uint32_t	e_entry;
	uint32_t	e_phoff;
	uint32_t	e_shoff;
	uint32_t	e_flags;
	uint16_t	e_ehsize;
	uint16_t	e_phentsize;
	uint16_t	e_phnum;
	uint16_t	e_shentsize;
	uint16_t	e_shnum;
	uint16_t	e_shstrndx;
} vg_elf32_header_t;

typedef struct {
	uint32_t	sh_name;
	uint32_t	sh_type;
	uint32_t	sh_flags;
	uint32_t	sh_addr;
	uint32_t	sh_offset;
	uint32_t	sh_size;
	uint32_t	sh_link;
	uint32_t	sh_info;
	uint32_t	sh_addralign;
	uint32_t	sh_entsize;
} vg_elf32_shdr_t;

static int
vg_ocl_amd_patch_inner(unsigned char *binary, size_t size)
{
	vg_elf32_header_t *ehp;
	vg_elf32_shdr_t *shp, *nshp;
	uint32_t *instr;
	size_t off;
	int i, n, txt2idx, patched;

	ehp = (vg_elf32_header_t *) binary;
	if ((size < sizeof(*ehp)) ||
	    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
	    !ehp->e_shoff)
		return 0;

	off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize);
	nshp = (vg_elf32_shdr_t *) (binary + off);
	if ((off + sizeof(*nshp)) > size)
		return 0;

	shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff);
	n = 0;
	txt2idx = 0;
	for (i = 0; i < ehp->e_shnum; i++) {
		off = nshp->sh_offset + shp[i].sh_name;
		if (((off + 6) >= size) ||
		    memcmp(binary + off, ".text", 6))
			continue;
		n++;
		if (n == 2)
			txt2idx = i;
	}
	if (n != 2)
		return 0;

	off = shp[txt2idx].sh_offset;
	instr = (uint32_t *) (binary + off);
	n = shp[txt2idx].sh_size / 4;
	patched = 0;
	for (i = 0; i < n; i += 2) {
		if (((instr[i] & 0x02001000) == 0) &&
		    ((instr[i+1] & 0x9003f000) == 0x0001a000)) {
			instr[i+1] ^= (0x0001a000 ^ 0x0000c000);
			patched++;
		}
	}

	return patched;
}

static int
vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size)
{
	vg_context_t *vcp = vocp->base.vxc_vc;
	vg_elf32_header_t *ehp;
	unsigned char *ptr;
	size_t offset = 1;
	int ninner = 0, nrun, npatched = 0;

	ehp = (vg_elf32_header_t *) binary;
	if ((size < sizeof(*ehp)) ||
	    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) ||
	    !ehp->e_shoff)
		return 0;

	offset = 1;
	while (offset < (size - 8)) {
		ptr = (unsigned char *) memchr(binary + offset,
					       0x7f,
					       size - offset);
		if (!ptr)
			return npatched;
		offset = ptr - binary;
		ehp = (vg_elf32_header_t *) ptr;
		if (((size - offset) < sizeof(*ehp)) ||
		    memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
		    !ehp->e_shoff) {
			offset += 1;
			continue;
		}

		ninner++;
		nrun = vg_ocl_amd_patch_inner(ptr, size - offset);
		npatched += nrun;
		if (vcp->vc_verbose > 1)
			fprintf(stderr, "AMD BFI_INT: patched %d instructions "
			       "in kernel %d\n",
			       nrun, ninner);
		npatched++;
		offset += 1;
	}
	return npatched;
}


static int
vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
		    const char *filename, const char *opts)
{
	FILE *kfp;
	char *buf, *tbuf;
	int len, fromsource = 0, patched = 0;
	size_t sz, szr;
	cl_program prog;
	cl_int ret, sts;
	unsigned char prog_hash[16];
	char bin_name[64];

	if (vcp->vc_verbose > 1)
		fprintf(stderr,
			"OpenCL compiler flags: %s\n", opts ? opts : "");

	sz = 128 * 1024;
	buf = (char *) malloc(sz);
	if (!buf) {
		fprintf(stderr, "Could not allocate program buffer\n");
		return 0;
	}

	kfp = fopen(filename, "r");
	if (!kfp) {
		fprintf(stderr, "Error loading kernel file '%s': %s\n",
		       filename, strerror(errno));
		free(buf);
		return 0;
	}

	len = fread(buf, 1, sz, kfp);
	fclose(kfp);

	if (!len) {
		fprintf(stderr, "Short read on CL kernel\n");
		free(buf);
		return 0;
	}

	vg_ocl_hash_program(vocp, opts, buf, len, prog_hash);
	snprintf(bin_name, sizeof(bin_name),
		 "%02x%02x%02x%02x%02x%02x%02x%02x"
		 "%02x%02x%02x%02x%02x%02x%02x%02x.oclbin",
		 prog_hash[0], prog_hash[1], prog_hash[2], prog_hash[3],
		 prog_hash[4], prog_hash[5], prog_hash[6], prog_hash[7],
		 prog_hash[8], prog_hash[9], prog_hash[10], prog_hash[11],
		 prog_hash[12], prog_hash[13], prog_hash[14], prog_hash[15]);

	if (vocp->voc_quirks & VG_OCL_NO_BINARIES) {
		kfp = NULL;
		if (vcp->vc_verbose > 1)
			fprintf(stderr, "Binary OpenCL programs disabled\n");
	} else {
		kfp = fopen(bin_name, "rb");
	}

	if (!kfp) {
		/* No binary available, create with source */
		fromsource = 1;
		sz = len;
		prog = clCreateProgramWithSource(vocp->voc_oclctx,
						 1, (const char **) &buf, &sz,
						 &ret);
	} else {
		if (vcp->vc_verbose > 1)
			fprintf(stderr, "Loading kernel binary %s\n", bin_name);
		szr = 0;
		while (!feof(kfp)) {
			len = fread(buf + szr, 1, sz - szr, kfp);
			if (!len) {
				fprintf(stderr,
					"Short read on CL kernel binary\n");
				fclose(kfp);
				free(buf);
				return 0;
			}
			szr += len;
			if (szr == sz) {
				tbuf = (char *) realloc(buf, sz*2);
				if (!tbuf) {
					fprintf(stderr,
						"Could not expand CL kernel "
						"binary buffer\n");
					fclose(kfp);
					free(buf);
					return 0;
				}
				buf = tbuf;
				sz *= 2;
			}
		}
		fclose(kfp);
	rebuild:
		prog = clCreateProgramWithBinary(vocp->voc_oclctx,
						 1, &vocp->voc_ocldid,
						 &szr,
						 (const unsigned char **) &buf,
						 &sts,
						 &ret);
	}
	free(buf);
	if (!prog) {
		vg_ocl_error(vocp, ret, "clCreateProgramWithSource");
		return 0;
	}

	if (vcp->vc_verbose > 0) {
		if (fromsource && !patched) {
			fprintf(stderr,
				"Compiling kernel, can take minutes...");
			fflush(stderr);
		}
	}
	ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL);
	if (ret != CL_SUCCESS) {
		if ((vcp->vc_verbose > 0) && fromsource && !patched)
			fprintf(stderr, "failure.\n");
		vg_ocl_error(NULL, ret, "clBuildProgram");
	} else if ((vcp->vc_verbose > 0) && fromsource && !patched) {
		fprintf(stderr, "done!\n");
	}
	if ((ret != CL_SUCCESS) ||
	    ((vcp->vc_verbose > 1) && fromsource && !patched)) {
		vg_ocl_buildlog(vocp, prog);
	}
	if (ret != CL_SUCCESS) {
		vg_ocl_dump_info(vocp);
		clReleaseProgram(prog);
		return 0;
	}

	if (fromsource && !(vocp->voc_quirks & VG_OCL_NO_BINARIES)) {
		ret = clGetProgramInfo(prog,
				       CL_PROGRAM_BINARY_SIZES,
				       sizeof(szr), &szr,
				       &sz);
		if (ret != CL_SUCCESS) {
			vg_ocl_error(vocp, ret,
				     "WARNING: clGetProgramInfo(BINARY_SIZES)");
			goto out;
		}
		if (sz == 0) {
			fprintf(stderr,
				"WARNING: zero-length CL kernel binary\n");
			goto out;
		}

		buf = (char *) malloc(szr);
		if (!buf) {
			fprintf(stderr,
				"WARNING: Could not allocate %"PRSIZET"d bytes "
				"for CL binary\n",
			       szr);
			goto out;
		}

		ret = clGetProgramInfo(prog,
				       CL_PROGRAM_BINARIES,
				       sizeof(buf), &buf,
				       &sz);
		if (ret != CL_SUCCESS) {
			vg_ocl_error(vocp, ret,
				     "WARNING: clGetProgramInfo(BINARIES)");
			free(buf);
			goto out;
		}

		if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) {
			patched = vg_ocl_amd_patch(vocp,
						   (unsigned char *) buf, szr);
			if (patched > 0) {
				if (vcp->vc_verbose > 1)
					fprintf(stderr,
						"AMD BFI_INT patch complete\n");
				clReleaseProgram(prog);
				goto rebuild;
			}
			fprintf(stderr,
				"WARNING: AMD BFI_INT patching failed\n");
			if (patched < 0) {
				/* Program was incompletely modified */
				free(buf);
				goto out;
			}
		}

		kfp = fopen(bin_name, "wb");
		if (!kfp) {
			fprintf(stderr, "WARNING: "
				"could not save CL kernel binary: %s\n",
				strerror(errno));
		} else {
			sz = fwrite(buf, 1, szr, kfp);
			fclose(kfp);
			if (sz != szr) {
				fprintf(stderr,
					"WARNING: short write on CL kernel "
					"binary file: expected "
					"%"PRSIZET"d, got %"PRSIZET"d\n",
					szr, sz);
				unlink(bin_name);
			}
		}
		free(buf);
	}

out:
	vocp->voc_oclprog = prog;
	if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") ||
	    !vg_ocl_create_kernel(vocp, 1, "heap_invert")) {
		clReleaseProgram(vocp->voc_oclprog);
		vocp->voc_oclprog = NULL;
		return 0;
	}

	return 1;
}

static void CL_CALLBACK
vg_ocl_context_callback(const char *errinfo,
			const void *private_info,
			size_t cb,
			void *user_data)
{
	fprintf(stderr, "vg_ocl_context_callback error: %s\n", errinfo);
}

static int
vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
	    int safe_mode)
{
	cl_int ret;
	char optbuf[128];
	int end = 0;

	memset(vocp, 0, sizeof(*vocp));
	vg_exec_context_init(vcp, &vocp->base);
	vocp->base.vxc_threadfunc = vg_opencl_loop;

	pthread_mutex_init(&vocp->voc_lock, NULL);
	pthread_cond_init(&vocp->voc_wait, NULL);
	vocp->voc_ocl_slot = -1;

	vocp->voc_ocldid = did;

	if (vcp->vc_verbose > 1)
		vg_ocl_dump_info(vocp);

	vocp->voc_quirks = vg_ocl_get_quirks(vocp);

	if ((vocp->voc_quirks & VG_OCL_BROKEN) && (vcp->vc_verbose > 0)) {
		char yesbuf[16];
		printf("Type 'yes' to continue: ");
		fflush(stdout);
		if (!fgets(yesbuf, sizeof(yesbuf), stdin) ||
		    strncmp(yesbuf, "yes", 3))
			exit(1);
	}

	vocp->voc_oclctx = clCreateContext(NULL,
					   1, &did,
					   vg_ocl_context_callback,
					   NULL,
					   &ret);
	if (!vocp->voc_oclctx) {
		vg_ocl_error(vocp, ret, "clCreateContext");
		return 0;
	}

	vocp->voc_oclcmdq = clCreateCommandQueue(vocp->voc_oclctx,
						 vocp->voc_ocldid,
						 0, &ret);
	if (!vocp->voc_oclcmdq) {
		vg_ocl_error(vocp, ret, "clCreateCommandQueue");
		return 0;
	}

	if (safe_mode)
		vocp->voc_quirks &= ~VG_OCL_OPTIMIZATIONS;

	end = 0;
	optbuf[end] = '\0';
	if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL)
		end += snprintf(optbuf + end, sizeof(optbuf) - end,
				"-DDEEP_PREPROC_UNROLL ");
	if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL)
		end += snprintf(optbuf + end, sizeof(optbuf) - end,
				"-DPRAGMA_UNROLL ");
	if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES)
		end += snprintf(optbuf + end, sizeof(optbuf) - end,
				"-DVERY_EXPENSIVE_BRANCHES ");
	if (vocp->voc_quirks & VG_OCL_DEEP_VLIW)
		end += snprintf(optbuf + end, sizeof(optbuf) - end,
				"-DDEEP_VLIW ");
	if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT)
		end += snprintf(optbuf + end, sizeof(optbuf) - end,
				"-DAMD_BFI_INT ");
	if (vocp->voc_quirks & VG_OCL_NV_VERBOSE)
		end += snprintf(optbuf + end, sizeof(optbuf) - end,
				"-cl-nv-verbose ");

	if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", optbuf))
		return 0;
	return 1;
}

static void
vg_ocl_del(vg_ocl_context_t *vocp)
{
	vg_ocl_free_args(vocp);
	if (vocp->voc_oclprog) {
		clReleaseProgram(vocp->voc_oclprog);
		vocp->voc_oclprog = NULL;
	}
	if (vocp->voc_oclcmdq) {
		clReleaseCommandQueue(vocp->voc_oclcmdq);
		vocp->voc_oclcmdq = NULL;
	}
	if (vocp->voc_oclctx) {
		clReleaseContext(vocp->voc_oclctx);
		vocp->voc_oclctx = NULL;
	}
	pthread_cond_destroy(&vocp->voc_wait);
	pthread_mutex_destroy(&vocp->voc_lock);
	vg_exec_context_del(&vocp->base);
}

static int vg_ocl_arg_map[][8] = {
	/* hashes_out / found */
	{ 2, 0, -1 },
	/* z_heap */
	{ 0, 1, 1, 0, 2, 2, -1 },
	/* point_tmp */
	{ 0, 0, 2, 1, -1 },
	/* row_in */
	{ 0, 2, -1 },
	/* col_in */
	{ 0, 3, -1 },
	/* target_table */
	{ 2, 3, -1 },
};

static int
vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot,
			int arg, size_t size, int host)
{
	cl_mem clbuf;
	cl_int ret;
	int i, j, knum, karg;

	for (i = 0; i < MAX_SLOT; i++) {
		if ((i != slot) && (slot >= 0))
			continue;
		if (vocp->voc_args[i][arg]) {
			clReleaseMemObject(vocp->voc_args[i][arg]);
			vocp->voc_args[i][arg] = NULL;
			vocp->voc_arg_size[i][arg] = 0;
		}
	}

	clbuf = clCreateBuffer(vocp->voc_oclctx,
			       CL_MEM_READ_WRITE |
			       (host ? CL_MEM_ALLOC_HOST_PTR : 0),
			       size,
			       NULL,
			       &ret);
	if (!clbuf) {
		fprintf(stderr, "clCreateBuffer(%d,%d): ", slot, arg);
		vg_ocl_error(vocp, ret, NULL);
		return 0;
	}

	for (i = 0; i < MAX_SLOT; i++) {
		if ((i != slot) && (slot >= 0))
			continue;

		clRetainMemObject(clbuf);
		vocp->voc_args[i][arg] = clbuf;
		vocp->voc_arg_size[i][arg] = size;

		for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
			knum = vg_ocl_arg_map[arg][j];
			karg = vg_ocl_arg_map[arg][j+1];
			ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
					     karg,
					     sizeof(clbuf),
					     &clbuf);

			if (ret) {
				fprintf(stderr,
					"clSetKernelArg(%d,%d): ", knum, karg);
				vg_ocl_error(vocp, ret, NULL);
				return 0;
			}
		}
	}

	clReleaseMemObject(clbuf);
	return 1;
}

int
vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg,
		   void *buffer, size_t size)
{
	cl_int slot, ret;

	slot = (wslot < 0) ? 0 : wslot;

	assert((slot >= 0) && (slot < MAX_SLOT));
	assert(size <= vocp->voc_arg_size[slot][arg]);

	ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq,
				   vocp->voc_args[slot][arg],
				   CL_TRUE,
				   0, size,
				   buffer,
				   0, NULL,
				   NULL);

	if (ret) {
		fprintf(stderr, "clEnqueueWriteBuffer(%d): ", arg);
		vg_ocl_error(vocp, ret, NULL);
		return 0;
	}

	return 1;
}

static void *
vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot,
		      int arg, int rw)
{
	void *buf;
	cl_int ret;

	assert((slot >= 0) && (slot < MAX_SLOT));

	buf = clEnqueueMapBuffer(vocp->voc_oclcmdq,
				 vocp->voc_args[slot][arg],
				 CL_TRUE,
				 (rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE)
				           : (rw ? CL_MAP_WRITE : CL_MAP_READ),
				 0, vocp->voc_arg_size[slot][arg],
				 0, NULL,
				 NULL,
				 &ret);
	if (!buf) {
		fprintf(stderr, "clEnqueueMapBuffer(%d): ", arg);
		vg_ocl_error(vocp, ret, NULL);
		return NULL;
	}
	return buf;
}

static void
vg_ocl_unmap_arg_buffer(vg_ocl_context_t *vocp, int slot,
			int arg, void *buf)
{
	cl_int ret;
	cl_event ev;

	assert((slot >= 0) && (slot < MAX_SLOT));

	ret = clEnqueueUnmapMemObject(vocp->voc_oclcmdq,
				      vocp->voc_args[slot][arg],
				      buf,
				      0, NULL,
				      &ev);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clEnqueueUnmapMemObject(%d): ", arg);
		vg_ocl_error(vocp, ret, NULL);
		return;
	}

	ret = clWaitForEvents(1, &ev);
	clReleaseEvent(ev);
	if (ret != CL_SUCCESS) {
		fprintf(stderr, "clWaitForEvent(clUnmapMemObject,%d): ", arg);
		vg_ocl_error(vocp, ret, NULL);
	}
}

int
vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot,
		      int arg, int value)
{
	cl_int ret;
	int i;

	for (i = 0; i < MAX_SLOT; i++) {
		if ((i != slot) && (slot >= 0))
			continue;
		ret = clSetKernelArg(vocp->voc_oclkernel[i][2],
				     arg,
				     sizeof(value),
				     &value);
		if (ret) {
			fprintf(stderr, "clSetKernelArg(%d): ", arg);
			vg_ocl_error(vocp, ret, NULL);
			return 0;
		}
	}
	return 1;
}

int
vg_ocl_kernel_buffer_arg(vg_ocl_context_t *vocp, int slot,
			 int arg, void *value, size_t size)
{
	cl_int ret;
	int i, j, knum, karg;

	for (i = 0; i < MAX_SLOT; i++) {
		if ((i != slot) && (slot >= 0))
			continue;
		for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) {
			knum = vg_ocl_arg_map[arg][j];
			karg = vg_ocl_arg_map[arg][j+1];
			ret = clSetKernelArg(vocp->voc_oclkernel[i][knum],
					     karg,
					     size,
					     value);
			if (ret) {
				fprintf(stderr,
					"clSetKernelArg(%d,%d): ", knum, karg);
				vg_ocl_error(vocp, ret, NULL);
				return 0;
			}
		}
	}
	return 1;
}

static void
vg_ocl_free_args(vg_ocl_context_t *vocp)
{
	int i, arg;
	for (i = 0; i < MAX_SLOT; i++) {
		for (arg = 0; arg < MAX_ARG; arg++) {
			if (vocp->voc_args[i][arg]) {
				clReleaseMemObject(vocp->voc_args[i][arg]);
				vocp->voc_args[i][arg] = NULL;
				vocp->voc_arg_size[i][arg] = 0;
			}
		}
	}
}

int
vg_ocl_kernel_dead(vg_ocl_context_t *vocp, int slot)
{
	return (vocp->voc_oclkrnwait[slot] == NULL);
}

static int
vg_ocl_kernel_start(vg_ocl_context_t *vocp, int slot, int ncol, int nrow,
		    int invsize)
{
	cl_int val, ret;
	cl_event ev;
	size_t globalws[2] = { ncol, nrow };
	size_t invws = (ncol * nrow) / invsize;

	assert(!vocp->voc_oclkrnwait[slot]);

	/* heap_invert() preconditions */
	assert(is_pow2(invsize) && (invsize > 1));

	val = invsize;
	ret = clSetKernelArg(vocp->voc_oclkernel[slot][1],
			     1,
			     sizeof(val),
			     &val);
	if (ret != CL_SUCCESS) {
		vg_ocl_error(vocp, ret, "clSetKernelArg(ncol)");
		return 0;
	}
	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
				     vocp->voc_oclkernel[slot][0],
				     2,
				     NULL, globalws, NULL,
				     0, NULL,
				     &ev);
	if (ret != CL_SUCCESS) {
		vg_ocl_error(vocp, ret, "clEnqueueNDRange(0)");
		return 0;
	}

	ret = clWaitForEvents(1, &ev);
	clReleaseEvent(ev);
	if (ret != CL_SUCCESS) {
		vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,0)");
		return 0;
	}

	if (vocp->voc_verify_func[0] &&
	    !(vocp->voc_verify_func[0])(vocp, slot)) {
		fprintf(stderr, "ERROR: Kernel 0 failed verification test\n");
		return 0;
	}

	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
				     vocp->voc_oclkernel[slot][1],
				     1,
				     NULL, &invws, NULL,
				     0, NULL,
				     &ev);
	if (ret != CL_SUCCESS) {
		vg_ocl_error(vocp, ret, "clEnqueueNDRange(1)");
		return 0;
	}

	ret = clWaitForEvents(1, &ev);
	clReleaseEvent(ev);
	if (ret != CL_SUCCESS) {
		vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,1)");
		return 0;
	}

	if (vocp->voc_verify_func[1] &&
	    !(vocp->voc_verify_func[1])(vocp, slot)) {
		fprintf(stderr, "ERROR: Kernel 1 failed verification test\n");
		return 0;
	}

	ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq,
				     vocp->voc_oclkernel[slot][2],
				     2,
				     NULL, globalws, NULL,
				     0, NULL,
				     &ev);
	if (ret != CL_SUCCESS) {
		vg_ocl_error(vocp, ret, "clEnqueueNDRange(2)");
		return 0;
	}

	vocp->voc_oclkrnwait[slot] = ev;
	return 1;
}

static int
vg_ocl_kernel_wait(vg_ocl_context_t *vocp, int slot)
{
	cl_event ev;
	cl_int ret;

	ev = vocp->voc_oclkrnwait[slot];
	vocp->voc_oclkrnwait[slot] = NULL;
	if (ev) {
		ret = clWaitForEvents(1, &ev);
		clReleaseEvent(ev);
		if (ret != CL_SUCCESS) {
			vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,e)");
			return 0;
		}
	}
	return 1;
}


static INLINE void
vg_ocl_get_bignum_raw(BIGNUM *bn, const unsigned char *buf)
{
	bn_expand(bn, 256);
	memcpy(bn->d, buf, 32);
	bn->top = (32 / sizeof(BN_ULONG));
}

static INLINE void
vg_ocl_put_bignum_raw(unsigned char *buf, const BIGNUM *bn)
{
	int bnlen = (bn->top * sizeof(BN_ULONG));
	if (bnlen >= 32) {
		memcpy(buf, bn->d, 32);
	} else {
		memcpy(buf, bn->d, bnlen);
		memset(buf + bnlen, 0, 32 - bnlen);
	}
}

#define ACCESS_BUNDLE 1024
#define ACCESS_STRIDE (ACCESS_BUNDLE/8)

static void
vg_ocl_get_bignum_tpa(BIGNUM *bn, const unsigned char *buf, int cell)
{
	unsigned char bnbuf[32];
	int start, i;

	start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) +
		 (cell % ACCESS_STRIDE));
	for (i = 0; i < 8; i++)
		memcpy(bnbuf+(i*4),
		       buf + 4*(start + i*ACCESS_STRIDE),
		       4);

	vg_ocl_get_bignum_raw(bn, bnbuf);
}

/*
 * Absolutely disgusting.
 * We want points in Montgomery form, and it's a lot easier to read the
 * coordinates from the structure than to export and re-montgomeryize.
 */

struct ec_point_st {
	const EC_METHOD *meth;
	BIGNUM X;
	BIGNUM Y;
	BIGNUM Z;
	int Z_is_one;
};

static INLINE void
vg_ocl_get_point(EC_POINT *ppnt, const unsigned char *buf)
{
	static const unsigned char mont_one[] = { 0x01,0x00,0x00,0x03,0xd1 };
	vg_ocl_get_bignum_raw(&ppnt->X, buf);
	vg_ocl_get_bignum_raw(&ppnt->Y, buf + 32);
	if (!ppnt->Z_is_one) {
		ppnt->Z_is_one = 1;
		BN_bin2bn(mont_one, sizeof(mont_one), &ppnt->Z);
	}
}

static INLINE void
vg_ocl_put_point(unsigned char *buf, const EC_POINT *ppnt)
{
	assert(ppnt->Z_is_one);
	vg_ocl_put_bignum_raw(buf, &ppnt->X);
	vg_ocl_put_bignum_raw(buf + 32, &ppnt->Y);
}

static void
vg_ocl_put_point_tpa(unsigned char *buf, int cell, const EC_POINT *ppnt)
{
	unsigned char pntbuf[64];
	int start, i;

	vg_ocl_put_point(pntbuf, ppnt);

	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
		 (cell % (ACCESS_STRIDE/2)));
	for (i = 0; i < 8; i++)
		memcpy(buf + 4*(start + i*ACCESS_STRIDE),
		       pntbuf+(i*4),
		       4);
	for (i = 0; i < 8; i++)
		memcpy(buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)),
		       pntbuf+32+(i*4),
		       4);
}

static void
vg_ocl_get_point_tpa(EC_POINT *ppnt, const unsigned char *buf, int cell)
{
	unsigned char pntbuf[64];
	int start, i;

	start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) +
		 (cell % (ACCESS_STRIDE/2)));
	for (i = 0; i < 8; i++)
		memcpy(pntbuf+(i*4),
		       buf + 4*(start + i*ACCESS_STRIDE),
		       4);
	for (i = 0; i < 8; i++)
		memcpy(pntbuf+32+(i*4),
		       buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)),
		       4);

	vg_ocl_get_point(ppnt, pntbuf);
}

void
show_elapsed(struct timeval *tv, const char *place)
{
	struct timeval now, delta;
        gettimeofday(&now, NULL);
	timersub(&now, tv, &delta);
	fprintf(stderr,
		"%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec);
}


/*
 * GPU address matching methods
 *
 * gethash: GPU computes and returns all address hashes.
 *  + Works with any matching method, including regular expressions.
 *  - The CPU will not be able to keep up with mid- to high-end GPUs.
 *
 * prefix: GPU computes hash, searches a range list, and discards.
 *  + Fast, minimal work for CPU.
 */

static int
vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot)
{
	vg_exec_context_t *vxcp = &vocp->base;
	vg_context_t *vcp = vocp->base.vxc_vc;
	vg_test_func_t test_func = vcp->vc_test;
	unsigned char *ocl_hashes_out;
	int i, res = 0, round;

	ocl_hashes_out = (unsigned char *)
		vg_ocl_map_arg_buffer(vocp, slot, 0, 0);

	if (!ocl_hashes_out) {
		fprintf(stderr,
			"ERROR: Could not map hash result buffer "
			"for slot %d\n", slot);
		return 2;
	}

	round = vocp->voc_ocl_cols * vocp->voc_ocl_rows;

	for (i = 0; i < round; i++, vxcp->vxc_delta++) {
		memcpy(&vxcp->vxc_binres[1],
		       ocl_hashes_out + (20*i),
		       20);

		res = test_func(vxcp);
		if (res)
			break;
	}

	vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out);
	return res;
}

static int
vg_ocl_gethash_init(vg_ocl_context_t *vocp)
{
	int i;

	if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get"))
		return 0;

	for (i = 0; i < vocp->voc_nslots; i++) {
		/* Each slot gets its own hash output buffer */
		if (!vg_ocl_kernel_arg_alloc(vocp, i, 0,
					     20 *
					     vocp->voc_ocl_rows *
					     vocp->voc_ocl_cols, 1))
			return 0;
	}

	vocp->voc_rekey_func = NULL;
	vocp->voc_check_func = vg_ocl_gethash_check;
	return 1;
}


static int
vg_ocl_prefix_rekey(vg_ocl_context_t *vocp)
{
	vg_context_t *vcp = vocp->base.vxc_vc;
	unsigned char *ocl_targets_in;
	uint32_t *ocl_found_out;
	int i;

	/* Set the found indicator for each slot to -1 */
	for (i = 0; i < vocp->voc_nslots; i++) {
		ocl_found_out = (uint32_t *)
			vg_ocl_map_arg_buffer(vocp, i, 0, 1);
		if (!ocl_found_out) {
			fprintf(stderr,
				"ERROR: Could not map result buffer"
				" for slot %d (rekey)\n", i);
			return -1;
		}
		ocl_found_out[0] = 0xffffffff;
		vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out);
	}

	if (vocp->voc_pattern_rewrite) {
		/* Count number of range records */
		i = vg_context_hash160_sort(vcp, NULL);
		if (!i)
			return 0;

		if (i > vocp->voc_pattern_alloc) {
			/* (re)allocate target buffer */
			if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0))
				return -1;
			vocp->voc_pattern_alloc = i;
		}

		/* Write range records */
		ocl_targets_in = (unsigned char *)
			vg_ocl_map_arg_buffer(vocp, 0, 5, 1);
		if (!ocl_targets_in) {
			fprintf(stderr,
				"ERROR: Could not map hash target buffer\n");
			return -1;
		}
		vg_context_hash160_sort(vcp, ocl_targets_in);
		vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in);
		vg_ocl_kernel_int_arg(vocp, -1, 4, i);

		vocp->voc_pattern_rewrite = 0;
	}
	return 1;
}

static int
vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot)
{
	vg_exec_context_t *vxcp = &vocp->base;
	vg_context_t *vcp = vocp->base.vxc_vc;
	vg_test_func_t test_func = vcp->vc_test;
	uint32_t *ocl_found_out;
	uint32_t found_delta;
	int orig_delta, tablesize;
	int res = 0;

	/* Retrieve the found indicator */
	ocl_found_out = (uint32_t *)
		vg_ocl_map_arg_buffer(vocp, slot, 0, 2);
	if (!ocl_found_out) {
		fprintf(stderr,
			"ERROR: Could not map result buffer"
			" for slot %d\n", slot);
		return 2;
	}
	found_delta = ocl_found_out[0];

	if (found_delta != 0xffffffff) {
		/* GPU code claims match, verify with CPU version */
		orig_delta = vxcp->vxc_delta;
		vxcp->vxc_delta += found_delta;
		vg_exec_context_calc_address(vxcp);

		/* Make sure the GPU produced the expected hash */
		res = 0;
		if (!memcmp(vxcp->vxc_binres + 1,
			    ocl_found_out + 2,
			    20)) {
			res = test_func(vxcp);
		}
		if (res == 0) {
			/*
			 * The match was not found in
			 * the pattern list.  Hmm.
			 */
			tablesize = ocl_found_out[2];
			fprintf(stderr, "Match idx: %d\n", ocl_found_out[1]);
			fprintf(stderr, "CPU hash: ");
			fdumphex(stderr, vxcp->vxc_binres + 1, 20);
			fprintf(stderr, "GPU hash: ");
			fdumphex(stderr,
				 (unsigned char *) (ocl_found_out + 2), 20);
			fprintf(stderr, "Found delta: %d "
			       "Start delta: %d\n",
			       found_delta, orig_delta);
			res = 1;
		}
	} else {
		vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows);
	}

	vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out);
	return res;
}

static int
vg_ocl_prefix_init(vg_ocl_context_t *vocp)
{
	int i;

	if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix"))
		return 0;

	for (i = 0; i < vocp->voc_nslots; i++) {
		if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1))
			return 0;
	}
	vocp->voc_rekey_func = vg_ocl_prefix_rekey;
	vocp->voc_check_func = vg_ocl_prefix_check;
	vocp->voc_pattern_rewrite = 1;
	vocp->voc_pattern_alloc = 0;
	return 1;
}


static int
vg_ocl_config_pattern(vg_ocl_context_t *vocp)
{
	vg_context_t *vcp = vocp->base.vxc_vc;
	int i;

	i = vg_context_hash160_sort(vcp, NULL);
	if (i > 0) {
		if (vcp->vc_verbose > 1)
			fprintf(stderr, "Using OpenCL prefix matcher\n");
		/* Configure for prefix matching */
		return vg_ocl_prefix_init(vocp);
	}

	if (vcp->vc_verbose > 0)
		fprintf(stderr, "WARNING: Using CPU pattern matcher\n");
	return vg_ocl_gethash_init(vocp);
}


/*
 * Temporary buffer content verification functions
 * This provides a simple test of the kernel, the OpenCL compiler,
 * and the hardware.
 */
static int
vg_ocl_verify_temporary(vg_ocl_context_t *vocp, int slot, int z_inverted)
{
	vg_exec_context_t *vxcp = &vocp->base;
	unsigned char *point_tmp = NULL, *z_heap = NULL;
	unsigned char *ocl_points_in = NULL, *ocl_strides_in = NULL;
	const EC_GROUP *pgroup;
	EC_POINT *ppr = NULL, *ppc = NULL, *pps = NULL, *ppt = NULL;
	BIGNUM bnz, bnez, bnm, *bnzc;
	BN_CTX *bnctx = NULL;
	BN_MONT_CTX *bnmont;
	int ret = 0;
	int mismatches = 0, mm_r;
	int x, y, bx;
	static const unsigned char raw_modulus[] = {
		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
		0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
		0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
	};

	BN_init(&bnz);
	BN_init(&bnez);
	BN_init(&bnm);

	bnctx = BN_CTX_new();
	bnmont = BN_MONT_CTX_new();
	pgroup = EC_KEY_get0_group(vxcp->vxc_key);
	ppr = EC_POINT_new(pgroup);
	ppc = EC_POINT_new(pgroup);
	pps = EC_POINT_new(pgroup);
	ppt = EC_POINT_new(pgroup);

	if (!bnctx || !bnmont || !ppr || !ppc || !pps || !ppt) {
		fprintf(stderr, "ERROR: out of memory\n");
		goto out;
	}

	BN_bin2bn(raw_modulus, sizeof(raw_modulus), &bnm);
	BN_MONT_CTX_set(bnmont, &bnm, bnctx);

	if (z_inverted) {
		bnzc = &bnez;
	} else {
		bnzc = &pps->Z;
	}

	z_heap = (unsigned char *)
		vg_ocl_map_arg_buffer(vocp, slot, 1, 0);
	point_tmp = (unsigned char *)
		vg_ocl_map_arg_buffer(vocp, slot, 2, 0);
	ocl_points_in = (unsigned char *)
		vg_ocl_map_arg_buffer(vocp, slot, 3, 0);
	ocl_strides_in = (unsigned char *)
		vg_ocl_map_arg_buffer(vocp, slot, 4, 0);

	if (!z_heap || !point_tmp || !ocl_points_in || !ocl_strides_in) {
		fprintf(stderr, "ERROR: could not map OpenCL point buffers\n");
		goto out;
	}

	for (y = 0; y < vocp->voc_ocl_rows; y++) {
		vg_ocl_get_point(ppr, ocl_strides_in + (64*y));
		bx = y * vocp->voc_ocl_cols;
		mm_r = 0;

		for (x = 0; x < vocp->voc_ocl_cols; x++) {
			vg_ocl_get_point_tpa(ppc, ocl_points_in, x);
			assert(ppr->Z_is_one && ppc->Z_is_one);
			EC_POINT_add(pgroup, pps, ppc, ppr, bnctx);
			assert(!pps->Z_is_one);
			vg_ocl_get_point_tpa(ppt, point_tmp, bx + x);
			vg_ocl_get_bignum_tpa(&bnz, z_heap, bx + x);
			if (z_inverted) {
				BN_mod_inverse(&bnez, &pps->Z, &bnm, bnctx);
				BN_to_montgomery(&bnez, &bnez, bnmont, bnctx);
				BN_to_montgomery(&bnez, &bnez, bnmont, bnctx);
			}
			if (BN_cmp(&ppt->X, &pps->X) ||
			    BN_cmp(&ppt->Y, &pps->Y) ||
			    BN_cmp(&bnz, bnzc)) {
				if (!mismatches) {
					fprintf(stderr, "Base privkey: ");
					fdumpbn(stderr, EC_KEY_get0_private_key(
						       vxcp->vxc_key));
				}
				mismatches++;
				fprintf(stderr, "Mismatch for kernel %d, "
				       "offset %d (%d,%d)\n",
				       z_inverted, bx + x, y, x);
				if (!mm_r) {
					mm_r = 1;
					fprintf(stderr, "Row X   : ");
					fdumpbn(stderr, &ppr->X);
					fprintf(stderr, "Row Y   : ");
					fdumpbn(stderr, &ppr->Y);
				}

				fprintf(stderr, "Column X: ");
				fdumpbn(stderr, &ppc->X);
				fprintf(stderr, "Column Y: ");
				fdumpbn(stderr, &ppc->Y);

				if (BN_cmp(&ppt->X, &pps->X)) {
					fprintf(stderr, "Expect X: ");
					fdumpbn(stderr, &pps->X);
					fprintf(stderr, "Device X: ");
					fdumpbn(stderr, &ppt->X);
				}
				if (BN_cmp(&ppt->Y, &pps->Y)) {
					fprintf(stderr, "Expect Y: ");
					fdumpbn(stderr, &pps->Y);
					fprintf(stderr, "Device Y: ");
					fdumpbn(stderr, &ppt->Y);
				}
				if (BN_cmp(&bnz, bnzc)) {
					fprintf(stderr, "Expect Z: ");
					fdumpbn(stderr, bnzc);
					fprintf(stderr, "Device Z: ");
					fdumpbn(stderr, &bnz);
				}
			}
		}
	}

	ret = !mismatches;

out:
	if (z_heap)
		vg_ocl_unmap_arg_buffer(vocp, slot, 1, z_heap);
	if (point_tmp)
		vg_ocl_unmap_arg_buffer(vocp, slot, 2, point_tmp);
	if (ocl_points_in)
		vg_ocl_unmap_arg_buffer(vocp, slot, 3, ocl_points_in);
	if (ocl_strides_in)
		vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in);
	if (ppr)
		EC_POINT_free(ppr);
	if (ppc)
		EC_POINT_free(ppc);
	if (pps)
		EC_POINT_free(pps);
	if (ppt)
		EC_POINT_free(ppt);
	BN_clear_free(&bnz);
	BN_clear_free(&bnez);
	BN_clear_free(&bnm);
	if (bnmont)
		BN_MONT_CTX_free(bnmont);
	if (bnctx)
		BN_CTX_free(bnctx);
	return ret;
}

static int
vg_ocl_verify_k0(vg_ocl_context_t *vocp, int slot)
{
	return vg_ocl_verify_temporary(vocp, slot, 0);
}

static int
vg_ocl_verify_k1(vg_ocl_context_t *vocp, int slot)
{
	return vg_ocl_verify_temporary(vocp, slot, 1);
}

static void *
vg_opencl_thread(void *arg)
{
	vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
	vg_context_t *vcp = vocp->base.vxc_vc;
	int halt = 0;
	int slot = -1;
	int rows, cols, invsize;
	unsigned long long idleu, busyu;
	double pidle;
	struct timeval tv, tvt, tvd, idle, busy;

	memset(&idle, 0, sizeof(idle));
	memset(&busy, 0, sizeof(busy));

	while (1) {
		pthread_mutex_lock(&vocp->voc_lock);
		if (halt) {
			halt = 0;
			vocp->voc_halt = 1;
		}
		if (slot != -1) {
			assert(vocp->voc_ocl_slot == slot);
			vocp->voc_ocl_slot = -1;
			slot = -1;
			pthread_cond_signal(&vocp->voc_wait);
		}
		if (vocp->voc_ocl_slot == -1) {
			gettimeofday(&tv, NULL);
			while (vocp->voc_ocl_slot == -1) {
				if (vocp->voc_halt)
					goto out;
				pthread_cond_wait(&vocp->voc_wait,
						  &vocp->voc_lock);
			}
			gettimeofday(&tvt, NULL);
			timersub(&tvt, &tv, &tvd);
			timeradd(&tvd, &idle, &idle);
		}
		slot = vocp->voc_ocl_slot;
		rows = vocp->voc_ocl_rows;
		cols = vocp->voc_ocl_cols;
		invsize = vocp->voc_ocl_invsize;
		pthread_mutex_unlock(&vocp->voc_lock);

		gettimeofday(&tv, NULL);
		if (!vg_ocl_kernel_start(vocp, slot, cols, rows, invsize))
			halt = 1;

		if (!vg_ocl_kernel_wait(vocp, slot))
			halt = 1;

		if (vcp->vc_verbose > 1) {
			gettimeofday(&tvt, NULL);
			timersub(&tvt, &tv, &tvd);
			timeradd(&tvd, &busy, &busy);
			if ((busy.tv_sec + idle.tv_sec) > 1) {
				idleu = (1000000 * idle.tv_sec) + idle.tv_usec;
				busyu = (1000000 * busy.tv_sec) + busy.tv_usec;
				pidle = ((double) idleu) / (idleu + busyu);

				if (pidle > 0.01) {
					fprintf(stderr, "\rGPU idle: %.2f%%"
					       "                              "
				       "                                \n",
					       100 * pidle);
				}
				memset(&idle, 0, sizeof(idle));
				memset(&busy, 0, sizeof(busy));
			}
		}
	}
out:
	pthread_mutex_unlock(&vocp->voc_lock);
	return NULL;
}


/*
 * Address search thread main loop
 */

static void *
vg_opencl_loop(vg_exec_context_t *arg)
{
	vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg;
	int i;
	int round, nrows, ncols;
	int pattern_generation;

	const BN_ULONG rekey_max = 100000000;
	BN_ULONG npoints, rekey_at;

	EC_KEY *pkey = NULL;
	const EC_GROUP *pgroup;
	const EC_POINT *pgen;
	EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL;
	EC_POINT *pseek = NULL;

	unsigned char *ocl_points_in, *ocl_strides_in;

	vg_context_t *vcp = vocp->base.vxc_vc;
	vg_exec_context_t *vxcp = &vocp->base;

	int slot, nslots;
	int slot_busy = 0, slot_done = 0, halt = 0;
	int c = 0, output_interval = 1000;

	struct timeval tvstart;

	pkey = vxcp->vxc_key;
	pgroup = EC_KEY_get0_group(pkey);
	pgen = EC_GROUP_get0_generator(pgroup);

	round = vocp->voc_ocl_rows * vocp->voc_ocl_cols;

	if (!vcp->vc_remove_on_match &&
	    (vcp->vc_chance >= 1.0f) &&
	    (vcp->vc_chance < round) &&
	    (vcp->vc_verbose > 0)) {
		fprintf(stderr, "WARNING: low pattern difficulty\n");
		fprintf(stderr,
			"WARNING: better match throughput is possible "
			"using vanitygen on the CPU\n");
	}

	slot = 0;
	nslots = 2;
	vocp->voc_nslots = nslots;

	nrows = vocp->voc_ocl_rows;
	ncols = vocp->voc_ocl_cols;

	ppbase = (EC_POINT **) malloc((nrows + ncols) *
				      sizeof(EC_POINT*));
	if (!ppbase)
		goto enomem;

	for (i = 0; i < (nrows + ncols); i++) {
		ppbase[i] = EC_POINT_new(pgroup);
		if (!ppbase[i])
			goto enomem;
	}

	pprow = ppbase + ncols;
	pbatchinc = EC_POINT_new(pgroup);
	poffset = EC_POINT_new(pgroup);
	pseek = EC_POINT_new(pgroup);
	if (!pbatchinc || !poffset || !pseek)
		goto enomem;

	BN_set_word(&vxcp->vxc_bntmp, ncols);
	EC_POINT_mul(pgroup, pbatchinc, &vxcp->vxc_bntmp, NULL, NULL,
		     vxcp->vxc_bnctx);
	EC_POINT_make_affine(pgroup, pbatchinc, vxcp->vxc_bnctx);

	BN_set_word(&vxcp->vxc_bntmp, round);
	EC_POINT_mul(pgroup, poffset, &vxcp->vxc_bntmp, NULL, NULL,
		     vxcp->vxc_bnctx);
	EC_POINT_make_affine(pgroup, poffset, vxcp->vxc_bnctx);

	if (!vg_ocl_config_pattern(vocp))
		goto enomem;

	for (i = 0; i < nslots; i++) {
		/*
		 * Each work group gets its own:
		 * - Column point array
		 */
		if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * nrows, 1))
			goto enomem;
	}

	/*
	 * All instances share:
	 * - The z_heap and point scratch spaces
	 * - The row point array
	 */
	if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1,
			     round_up_pow2(32 * 2 * round, 4096), 0) ||
	    !vg_ocl_kernel_arg_alloc(vocp, -1, 2,
			     round_up_pow2(32 * 2 * round, 4096), 0) ||
	    !vg_ocl_kernel_arg_alloc(vocp, -1, 3,
			     round_up_pow2(32 * 2 * ncols, 4096), 1))
		goto enomem;

	npoints = 0;
	rekey_at = 0;
	vxcp->vxc_binres[0] = vcp->vc_addrtype;

	if (pthread_create(&vocp->voc_ocl_thread, NULL,
			   vg_opencl_thread, vocp))
		goto enomem;

	gettimeofday(&tvstart, NULL);

l_rekey:
	if (vocp->voc_rekey_func) {
		switch (vocp->voc_rekey_func(vocp)) {
		case 1:
			break;
		case 0:
			goto nopatterns;
		default:
			goto enomem;
		}
	}

	vg_exec_context_upgrade_lock(vxcp);

	pattern_generation = vcp->vc_pattern_generation;

	/* Generate a new random private key */
	EC_KEY_generate_key(pkey);
	npoints = 0;

	/* Determine rekey interval */
	EC_GROUP_get_order(pgroup, &vxcp->vxc_bntmp, vxcp->vxc_bnctx);
	BN_sub(&vxcp->vxc_bntmp2,
	       &vxcp->vxc_bntmp,
	       EC_KEY_get0_private_key(pkey));
	rekey_at = BN_get_word(&vxcp->vxc_bntmp2);
	if ((rekey_at == BN_MASK2) || (rekey_at > rekey_max))
		rekey_at = rekey_max;
	assert(rekey_at > 0);

	EC_POINT_copy(ppbase[0], EC_KEY_get0_public_key(pkey));

	vg_exec_context_downgrade_lock(vxcp);

	if (vcp->vc_pubkey_base) {
		EC_POINT_add(pgroup,
			     ppbase[0],
			     ppbase[0],
			     vcp->vc_pubkey_base,
			     vxcp->vxc_bnctx);
	}

	/* Build the base array of sequential points */
	for (i = 1; i < ncols; i++) {
		EC_POINT_add(pgroup,
			     ppbase[i],
			     ppbase[i-1],
			     pgen, vxcp->vxc_bnctx);
	}

	EC_POINTs_make_affine(pgroup, ncols, ppbase, vxcp->vxc_bnctx);

	/* Fill the sequential point array */
	ocl_points_in = (unsigned char *)
		vg_ocl_map_arg_buffer(vocp, 0, 3, 1);
	if (!ocl_points_in) {
		fprintf(stderr, "ERROR: Could not map column buffer\n");
		goto enomem;
	}
	for (i = 0; i < ncols; i++)
		vg_ocl_put_point_tpa(ocl_points_in, i, ppbase[i]);
	vg_ocl_unmap_arg_buffer(vocp, 0, 3, ocl_points_in);

	/*
	 * Set up the initial row increment table.
	 * Set the first element to pgen -- effectively
	 * skipping the exact key generated above.
	 */
	EC_POINT_copy(pprow[0], pgen);
	for (i = 1; i < nrows; i++) {
		EC_POINT_add(pgroup,
			     pprow[i],
			     pprow[i-1],
			     pbatchinc, vxcp->vxc_bnctx);
	}
	EC_POINTs_make_affine(pgroup, nrows, pprow, vxcp->vxc_bnctx);
	vxcp->vxc_delta = 1;
	npoints = 1;
	slot = 0;
	slot_busy = 0;
	slot_done = 0;

	while (1) {
		if (slot_done) {
			assert(rekey_at > 0);
			slot_done = 0;

			/* Call the result check function */
			switch (vocp->voc_check_func(vocp, slot)) {
			case 1:
				rekey_at = 0;
				break;
			case 2:
				halt = 1;
				break;
			default:
				break;
			}

			c += round;
			if (!halt && (c >= output_interval)) {
				output_interval =
					vg_output_timing(vcp, c, &tvstart);
				c = 0;
			}
			vg_exec_context_yield(vxcp);

			/* If the patterns changed, reload it to the GPU */
			if (vocp->voc_rekey_func &&
			    (pattern_generation !=
			     vcp->vc_pattern_generation)) {
				vocp->voc_pattern_rewrite = 1;
				rekey_at = 0;
			}
		}

		if (vcp->vc_halt)
			halt = 1;
		if (halt)
			break;

		if ((npoints + round) < rekey_at) {
			if (npoints > 1) {
				/* Move the row increments forward */
				for (i = 0; i < nrows; i++) {
					EC_POINT_add(pgroup,
						     pprow[i],
						     pprow[i],
						     poffset,
						     vxcp->vxc_bnctx);
				}

				EC_POINTs_make_affine(pgroup, nrows, pprow,
						      vxcp->vxc_bnctx);
			}

			/* Copy the row stride array to the device */
			ocl_strides_in = (unsigned char *)
				vg_ocl_map_arg_buffer(vocp, slot, 4, 1);
			if (!ocl_strides_in) {
				fprintf(stderr,
					"ERROR: Could not map row buffer "
					"for slot %d\n", slot);
				goto enomem;
			}
			memset(ocl_strides_in, 0, 64*nrows);
			for (i = 0; i < nrows; i++)
				vg_ocl_put_point(ocl_strides_in + (64*i),
						 pprow[i]);
			vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in);
			npoints += round;

			pthread_mutex_lock(&vocp->voc_lock);
			while (vocp->voc_ocl_slot != -1) {
				assert(slot_busy);
				pthread_cond_wait(&vocp->voc_wait,
						  &vocp->voc_lock);
			}

			if (vocp->voc_halt) {
				pthread_mutex_unlock(&vocp->voc_lock);
				halt = 1;
				break;
			}

			vocp->voc_ocl_slot = slot;
			pthread_cond_signal(&vocp->voc_wait);
			pthread_mutex_unlock(&vocp->voc_lock);

			slot_done = slot_busy;
			slot_busy = 1;
			slot = (slot + 1) % nslots;

		} else {
			if (slot_busy) {
				pthread_mutex_lock(&vocp->voc_lock);
				while (vocp->voc_ocl_slot != -1) {
					assert(vocp->voc_ocl_slot ==
					       ((slot + nslots - 1) % nslots));
					pthread_cond_wait(&vocp->voc_wait,
							  &vocp->voc_lock);
				}
				pthread_mutex_unlock(&vocp->voc_lock);
				slot_busy = 0;
				slot_done = 1;
			}

			if (!rekey_at ||
			    (!slot_done && ((npoints + round) >= rekey_at)))
				goto l_rekey;
		}
	}

	if (0) {
	enomem:
		fprintf(stderr, "ERROR: allocation failure?\n");
	nopatterns:
		;
	}

	if (halt) {
		if (vcp->vc_verbose > 1) {
			printf("Halting...");
			fflush(stdout);
		}
		pthread_mutex_lock(&vocp->voc_lock);
		vocp->voc_halt = 1;
		pthread_cond_signal(&vocp->voc_wait);
		while (vocp->voc_ocl_slot != -1) {
			assert(slot_busy);
			pthread_cond_wait(&vocp->voc_wait,
					  &vocp->voc_lock);
		}
		slot_busy = 0;
		pthread_mutex_unlock(&vocp->voc_lock);
		pthread_join(vocp->voc_ocl_thread, NULL);
		if (vcp->vc_verbose > 1)
			printf("done!\n");
	}

	vg_exec_context_yield(vxcp);

	if (ppbase) {
		for (i = 0; i < (nrows + ncols); i++)
			if (ppbase[i])
				EC_POINT_free(ppbase[i]);
		free(ppbase);
	}
	if (pbatchinc)
		EC_POINT_free(pbatchinc);

	/* Release the argument buffers */
	vg_ocl_free_args(vocp);
	vocp->voc_halt = 0;
	vocp->voc_ocl_slot = -1;
	vg_context_thread_exit(vcp);
	return NULL;
}


/*
 * OpenCL platform/device selection junk
 */

static int
get_device_list(cl_platform_id pid, cl_device_id **list_out)
{
	cl_uint nd;
	cl_int res;
	cl_device_id *ids;
	res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, NULL, &nd);
	if (res != CL_SUCCESS) {
		vg_ocl_error(NULL, res, "clGetDeviceIDs(0)");
		*list_out = NULL;
		return -1;
	}
	if (nd) {
		ids = (cl_device_id *) malloc(nd * sizeof(*ids));
		if (ids == NULL) {
			fprintf(stderr, "Could not allocate device ID list\n");
			*list_out = NULL;
			return -1;
		}
		res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, nd, ids, NULL);
		if (res != CL_SUCCESS) {
			vg_ocl_error(NULL, res, "clGetDeviceIDs(n)");
			free(ids);
			*list_out = NULL;
			return -1;
		}
		*list_out = ids;
	}
	return nd;
}

static void
show_devices(cl_platform_id pid, cl_device_id *ids, int nd, int base)
{
	int i;
	char nbuf[128];
	char vbuf[128];
	size_t len;
	cl_int res;

	for (i = 0; i < nd; i++) {
		res = clGetDeviceInfo(ids[i], CL_DEVICE_NAME,
				      sizeof(nbuf), nbuf, &len);
		if (res != CL_SUCCESS)
			continue;
		if (len >= sizeof(nbuf))
			len = sizeof(nbuf) - 1;
		nbuf[len] = '\0';
		res = clGetDeviceInfo(ids[i], CL_DEVICE_VENDOR,
				      sizeof(vbuf), vbuf, &len);
		if (res != CL_SUCCESS)
			continue;
		if (len >= sizeof(vbuf))
			len = sizeof(vbuf) - 1;
		vbuf[len] = '\0';
		fprintf(stderr, "  %d: [%s] %s\n", i + base, vbuf, nbuf);
	}
}

static cl_device_id
get_device(cl_platform_id pid, int num)
{
	int nd;
	cl_device_id id, *ids;

	nd = get_device_list(pid, &ids);
	if (nd < 0)
		return NULL;
	if (!nd) {
		fprintf(stderr, "No OpenCL devices found\n");
		return NULL;
	}
	if (num < 0) {
		if (nd == 1)
			num = 0;
		else
			num = nd;
	}
	if (num < nd) {
		id = ids[num];
		free(ids);
		return id;
	}
	free(ids);
	return NULL;
}

static int
get_platform_list(cl_platform_id **list_out)
{
	cl_uint np;
	cl_int res;
	cl_platform_id *ids;
	res = clGetPlatformIDs(0, NULL, &np);
	if (res != CL_SUCCESS) {
		vg_ocl_error(NULL, res, "clGetPlatformIDs(0)");
		*list_out = NULL;
		return -1;
	}
	if (np) {
		ids = (cl_platform_id *) malloc(np * sizeof(*ids));
		if (ids == NULL) {
			fprintf(stderr,
				"Could not allocate platform ID list\n");
			*list_out = NULL;
			return -1;
		}
		res = clGetPlatformIDs(np, ids, NULL);
		if (res != CL_SUCCESS) {
			vg_ocl_error(NULL, res, "clGetPlatformIDs(n)");
			free(ids);
			*list_out = NULL;
			return -1;
		}
		*list_out = ids;
	}
	return np;
}

void
show_platforms(cl_platform_id *ids, int np, int base)
{
	int i;
	char nbuf[128];
	char vbuf[128];
	size_t len;
	cl_int res;

	for (i = 0; i < np; i++) {
		res = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME,
					sizeof(nbuf), nbuf, &len);
		if (res != CL_SUCCESS) {
			vg_ocl_error(NULL, res, "clGetPlatformInfo(NAME)");
			continue;
		}
		if (len >= sizeof(nbuf))
			len = sizeof(nbuf) - 1;
		nbuf[len] = '\0';
		res = clGetPlatformInfo(ids[i], CL_PLATFORM_VENDOR,
					sizeof(vbuf), vbuf, &len);
		if (res != CL_SUCCESS) {
			vg_ocl_error(NULL, res, "clGetPlatformInfo(VENDOR)");
			continue;
		}
		if (len >= sizeof(vbuf))
			len = sizeof(vbuf) - 1;
		vbuf[len] = '\0';
		fprintf(stderr, "%d: [%s] %s\n", i + base, vbuf, nbuf);
	}
}

static cl_platform_id
get_platform(int num)
{
	int np;
	cl_platform_id id, *ids;

	np = get_platform_list(&ids);
	if (np < 0)
		return NULL;
	if (!np) {
		fprintf(stderr, "No OpenCL platforms available\n");
		return NULL;
	}
	if (num < 0) {
		if (np == 1)
			num = 0;
		else
			num = np;
	}
	if (num < np) {
		id = ids[num];
		free(ids);
		return id;
	}
	free(ids);
	return NULL;
}

void
vg_ocl_enumerate_devices(void)
{
	cl_platform_id *pids;
	cl_device_id *dids;
	int np, nd, i;

	np = get_platform_list(&pids);
	if (!np) {
		fprintf(stderr, "No OpenCL platforms available\n");
		return;
	}
	fprintf(stderr, "Available OpenCL platforms:\n");
	for (i = 0; i < np; i++) {
		show_platforms(&pids[i], 1, i);
		nd = get_device_list(pids[i], &dids);
		if (!nd) {
			fprintf(stderr, "  -- No devices\n");
		} else {
			show_devices(pids[i], dids, nd, 0);
		}
	}
}

static cl_device_id
get_opencl_device(int platformidx, int deviceidx)
{
	cl_platform_id pid;
	cl_device_id did = NULL;

	pid = get_platform(platformidx);
	if (pid) {
		did = get_device(pid, deviceidx);
		if (did)
			return did;
	}
	return NULL;
}


vg_ocl_context_t *
vg_ocl_context_new(vg_context_t *vcp,
		   int platformidx, int deviceidx, int safe_mode, int verify,
		   int worksize, int nthreads, int nrows, int ncols,
		   int invsize)
{
	cl_device_id did;
	int round, full_threads, wsmult;
	cl_ulong memsize, allocsize;
	vg_ocl_context_t *vocp;

	/* Find the device */
	did = get_opencl_device(platformidx, deviceidx);
	if (!did) {
		return 0;
	}

	vocp = (vg_ocl_context_t *) malloc(sizeof(*vocp));
	if (!vocp)
		return NULL;

	/* Open the device and compile the kernel */
	if (!vg_ocl_init(vcp, vocp, did, safe_mode)) {
		free(vocp);
		return NULL;
	}

	if (verify) {
		if (vcp->vc_verbose > 0) {
			fprintf(stderr, "WARNING: "
				"Hardware verification mode enabled\n");
		}
		if (!nthreads)
			nthreads = 1;
		vocp->voc_verify_func[0] = vg_ocl_verify_k0;
		vocp->voc_verify_func[1] = vg_ocl_verify_k1;
	}

	/*
	 * nrows: number of point rows per job
	 * ncols: number of point columns per job
	 * invsize: number of modular inversion tasks per job
	 *    (each task performs (nrows*ncols)/invsize inversions)
	 * nslots: number of kernels
	 *    (create two, keep one running while we service the other or wait)
	 */

	if (!nthreads) {
		/* Pick nthreads sufficient to saturate one compute unit */
		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
		    CL_DEVICE_TYPE_CPU)
			nthreads = 1;
		else
			nthreads = vg_ocl_device_getsizet(vocp->voc_ocldid,
					CL_DEVICE_MAX_WORK_GROUP_SIZE);
	}

	full_threads = vg_ocl_device_getsizet(vocp->voc_ocldid,
					      CL_DEVICE_MAX_COMPUTE_UNITS);
	full_threads *= nthreads;

	/*
	 * The work size selection is complicated, and the most
	 * important factor is the batch size of the heap_invert kernel.
	 * Each value added to the batch trades one complete modular
	 * inversion for four multiply operations.  Ideally the work
	 * size would be as large as possible.  The practical limiting
	 * factors are:
	 * 1. Available memory
	 * 2. Responsiveness and operational latency
	 *
	 * We take a naive approach and limit batch size to a point of
	 * sufficiently diminishing returns, hoping that responsiveness
	 * will be sufficient.
	 *
	 * The measured value for the OpenSSL implementations on my CPU
	 * is 80:1.  This causes heap_invert to get batches of 20 or so
	 * for free, and receive 10% incremental returns at 200.  The CPU
	 * work size is therefore set to 256.
	 *
	 * The ratio on most GPUs with the oclvanitygen implementations
	 * is closer to 500:1, and larger batches are required for
	 * good performance.
	 */
	if (!worksize) {
		if (vg_ocl_device_gettype(vocp->voc_ocldid) &
		    CL_DEVICE_TYPE_GPU)
			worksize = 2048;
		else
			worksize = 256;
	}

	if (!ncols) {
		memsize = vg_ocl_device_getulong(vocp->voc_ocldid,
					CL_DEVICE_GLOBAL_MEM_SIZE);
		allocsize = vg_ocl_device_getulong(vocp->voc_ocldid,
					CL_DEVICE_MAX_MEM_ALLOC_SIZE);
		memsize /= 2;
		ncols = full_threads;
		nrows = 2;
		/* Find row and column counts close to sqrt(full_threads) */
		while ((ncols > nrows) && !(ncols & 1)) {
			ncols /= 2;
			nrows *= 2;
		}

		/*
		 * Increase row & column counts to satisfy work size
		 * multiplier or fill available memory.
		 */
		wsmult = 1;
		while ((!worksize || ((wsmult * 2) <= worksize)) &&
		       ((ncols * nrows * 2 * 128) < memsize) &&
		       ((ncols * nrows * 2 * 64) < allocsize)) {
			if (ncols > nrows)
				nrows *= 2;
			else
				ncols *= 2;
			wsmult *= 2;
		}
	}

	round = nrows * ncols;

	if (!invsize) {
		invsize = 2;
		while (!(round % (invsize << 1)) &&
		       ((round / invsize) > full_threads))
			invsize <<= 1;
	}

	if (vcp->vc_verbose > 1) {
		fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows);
		fprintf(stderr, "Modular inverse: %d threads, %d ops each\n",
			round/invsize, invsize);
	}

	if ((round % invsize) || !is_pow2(invsize) || (invsize < 2)) {
		if (vcp->vc_verbose <= 1) {
			fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows);
			fprintf(stderr,
				"Modular inverse: %d threads, %d ops each\n",
				round/invsize, invsize);
		}
		if (round % invsize)
			fprintf(stderr,
				"Modular inverse work size must "
				"evenly divide points\n");
		else
			fprintf(stderr,
				"Modular inverse work per task (%d) "
				"must be a power of 2\n", invsize);
		goto out_fail;
	}

	vocp->voc_ocl_rows = nrows;
	vocp->voc_ocl_cols = ncols;
	vocp->voc_ocl_invsize = invsize;

	return vocp;

out_fail:
	vg_ocl_context_free(vocp);
	return NULL;
}

vg_ocl_context_t *
vg_ocl_context_new_from_devstr(vg_context_t *vcp, const char *devstr,
			       int safemode, int verify)
{
	int platformidx, deviceidx;
	int worksize = 0, nthreads = 0, nrows = 0, ncols = 0, invsize = 0;

	char *dsd, *part, *part2, *save, *param;

	dsd = strdup(devstr);
	if (!dsd)
		return NULL;

	save = NULL;
	part = strtok_r(dsd, ",", &save);

	part2 = strchr(part, ':');
	if (!part2) {
		fprintf(stderr, "Invalid device specifier '%s'\n", part);
		free(dsd);
		return NULL;
	}

	*part2 = '\0';
	platformidx = atoi(part);
	deviceidx = atoi(part2 + 1);

	while ((part = strtok_r(NULL, ",", &save)) != NULL) {
		param = strchr(part, '=');
		if (!param) {
			fprintf(stderr, "Unrecognized parameter '%s'\n", part);
			continue;
		}

		*param = '\0';
		param++;

		if (!strcmp(part, "grid")) {
			ncols = strtol(param, &part2, 0);
			if (part2 && *part2 == 'x') {
				nrows = strtol(part2+1, NULL, 0);
			}
			if (!nrows || !ncols) {
				fprintf(stderr,
					"Invalid grid size '%s'\n", param);
				nrows = 0;
				ncols = 0;
				continue;
			}
		}

		else if (!strcmp(part, "invsize")) {
			invsize = atoi(param);
			if (!invsize) {
				fprintf(stderr,
					"Invalid modular inverse size '%s'\n",
					param);
				continue;
			}
			if (invsize & (invsize - 1)) {
				fprintf(stderr,
					"Modular inverse size %d must be "
					"a power of 2\n", invsize);
				invsize = 0;
				continue;
			}
		}

		else if (!strcmp(part, "threads")) {
			nthreads = atoi(param);
			if (nthreads == 0) {
				fprintf(stderr,
					"Invalid thread count '%s'\n", param);
				continue;
			}
		}

		else {
			fprintf(stderr, "Unrecognized parameter '%s'\n", part);
		}
	}

	free(dsd);

	return vg_ocl_context_new(vcp, platformidx, deviceidx, safemode,
				  verify, worksize, nthreads, nrows, ncols,
				  invsize);
}


void
vg_ocl_context_free(vg_ocl_context_t *vocp)
{
	vg_ocl_del(vocp);
	free(vocp);
}