diff --git a/Makefile b/Makefile index cebe1f9..55c17b8 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,21 @@ LIBS=-lpcre -lcrypto -lm -lpthread CFLAGS=-ggdb -O3 -Wall -OBJS=vanitygen.o oclvanitygen.o keyconv.o pattern.o util.o -PROGS=vanitygen keyconv +OBJS=vanitygen.o oclvanitygen.o oclvanityminer.o oclengine.o keyconv.o pattern.o util.o +PROGS=vanitygen oclvanitygen oclvanityminer keyconv + +most: vanitygen keyconv all: $(PROGS) vanitygen: vanitygen.o pattern.o util.o $(CC) $^ -o $@ $(CFLAGS) $(LIBS) -oclvanitygen: oclvanitygen.o pattern.o util.o +oclvanitygen: oclvanitygen.o oclengine.o pattern.o util.o $(CC) $^ -o $@ $(CFLAGS) $(LIBS) -lOpenCL +oclvanityminer: oclvanityminer.o oclengine.o pattern.o util.o + $(CC) $^ -o $@ $(CFLAGS) $(LIBS) -lOpenCL -lcurl + keyconv: keyconv.o util.o $(CC) $^ -o $@ $(CFLAGS) $(LIBS) diff --git a/Makefile.Win32 b/Makefile.Win32 index 77b523e..40072ff 100644 --- a/Makefile.Win32 +++ b/Makefile.Win32 @@ -2,29 +2,42 @@ CC = cl OPENSSL_DIR = C:\OpenSSL-Win32 PTHREADS_DIR = C:\pthreads-w32-2-8-0-release PCRE_DIR = C:\pcre-7.9-src +CURL_DIR = C:\curl-7.26.0-x86\builds\libcurl-release-static-ssl-static-ipv6-sspi OPENCL_DIR = "C:\Program Files (x86)\AMD APP" OPENCL_INCLUDE = /I$(OPENCL_DIR)\include OPENCL_LIBS = $(OPENCL_DIR)\lib\x86\OpenCL.lib -CFLAGS = /D_WIN32 /DPTW32_STATIC_LIB /DPCRE_STATIC /I$(OPENSSL_DIR)\include /I$(PTHREADS_DIR) /I$(PCRE_DIR) -LIBS = $(OPENSSL_DIR)\lib\libeay32.lib $(PTHREADS_DIR)\pthreadVC2.lib $(PCRE_DIR)\pcre.lib ws2_32.lib -OBJS = vanitygen.obj oclvanitygen.obj keyconv.obj pattern.obj util.obj winglue.obj +CURL_INCLUDE = /I$(CURL_DIR)\include /DCURL_STATICLIB +CURL_LIBS = $(CURL_DIR)\lib\libcurl_a.lib +CFLAGS_BASE = /D_WIN32 /DPTW32_STATIC_LIB /DPCRE_STATIC /I$(OPENSSL_DIR)\inc32 /I$(PTHREADS_DIR) /I$(PCRE_DIR) /Ox /Zi +CFLAGS = $(CFLAGS_BASE) /GL +LIBS = $(OPENSSL_DIR)\out32\libeay32.lib $(PTHREADS_DIR)\pthreadVC2.lib $(PCRE_DIR)\pcre.lib ws2_32.lib user32.lib advapi32.lib gdi32.lib /LTCG /DEBUG +OBJS = vanitygen.obj oclvanitygen.obj oclengine.obj oclvanityminer.obj keyconv.obj pattern.obj util.obj winglue.obj -all: vanitygen.exe +all: vanitygen.exe keyconv.exe vanitygen.exe: vanitygen.obj pattern.obj util.obj winglue.obj link /nologo /out:$@ $** $(LIBS) -oclvanitygen.exe: oclvanitygen.obj pattern.obj util.obj winglue.obj - link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS) +oclvanitygen.exe: oclvanitygen.obj oclengine.obj pattern.obj util.obj winglue.obj + link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS) $(CURL_LIBS) + +oclvanityminer.exe: oclvanityminer.obj oclengine.obj pattern.obj util.obj winglue.obj + link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS) $(CURL_LIBS) keyconv.exe: keyconv.obj util.obj winglue.obj - link /nologo /out:$@ $** $(LIBS) $(OPENCL_LIBS) + link /nologo /out:$@ $** $(LIBS) .c.obj: @$(CC) /nologo $(CFLAGS) /c /Tp$< /Fo$@ +oclengine.obj: oclengine.c + @$(CC) /nologo $(CFLAGS_BASE) $(OPENCL_INCLUDE) /c /Tpoclengine.c /Fo$@ + oclvanitygen.obj: oclvanitygen.c - @$(CC) /nologo $(CFLAGS) $(OPENCL_INCLUDE) /c /Tpoclvanitygen.c /Fo$@ + @$(CC) /nologo $(CFLAGS_BASE) /c /Tpoclvanitygen.c /Fo$@ + +oclvanityminer.obj: oclvanityminer.c + @$(CC) /nologo $(CFLAGS_BASE) $(CURL_INCLUDE) /c /Tpoclvanityminer.c /Fo$@ clean: - del vanitygen.exe $(OBJS) + del vanitygen.exe oclvanitygen.exe oclvanityminer.exe keyconv.exe $(OBJS) diff --git a/keyconv.c b/keyconv.c index 1283fa1..0fac934 100644 --- a/keyconv.c +++ b/keyconv.c @@ -28,7 +28,8 @@ usage(const char *progname) "-8 Output key in PKCS#8 form\n" "-e Encrypt output key, prompt for password\n" "-E Encrypt output key with (UNSAFE)\n" -"-c Combine private key parts to make complete private key", +"-c Combine private key parts to make complete private key\n" +"-v Verbose output\n", version, progname); } @@ -47,10 +48,11 @@ main(int argc, char **argv) int privtype, addrtype; int pkcs8 = 0; int pass_prompt = 0; + int verbose = 0; int opt; int res; - while ((opt = getopt(argc, argv, "8E:ec:")) != -1) { + while ((opt = getopt(argc, argv, "8E:ec:v")) != -1) { switch (opt) { case '8': pkcs8 = 1; @@ -75,6 +77,9 @@ main(int argc, char **argv) case 'c': key2_in = optarg; break; + case 'v': + verbose = 1; + break; default: usage(argv[0]); return 1; @@ -149,6 +154,15 @@ main(int argc, char **argv) default: addrtype = 0; break; } + if (verbose) { + unsigned char *pend = (unsigned char *) pbuf; + res = i2o_ECPublicKey(pkey, &pend); + fprintf(stderr, "Pubkey (hex): "); + dumphex((unsigned char *)pbuf, res); + fprintf(stderr, "Privkey (hex): "); + dumpbn(EC_KEY_get0_private_key(pkey)); + } + if (pkcs8) { res = vg_pkcs8_encode_privkey(pbuf, sizeof(pbuf), pkey, pass_in); diff --git a/oclengine.c b/oclengine.c new file mode 100644 index 0000000..2b59835 --- /dev/null +++ b/oclengine.c @@ -0,0 +1,2591 @@ +/* + * Vanitygen, vanity bitcoin address generator + * Copyright (C) 2011 + * + * Vanitygen is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * Vanitygen is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with Vanitygen. If not, see . + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#ifdef __APPLE__ +#include +#ifndef CL_CALLBACK +#define CL_CALLBACK +#endif +#else +#include +#endif + +#include "oclengine.h" +#include "pattern.h" +#include "util.h" + + +#define MAX_SLOT 2 +#define MAX_ARG 6 +#define MAX_KERNEL 3 + +#define is_pow2(v) (!((v) & ((v)-1))) +#define round_up_pow2(x, a) (((x) + ((a)-1)) & ~((a)-1)) + +static void vg_ocl_free_args(vg_ocl_context_t *vocp); + + +/* OpenCL address searching mode */ +struct _vg_ocl_context_s; +typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *); +typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot); + +struct _vg_ocl_context_s { + vg_exec_context_t base; + cl_device_id voc_ocldid; + cl_context voc_oclctx; + cl_command_queue voc_oclcmdq; + cl_program voc_oclprog; + vg_ocl_init_t voc_init_func; + vg_ocl_init_t voc_rekey_func; + vg_ocl_check_t voc_check_func; + int voc_quirks; + int voc_nslots; + cl_kernel voc_oclkernel[MAX_SLOT][MAX_KERNEL]; + cl_event voc_oclkrnwait[MAX_SLOT]; + cl_mem voc_args[MAX_SLOT][MAX_ARG]; + size_t voc_arg_size[MAX_SLOT][MAX_ARG]; + + int voc_pattern_rewrite; + int voc_pattern_alloc; + + vg_ocl_check_t voc_verify_func[MAX_KERNEL]; + + pthread_t voc_ocl_thread; + pthread_mutex_t voc_lock; + pthread_cond_t voc_wait; + int voc_ocl_slot; + int voc_ocl_rows; + int voc_ocl_cols; + int voc_ocl_invsize; + int voc_halt; + int voc_dump_done; +}; + + +/* Thread synchronization stubs */ +void +vg_exec_downgrade_lock(vg_exec_context_t *vxcp) +{ +} + +int +vg_exec_upgrade_lock(vg_exec_context_t *vxcp) +{ + return 0; +} + + +/* + * OpenCL debugging and support + */ + +static const char * +vg_ocl_strerror(cl_int ret) +{ +#define OCL_STATUS(st) case st: return #st; + switch (ret) { + OCL_STATUS(CL_SUCCESS); + OCL_STATUS(CL_DEVICE_NOT_FOUND); + OCL_STATUS(CL_DEVICE_NOT_AVAILABLE); + OCL_STATUS(CL_COMPILER_NOT_AVAILABLE); + OCL_STATUS(CL_MEM_OBJECT_ALLOCATION_FAILURE); + OCL_STATUS(CL_OUT_OF_RESOURCES); + OCL_STATUS(CL_OUT_OF_HOST_MEMORY); + OCL_STATUS(CL_PROFILING_INFO_NOT_AVAILABLE); + OCL_STATUS(CL_MEM_COPY_OVERLAP); + OCL_STATUS(CL_IMAGE_FORMAT_MISMATCH); + OCL_STATUS(CL_IMAGE_FORMAT_NOT_SUPPORTED); + OCL_STATUS(CL_BUILD_PROGRAM_FAILURE); + OCL_STATUS(CL_MAP_FAILURE); +#if defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) + OCL_STATUS(CL_MISALIGNED_SUB_BUFFER_OFFSET); +#endif /* defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) */ +#if defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) + OCL_STATUS(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); +#endif /* defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) */ + OCL_STATUS(CL_INVALID_VALUE); + OCL_STATUS(CL_INVALID_DEVICE_TYPE); + OCL_STATUS(CL_INVALID_PLATFORM); + OCL_STATUS(CL_INVALID_DEVICE); + OCL_STATUS(CL_INVALID_CONTEXT); + OCL_STATUS(CL_INVALID_QUEUE_PROPERTIES); + OCL_STATUS(CL_INVALID_COMMAND_QUEUE); + OCL_STATUS(CL_INVALID_HOST_PTR); + OCL_STATUS(CL_INVALID_MEM_OBJECT); + OCL_STATUS(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); + OCL_STATUS(CL_INVALID_IMAGE_SIZE); + OCL_STATUS(CL_INVALID_SAMPLER); + OCL_STATUS(CL_INVALID_BINARY); + OCL_STATUS(CL_INVALID_BUILD_OPTIONS); + OCL_STATUS(CL_INVALID_PROGRAM); + OCL_STATUS(CL_INVALID_PROGRAM_EXECUTABLE); + OCL_STATUS(CL_INVALID_KERNEL_NAME); + OCL_STATUS(CL_INVALID_KERNEL_DEFINITION); + OCL_STATUS(CL_INVALID_KERNEL); + OCL_STATUS(CL_INVALID_ARG_INDEX); + OCL_STATUS(CL_INVALID_ARG_VALUE); + OCL_STATUS(CL_INVALID_ARG_SIZE); + OCL_STATUS(CL_INVALID_KERNEL_ARGS); + OCL_STATUS(CL_INVALID_WORK_DIMENSION); + OCL_STATUS(CL_INVALID_WORK_GROUP_SIZE); + OCL_STATUS(CL_INVALID_WORK_ITEM_SIZE); + OCL_STATUS(CL_INVALID_GLOBAL_OFFSET); + OCL_STATUS(CL_INVALID_EVENT_WAIT_LIST); + OCL_STATUS(CL_INVALID_EVENT); + OCL_STATUS(CL_INVALID_OPERATION); + OCL_STATUS(CL_INVALID_GL_OBJECT); + OCL_STATUS(CL_INVALID_BUFFER_SIZE); + OCL_STATUS(CL_INVALID_MIP_LEVEL); + OCL_STATUS(CL_INVALID_GLOBAL_WORK_SIZE); +#if defined(CL_INVALID_PROPERTY) + OCL_STATUS(CL_INVALID_PROPERTY); +#endif /* defined(CL_INVALID_PROPERTY) */ +#undef OCL_STATUS + default: { + static char tmp[64]; + snprintf(tmp, sizeof(tmp), "Unknown code %d", ret); + return tmp; + } + } +} + +/* Get device strings, using a static buffer -- caveat emptor */ +static const char * +vg_ocl_platform_getstr(cl_platform_id pid, cl_platform_info param) +{ + static char platform_str[1024]; + cl_int ret; + size_t size_ret; + ret = clGetPlatformInfo(pid, param, + sizeof(platform_str), platform_str, + &size_ret); + if (ret != CL_SUCCESS) { + snprintf(platform_str, sizeof(platform_str), + "clGetPlatformInfo(%d): %s", + param, vg_ocl_strerror(ret)); + } + return platform_str; +} + +static cl_platform_id +vg_ocl_device_getplatform(cl_device_id did) +{ + cl_int ret; + cl_platform_id val; + size_t size_ret; + ret = clGetDeviceInfo(did, CL_DEVICE_PLATFORM, + sizeof(val), &val, &size_ret); + if (ret != CL_SUCCESS) { + fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_PLATFORM): %s", + vg_ocl_strerror(ret)); + } + return val; +} + +static cl_device_type +vg_ocl_device_gettype(cl_device_id did) +{ + cl_int ret; + cl_device_type val; + size_t size_ret; + ret = clGetDeviceInfo(did, CL_DEVICE_TYPE, + sizeof(val), &val, &size_ret); + if (ret != CL_SUCCESS) { + fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_TYPE): %s", + vg_ocl_strerror(ret)); + } + return val; +} + +static const char * +vg_ocl_device_getstr(cl_device_id did, cl_device_info param) +{ + static char device_str[1024]; + cl_int ret; + size_t size_ret; + ret = clGetDeviceInfo(did, param, + sizeof(device_str), device_str, + &size_ret); + if (ret != CL_SUCCESS) { + snprintf(device_str, sizeof(device_str), + "clGetDeviceInfo(%d): %s", + param, vg_ocl_strerror(ret)); + } + return device_str; +} + +static size_t +vg_ocl_device_getsizet(cl_device_id did, cl_device_info param) +{ + cl_int ret; + size_t val; + size_t size_ret; + ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); + if (ret != CL_SUCCESS) { + fprintf(stderr, + "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); + } + return val; +} + +static cl_ulong +vg_ocl_device_getulong(cl_device_id did, cl_device_info param) +{ + cl_int ret; + cl_ulong val; + size_t size_ret; + ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); + if (ret != CL_SUCCESS) { + fprintf(stderr, + "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); + } + return val; +} + +static cl_uint +vg_ocl_device_getuint(cl_device_id did, cl_device_info param) +{ + cl_int ret; + cl_uint val; + size_t size_ret; + ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); + if (ret != CL_SUCCESS) { + fprintf(stderr, + "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); + } + return val; +} + +void +vg_ocl_dump_info(vg_ocl_context_t *vocp) +{ + cl_device_id did; + if (vocp->base.vxc_vc && (vocp->base.vxc_vc->vc_verbose < 1)) + return; + if (vocp->voc_dump_done) + return; + did = vocp->voc_ocldid; + fprintf(stderr, "Device: %s\n", + vg_ocl_device_getstr(did, CL_DEVICE_NAME)); + fprintf(stderr, "Vendor: %s (%04x)\n", + vg_ocl_device_getstr(did, CL_DEVICE_VENDOR), + vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID)); + fprintf(stderr, "Driver: %s\n", + vg_ocl_device_getstr(did, CL_DRIVER_VERSION)); + fprintf(stderr, "Profile: %s\n", + vg_ocl_device_getstr(did, CL_DEVICE_PROFILE)); + fprintf(stderr, "Version: %s\n", + vg_ocl_device_getstr(did, CL_DEVICE_VERSION)); + fprintf(stderr, "Max compute units: %"PRSIZET"d\n", + vg_ocl_device_getsizet(did, CL_DEVICE_MAX_COMPUTE_UNITS)); + fprintf(stderr, "Max workgroup size: %"PRSIZET"d\n", + vg_ocl_device_getsizet(did, CL_DEVICE_MAX_WORK_GROUP_SIZE)); + fprintf(stderr, "Global memory: %ld\n", + vg_ocl_device_getulong(did, CL_DEVICE_GLOBAL_MEM_SIZE)); + fprintf(stderr, "Max allocation: %ld\n", + vg_ocl_device_getulong(did, CL_DEVICE_MAX_MEM_ALLOC_SIZE)); + vocp->voc_dump_done = 1; +} + +void +vg_ocl_error(vg_ocl_context_t *vocp, int code, const char *desc) +{ + const char *err = vg_ocl_strerror(code); + if (desc) { + fprintf(stderr, "%s: %s\n", desc, err); + } else { + fprintf(stderr, "%s\n", err); + } + + if (vocp && vocp->voc_ocldid) + vg_ocl_dump_info(vocp); +} + +static void +vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog) +{ + size_t logbufsize, logsize; + char *log; + int off = 0; + cl_int ret; + + ret = clGetProgramBuildInfo(prog, + vocp->voc_ocldid, + CL_PROGRAM_BUILD_LOG, + 0, NULL, + &logbufsize); + if (ret != CL_SUCCESS) { + vg_ocl_error(NULL, ret, "clGetProgramBuildInfo"); + return; + } + + log = (char *) malloc(logbufsize); + if (!log) { + fprintf(stderr, "Could not allocate build log buffer\n"); + return; + } + + ret = clGetProgramBuildInfo(prog, + vocp->voc_ocldid, + CL_PROGRAM_BUILD_LOG, + logbufsize, + log, + &logsize); + if (ret != CL_SUCCESS) { + vg_ocl_error(NULL, ret, "clGetProgramBuildInfo"); + + } else { + /* Remove leading newlines and trailing newlines/whitespace */ + log[logbufsize-1] = '\0'; + for (off = logsize - 1; off >= 0; off--) { + if ((log[off] != '\r') && + (log[off] != '\n') && + (log[off] != ' ') && + (log[off] != '\t') && + (log[off] != '\0')) + break; + log[off] = '\0'; + } + for (off = 0; off < logbufsize; off++) { + if ((log[off] != '\r') && + (log[off] != '\n')) + break; + } + + fprintf(stderr, "Build log:\n%s\n", &log[off]); + } + free(log); +} + +/* + * OpenCL per-exec functions + */ + +enum { + VG_OCL_DEEP_PREPROC_UNROLL = (1 << 0), + VG_OCL_PRAGMA_UNROLL = (1 << 1), + VG_OCL_EXPENSIVE_BRANCHES = (1 << 2), + VG_OCL_DEEP_VLIW = (1 << 3), + VG_OCL_AMD_BFI_INT = (1 << 4), + VG_OCL_NV_VERBOSE = (1 << 5), + VG_OCL_BROKEN = (1 << 6), + VG_OCL_NO_BINARIES = (1 << 7), + + VG_OCL_OPTIMIZATIONS = (VG_OCL_DEEP_PREPROC_UNROLL | + VG_OCL_PRAGMA_UNROLL | + VG_OCL_EXPENSIVE_BRANCHES | + VG_OCL_DEEP_VLIW | + VG_OCL_AMD_BFI_INT), + +}; + +static int +vg_ocl_get_quirks(vg_ocl_context_t *vocp) +{ + uint32_t vend; + const char *dvn; + unsigned int quirks = 0; + + quirks |= VG_OCL_DEEP_PREPROC_UNROLL; + + vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID); + switch (vend) { + case 0x10de: /* NVIDIA */ + /* + * NVIDIA's compiler seems to take a really really long + * time when using preprocessor unrolling, but works + * well with pragma unroll. + */ + quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL; + quirks |= VG_OCL_PRAGMA_UNROLL; + quirks |= VG_OCL_NV_VERBOSE; + break; + case 0x1002: /* AMD/ATI */ + /* + * AMD's compiler works best with preprocesor unrolling. + * Pragma unroll is unreliable with AMD's compiler and + * seems to crash based on whether the gods were smiling + * when Catalyst was last installed/upgraded. + */ + if (vg_ocl_device_gettype(vocp->voc_ocldid) & + CL_DEVICE_TYPE_GPU) { + quirks |= VG_OCL_EXPENSIVE_BRANCHES; + quirks |= VG_OCL_DEEP_VLIW; + dvn = vg_ocl_device_getstr(vocp->voc_ocldid, + CL_DEVICE_EXTENSIONS); + if (dvn && strstr(dvn, "cl_amd_media_ops")) + quirks |= VG_OCL_AMD_BFI_INT; + + dvn = vg_ocl_device_getstr(vocp->voc_ocldid, + CL_DEVICE_NAME); + if (!strcmp(dvn, "ATI RV710")) { + quirks &= ~VG_OCL_OPTIMIZATIONS; + quirks |= VG_OCL_NO_BINARIES; + } + } + break; + default: + break; + } + return quirks; +} + +static int +vg_ocl_create_kernel(vg_ocl_context_t *vocp, int knum, const char *func) +{ + int i; + cl_kernel krn; + cl_int ret; + + for (i = 0; i < MAX_SLOT; i++) { + krn = clCreateKernel(vocp->voc_oclprog, func, &ret); + if (!krn) { + fprintf(stderr, "clCreateKernel(%d): ", i); + vg_ocl_error(vocp, ret, NULL); + while (--i >= 0) { + clReleaseKernel(vocp->voc_oclkernel[i][knum]); + vocp->voc_oclkernel[i][knum] = NULL; + } + return 0; + } + vocp->voc_oclkernel[i][knum] = krn; + vocp->voc_oclkrnwait[i] = NULL; + } + return 1; +} + +static void +vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts, + const char *program, size_t size, + unsigned char *hash_out) +{ + MD5_CTX ctx; + cl_platform_id pid; + const char *str; + + MD5_Init(&ctx); + pid = vg_ocl_device_getplatform(vocp->voc_ocldid); + str = vg_ocl_platform_getstr(pid, CL_PLATFORM_NAME); + MD5_Update(&ctx, str, strlen(str) + 1); + str = vg_ocl_platform_getstr(pid, CL_PLATFORM_VERSION); + MD5_Update(&ctx, str, strlen(str) + 1); + str = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME); + MD5_Update(&ctx, str, strlen(str) + 1); + if (opts) + MD5_Update(&ctx, opts, strlen(opts) + 1); + if (size) + MD5_Update(&ctx, program, size); + MD5_Final(hash_out, &ctx); +} + +typedef struct { + unsigned char e_ident[16]; + uint16_t e_type; + uint16_t e_machine; + uint32_t e_version; + uint32_t e_entry; + uint32_t e_phoff; + uint32_t e_shoff; + uint32_t e_flags; + uint16_t e_ehsize; + uint16_t e_phentsize; + uint16_t e_phnum; + uint16_t e_shentsize; + uint16_t e_shnum; + uint16_t e_shstrndx; +} vg_elf32_header_t; + +typedef struct { + uint32_t sh_name; + uint32_t sh_type; + uint32_t sh_flags; + uint32_t sh_addr; + uint32_t sh_offset; + uint32_t sh_size; + uint32_t sh_link; + uint32_t sh_info; + uint32_t sh_addralign; + uint32_t sh_entsize; +} vg_elf32_shdr_t; + +static int +vg_ocl_amd_patch_inner(unsigned char *binary, size_t size) +{ + vg_elf32_header_t *ehp; + vg_elf32_shdr_t *shp, *nshp; + uint32_t *instr; + size_t off; + int i, n, txt2idx, patched; + + ehp = (vg_elf32_header_t *) binary; + if ((size < sizeof(*ehp)) || + memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) || + !ehp->e_shoff) + return 0; + + off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize); + nshp = (vg_elf32_shdr_t *) (binary + off); + if ((off + sizeof(*nshp)) > size) + return 0; + + shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff); + n = 0; + txt2idx = 0; + for (i = 0; i < ehp->e_shnum; i++) { + off = nshp->sh_offset + shp[i].sh_name; + if (((off + 6) >= size) || + memcmp(binary + off, ".text", 6)) + continue; + n++; + if (n == 2) + txt2idx = i; + } + if (n != 2) + return 0; + + off = shp[txt2idx].sh_offset; + instr = (uint32_t *) (binary + off); + n = shp[txt2idx].sh_size / 4; + patched = 0; + for (i = 0; i < n; i += 2) { + if (((instr[i] & 0x02001000) == 0) && + ((instr[i+1] & 0x9003f000) == 0x0001a000)) { + instr[i+1] ^= (0x0001a000 ^ 0x0000c000); + patched++; + } + } + + return patched; +} + +static int +vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size) +{ + vg_context_t *vcp = vocp->base.vxc_vc; + vg_elf32_header_t *ehp; + unsigned char *ptr; + size_t offset = 1; + int ninner = 0, nrun, npatched = 0; + + ehp = (vg_elf32_header_t *) binary; + if ((size < sizeof(*ehp)) || + memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) || + !ehp->e_shoff) + return 0; + + offset = 1; + while (offset < (size - 8)) { + ptr = (unsigned char *) memchr(binary + offset, + 0x7f, + size - offset); + if (!ptr) + return npatched; + offset = ptr - binary; + ehp = (vg_elf32_header_t *) ptr; + if (((size - offset) < sizeof(*ehp)) || + memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) || + !ehp->e_shoff) { + offset += 1; + continue; + } + + ninner++; + nrun = vg_ocl_amd_patch_inner(ptr, size - offset); + npatched += nrun; + if (vcp->vc_verbose > 1) + fprintf(stderr, "AMD BFI_INT: patched %d instructions " + "in kernel %d\n", + nrun, ninner); + npatched++; + offset += 1; + } + return npatched; +} + + +static int +vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, + const char *filename, const char *opts) +{ + FILE *kfp; + char *buf, *tbuf; + int len, fromsource = 0, patched = 0; + size_t sz, szr; + cl_program prog; + cl_int ret, sts; + unsigned char prog_hash[16]; + char bin_name[64]; + + if (vcp->vc_verbose > 1) + fprintf(stderr, + "OpenCL compiler flags: %s\n", opts ? opts : ""); + + sz = 128 * 1024; + buf = (char *) malloc(sz); + if (!buf) { + fprintf(stderr, "Could not allocate program buffer\n"); + return 0; + } + + kfp = fopen(filename, "r"); + if (!kfp) { + fprintf(stderr, "Error loading kernel file '%s': %s\n", + filename, strerror(errno)); + free(buf); + return 0; + } + + len = fread(buf, 1, sz, kfp); + fclose(kfp); + + if (!len) { + fprintf(stderr, "Short read on CL kernel\n"); + free(buf); + return 0; + } + + vg_ocl_hash_program(vocp, opts, buf, len, prog_hash); + snprintf(bin_name, sizeof(bin_name), + "%02x%02x%02x%02x%02x%02x%02x%02x" + "%02x%02x%02x%02x%02x%02x%02x%02x.oclbin", + prog_hash[0], prog_hash[1], prog_hash[2], prog_hash[3], + prog_hash[4], prog_hash[5], prog_hash[6], prog_hash[7], + prog_hash[8], prog_hash[9], prog_hash[10], prog_hash[11], + prog_hash[12], prog_hash[13], prog_hash[14], prog_hash[15]); + + if (vocp->voc_quirks & VG_OCL_NO_BINARIES) { + kfp = NULL; + if (vcp->vc_verbose > 1) + fprintf(stderr, "Binary OpenCL programs disabled\n"); + } else { + kfp = fopen(bin_name, "rb"); + } + + if (!kfp) { + /* No binary available, create with source */ + fromsource = 1; + sz = len; + prog = clCreateProgramWithSource(vocp->voc_oclctx, + 1, (const char **) &buf, &sz, + &ret); + } else { + if (vcp->vc_verbose > 1) + fprintf(stderr, "Loading kernel binary %s\n", bin_name); + szr = 0; + while (!feof(kfp)) { + len = fread(buf + szr, 1, sz - szr, kfp); + if (!len) { + fprintf(stderr, + "Short read on CL kernel binary\n"); + fclose(kfp); + free(buf); + return 0; + } + szr += len; + if (szr == sz) { + tbuf = (char *) realloc(buf, sz*2); + if (!tbuf) { + fprintf(stderr, + "Could not expand CL kernel " + "binary buffer\n"); + fclose(kfp); + free(buf); + return 0; + } + buf = tbuf; + sz *= 2; + } + } + fclose(kfp); + rebuild: + prog = clCreateProgramWithBinary(vocp->voc_oclctx, + 1, &vocp->voc_ocldid, + &szr, + (const unsigned char **) &buf, + &sts, + &ret); + } + free(buf); + if (!prog) { + vg_ocl_error(vocp, ret, "clCreateProgramWithSource"); + return 0; + } + + if (vcp->vc_verbose > 0) { + if (fromsource && !patched) { + fprintf(stderr, + "Compiling kernel, can take minutes..."); + fflush(stderr); + } + } + ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL); + if (ret != CL_SUCCESS) { + if ((vcp->vc_verbose > 0) && fromsource && !patched) + fprintf(stderr, "failure.\n"); + vg_ocl_error(NULL, ret, "clBuildProgram"); + } else if ((vcp->vc_verbose > 0) && fromsource && !patched) { + fprintf(stderr, "done!\n"); + } + if ((ret != CL_SUCCESS) || + ((vcp->vc_verbose > 1) && fromsource && !patched)) { + vg_ocl_buildlog(vocp, prog); + } + if (ret != CL_SUCCESS) { + vg_ocl_dump_info(vocp); + clReleaseProgram(prog); + return 0; + } + + if (fromsource && !(vocp->voc_quirks & VG_OCL_NO_BINARIES)) { + ret = clGetProgramInfo(prog, + CL_PROGRAM_BINARY_SIZES, + sizeof(szr), &szr, + &sz); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, + "WARNING: clGetProgramInfo(BINARY_SIZES)"); + goto out; + } + if (sz == 0) { + fprintf(stderr, + "WARNING: zero-length CL kernel binary\n"); + goto out; + } + + buf = (char *) malloc(szr); + if (!buf) { + fprintf(stderr, + "WARNING: Could not allocate %"PRSIZET"d bytes " + "for CL binary\n", + szr); + goto out; + } + + ret = clGetProgramInfo(prog, + CL_PROGRAM_BINARIES, + sizeof(buf), &buf, + &sz); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, + "WARNING: clGetProgramInfo(BINARIES)"); + free(buf); + goto out; + } + + if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) { + patched = vg_ocl_amd_patch(vocp, + (unsigned char *) buf, szr); + if (patched > 0) { + if (vcp->vc_verbose > 1) + fprintf(stderr, + "AMD BFI_INT patch complete\n"); + clReleaseProgram(prog); + goto rebuild; + } + fprintf(stderr, + "WARNING: AMD BFI_INT patching failed\n"); + if (patched < 0) { + /* Program was incompletely modified */ + free(buf); + goto out; + } + } + + kfp = fopen(bin_name, "wb"); + if (!kfp) { + fprintf(stderr, "WARNING: " + "could not save CL kernel binary: %s\n", + strerror(errno)); + } else { + sz = fwrite(buf, 1, szr, kfp); + fclose(kfp); + if (sz != szr) { + fprintf(stderr, + "WARNING: short write on CL kernel " + "binary file: expected " + "%"PRSIZET"d, got %"PRSIZET"d\n", + szr, sz); + unlink(bin_name); + } + } + free(buf); + } + +out: + vocp->voc_oclprog = prog; + if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") || + !vg_ocl_create_kernel(vocp, 1, "heap_invert")) { + clReleaseProgram(vocp->voc_oclprog); + vocp->voc_oclprog = NULL; + return 0; + } + + return 1; +} + +static void CL_CALLBACK +vg_ocl_context_callback(const char *errinfo, + const void *private_info, + size_t cb, + void *user_data) +{ + fprintf(stderr, "vg_ocl_context_callback error: %s\n", errinfo); +} + +static int +vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did, + int safe_mode) +{ + cl_int ret; + char optbuf[128]; + int end = 0; + + memset(vocp, 0, sizeof(*vocp)); + vg_exec_context_init(vcp, &vocp->base); + + pthread_mutex_init(&vocp->voc_lock, NULL); + pthread_cond_init(&vocp->voc_wait, NULL); + vocp->voc_ocl_slot = -1; + + vocp->voc_ocldid = did; + + if (vcp->vc_verbose > 1) + vg_ocl_dump_info(vocp); + + vocp->voc_quirks = vg_ocl_get_quirks(vocp); + + if ((vocp->voc_quirks & VG_OCL_BROKEN) && (vcp->vc_verbose > 0)) { + char yesbuf[16]; + printf("Type 'yes' to continue: "); + fflush(stdout); + if (!fgets(yesbuf, sizeof(yesbuf), stdin) || + strncmp(yesbuf, "yes", 3)) + exit(1); + } + + vocp->voc_oclctx = clCreateContext(NULL, + 1, &did, + vg_ocl_context_callback, + NULL, + &ret); + if (!vocp->voc_oclctx) { + vg_ocl_error(vocp, ret, "clCreateContext"); + return 0; + } + + vocp->voc_oclcmdq = clCreateCommandQueue(vocp->voc_oclctx, + vocp->voc_ocldid, + 0, &ret); + if (!vocp->voc_oclcmdq) { + vg_ocl_error(vocp, ret, "clCreateCommandQueue"); + return 0; + } + + if (safe_mode) + vocp->voc_quirks &= ~VG_OCL_OPTIMIZATIONS; + + end = 0; + optbuf[end] = '\0'; + if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL) + end += snprintf(optbuf + end, sizeof(optbuf) - end, + "-DDEEP_PREPROC_UNROLL "); + if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL) + end += snprintf(optbuf + end, sizeof(optbuf) - end, + "-DPRAGMA_UNROLL "); + if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES) + end += snprintf(optbuf + end, sizeof(optbuf) - end, + "-DVERY_EXPENSIVE_BRANCHES "); + if (vocp->voc_quirks & VG_OCL_DEEP_VLIW) + end += snprintf(optbuf + end, sizeof(optbuf) - end, + "-DDEEP_VLIW "); + if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT) + end += snprintf(optbuf + end, sizeof(optbuf) - end, + "-DAMD_BFI_INT "); + if (vocp->voc_quirks & VG_OCL_NV_VERBOSE) + end += snprintf(optbuf + end, sizeof(optbuf) - end, + "-cl-nv-verbose "); + + if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", optbuf)) + return 0; + return 1; +} + +static void +vg_ocl_del(vg_ocl_context_t *vocp) +{ + vg_ocl_free_args(vocp); + if (vocp->voc_oclprog) { + clReleaseProgram(vocp->voc_oclprog); + vocp->voc_oclprog = NULL; + } + if (vocp->voc_oclcmdq) { + clReleaseCommandQueue(vocp->voc_oclcmdq); + vocp->voc_oclcmdq = NULL; + } + if (vocp->voc_oclctx) { + clReleaseContext(vocp->voc_oclctx); + vocp->voc_oclctx = NULL; + } + pthread_cond_destroy(&vocp->voc_wait); + pthread_mutex_destroy(&vocp->voc_lock); + vg_exec_context_del(&vocp->base); +} + +static int vg_ocl_arg_map[][8] = { + /* hashes_out / found */ + { 2, 0, -1 }, + /* z_heap */ + { 0, 1, 1, 0, 2, 2, -1 }, + /* point_tmp */ + { 0, 0, 2, 1, -1 }, + /* row_in */ + { 0, 2, -1 }, + /* col_in */ + { 0, 3, -1 }, + /* target_table */ + { 2, 3, -1 }, +}; + +static int +vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot, + int arg, size_t size, int host) +{ + cl_mem clbuf; + cl_int ret; + int i, j, knum, karg; + + for (i = 0; i < MAX_SLOT; i++) { + if ((i != slot) && (slot >= 0)) + continue; + if (vocp->voc_args[i][arg]) { + clReleaseMemObject(vocp->voc_args[i][arg]); + vocp->voc_args[i][arg] = NULL; + vocp->voc_arg_size[i][arg] = 0; + } + } + + clbuf = clCreateBuffer(vocp->voc_oclctx, + CL_MEM_READ_WRITE | + (host ? CL_MEM_ALLOC_HOST_PTR : 0), + size, + NULL, + &ret); + if (!clbuf) { + fprintf(stderr, "clCreateBuffer(%d,%d): ", slot, arg); + vg_ocl_error(vocp, ret, NULL); + return 0; + } + + for (i = 0; i < MAX_SLOT; i++) { + if ((i != slot) && (slot >= 0)) + continue; + + clRetainMemObject(clbuf); + vocp->voc_args[i][arg] = clbuf; + vocp->voc_arg_size[i][arg] = size; + + for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) { + knum = vg_ocl_arg_map[arg][j]; + karg = vg_ocl_arg_map[arg][j+1]; + ret = clSetKernelArg(vocp->voc_oclkernel[i][knum], + karg, + sizeof(clbuf), + &clbuf); + + if (ret) { + fprintf(stderr, + "clSetKernelArg(%d,%d): ", knum, karg); + vg_ocl_error(vocp, ret, NULL); + return 0; + } + } + } + + clReleaseMemObject(clbuf); + return 1; +} + +int +vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg, + void *buffer, size_t size) +{ + cl_int slot, ret; + + slot = (wslot < 0) ? 0 : wslot; + + assert((slot >= 0) && (slot < MAX_SLOT)); + assert(size <= vocp->voc_arg_size[slot][arg]); + + ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq, + vocp->voc_args[slot][arg], + CL_TRUE, + 0, size, + buffer, + 0, NULL, + NULL); + + if (ret) { + fprintf(stderr, "clEnqueueWriteBuffer(%d): ", arg); + vg_ocl_error(vocp, ret, NULL); + return 0; + } + + return 1; +} + +static void * +vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot, + int arg, int rw) +{ + void *buf; + cl_int ret; + + assert((slot >= 0) && (slot < MAX_SLOT)); + + buf = clEnqueueMapBuffer(vocp->voc_oclcmdq, + vocp->voc_args[slot][arg], + CL_TRUE, + (rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE) + : (rw ? CL_MAP_WRITE : CL_MAP_READ), + 0, vocp->voc_arg_size[slot][arg], + 0, NULL, + NULL, + &ret); + if (!buf) { + fprintf(stderr, "clEnqueueMapBuffer(%d): ", arg); + vg_ocl_error(vocp, ret, NULL); + return NULL; + } + return buf; +} + +static void +vg_ocl_unmap_arg_buffer(vg_ocl_context_t *vocp, int slot, + int arg, void *buf) +{ + cl_int ret; + cl_event ev; + + assert((slot >= 0) && (slot < MAX_SLOT)); + + ret = clEnqueueUnmapMemObject(vocp->voc_oclcmdq, + vocp->voc_args[slot][arg], + buf, + 0, NULL, + &ev); + if (ret != CL_SUCCESS) { + fprintf(stderr, "clEnqueueUnmapMemObject(%d): ", arg); + vg_ocl_error(vocp, ret, NULL); + return; + } + + ret = clWaitForEvents(1, &ev); + clReleaseEvent(ev); + if (ret != CL_SUCCESS) { + fprintf(stderr, "clWaitForEvent(clUnmapMemObject,%d): ", arg); + vg_ocl_error(vocp, ret, NULL); + } +} + +int +vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot, + int arg, int value) +{ + cl_int ret; + int i; + + for (i = 0; i < MAX_SLOT; i++) { + if ((i != slot) && (slot >= 0)) + continue; + ret = clSetKernelArg(vocp->voc_oclkernel[i][2], + arg, + sizeof(value), + &value); + if (ret) { + fprintf(stderr, "clSetKernelArg(%d): ", arg); + vg_ocl_error(vocp, ret, NULL); + return 0; + } + } + return 1; +} + +int +vg_ocl_kernel_buffer_arg(vg_ocl_context_t *vocp, int slot, + int arg, void *value, size_t size) +{ + cl_int ret; + int i, j, knum, karg; + + for (i = 0; i < MAX_SLOT; i++) { + if ((i != slot) && (slot >= 0)) + continue; + for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) { + knum = vg_ocl_arg_map[arg][j]; + karg = vg_ocl_arg_map[arg][j+1]; + ret = clSetKernelArg(vocp->voc_oclkernel[i][knum], + karg, + size, + value); + if (ret) { + fprintf(stderr, + "clSetKernelArg(%d,%d): ", knum, karg); + vg_ocl_error(vocp, ret, NULL); + return 0; + } + } + } + return 1; +} + +static void +vg_ocl_free_args(vg_ocl_context_t *vocp) +{ + int i, arg; + for (i = 0; i < MAX_SLOT; i++) { + for (arg = 0; arg < MAX_ARG; arg++) { + if (vocp->voc_args[i][arg]) { + clReleaseMemObject(vocp->voc_args[i][arg]); + vocp->voc_args[i][arg] = NULL; + vocp->voc_arg_size[i][arg] = 0; + } + } + } +} + +int +vg_ocl_kernel_dead(vg_ocl_context_t *vocp, int slot) +{ + return (vocp->voc_oclkrnwait[slot] == NULL); +} + +static int +vg_ocl_kernel_start(vg_ocl_context_t *vocp, int slot, int ncol, int nrow, + int invsize) +{ + cl_int val, ret; + cl_event ev; + size_t globalws[2] = { ncol, nrow }; + size_t invws = (ncol * nrow) / invsize; + + assert(!vocp->voc_oclkrnwait[slot]); + + /* heap_invert() preconditions */ + assert(is_pow2(invsize) && (invsize > 1)); + + val = invsize; + ret = clSetKernelArg(vocp->voc_oclkernel[slot][1], + 1, + sizeof(val), + &val); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, "clSetKernelArg(ncol)"); + return 0; + } + ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, + vocp->voc_oclkernel[slot][0], + 2, + NULL, globalws, NULL, + 0, NULL, + &ev); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, "clEnqueueNDRange(0)"); + return 0; + } + + ret = clWaitForEvents(1, &ev); + clReleaseEvent(ev); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,0)"); + return 0; + } + + if (vocp->voc_verify_func[0] && + !(vocp->voc_verify_func[0])(vocp, slot)) { + fprintf(stderr, "ERROR: Kernel 0 failed verification test\n"); + return 0; + } + + ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, + vocp->voc_oclkernel[slot][1], + 1, + NULL, &invws, NULL, + 0, NULL, + &ev); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, "clEnqueueNDRange(1)"); + return 0; + } + + ret = clWaitForEvents(1, &ev); + clReleaseEvent(ev); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,1)"); + return 0; + } + + if (vocp->voc_verify_func[1] && + !(vocp->voc_verify_func[1])(vocp, slot)) { + fprintf(stderr, "ERROR: Kernel 1 failed verification test\n"); + return 0; + } + + ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, + vocp->voc_oclkernel[slot][2], + 2, + NULL, globalws, NULL, + 0, NULL, + &ev); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, "clEnqueueNDRange(2)"); + return 0; + } + + vocp->voc_oclkrnwait[slot] = ev; + return 1; +} + +static int +vg_ocl_kernel_wait(vg_ocl_context_t *vocp, int slot) +{ + cl_event ev; + cl_int ret; + + ev = vocp->voc_oclkrnwait[slot]; + vocp->voc_oclkrnwait[slot] = NULL; + if (ev) { + ret = clWaitForEvents(1, &ev); + clReleaseEvent(ev); + if (ret != CL_SUCCESS) { + vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,e)"); + return 0; + } + } + return 1; +} + + +static INLINE void +vg_ocl_get_bignum_raw(BIGNUM *bn, const unsigned char *buf) +{ + bn_expand(bn, 256); + memcpy(bn->d, buf, 32); + bn->top = (32 / sizeof(BN_ULONG)); +} + +static INLINE void +vg_ocl_put_bignum_raw(unsigned char *buf, const BIGNUM *bn) +{ + int bnlen = (bn->top * sizeof(BN_ULONG)); + if (bnlen >= 32) { + memcpy(buf, bn->d, 32); + } else { + memcpy(buf, bn->d, bnlen); + memset(buf + bnlen, 0, 32 - bnlen); + } +} + +#define ACCESS_BUNDLE 1024 +#define ACCESS_STRIDE (ACCESS_BUNDLE/8) + +static void +vg_ocl_get_bignum_tpa(BIGNUM *bn, const unsigned char *buf, int cell) +{ + unsigned char bnbuf[32]; + int start, i; + + start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (cell % ACCESS_STRIDE)); + for (i = 0; i < 8; i++) + memcpy(bnbuf+(i*4), + buf + 4*(start + i*ACCESS_STRIDE), + 4); + + vg_ocl_get_bignum_raw(bn, bnbuf); +} + +/* + * Absolutely disgusting. + * We want points in Montgomery form, and it's a lot easier to read the + * coordinates from the structure than to export and re-montgomeryize. + */ + +struct ec_point_st { + const EC_METHOD *meth; + BIGNUM X; + BIGNUM Y; + BIGNUM Z; + int Z_is_one; +}; + +static INLINE void +vg_ocl_get_point(EC_POINT *ppnt, const unsigned char *buf) +{ + static const unsigned char mont_one[] = { 0x01,0x00,0x00,0x03,0xd1 }; + vg_ocl_get_bignum_raw(&ppnt->X, buf); + vg_ocl_get_bignum_raw(&ppnt->Y, buf + 32); + if (!ppnt->Z_is_one) { + ppnt->Z_is_one = 1; + BN_bin2bn(mont_one, sizeof(mont_one), &ppnt->Z); + } +} + +static INLINE void +vg_ocl_put_point(unsigned char *buf, const EC_POINT *ppnt) +{ + assert(ppnt->Z_is_one); + vg_ocl_put_bignum_raw(buf, &ppnt->X); + vg_ocl_put_bignum_raw(buf + 32, &ppnt->Y); +} + +static void +vg_ocl_put_point_tpa(unsigned char *buf, int cell, const EC_POINT *ppnt) +{ + unsigned char pntbuf[64]; + int start, i; + + vg_ocl_put_point(pntbuf, ppnt); + + start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (cell % (ACCESS_STRIDE/2))); + for (i = 0; i < 8; i++) + memcpy(buf + 4*(start + i*ACCESS_STRIDE), + pntbuf+(i*4), + 4); + for (i = 0; i < 8; i++) + memcpy(buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)), + pntbuf+32+(i*4), + 4); +} + +static void +vg_ocl_get_point_tpa(EC_POINT *ppnt, const unsigned char *buf, int cell) +{ + unsigned char pntbuf[64]; + int start, i; + + start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (cell % (ACCESS_STRIDE/2))); + for (i = 0; i < 8; i++) + memcpy(pntbuf+(i*4), + buf + 4*(start + i*ACCESS_STRIDE), + 4); + for (i = 0; i < 8; i++) + memcpy(pntbuf+32+(i*4), + buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)), + 4); + + vg_ocl_get_point(ppnt, pntbuf); +} + +void +show_elapsed(struct timeval *tv, const char *place) +{ + struct timeval now, delta; + gettimeofday(&now, NULL); + timersub(&now, tv, &delta); + fprintf(stderr, + "%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec); +} + + +/* + * GPU address matching methods + * + * gethash: GPU computes and returns all address hashes. + * + Works with any matching method, including regular expressions. + * - The CPU will not be able to keep up with mid- to high-end GPUs. + * + * prefix: GPU computes hash, searches a range list, and discards. + * + Fast, minimal work for CPU. + */ + +static int +vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot) +{ + vg_exec_context_t *vxcp = &vocp->base; + vg_context_t *vcp = vocp->base.vxc_vc; + vg_test_func_t test_func = vcp->vc_test; + unsigned char *ocl_hashes_out; + int i, res = 0, round; + + ocl_hashes_out = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, slot, 0, 0); + + round = vocp->voc_ocl_cols * vocp->voc_ocl_rows; + + for (i = 0; i < round; i++, vxcp->vxc_delta++) { + memcpy(&vxcp->vxc_binres[1], + ocl_hashes_out + (20*i), + 20); + + res = test_func(vxcp); + if (res) + break; + } + + vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out); + return res; +} + +static int +vg_ocl_gethash_init(vg_ocl_context_t *vocp) +{ + int i; + + if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get")) + return 0; + + for (i = 0; i < vocp->voc_nslots; i++) { + /* Each slot gets its own hash output buffer */ + if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, + 20 * + vocp->voc_ocl_rows * + vocp->voc_ocl_cols, 1)) + return 0; + } + + vocp->voc_rekey_func = NULL; + vocp->voc_check_func = vg_ocl_gethash_check; + return 1; +} + + +static int +vg_ocl_prefix_rekey(vg_ocl_context_t *vocp) +{ + vg_context_t *vcp = vocp->base.vxc_vc; + unsigned char *ocl_targets_in; + uint32_t *ocl_found_out; + int i; + + /* Set the found indicator for each slot to -1 */ + for (i = 0; i < vocp->voc_nslots; i++) { + ocl_found_out = (uint32_t *) + vg_ocl_map_arg_buffer(vocp, i, 0, 1); + ocl_found_out[0] = 0xffffffff; + vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out); + } + + if (vocp->voc_pattern_rewrite) { + /* Count number of range records */ + i = vg_context_hash160_sort(vcp, NULL); + if (!i) { + fprintf(stderr, + "No range records available, exiting\n"); + return 0; + } + + if (i > vocp->voc_pattern_alloc) { + /* (re)allocate target buffer */ + if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0)) + return 0; + vocp->voc_pattern_alloc = i; + } + + /* Write range records */ + ocl_targets_in = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, 0, 5, 1); + vg_context_hash160_sort(vcp, ocl_targets_in); + vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in); + vg_ocl_kernel_int_arg(vocp, -1, 4, i); + + vocp->voc_pattern_rewrite = 0; + } + return 1; +} + +static int +vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot) +{ + vg_exec_context_t *vxcp = &vocp->base; + vg_context_t *vcp = vocp->base.vxc_vc; + vg_test_func_t test_func = vcp->vc_test; + uint32_t *ocl_found_out; + uint32_t found_delta; + int orig_delta, tablesize; + int res = 0; + + /* Retrieve the found indicator */ + ocl_found_out = (uint32_t *) + vg_ocl_map_arg_buffer(vocp, slot, 0, 2); + found_delta = ocl_found_out[0]; + + if (found_delta != 0xffffffff) { + /* GPU code claims match, verify with CPU version */ + orig_delta = vxcp->vxc_delta; + vxcp->vxc_delta += found_delta; + vg_exec_context_calc_address(vxcp); + + /* Make sure the GPU produced the expected hash */ + res = 0; + if (!memcmp(vxcp->vxc_binres + 1, + ocl_found_out + 2, + 20)) { + res = test_func(vxcp); + } + if (res == 0) { + /* + * The match was not found in + * the pattern list. Hmm. + */ + tablesize = ocl_found_out[2]; + fprintf(stderr, "Match idx: %d\n", ocl_found_out[1]); + fprintf(stderr, "CPU hash: "); + fdumphex(stderr, vxcp->vxc_binres + 1, 20); + fprintf(stderr, "GPU hash: "); + fdumphex(stderr, + (unsigned char *) (ocl_found_out + 2), 20); + fprintf(stderr, "Found delta: %d " + "Start delta: %d\n", + found_delta, orig_delta); + res = 1; + } + vocp->voc_pattern_rewrite = 1; + } else { + vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows); + } + + vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out); + return res; +} + +static int +vg_ocl_prefix_init(vg_ocl_context_t *vocp) +{ + int i; + + if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix")) + return 0; + + for (i = 0; i < vocp->voc_nslots; i++) { + if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1)) + return 0; + } + vocp->voc_rekey_func = vg_ocl_prefix_rekey; + vocp->voc_check_func = vg_ocl_prefix_check; + vocp->voc_pattern_rewrite = 1; + vocp->voc_pattern_alloc = 0; + return 1; +} + + +static int +vg_ocl_config_pattern(vg_ocl_context_t *vocp) +{ + vg_context_t *vcp = vocp->base.vxc_vc; + int i; + + i = vg_context_hash160_sort(vcp, NULL); + if (i > 0) { + if (vcp->vc_verbose > 1) + fprintf(stderr, "Using OpenCL prefix matcher\n"); + /* Configure for prefix matching */ + return vg_ocl_prefix_init(vocp); + } + + if (vcp->vc_verbose > 0) + fprintf(stderr, "WARNING: Using CPU pattern matcher\n"); + return vg_ocl_gethash_init(vocp); +} + + +/* + * Temporary buffer content verification functions + * This provides a simple test of the kernel, the OpenCL compiler, + * and the hardware. + */ +static int +vg_ocl_verify_temporary(vg_ocl_context_t *vocp, int slot, int z_inverted) +{ + vg_exec_context_t *vxcp = &vocp->base; + unsigned char *point_tmp = NULL, *z_heap = NULL; + unsigned char *ocl_points_in = NULL, *ocl_strides_in = NULL; + const EC_GROUP *pgroup; + EC_POINT *ppr = NULL, *ppc = NULL, *pps = NULL, *ppt = NULL; + BIGNUM bnz, bnez, bnm, *bnzc; + BN_CTX *bnctx = NULL; + BN_MONT_CTX *bnmont; + int ret = 0; + int mismatches = 0, mm_r; + int x, y, bx; + static const unsigned char raw_modulus[] = { + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F + }; + + BN_init(&bnz); + BN_init(&bnez); + BN_init(&bnm); + + bnctx = BN_CTX_new(); + bnmont = BN_MONT_CTX_new(); + pgroup = EC_KEY_get0_group(vxcp->vxc_key); + ppr = EC_POINT_new(pgroup); + ppc = EC_POINT_new(pgroup); + pps = EC_POINT_new(pgroup); + ppt = EC_POINT_new(pgroup); + + if (!bnctx || !bnmont || !ppr || !ppc || !pps || !ppt) { + fprintf(stderr, "ERROR: out of memory\n"); + goto out; + } + + BN_bin2bn(raw_modulus, sizeof(raw_modulus), &bnm); + BN_MONT_CTX_set(bnmont, &bnm, bnctx); + + if (z_inverted) { + bnzc = &bnez; + } else { + bnzc = &pps->Z; + } + + z_heap = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, slot, 1, 0); + point_tmp = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, slot, 2, 0); + ocl_points_in = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, slot, 3, 0); + ocl_strides_in = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, slot, 4, 0); + + if (!z_heap || !point_tmp || !ocl_points_in || !ocl_strides_in) { + fprintf(stderr, "ERROR: could not map OpenCL point buffers\n"); + goto out; + } + + for (y = 0; y < vocp->voc_ocl_rows; y++) { + vg_ocl_get_point(ppr, ocl_strides_in + (64*y)); + bx = y * vocp->voc_ocl_cols; + mm_r = 0; + + for (x = 0; x < vocp->voc_ocl_cols; x++) { + vg_ocl_get_point_tpa(ppc, ocl_points_in, x); + assert(ppr->Z_is_one && ppc->Z_is_one); + EC_POINT_add(pgroup, pps, ppc, ppr, bnctx); + assert(!pps->Z_is_one); + vg_ocl_get_point_tpa(ppt, point_tmp, bx + x); + vg_ocl_get_bignum_tpa(&bnz, z_heap, bx + x); + if (z_inverted) { + BN_mod_inverse(&bnez, &pps->Z, &bnm, bnctx); + BN_to_montgomery(&bnez, &bnez, bnmont, bnctx); + BN_to_montgomery(&bnez, &bnez, bnmont, bnctx); + } + if (BN_cmp(&ppt->X, &pps->X) || + BN_cmp(&ppt->Y, &pps->Y) || + BN_cmp(&bnz, bnzc)) { + if (!mismatches) { + fprintf(stderr, "Base privkey: "); + fdumpbn(stderr, EC_KEY_get0_private_key( + vxcp->vxc_key)); + } + mismatches++; + fprintf(stderr, "Mismatch for kernel %d, " + "offset %d (%d,%d)\n", + z_inverted, bx + x, y, x); + if (!mm_r) { + mm_r = 1; + fprintf(stderr, "Row X : "); + fdumpbn(stderr, &ppr->X); + fprintf(stderr, "Row Y : "); + fdumpbn(stderr, &ppr->Y); + } + + fprintf(stderr, "Column X: "); + fdumpbn(stderr, &ppc->X); + fprintf(stderr, "Column Y: "); + fdumpbn(stderr, &ppc->Y); + + if (BN_cmp(&ppt->X, &pps->X)) { + fprintf(stderr, "Expect X: "); + fdumpbn(stderr, &pps->X); + fprintf(stderr, "Device X: "); + fdumpbn(stderr, &ppt->X); + } + if (BN_cmp(&ppt->Y, &pps->Y)) { + fprintf(stderr, "Expect Y: "); + fdumpbn(stderr, &pps->Y); + fprintf(stderr, "Device Y: "); + fdumpbn(stderr, &ppt->Y); + } + if (BN_cmp(&bnz, bnzc)) { + fprintf(stderr, "Expect Z: "); + fdumpbn(stderr, bnzc); + fprintf(stderr, "Device Z: "); + fdumpbn(stderr, &bnz); + } + } + } + } + + ret = !mismatches; + +out: + if (z_heap) + vg_ocl_unmap_arg_buffer(vocp, slot, 1, z_heap); + if (point_tmp) + vg_ocl_unmap_arg_buffer(vocp, slot, 2, point_tmp); + if (ocl_points_in) + vg_ocl_unmap_arg_buffer(vocp, slot, 3, ocl_points_in); + if (ocl_strides_in) + vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in); + if (ppr) + EC_POINT_free(ppr); + if (ppc) + EC_POINT_free(ppc); + if (pps) + EC_POINT_free(pps); + if (ppt) + EC_POINT_free(ppt); + BN_clear_free(&bnz); + BN_clear_free(&bnez); + BN_clear_free(&bnm); + if (bnmont) + BN_MONT_CTX_free(bnmont); + if (bnctx) + BN_CTX_free(bnctx); + return ret; +} + +static int +vg_ocl_verify_k0(vg_ocl_context_t *vocp, int slot) +{ + return vg_ocl_verify_temporary(vocp, slot, 0); +} + +static int +vg_ocl_verify_k1(vg_ocl_context_t *vocp, int slot) +{ + return vg_ocl_verify_temporary(vocp, slot, 1); +} + +static void * +vg_opencl_thread(void *arg) +{ + vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg; + vg_context_t *vcp = vocp->base.vxc_vc; + int halt = 0; + int slot = -1; + int rows, cols, invsize; + unsigned long long idleu, busyu; + double pidle; + struct timeval tv, tvt, tvd, idle, busy; + + memset(&idle, 0, sizeof(idle)); + memset(&busy, 0, sizeof(busy)); + + while (1) { + pthread_mutex_lock(&vocp->voc_lock); + if (halt) { + halt = 0; + vocp->voc_halt = 1; + } + if (slot != -1) { + assert(vocp->voc_ocl_slot == slot); + vocp->voc_ocl_slot = -1; + slot = -1; + pthread_cond_signal(&vocp->voc_wait); + } + if (vocp->voc_ocl_slot == -1) { + gettimeofday(&tv, NULL); + while (vocp->voc_ocl_slot == -1) { + if (vocp->voc_halt) + goto out; + pthread_cond_wait(&vocp->voc_wait, + &vocp->voc_lock); + } + gettimeofday(&tvt, NULL); + timersub(&tvt, &tv, &tvd); + timeradd(&tvd, &idle, &idle); + } + slot = vocp->voc_ocl_slot; + rows = vocp->voc_ocl_rows; + cols = vocp->voc_ocl_cols; + invsize = vocp->voc_ocl_invsize; + pthread_mutex_unlock(&vocp->voc_lock); + + gettimeofday(&tv, NULL); + if (!vg_ocl_kernel_start(vocp, slot, cols, rows, invsize)) + halt = 1; + + if (!vg_ocl_kernel_wait(vocp, slot)) + halt = 1; + + if (vcp->vc_verbose > 1) { + gettimeofday(&tvt, NULL); + timersub(&tvt, &tv, &tvd); + timeradd(&tvd, &busy, &busy); + if ((busy.tv_sec + idle.tv_sec) > 1) { + idleu = (1000000 * idle.tv_sec) + idle.tv_usec; + busyu = (1000000 * busy.tv_sec) + busy.tv_usec; + pidle = ((double) idleu) / (idleu + busyu); + + if (pidle > 0.01) { + fprintf(stderr, "\rGPU idle: %.2f%%" + " " + " \n", + 100 * pidle); + } + memset(&idle, 0, sizeof(idle)); + memset(&busy, 0, sizeof(busy)); + } + } + } +out: + pthread_mutex_unlock(&vocp->voc_lock); + return NULL; +} + + +/* + * Address search thread main loop + */ + +void * +vg_opencl_loop(void *arg) +{ + vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg; + int i; + int round, nrows, ncols; + + const BN_ULONG rekey_max = 100000000; + BN_ULONG npoints, rekey_at; + + EC_KEY *pkey = NULL; + const EC_GROUP *pgroup; + const EC_POINT *pgen; + EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL; + EC_POINT *pseek = NULL; + + unsigned char *ocl_points_in, *ocl_strides_in; + + vg_context_t *vcp = vocp->base.vxc_vc; + vg_exec_context_t *vxcp = &vocp->base; + + int slot, nslots; + int slot_busy = 0, slot_done = 0, halt = 0; + int c = 0, output_interval = 1000; + + struct timeval tvstart; + + pkey = vxcp->vxc_key; + pgroup = EC_KEY_get0_group(pkey); + pgen = EC_GROUP_get0_generator(pgroup); + + round = vocp->voc_ocl_rows * vocp->voc_ocl_cols; + + if (!vcp->vc_remove_on_match && + (vcp->vc_chance >= 1.0f) && + (vcp->vc_chance < round) && + (vcp->vc_verbose > 0)) { + fprintf(stderr, "WARNING: low pattern difficulty\n"); + fprintf(stderr, + "WARNING: better match throughput is possible " + "using vanitygen on the CPU\n"); + } + + slot = 0; + nslots = 2; + vocp->voc_nslots = nslots; + + nrows = vocp->voc_ocl_rows; + ncols = vocp->voc_ocl_cols; + + ppbase = (EC_POINT **) malloc((nrows + ncols) * + sizeof(EC_POINT*)); + if (!ppbase) + goto enomem; + + for (i = 0; i < (nrows + ncols); i++) { + ppbase[i] = EC_POINT_new(pgroup); + if (!ppbase[i]) + goto enomem; + } + + pprow = ppbase + ncols; + pbatchinc = EC_POINT_new(pgroup); + poffset = EC_POINT_new(pgroup); + pseek = EC_POINT_new(pgroup); + if (!pbatchinc || !poffset || !pseek) + goto enomem; + + BN_set_word(&vxcp->vxc_bntmp, ncols); + EC_POINT_mul(pgroup, pbatchinc, &vxcp->vxc_bntmp, NULL, NULL, + vxcp->vxc_bnctx); + EC_POINT_make_affine(pgroup, pbatchinc, vxcp->vxc_bnctx); + + BN_set_word(&vxcp->vxc_bntmp, round); + EC_POINT_mul(pgroup, poffset, &vxcp->vxc_bntmp, NULL, NULL, + vxcp->vxc_bnctx); + EC_POINT_make_affine(pgroup, poffset, vxcp->vxc_bnctx); + + if (!vg_ocl_config_pattern(vocp)) + goto enomem; + + for (i = 0; i < nslots; i++) { + /* + * Each work group gets its own: + * - Column point array + */ + if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * nrows, 1)) + goto enomem; + } + + /* + * All instances share: + * - The z_heap and point scratch spaces + * - The row point array + */ + if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1, + round_up_pow2(32 * 2 * round, 4096), 0) || + !vg_ocl_kernel_arg_alloc(vocp, -1, 2, + round_up_pow2(32 * 2 * round, 4096), 0) || + !vg_ocl_kernel_arg_alloc(vocp, -1, 3, + round_up_pow2(32 * 2 * ncols, 4096), 1)) + goto enomem; + + npoints = 0; + rekey_at = 0; + vxcp->vxc_binres[0] = vcp->vc_addrtype; + + if (pthread_create(&vocp->voc_ocl_thread, NULL, + vg_opencl_thread, vocp)) + goto enomem; + + gettimeofday(&tvstart, NULL); + +l_rekey: + if (vocp->voc_rekey_func && + !vocp->voc_rekey_func(vocp)) + goto enomem; + + /* Generate a new random private key */ + EC_KEY_generate_key(pkey); + npoints = 0; + + /* Determine rekey interval */ + EC_GROUP_get_order(pgroup, &vxcp->vxc_bntmp, vxcp->vxc_bnctx); + BN_sub(&vxcp->vxc_bntmp2, + &vxcp->vxc_bntmp, + EC_KEY_get0_private_key(pkey)); + rekey_at = BN_get_word(&vxcp->vxc_bntmp2); + if ((rekey_at == BN_MASK2) || (rekey_at > rekey_max)) + rekey_at = rekey_max; + assert(rekey_at > 0); + + EC_POINT_copy(ppbase[0], EC_KEY_get0_public_key(pkey)); + + if (vcp->vc_pubkey_base) { + EC_POINT_add(pgroup, + ppbase[0], + ppbase[0], + vcp->vc_pubkey_base, + vxcp->vxc_bnctx); + } + + /* Build the base array of sequential points */ + for (i = 1; i < ncols; i++) { + EC_POINT_add(pgroup, + ppbase[i], + ppbase[i-1], + pgen, vxcp->vxc_bnctx); + } + + EC_POINTs_make_affine(pgroup, ncols, ppbase, vxcp->vxc_bnctx); + + /* Fill the sequential point array */ + ocl_points_in = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, 0, 3, 1); + if (!ocl_points_in) + goto enomem; + for (i = 0; i < ncols; i++) + vg_ocl_put_point_tpa(ocl_points_in, i, ppbase[i]); + vg_ocl_unmap_arg_buffer(vocp, 0, 3, ocl_points_in); + + /* + * Set up the initial row increment table. + * Set the first element to pgen -- effectively + * skipping the exact key generated above. + */ + EC_POINT_copy(pprow[0], pgen); + for (i = 1; i < nrows; i++) { + EC_POINT_add(pgroup, + pprow[i], + pprow[i-1], + pbatchinc, vxcp->vxc_bnctx); + } + EC_POINTs_make_affine(pgroup, nrows, pprow, vxcp->vxc_bnctx); + vxcp->vxc_delta = 1; + npoints = 1; + slot = 0; + slot_busy = 0; + slot_done = 0; + + while (1) { + if (slot_done) { + assert(rekey_at > 0); + slot_done = 0; + + /* Call the result check function */ + switch (vocp->voc_check_func(vocp, slot)) { + case 1: + rekey_at = 0; + break; + case 2: + halt = 1; + break; + default: + break; + } + + c += round; + if (!halt && (c >= output_interval)) { + output_interval = + vg_output_timing(vcp, c, &tvstart); + c = 0; + } + } + + if (vcp->vc_halt) + halt = 1; + if (halt) + break; + + if ((npoints + round) < rekey_at) { + if (npoints > 1) { + /* Move the row increments forward */ + for (i = 0; i < nrows; i++) { + EC_POINT_add(pgroup, + pprow[i], + pprow[i], + poffset, + vxcp->vxc_bnctx); + } + + EC_POINTs_make_affine(pgroup, nrows, pprow, + vxcp->vxc_bnctx); + } + + /* Copy the row stride array to the device */ + ocl_strides_in = (unsigned char *) + vg_ocl_map_arg_buffer(vocp, slot, 4, 1); + if (!ocl_strides_in) + goto enomem; + memset(ocl_strides_in, 0, 64*nrows); + for (i = 0; i < nrows; i++) + vg_ocl_put_point(ocl_strides_in + (64*i), + pprow[i]); + vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in); + npoints += round; + + pthread_mutex_lock(&vocp->voc_lock); + while (vocp->voc_ocl_slot != -1) { + assert(slot_busy); + pthread_cond_wait(&vocp->voc_wait, + &vocp->voc_lock); + } + + if (vocp->voc_halt) { + pthread_mutex_unlock(&vocp->voc_lock); + halt = 1; + break; + } + + vocp->voc_ocl_slot = slot; + pthread_cond_signal(&vocp->voc_wait); + pthread_mutex_unlock(&vocp->voc_lock); + + slot_done = slot_busy; + slot_busy = 1; + slot = (slot + 1) % nslots; + + } else { + if (slot_busy) { + pthread_mutex_lock(&vocp->voc_lock); + while (vocp->voc_ocl_slot != -1) { + assert(vocp->voc_ocl_slot == + ((slot + nslots - 1) % nslots)); + pthread_cond_wait(&vocp->voc_wait, + &vocp->voc_lock); + } + pthread_mutex_unlock(&vocp->voc_lock); + slot_busy = 0; + slot_done = 1; + } + + if (!rekey_at || + (!slot_done && ((npoints + round) >= rekey_at))) + goto l_rekey; + } + } + + if (0) { + enomem: + fprintf(stderr, "ERROR: allocation failure?\n"); + } + + if (halt) { + if (vcp->vc_verbose > 1) { + printf("Halting..."); + fflush(stdout); + } + pthread_mutex_lock(&vocp->voc_lock); + vocp->voc_halt = 1; + pthread_cond_signal(&vocp->voc_wait); + while (vocp->voc_ocl_slot != -1) { + assert(slot_busy); + pthread_cond_wait(&vocp->voc_wait, + &vocp->voc_lock); + } + slot_busy = 0; + pthread_mutex_unlock(&vocp->voc_lock); + pthread_join(vocp->voc_ocl_thread, NULL); + if (vcp->vc_verbose > 1) + printf("done!\n"); + } + + if (ppbase) { + for (i = 0; i < (nrows + ncols); i++) + if (ppbase[i]) + EC_POINT_free(ppbase[i]); + free(ppbase); + } + if (pbatchinc) + EC_POINT_free(pbatchinc); + + /* Release the argument buffers */ + vg_ocl_free_args(vocp); + vocp->voc_halt = 0; + vocp->voc_ocl_slot = -1; + return NULL; +} + + + + +/* + * OpenCL platform/device selection junk + */ + +static int +get_device_list(cl_platform_id pid, cl_device_id **list_out) +{ + cl_uint nd; + cl_int res; + cl_device_id *ids; + res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); + if (res != CL_SUCCESS) { + vg_ocl_error(NULL, res, "clGetDeviceIDs(0)"); + *list_out = NULL; + return -1; + } + if (nd) { + ids = (cl_device_id *) malloc(nd * sizeof(*ids)); + if (ids == NULL) { + fprintf(stderr, "Could not allocate device ID list\n"); + *list_out = NULL; + return -1; + } + res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, nd, ids, NULL); + if (res != CL_SUCCESS) { + vg_ocl_error(NULL, res, "clGetDeviceIDs(n)"); + free(ids); + *list_out = NULL; + return -1; + } + *list_out = ids; + } + return nd; +} + +static void +show_devices(cl_platform_id pid, cl_device_id *ids, int nd, int base) +{ + int i; + char nbuf[128]; + char vbuf[128]; + size_t len; + cl_int res; + + for (i = 0; i < nd; i++) { + res = clGetDeviceInfo(ids[i], CL_DEVICE_NAME, + sizeof(nbuf), nbuf, &len); + if (res != CL_SUCCESS) + continue; + if (len >= sizeof(nbuf)) + len = sizeof(nbuf) - 1; + nbuf[len] = '\0'; + res = clGetDeviceInfo(ids[i], CL_DEVICE_VENDOR, + sizeof(vbuf), vbuf, &len); + if (res != CL_SUCCESS) + continue; + if (len >= sizeof(vbuf)) + len = sizeof(vbuf) - 1; + vbuf[len] = '\0'; + fprintf(stderr, " %d: [%s] %s\n", i + base, vbuf, nbuf); + } +} + +static cl_device_id +get_device(cl_platform_id pid, int num) +{ + int nd; + cl_device_id id, *ids; + + nd = get_device_list(pid, &ids); + if (nd < 0) + return NULL; + if (!nd) { + fprintf(stderr, "No OpenCL devices found\n"); + return NULL; + } + if (num < 0) { + if (nd == 1) + num = 0; + else + num = nd; + } + if (num < nd) { + id = ids[num]; + free(ids); + return id; + } + free(ids); + return NULL; +} + +static int +get_platform_list(cl_platform_id **list_out) +{ + cl_uint np; + cl_int res; + cl_platform_id *ids; + res = clGetPlatformIDs(0, NULL, &np); + if (res != CL_SUCCESS) { + vg_ocl_error(NULL, res, "clGetPlatformIDs(0)"); + *list_out = NULL; + return -1; + } + if (np) { + ids = (cl_platform_id *) malloc(np * sizeof(*ids)); + if (ids == NULL) { + fprintf(stderr, + "Could not allocate platform ID list\n"); + *list_out = NULL; + return -1; + } + res = clGetPlatformIDs(np, ids, NULL); + if (res != CL_SUCCESS) { + vg_ocl_error(NULL, res, "clGetPlatformIDs(n)"); + free(ids); + *list_out = NULL; + return -1; + } + *list_out = ids; + } + return np; +} + +void +show_platforms(cl_platform_id *ids, int np, int base) +{ + int i; + char nbuf[128]; + char vbuf[128]; + size_t len; + cl_int res; + + for (i = 0; i < np; i++) { + res = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, + sizeof(nbuf), nbuf, &len); + if (res != CL_SUCCESS) { + vg_ocl_error(NULL, res, "clGetPlatformInfo(NAME)"); + continue; + } + if (len >= sizeof(nbuf)) + len = sizeof(nbuf) - 1; + nbuf[len] = '\0'; + res = clGetPlatformInfo(ids[i], CL_PLATFORM_VENDOR, + sizeof(vbuf), vbuf, &len); + if (res != CL_SUCCESS) { + vg_ocl_error(NULL, res, "clGetPlatformInfo(VENDOR)"); + continue; + } + if (len >= sizeof(vbuf)) + len = sizeof(vbuf) - 1; + vbuf[len] = '\0'; + fprintf(stderr, "%d: [%s] %s\n", i + base, vbuf, nbuf); + } +} + +static cl_platform_id +get_platform(int num) +{ + int np; + cl_platform_id id, *ids; + + np = get_platform_list(&ids); + if (np < 0) + return NULL; + if (!np) { + fprintf(stderr, "No OpenCL platforms available\n"); + return NULL; + } + if (num < 0) { + if (np == 1) + num = 0; + else + num = np; + } + if (num < np) { + id = ids[num]; + free(ids); + return id; + } + free(ids); + return NULL; +} + +void +enumerate_opencl(void) +{ + cl_platform_id *pids; + cl_device_id *dids; + int np, nd, i; + + np = get_platform_list(&pids); + if (!np) { + fprintf(stderr, "No OpenCL platforms available\n"); + return; + } + fprintf(stderr, "Available OpenCL platforms:\n"); + for (i = 0; i < np; i++) { + show_platforms(&pids[i], 1, i); + nd = get_device_list(pids[i], &dids); + if (!nd) { + fprintf(stderr, " -- No devices\n"); + } else { + show_devices(pids[i], dids, nd, 0); + } + } +} + +cl_device_id +get_opencl_device(int platformidx, int deviceidx) +{ + cl_platform_id pid; + cl_device_id did = NULL; + + pid = get_platform(platformidx); + if (pid) { + did = get_device(pid, deviceidx); + if (did) + return did; + } + enumerate_opencl(); + return NULL; +} + + +vg_ocl_context_t * +vg_ocl_context_new(vg_context_t *vcp, + int platformidx, int deviceidx, int safe_mode, int verify, + int worksize, int nthreads, int nrows, int ncols, + int invsize) +{ + cl_device_id did; + int round, full_threads, wsmult; + cl_ulong memsize, allocsize; + vg_ocl_context_t *vocp; + + /* Find the device */ + did = get_opencl_device(platformidx, deviceidx); + if (!did) { + return 0; + } + + vocp = (vg_ocl_context_t *) malloc(sizeof(*vocp)); + if (!vocp) + return NULL; + + /* Open the device and compile the kernel */ + if (!vg_ocl_init(vcp, vocp, did, safe_mode)) { + free(vocp); + return NULL; + } + + if (verify) { + if (vcp->vc_verbose > 0) { + fprintf(stderr, "WARNING: " + "Hardware verification mode enabled\n"); + } + if (!nthreads) + nthreads = 1; + vocp->voc_verify_func[0] = vg_ocl_verify_k0; + vocp->voc_verify_func[1] = vg_ocl_verify_k1; + } + + /* + * nrows: number of point rows per job + * ncols: number of point columns per job + * invsize: number of modular inversion tasks per job + * (each task performs (nrows*ncols)/invsize inversions) + * nslots: number of kernels + * (create two, keep one running while we service the other or wait) + */ + + if (!nthreads) { + /* Pick nthreads sufficient to saturate one compute unit */ + if (vg_ocl_device_gettype(vocp->voc_ocldid) & + CL_DEVICE_TYPE_CPU) + nthreads = 1; + else + nthreads = vg_ocl_device_getsizet(vocp->voc_ocldid, + CL_DEVICE_MAX_WORK_GROUP_SIZE); + } + + full_threads = vg_ocl_device_getsizet(vocp->voc_ocldid, + CL_DEVICE_MAX_COMPUTE_UNITS); + full_threads *= nthreads; + + /* + * The work size selection is complicated, and the most + * important factor is the batch size of the heap_invert kernel. + * Each value added to the batch trades one complete modular + * inversion for four multiply operations. Ideally the work + * size would be as large as possible. The practical limiting + * factors are: + * 1. Available memory + * 2. Responsiveness and operational latency + * + * We take a naive approach and limit batch size to a point of + * sufficiently diminishing returns, hoping that responsiveness + * will be sufficient. + * + * The measured value for the OpenSSL implementations on my CPU + * is 80:1. This causes heap_invert to get batches of 20 or so + * for free, and receive 10% incremental returns at 200. The CPU + * work size is therefore set to 256. + * + * The ratio on most GPUs with the oclvanitygen implementations + * is closer to 500:1, and larger batches are required for + * good performance. + */ + if (!worksize) { + if (vg_ocl_device_gettype(vocp->voc_ocldid) & + CL_DEVICE_TYPE_GPU) + worksize = 2048; + else + worksize = 256; + } + + if (!ncols) { + memsize = vg_ocl_device_getulong(vocp->voc_ocldid, + CL_DEVICE_GLOBAL_MEM_SIZE); + allocsize = vg_ocl_device_getulong(vocp->voc_ocldid, + CL_DEVICE_MAX_MEM_ALLOC_SIZE); + memsize /= 2; + ncols = full_threads; + nrows = 2; + /* Find row and column counts close to sqrt(full_threads) */ + while ((ncols > nrows) && !(ncols & 1)) { + ncols /= 2; + nrows *= 2; + } + + /* + * Increase row & column counts to satisfy work size + * multiplier or fill available memory. + */ + wsmult = 1; + while ((!worksize || ((wsmult * 2) <= worksize)) && + ((ncols * nrows * 2 * 128) < memsize) && + ((ncols * nrows * 2 * 64) < allocsize)) { + if (ncols > nrows) + nrows *= 2; + else + ncols *= 2; + wsmult *= 2; + } + } + + round = nrows * ncols; + + if (!invsize) { + invsize = 2; + while (!(round % (invsize << 1)) && + ((round / invsize) > full_threads)) + invsize <<= 1; + } + + if (vcp->vc_verbose > 1) { + fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows); + fprintf(stderr, "Modular inverse: %d threads, %d ops each\n", + round/invsize, invsize); + } + + if ((round % invsize) || !is_pow2(invsize) || (invsize < 2)) { + if (vcp->vc_verbose <= 1) { + fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows); + fprintf(stderr, + "Modular inverse: %d threads, %d ops each\n", + round/invsize, invsize); + } + if (round % invsize) + fprintf(stderr, + "Modular inverse work size must " + "evenly divide points\n"); + else + fprintf(stderr, + "Modular inverse work per task (%d) " + "must be a power of 2\n", invsize); + goto out_fail; + } + + vocp->voc_ocl_rows = nrows; + vocp->voc_ocl_cols = ncols; + vocp->voc_ocl_invsize = invsize; + + return vocp; + +out_fail: + vg_ocl_context_free(vocp); + return NULL; +} + +void +vg_ocl_context_free(vg_ocl_context_t *vocp) +{ + vg_ocl_del(vocp); + free(vocp); +} diff --git a/oclengine.h b/oclengine.h new file mode 100644 index 0000000..d13c1f0 --- /dev/null +++ b/oclengine.h @@ -0,0 +1,35 @@ +/* + * Vanitygen, vanity bitcoin address generator + * Copyright (C) 2011 + * + * Vanitygen is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * Vanitygen is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with Vanitygen. If not, see . + */ + +#if !defined (__VG_OCLENGINE_H__) +#define __VG_OCLENGINE_H__ + +#include "pattern.h" + +typedef struct _vg_ocl_context_s vg_ocl_context_t; + +extern vg_ocl_context_t *vg_ocl_context_new( + vg_context_t *vcp, int platformidx, int deviceidx, + int safe_mode, int verify, + int worksize, int nthreads, int nrows, int ncols, + int invsize); +extern void vg_ocl_context_free(vg_ocl_context_t *vocp); + +extern void *vg_opencl_loop(void *vocp); + +#endif /* !defined (__VG_OCLENGINE_H__) */ diff --git a/oclvanitygen.c b/oclvanitygen.c index 7b204fb..0284ea0 100644 --- a/oclvanitygen.c +++ b/oclvanitygen.c @@ -21,2517 +21,17 @@ #include #include -#include - -#include -#include -#include -#include - -#ifdef __APPLE__ -#include -#ifndef CL_CALLBACK -#define CL_CALLBACK -#endif -#else -#include -#endif - -#include "pattern.h" -#include "util.h" - - -const char *version = "0.17"; -const int debug = 0; - -#define MAX_SLOT 2 -#define MAX_ARG 6 -#define MAX_KERNEL 3 - -#define is_pow2(v) (!((v) & ((v)-1))) -#define round_up_pow2(x, a) (((x) + ((a)-1)) & ~((a)-1)) - -/* OpenCL address searching mode */ -struct _vg_ocl_context_s; -typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *); -typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot); - -typedef struct _vg_ocl_context_s { - vg_exec_context_t base; - cl_device_id voc_ocldid; - cl_context voc_oclctx; - cl_command_queue voc_oclcmdq; - cl_program voc_oclprog; - vg_ocl_init_t voc_init_func; - vg_ocl_init_t voc_rekey_func; - vg_ocl_check_t voc_check_func; - int voc_quirks; - int voc_nslots; - cl_kernel voc_oclkernel[MAX_SLOT][MAX_KERNEL]; - cl_event voc_oclkrnwait[MAX_SLOT]; - cl_mem voc_args[MAX_SLOT][MAX_ARG]; - size_t voc_arg_size[MAX_SLOT][MAX_ARG]; - - int voc_pattern_rewrite; - int voc_pattern_alloc; - - vg_ocl_check_t voc_verify_func[MAX_KERNEL]; - - pthread_t voc_ocl_thread; - pthread_mutex_t voc_lock; - pthread_cond_t voc_wait; - int voc_ocl_slot; - int voc_ocl_rows; - int voc_ocl_cols; - int voc_ocl_invsize; - int voc_halt; - int voc_rekey; - int voc_dump_done; -} vg_ocl_context_t; - - -/* Thread synchronization stubs */ -void -vg_exec_downgrade_lock(vg_exec_context_t *vxcp) -{ -} - -int -vg_exec_upgrade_lock(vg_exec_context_t *vxcp) -{ - return 0; -} - - -/* - * OpenCL debugging and support - */ - -const char * -vg_ocl_strerror(cl_int ret) -{ -#define OCL_STATUS(st) case st: return #st; - switch (ret) { - OCL_STATUS(CL_SUCCESS); - OCL_STATUS(CL_DEVICE_NOT_FOUND); - OCL_STATUS(CL_DEVICE_NOT_AVAILABLE); - OCL_STATUS(CL_COMPILER_NOT_AVAILABLE); - OCL_STATUS(CL_MEM_OBJECT_ALLOCATION_FAILURE); - OCL_STATUS(CL_OUT_OF_RESOURCES); - OCL_STATUS(CL_OUT_OF_HOST_MEMORY); - OCL_STATUS(CL_PROFILING_INFO_NOT_AVAILABLE); - OCL_STATUS(CL_MEM_COPY_OVERLAP); - OCL_STATUS(CL_IMAGE_FORMAT_MISMATCH); - OCL_STATUS(CL_IMAGE_FORMAT_NOT_SUPPORTED); - OCL_STATUS(CL_BUILD_PROGRAM_FAILURE); - OCL_STATUS(CL_MAP_FAILURE); -#if defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) - OCL_STATUS(CL_MISALIGNED_SUB_BUFFER_OFFSET); -#endif /* defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) */ -#if defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) - OCL_STATUS(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); -#endif /* defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) */ - OCL_STATUS(CL_INVALID_VALUE); - OCL_STATUS(CL_INVALID_DEVICE_TYPE); - OCL_STATUS(CL_INVALID_PLATFORM); - OCL_STATUS(CL_INVALID_DEVICE); - OCL_STATUS(CL_INVALID_CONTEXT); - OCL_STATUS(CL_INVALID_QUEUE_PROPERTIES); - OCL_STATUS(CL_INVALID_COMMAND_QUEUE); - OCL_STATUS(CL_INVALID_HOST_PTR); - OCL_STATUS(CL_INVALID_MEM_OBJECT); - OCL_STATUS(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); - OCL_STATUS(CL_INVALID_IMAGE_SIZE); - OCL_STATUS(CL_INVALID_SAMPLER); - OCL_STATUS(CL_INVALID_BINARY); - OCL_STATUS(CL_INVALID_BUILD_OPTIONS); - OCL_STATUS(CL_INVALID_PROGRAM); - OCL_STATUS(CL_INVALID_PROGRAM_EXECUTABLE); - OCL_STATUS(CL_INVALID_KERNEL_NAME); - OCL_STATUS(CL_INVALID_KERNEL_DEFINITION); - OCL_STATUS(CL_INVALID_KERNEL); - OCL_STATUS(CL_INVALID_ARG_INDEX); - OCL_STATUS(CL_INVALID_ARG_VALUE); - OCL_STATUS(CL_INVALID_ARG_SIZE); - OCL_STATUS(CL_INVALID_KERNEL_ARGS); - OCL_STATUS(CL_INVALID_WORK_DIMENSION); - OCL_STATUS(CL_INVALID_WORK_GROUP_SIZE); - OCL_STATUS(CL_INVALID_WORK_ITEM_SIZE); - OCL_STATUS(CL_INVALID_GLOBAL_OFFSET); - OCL_STATUS(CL_INVALID_EVENT_WAIT_LIST); - OCL_STATUS(CL_INVALID_EVENT); - OCL_STATUS(CL_INVALID_OPERATION); - OCL_STATUS(CL_INVALID_GL_OBJECT); - OCL_STATUS(CL_INVALID_BUFFER_SIZE); - OCL_STATUS(CL_INVALID_MIP_LEVEL); - OCL_STATUS(CL_INVALID_GLOBAL_WORK_SIZE); -#if defined(CL_INVALID_PROPERTY) - OCL_STATUS(CL_INVALID_PROPERTY); -#endif /* defined(CL_INVALID_PROPERTY) */ -#undef OCL_STATUS - default: { - static char tmp[64]; - snprintf(tmp, sizeof(tmp), "Unknown code %d", ret); - return tmp; - } - } -} - -/* Get device strings, using a static buffer -- caveat emptor */ -const char * -vg_ocl_platform_getstr(cl_platform_id pid, cl_platform_info param) -{ - static char platform_str[1024]; - cl_int ret; - size_t size_ret; - ret = clGetPlatformInfo(pid, param, - sizeof(platform_str), platform_str, - &size_ret); - if (ret != CL_SUCCESS) { - snprintf(platform_str, sizeof(platform_str), - "clGetPlatformInfo(%d): %s", - param, vg_ocl_strerror(ret)); - } - return platform_str; -} - -cl_platform_id -vg_ocl_device_getplatform(cl_device_id did) -{ - cl_int ret; - cl_platform_id val; - size_t size_ret; - ret = clGetDeviceInfo(did, CL_DEVICE_PLATFORM, - sizeof(val), &val, &size_ret); - if (ret != CL_SUCCESS) { - fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_PLATFORM): %s", - vg_ocl_strerror(ret)); - } - return val; -} - -cl_device_type -vg_ocl_device_gettype(cl_device_id did) -{ - cl_int ret; - cl_device_type val; - size_t size_ret; - ret = clGetDeviceInfo(did, CL_DEVICE_TYPE, - sizeof(val), &val, &size_ret); - if (ret != CL_SUCCESS) { - fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_TYPE): %s", - vg_ocl_strerror(ret)); - } - return val; -} - -const char * -vg_ocl_device_getstr(cl_device_id did, cl_device_info param) -{ - static char device_str[1024]; - cl_int ret; - size_t size_ret; - ret = clGetDeviceInfo(did, param, - sizeof(device_str), device_str, - &size_ret); - if (ret != CL_SUCCESS) { - snprintf(device_str, sizeof(device_str), - "clGetDeviceInfo(%d): %s", - param, vg_ocl_strerror(ret)); - } - return device_str; -} - -size_t -vg_ocl_device_getsizet(cl_device_id did, cl_device_info param) -{ - cl_int ret; - size_t val; - size_t size_ret; - ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); - if (ret != CL_SUCCESS) { - fprintf(stderr, - "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); - } - return val; -} - -cl_ulong -vg_ocl_device_getulong(cl_device_id did, cl_device_info param) -{ - cl_int ret; - cl_ulong val; - size_t size_ret; - ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); - if (ret != CL_SUCCESS) { - fprintf(stderr, - "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); - } - return val; -} - -cl_uint -vg_ocl_device_getuint(cl_device_id did, cl_device_info param) -{ - cl_int ret; - cl_uint val; - size_t size_ret; - ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); - if (ret != CL_SUCCESS) { - fprintf(stderr, - "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); - } - return val; -} - -void -vg_ocl_dump_info(vg_ocl_context_t *vocp) -{ - cl_device_id did; - if (vocp->base.vxc_vc && (vocp->base.vxc_vc->vc_verbose < 1)) - return; - if (vocp->voc_dump_done) - return; - did = vocp->voc_ocldid; - fprintf(stderr, "Device: %s\n", - vg_ocl_device_getstr(did, CL_DEVICE_NAME)); - fprintf(stderr, "Vendor: %s (%04x)\n", - vg_ocl_device_getstr(did, CL_DEVICE_VENDOR), - vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID)); - fprintf(stderr, "Driver: %s\n", - vg_ocl_device_getstr(did, CL_DRIVER_VERSION)); - fprintf(stderr, "Profile: %s\n", - vg_ocl_device_getstr(did, CL_DEVICE_PROFILE)); - fprintf(stderr, "Version: %s\n", - vg_ocl_device_getstr(did, CL_DEVICE_VERSION)); - fprintf(stderr, "Max compute units: %"PRSIZET"d\n", - vg_ocl_device_getsizet(did, CL_DEVICE_MAX_COMPUTE_UNITS)); - fprintf(stderr, "Max workgroup size: %"PRSIZET"d\n", - vg_ocl_device_getsizet(did, CL_DEVICE_MAX_WORK_GROUP_SIZE)); - fprintf(stderr, "Global memory: %ld\n", - vg_ocl_device_getulong(did, CL_DEVICE_GLOBAL_MEM_SIZE)); - fprintf(stderr, "Max allocation: %ld\n", - vg_ocl_device_getulong(did, CL_DEVICE_MAX_MEM_ALLOC_SIZE)); - vocp->voc_dump_done = 1; -} - -void -vg_ocl_error(vg_ocl_context_t *vocp, int code, const char *desc) -{ - const char *err = vg_ocl_strerror(code); - if (desc) { - fprintf(stderr, "%s: %s\n", desc, err); - } else { - fprintf(stderr, "%s\n", err); - } - - if (vocp && vocp->voc_ocldid) - vg_ocl_dump_info(vocp); -} - -void -vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog) -{ - size_t logbufsize, logsize; - char *log; - int off = 0; - cl_int ret; - - ret = clGetProgramBuildInfo(prog, - vocp->voc_ocldid, - CL_PROGRAM_BUILD_LOG, - 0, NULL, - &logbufsize); - if (ret != CL_SUCCESS) { - vg_ocl_error(NULL, ret, "clGetProgramBuildInfo"); - return; - } - - log = (char *) malloc(logbufsize); - if (!log) { - fprintf(stderr, "Could not allocate build log buffer\n"); - return; - } - - ret = clGetProgramBuildInfo(prog, - vocp->voc_ocldid, - CL_PROGRAM_BUILD_LOG, - logbufsize, - log, - &logsize); - if (ret != CL_SUCCESS) { - vg_ocl_error(NULL, ret, "clGetProgramBuildInfo"); - - } else { - /* Remove leading newlines and trailing newlines/whitespace */ - log[logbufsize-1] = '\0'; - for (off = logsize - 1; off >= 0; off--) { - if ((log[off] != '\r') && - (log[off] != '\n') && - (log[off] != ' ') && - (log[off] != '\t') && - (log[off] != '\0')) - break; - log[off] = '\0'; - } - for (off = 0; off < logbufsize; off++) { - if ((log[off] != '\r') && - (log[off] != '\n')) - break; - } - - fprintf(stderr, "Build log:\n%s\n", &log[off]); - } - free(log); -} - -/* - * OpenCL per-exec functions - */ - -enum { - VG_OCL_DEEP_PREPROC_UNROLL = (1 << 0), - VG_OCL_PRAGMA_UNROLL = (1 << 1), - VG_OCL_EXPENSIVE_BRANCHES = (1 << 2), - VG_OCL_DEEP_VLIW = (1 << 3), - VG_OCL_AMD_BFI_INT = (1 << 4), - VG_OCL_NV_VERBOSE = (1 << 5), - VG_OCL_BROKEN = (1 << 6), - VG_OCL_NO_BINARIES = (1 << 7), - - VG_OCL_OPTIMIZATIONS = (VG_OCL_DEEP_PREPROC_UNROLL | - VG_OCL_PRAGMA_UNROLL | - VG_OCL_EXPENSIVE_BRANCHES | - VG_OCL_DEEP_VLIW | - VG_OCL_AMD_BFI_INT), - -}; - -int -vg_ocl_get_quirks(vg_ocl_context_t *vocp) -{ - uint32_t vend; - const char *dvn; - unsigned int quirks = 0; - - quirks |= VG_OCL_DEEP_PREPROC_UNROLL; - - vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID); - switch (vend) { - case 0x10de: /* NVIDIA */ - /* - * NVIDIA's compiler seems to take a really really long - * time when using preprocessor unrolling, but works - * well with pragma unroll. - */ - quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL; - quirks |= VG_OCL_PRAGMA_UNROLL; - quirks |= VG_OCL_NV_VERBOSE; - break; - case 0x1002: /* AMD/ATI */ - /* - * AMD's compiler works best with preprocesor unrolling. - * Pragma unroll is unreliable with AMD's compiler and - * seems to crash based on whether the gods were smiling - * when Catalyst was last installed/upgraded. - */ - if (vg_ocl_device_gettype(vocp->voc_ocldid) & - CL_DEVICE_TYPE_GPU) { - quirks |= VG_OCL_EXPENSIVE_BRANCHES; - quirks |= VG_OCL_DEEP_VLIW; - dvn = vg_ocl_device_getstr(vocp->voc_ocldid, - CL_DEVICE_EXTENSIONS); - if (dvn && strstr(dvn, "cl_amd_media_ops")) - quirks |= VG_OCL_AMD_BFI_INT; - - dvn = vg_ocl_device_getstr(vocp->voc_ocldid, - CL_DEVICE_NAME); - if (!strcmp(dvn, "ATI RV710")) { - quirks &= ~VG_OCL_OPTIMIZATIONS; - quirks |= VG_OCL_NO_BINARIES; - } - } - break; - default: - break; - } - return quirks; -} - -int -vg_ocl_create_kernel(vg_ocl_context_t *vocp, int knum, const char *func) -{ - int i; - cl_kernel krn; - cl_int ret; - - for (i = 0; i < MAX_SLOT; i++) { - krn = clCreateKernel(vocp->voc_oclprog, func, &ret); - if (!krn) { - fprintf(stderr, "clCreateKernel(%d): ", i); - vg_ocl_error(vocp, ret, NULL); - while (--i >= 0) { - clReleaseKernel(vocp->voc_oclkernel[i][knum]); - vocp->voc_oclkernel[i][knum] = NULL; - } - return 0; - } - vocp->voc_oclkernel[i][knum] = krn; - vocp->voc_oclkrnwait[i] = NULL; - } - return 1; -} - -void -vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts, - const char *program, size_t size, - unsigned char *hash_out) -{ - MD5_CTX ctx; - cl_platform_id pid; - const char *str; - - MD5_Init(&ctx); - pid = vg_ocl_device_getplatform(vocp->voc_ocldid); - str = vg_ocl_platform_getstr(pid, CL_PLATFORM_NAME); - MD5_Update(&ctx, str, strlen(str) + 1); - str = vg_ocl_platform_getstr(pid, CL_PLATFORM_VERSION); - MD5_Update(&ctx, str, strlen(str) + 1); - str = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME); - MD5_Update(&ctx, str, strlen(str) + 1); - if (opts) - MD5_Update(&ctx, opts, strlen(opts) + 1); - if (size) - MD5_Update(&ctx, program, size); - MD5_Final(hash_out, &ctx); -} - -typedef struct { - unsigned char e_ident[16]; - uint16_t e_type; - uint16_t e_machine; - uint32_t e_version; - uint32_t e_entry; - uint32_t e_phoff; - uint32_t e_shoff; - uint32_t e_flags; - uint16_t e_ehsize; - uint16_t e_phentsize; - uint16_t e_phnum; - uint16_t e_shentsize; - uint16_t e_shnum; - uint16_t e_shstrndx; -} vg_elf32_header_t; - -typedef struct { - uint32_t sh_name; - uint32_t sh_type; - uint32_t sh_flags; - uint32_t sh_addr; - uint32_t sh_offset; - uint32_t sh_size; - uint32_t sh_link; - uint32_t sh_info; - uint32_t sh_addralign; - uint32_t sh_entsize; -} vg_elf32_shdr_t; - -int -vg_ocl_amd_patch_inner(unsigned char *binary, size_t size) -{ - vg_elf32_header_t *ehp; - vg_elf32_shdr_t *shp, *nshp; - uint32_t *instr; - size_t off; - int i, n, txt2idx, patched; - - ehp = (vg_elf32_header_t *) binary; - if ((size < sizeof(*ehp)) || - memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) || - !ehp->e_shoff) - return 0; - - off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize); - nshp = (vg_elf32_shdr_t *) (binary + off); - if ((off + sizeof(*nshp)) > size) - return 0; - - shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff); - n = 0; - txt2idx = 0; - for (i = 0; i < ehp->e_shnum; i++) { - off = nshp->sh_offset + shp[i].sh_name; - if (((off + 6) >= size) || - memcmp(binary + off, ".text", 6)) - continue; - n++; - if (n == 2) - txt2idx = i; - } - if (n != 2) - return 0; - - off = shp[txt2idx].sh_offset; - instr = (uint32_t *) (binary + off); - n = shp[txt2idx].sh_size / 4; - patched = 0; - for (i = 0; i < n; i += 2) { - if (((instr[i] & 0x02001000) == 0) && - ((instr[i+1] & 0x9003f000) == 0x0001a000)) { - instr[i+1] ^= (0x0001a000 ^ 0x0000c000); - patched++; - } - } - - return patched; -} - -int -vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size) -{ - vg_context_t *vcp = vocp->base.vxc_vc; - vg_elf32_header_t *ehp; - unsigned char *ptr; - size_t offset = 1; - int ninner = 0, nrun, npatched = 0; - - ehp = (vg_elf32_header_t *) binary; - if ((size < sizeof(*ehp)) || - memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) || - !ehp->e_shoff) - return 0; - - offset = 1; - while (offset < (size - 8)) { - ptr = (unsigned char *) memchr(binary + offset, - 0x7f, - size - offset); - if (!ptr) - return npatched; - offset = ptr - binary; - ehp = (vg_elf32_header_t *) ptr; - if (((size - offset) < sizeof(*ehp)) || - memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) || - !ehp->e_shoff) { - offset += 1; - continue; - } - - ninner++; - nrun = vg_ocl_amd_patch_inner(ptr, size - offset); - npatched += nrun; - if (vcp->vc_verbose > 1) - fprintf(stderr, "AMD BFI_INT: patched %d instructions " - "in kernel %d\n", - nrun, ninner); - npatched++; - offset += 1; - } - return npatched; -} - - -int -vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, - const char *filename, const char *opts) -{ - FILE *kfp; - char *buf, *tbuf; - int len, fromsource = 0, patched = 0; - size_t sz, szr; - cl_program prog; - cl_int ret, sts; - unsigned char prog_hash[16]; - char bin_name[64]; - - if (vcp->vc_verbose > 1) - fprintf(stderr, - "OpenCL compiler flags: %s\n", opts ? opts : ""); - - sz = 128 * 1024; - buf = (char *) malloc(sz); - if (!buf) { - fprintf(stderr, "Could not allocate program buffer\n"); - return 0; - } - - kfp = fopen(filename, "r"); - if (!kfp) { - fprintf(stderr, "Error loading kernel file '%s': %s\n", - filename, strerror(errno)); - free(buf); - return 0; - } - - len = fread(buf, 1, sz, kfp); - fclose(kfp); - - if (!len) { - fprintf(stderr, "Short read on CL kernel\n"); - free(buf); - return 0; - } - - vg_ocl_hash_program(vocp, opts, buf, len, prog_hash); - snprintf(bin_name, sizeof(bin_name), - "%02x%02x%02x%02x%02x%02x%02x%02x" - "%02x%02x%02x%02x%02x%02x%02x%02x.oclbin", - prog_hash[0], prog_hash[1], prog_hash[2], prog_hash[3], - prog_hash[4], prog_hash[5], prog_hash[6], prog_hash[7], - prog_hash[8], prog_hash[9], prog_hash[10], prog_hash[11], - prog_hash[12], prog_hash[13], prog_hash[14], prog_hash[15]); - - if (vocp->voc_quirks & VG_OCL_NO_BINARIES) { - kfp = NULL; - if (vcp->vc_verbose > 1) - fprintf(stderr, "Binary OpenCL programs disabled\n"); - } else { - kfp = fopen(bin_name, "rb"); - } - - if (!kfp) { - /* No binary available, create with source */ - fromsource = 1; - sz = len; - prog = clCreateProgramWithSource(vocp->voc_oclctx, - 1, (const char **) &buf, &sz, - &ret); - } else { - if (vcp->vc_verbose > 1) - fprintf(stderr, "Loading kernel binary %s\n", bin_name); - szr = 0; - while (!feof(kfp)) { - len = fread(buf + szr, 1, sz - szr, kfp); - if (!len) { - fprintf(stderr, - "Short read on CL kernel binary\n"); - fclose(kfp); - free(buf); - return 0; - } - szr += len; - if (szr == sz) { - tbuf = (char *) realloc(buf, sz*2); - if (!tbuf) { - fprintf(stderr, - "Could not expand CL kernel " - "binary buffer\n"); - fclose(kfp); - free(buf); - return 0; - } - buf = tbuf; - sz *= 2; - } - } - fclose(kfp); - rebuild: - prog = clCreateProgramWithBinary(vocp->voc_oclctx, - 1, &vocp->voc_ocldid, - &szr, - (const unsigned char **) &buf, - &sts, - &ret); - } - free(buf); - if (!prog) { - vg_ocl_error(vocp, ret, "clCreateProgramWithSource"); - return 0; - } - - if (vcp->vc_verbose > 0) { - if (fromsource && !patched) { - fprintf(stderr, - "Compiling kernel, can take minutes..."); - fflush(stderr); - } - } - ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL); - if (ret != CL_SUCCESS) { - if ((vcp->vc_verbose > 0) && fromsource && !patched) - fprintf(stderr, "failure.\n"); - vg_ocl_error(NULL, ret, "clBuildProgram"); - } else if ((vcp->vc_verbose > 0) && fromsource && !patched) { - fprintf(stderr, "done!\n"); - } - if ((ret != CL_SUCCESS) || - ((vcp->vc_verbose > 1) && fromsource && !patched)) { - vg_ocl_buildlog(vocp, prog); - } - if (ret != CL_SUCCESS) { - vg_ocl_dump_info(vocp); - clReleaseProgram(prog); - return 0; - } - - if (fromsource && !(vocp->voc_quirks & VG_OCL_NO_BINARIES)) { - ret = clGetProgramInfo(prog, - CL_PROGRAM_BINARY_SIZES, - sizeof(szr), &szr, - &sz); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, - "WARNING: clGetProgramInfo(BINARY_SIZES)"); - goto out; - } - if (sz == 0) { - fprintf(stderr, - "WARNING: zero-length CL kernel binary\n"); - goto out; - } - - buf = (char *) malloc(szr); - if (!buf) { - fprintf(stderr, - "WARNING: Could not allocate %"PRSIZET"d bytes " - "for CL binary\n", - szr); - goto out; - } - - ret = clGetProgramInfo(prog, - CL_PROGRAM_BINARIES, - sizeof(buf), &buf, - &sz); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, - "WARNING: clGetProgramInfo(BINARIES)"); - free(buf); - goto out; - } - - if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) { - patched = vg_ocl_amd_patch(vocp, - (unsigned char *) buf, szr); - if (patched > 0) { - if (vcp->vc_verbose > 1) - fprintf(stderr, - "AMD BFI_INT patch complete\n"); - clReleaseProgram(prog); - goto rebuild; - } - fprintf(stderr, - "WARNING: AMD BFI_INT patching failed\n"); - if (patched < 0) { - /* Program was incompletely modified */ - free(buf); - goto out; - } - } - - kfp = fopen(bin_name, "wb"); - if (!kfp) { - fprintf(stderr, "WARNING: " - "could not save CL kernel binary: %s\n", - strerror(errno)); - } else { - sz = fwrite(buf, 1, szr, kfp); - fclose(kfp); - if (sz != szr) { - fprintf(stderr, - "WARNING: short write on CL kernel " - "binary file: expected " - "%"PRSIZET"d, got %"PRSIZET"d\n", - szr, sz); - unlink(bin_name); - } - } - free(buf); - } - -out: - vocp->voc_oclprog = prog; - if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") || - !vg_ocl_create_kernel(vocp, 1, "heap_invert")) { - clReleaseProgram(vocp->voc_oclprog); - vocp->voc_oclprog = NULL; - return 0; - } - - return 1; -} - -void CL_CALLBACK -vg_ocl_context_callback(const char *errinfo, - const void *private_info, - size_t cb, - void *user_data) -{ - fprintf(stderr, "vg_ocl_context_callback error: %s\n", errinfo); -} - -int -vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did, - int safe_mode) -{ - cl_int ret; - char optbuf[128]; - int end = 0; - - memset(vocp, 0, sizeof(*vocp)); - vg_exec_context_init(vcp, &vocp->base); - - pthread_mutex_init(&vocp->voc_lock, NULL); - pthread_cond_init(&vocp->voc_wait, NULL); - vocp->voc_ocl_slot = -1; - - vocp->voc_ocldid = did; - - if (vcp->vc_verbose > 1) - vg_ocl_dump_info(vocp); - - vocp->voc_quirks = vg_ocl_get_quirks(vocp); - - if ((vocp->voc_quirks & VG_OCL_BROKEN) && (vcp->vc_verbose > 0)) { - char yesbuf[16]; - printf("Type 'yes' to continue: "); - fflush(stdout); - if (!fgets(yesbuf, sizeof(yesbuf), stdin) || - strncmp(yesbuf, "yes", 3)) - exit(1); - } - - vocp->voc_oclctx = clCreateContext(NULL, - 1, &did, - vg_ocl_context_callback, - NULL, - &ret); - if (!vocp->voc_oclctx) { - vg_ocl_error(vocp, ret, "clCreateContext"); - return 0; - } - - vocp->voc_oclcmdq = clCreateCommandQueue(vocp->voc_oclctx, - vocp->voc_ocldid, - 0, &ret); - if (!vocp->voc_oclcmdq) { - vg_ocl_error(vocp, ret, "clCreateCommandQueue"); - return 0; - } - - if (safe_mode) - vocp->voc_quirks &= ~VG_OCL_OPTIMIZATIONS; - - end = 0; - optbuf[end] = '\0'; - if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL) - end += snprintf(optbuf + end, sizeof(optbuf) - end, - "-DDEEP_PREPROC_UNROLL "); - if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL) - end += snprintf(optbuf + end, sizeof(optbuf) - end, - "-DPRAGMA_UNROLL "); - if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES) - end += snprintf(optbuf + end, sizeof(optbuf) - end, - "-DVERY_EXPENSIVE_BRANCHES "); - if (vocp->voc_quirks & VG_OCL_DEEP_VLIW) - end += snprintf(optbuf + end, sizeof(optbuf) - end, - "-DDEEP_VLIW "); - if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT) - end += snprintf(optbuf + end, sizeof(optbuf) - end, - "-DAMD_BFI_INT "); - if (vocp->voc_quirks & VG_OCL_NV_VERBOSE) - end += snprintf(optbuf + end, sizeof(optbuf) - end, - "-cl-nv-verbose "); - - if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", optbuf)) - return 0; - return 1; -} - -void -vg_ocl_del(vg_ocl_context_t *vocp) -{ - if (vocp->voc_oclprog) { - clReleaseProgram(vocp->voc_oclprog); - vocp->voc_oclprog = NULL; - } - if (vocp->voc_oclcmdq) { - clReleaseCommandQueue(vocp->voc_oclcmdq); - vocp->voc_oclcmdq = NULL; - } - if (vocp->voc_oclctx) { - clReleaseContext(vocp->voc_oclctx); - vocp->voc_oclctx = NULL; - } - pthread_cond_destroy(&vocp->voc_wait); - pthread_mutex_destroy(&vocp->voc_lock); - vg_exec_context_del(&vocp->base); -} - -static int vg_ocl_arg_map[][8] = { - /* hashes_out / found */ - { 2, 0, -1 }, - /* z_heap */ - { 0, 1, 1, 0, 2, 2, -1 }, - /* point_tmp */ - { 0, 0, 2, 1, -1 }, - /* row_in */ - { 0, 2, -1 }, - /* col_in */ - { 0, 3, -1 }, - /* target_table */ - { 2, 3, -1 }, -}; - -int -vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot, - int arg, size_t size, int host) -{ - cl_mem clbuf; - cl_int ret; - int i, j, knum, karg; - - for (i = 0; i < MAX_SLOT; i++) { - if ((i != slot) && (slot >= 0)) - continue; - if (vocp->voc_args[i][arg]) { - clReleaseMemObject(vocp->voc_args[i][arg]); - vocp->voc_args[i][arg] = NULL; - vocp->voc_arg_size[i][arg] = 0; - } - } - - clbuf = clCreateBuffer(vocp->voc_oclctx, - CL_MEM_READ_WRITE | - (host ? CL_MEM_ALLOC_HOST_PTR : 0), - size, - NULL, - &ret); - if (!clbuf) { - fprintf(stderr, "clCreateBuffer(%d,%d): ", slot, arg); - vg_ocl_error(vocp, ret, NULL); - return 0; - } - - for (i = 0; i < MAX_SLOT; i++) { - if ((i != slot) && (slot >= 0)) - continue; - - clRetainMemObject(clbuf); - vocp->voc_args[i][arg] = clbuf; - vocp->voc_arg_size[i][arg] = size; - - for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) { - knum = vg_ocl_arg_map[arg][j]; - karg = vg_ocl_arg_map[arg][j+1]; - ret = clSetKernelArg(vocp->voc_oclkernel[i][knum], - karg, - sizeof(clbuf), - &clbuf); - - if (ret) { - fprintf(stderr, - "clSetKernelArg(%d,%d): ", knum, karg); - vg_ocl_error(vocp, ret, NULL); - return 0; - } - } - } - - clReleaseMemObject(clbuf); - return 1; -} - -int -vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg, - void *buffer, size_t size) -{ - cl_int slot, ret; - - slot = (wslot < 0) ? 0 : wslot; - - assert((slot >= 0) && (slot < MAX_SLOT)); - assert(size <= vocp->voc_arg_size[slot][arg]); - - ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq, - vocp->voc_args[slot][arg], - CL_TRUE, - 0, size, - buffer, - 0, NULL, - NULL); - - if (ret) { - fprintf(stderr, "clEnqueueWriteBuffer(%d): ", arg); - vg_ocl_error(vocp, ret, NULL); - return 0; - } - - return 1; -} - -void * -vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot, - int arg, int rw) -{ - void *buf; - cl_int ret; - - assert((slot >= 0) && (slot < MAX_SLOT)); - - buf = clEnqueueMapBuffer(vocp->voc_oclcmdq, - vocp->voc_args[slot][arg], - CL_TRUE, - (rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE) - : (rw ? CL_MAP_WRITE : CL_MAP_READ), - 0, vocp->voc_arg_size[slot][arg], - 0, NULL, - NULL, - &ret); - if (!buf) { - fprintf(stderr, "clEnqueueMapBuffer(%d): ", arg); - vg_ocl_error(vocp, ret, NULL); - return NULL; - } - return buf; -} - -void -vg_ocl_unmap_arg_buffer(vg_ocl_context_t *vocp, int slot, - int arg, void *buf) -{ - cl_int ret; - cl_event ev; - - assert((slot >= 0) && (slot < MAX_SLOT)); - - ret = clEnqueueUnmapMemObject(vocp->voc_oclcmdq, - vocp->voc_args[slot][arg], - buf, - 0, NULL, - &ev); - if (ret != CL_SUCCESS) { - fprintf(stderr, "clEnqueueUnmapMemObject(%d): ", arg); - vg_ocl_error(vocp, ret, NULL); - return; - } - - ret = clWaitForEvents(1, &ev); - clReleaseEvent(ev); - if (ret != CL_SUCCESS) { - fprintf(stderr, "clWaitForEvent(clUnmapMemObject,%d): ", arg); - vg_ocl_error(vocp, ret, NULL); - } -} - -int -vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot, - int arg, int value) -{ - cl_int ret; - int i; - - for (i = 0; i < MAX_SLOT; i++) { - if ((i != slot) && (slot >= 0)) - continue; - ret = clSetKernelArg(vocp->voc_oclkernel[i][2], - arg, - sizeof(value), - &value); - if (ret) { - fprintf(stderr, "clSetKernelArg(%d): ", arg); - vg_ocl_error(vocp, ret, NULL); - return 0; - } - } - return 1; -} - -int -vg_ocl_kernel_buffer_arg(vg_ocl_context_t *vocp, int slot, - int arg, void *value, size_t size) -{ - cl_int ret; - int i, j, knum, karg; - - for (i = 0; i < MAX_SLOT; i++) { - if ((i != slot) && (slot >= 0)) - continue; - for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) { - knum = vg_ocl_arg_map[arg][j]; - karg = vg_ocl_arg_map[arg][j+1]; - ret = clSetKernelArg(vocp->voc_oclkernel[i][knum], - karg, - size, - value); - if (ret) { - fprintf(stderr, - "clSetKernelArg(%d,%d): ", knum, karg); - vg_ocl_error(vocp, ret, NULL); - return 0; - } - } - } - return 1; -} - -int -vg_ocl_kernel_dead(vg_ocl_context_t *vocp, int slot) -{ - return (vocp->voc_oclkrnwait[slot] == NULL); -} - -int -vg_ocl_kernel_start(vg_ocl_context_t *vocp, int slot, int ncol, int nrow, - int invsize) -{ - cl_int val, ret; - cl_event ev; - size_t globalws[2] = { ncol, nrow }; - size_t invws = (ncol * nrow) / invsize; - - assert(!vocp->voc_oclkrnwait[slot]); - - /* heap_invert() preconditions */ - assert(is_pow2(invsize) && (invsize > 1)); - - val = invsize; - ret = clSetKernelArg(vocp->voc_oclkernel[slot][1], - 1, - sizeof(val), - &val); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, "clSetKernelArg(ncol)"); - return 0; - } - ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, - vocp->voc_oclkernel[slot][0], - 2, - NULL, globalws, NULL, - 0, NULL, - &ev); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, "clEnqueueNDRange(0)"); - return 0; - } - - ret = clWaitForEvents(1, &ev); - clReleaseEvent(ev); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,0)"); - return 0; - } - - if (vocp->voc_verify_func[0] && - !(vocp->voc_verify_func[0])(vocp, slot)) { - fprintf(stderr, "ERROR: Kernel 0 failed verification test\n"); - return 0; - } - - ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, - vocp->voc_oclkernel[slot][1], - 1, - NULL, &invws, NULL, - 0, NULL, - &ev); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, "clEnqueueNDRange(1)"); - return 0; - } - - ret = clWaitForEvents(1, &ev); - clReleaseEvent(ev); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,1)"); - return 0; - } - - if (vocp->voc_verify_func[1] && - !(vocp->voc_verify_func[1])(vocp, slot)) { - fprintf(stderr, "ERROR: Kernel 1 failed verification test\n"); - return 0; - } - - ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, - vocp->voc_oclkernel[slot][2], - 2, - NULL, globalws, NULL, - 0, NULL, - &ev); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, "clEnqueueNDRange(2)"); - return 0; - } - - vocp->voc_oclkrnwait[slot] = ev; - return 1; -} - -int -vg_ocl_kernel_wait(vg_ocl_context_t *vocp, int slot) -{ - cl_event ev; - cl_int ret; - - ev = vocp->voc_oclkrnwait[slot]; - vocp->voc_oclkrnwait[slot] = NULL; - if (ev) { - ret = clWaitForEvents(1, &ev); - clReleaseEvent(ev); - if (ret != CL_SUCCESS) { - vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,e)"); - return 0; - } - } - return 1; -} - - -INLINE void -vg_ocl_get_bignum_raw(BIGNUM *bn, const unsigned char *buf) -{ - bn_expand(bn, 256); - memcpy(bn->d, buf, 32); - bn->top = (32 / sizeof(BN_ULONG)); -} - -INLINE void -vg_ocl_put_bignum_raw(unsigned char *buf, const BIGNUM *bn) -{ - int bnlen = (bn->top * sizeof(BN_ULONG)); - if (bnlen >= 32) { - memcpy(buf, bn->d, 32); - } else { - memcpy(buf, bn->d, bnlen); - memset(buf + bnlen, 0, 32 - bnlen); - } -} - -#define ACCESS_BUNDLE 1024 -#define ACCESS_STRIDE (ACCESS_BUNDLE/8) - -void -vg_ocl_get_bignum_tpa(BIGNUM *bn, const unsigned char *buf, int cell) -{ - unsigned char bnbuf[32]; - int start, i; - - start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + - (cell % ACCESS_STRIDE)); - for (i = 0; i < 8; i++) - memcpy(bnbuf+(i*4), - buf + 4*(start + i*ACCESS_STRIDE), - 4); - - vg_ocl_get_bignum_raw(bn, bnbuf); -} - -/* - * Absolutely disgusting. - * We want points in Montgomery form, and it's a lot easier to read the - * coordinates from the structure than to export and re-montgomeryize. - */ - -struct ec_point_st { - const EC_METHOD *meth; - BIGNUM X; - BIGNUM Y; - BIGNUM Z; - int Z_is_one; -}; - -INLINE void -vg_ocl_get_point(EC_POINT *ppnt, const unsigned char *buf) -{ - static const unsigned char mont_one[] = { 0x01,0x00,0x00,0x03,0xd1 }; - vg_ocl_get_bignum_raw(&ppnt->X, buf); - vg_ocl_get_bignum_raw(&ppnt->Y, buf + 32); - if (!ppnt->Z_is_one) { - ppnt->Z_is_one = 1; - BN_bin2bn(mont_one, sizeof(mont_one), &ppnt->Z); - } -} - -INLINE void -vg_ocl_put_point(unsigned char *buf, const EC_POINT *ppnt) -{ - assert(ppnt->Z_is_one); - vg_ocl_put_bignum_raw(buf, &ppnt->X); - vg_ocl_put_bignum_raw(buf + 32, &ppnt->Y); -} - -void -vg_ocl_put_point_tpa(unsigned char *buf, int cell, const EC_POINT *ppnt) -{ - unsigned char pntbuf[64]; - int start, i; - - vg_ocl_put_point(pntbuf, ppnt); - - start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + - (cell % (ACCESS_STRIDE/2))); - for (i = 0; i < 8; i++) - memcpy(buf + 4*(start + i*ACCESS_STRIDE), - pntbuf+(i*4), - 4); - for (i = 0; i < 8; i++) - memcpy(buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)), - pntbuf+32+(i*4), - 4); -} - -void -vg_ocl_get_point_tpa(EC_POINT *ppnt, const unsigned char *buf, int cell) -{ - unsigned char pntbuf[64]; - int start, i; - - start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + - (cell % (ACCESS_STRIDE/2))); - for (i = 0; i < 8; i++) - memcpy(pntbuf+(i*4), - buf + 4*(start + i*ACCESS_STRIDE), - 4); - for (i = 0; i < 8; i++) - memcpy(pntbuf+32+(i*4), - buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)), - 4); - - vg_ocl_get_point(ppnt, pntbuf); -} - -void -show_elapsed(struct timeval *tv, const char *place) -{ - struct timeval now, delta; - gettimeofday(&now, NULL); - timersub(&now, tv, &delta); - fprintf(stderr, - "%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec); -} - - -/* - * GPU address matching methods - * - * gethash: GPU computes and returns all address hashes. - * + Works with any matching method, including regular expressions. - * - The CPU will not be able to keep up with mid- to high-end GPUs. - * - * prefix: GPU computes hash, searches a range list, and discards. - * + Fast, minimal work for CPU. - */ - -int -vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot) -{ - vg_exec_context_t *vxcp = &vocp->base; - vg_context_t *vcp = vocp->base.vxc_vc; - vg_test_func_t test_func = vcp->vc_test; - unsigned char *ocl_hashes_out; - int i, res = 0, round; - - ocl_hashes_out = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, slot, 0, 0); - - round = vocp->voc_ocl_cols * vocp->voc_ocl_rows; - - for (i = 0; i < round; i++, vxcp->vxc_delta++) { - memcpy(&vxcp->vxc_binres[1], - ocl_hashes_out + (20*i), - 20); - - res = test_func(vxcp); - if (res) - break; - } - - vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out); - return res; -} - -int -vg_ocl_gethash_init(vg_ocl_context_t *vocp) -{ - int i; - - if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get")) - return 0; - - for (i = 0; i < vocp->voc_nslots; i++) { - /* Each slot gets its own hash output buffer */ - if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, - 20 * - vocp->voc_ocl_rows * - vocp->voc_ocl_cols, 1)) - return 0; - } - - vocp->voc_rekey_func = NULL; - vocp->voc_check_func = vg_ocl_gethash_check; - return 1; -} - - -static int -vg_ocl_prefix_rekey(vg_ocl_context_t *vocp) -{ - vg_context_t *vcp = vocp->base.vxc_vc; - unsigned char *ocl_targets_in; - uint32_t *ocl_found_out; - int i; - - /* Set the found indicator for each slot to -1 */ - for (i = 0; i < vocp->voc_nslots; i++) { - ocl_found_out = (uint32_t *) - vg_ocl_map_arg_buffer(vocp, i, 0, 1); - ocl_found_out[0] = 0xffffffff; - vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out); - } - - if (vocp->voc_pattern_rewrite) { - /* Count number of range records */ - i = vg_context_hash160_sort(vcp, NULL); - if (!i) { - fprintf(stderr, - "No range records available, exiting\n"); - return 0; - } - - if (i > vocp->voc_pattern_alloc) { - /* (re)allocate target buffer */ - if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0)) - return 0; - vocp->voc_pattern_alloc = i; - } - - /* Write range records */ - ocl_targets_in = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, 0, 5, 1); - vg_context_hash160_sort(vcp, ocl_targets_in); - vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in); - vg_ocl_kernel_int_arg(vocp, -1, 4, i); - - vocp->voc_pattern_rewrite = 0; - } - return 1; -} - -static int -vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot) -{ - vg_exec_context_t *vxcp = &vocp->base; - vg_context_t *vcp = vocp->base.vxc_vc; - vg_test_func_t test_func = vcp->vc_test; - uint32_t *ocl_found_out; - uint32_t found_delta; - int orig_delta, tablesize; - int res = 0; - - /* Retrieve the found indicator */ - ocl_found_out = (uint32_t *) - vg_ocl_map_arg_buffer(vocp, slot, 0, 2); - found_delta = ocl_found_out[0]; - - if (found_delta != 0xffffffff) { - /* GPU code claims match, verify with CPU version */ - orig_delta = vxcp->vxc_delta; - vxcp->vxc_delta += found_delta; - vg_exec_context_calc_address(vxcp); - - /* Make sure the GPU produced the expected hash */ - res = 0; - if (!memcmp(vxcp->vxc_binres + 1, - ocl_found_out + 2, - 20)) { - res = test_func(vxcp); - } - if (res == 0) { - /* - * The match was not found in - * the pattern list. Hmm. - */ - tablesize = ocl_found_out[2]; - fprintf(stderr, "Match idx: %d\n", ocl_found_out[1]); - fprintf(stderr, "CPU hash: "); - fdumphex(stderr, vxcp->vxc_binres + 1, 20); - fprintf(stderr, "GPU hash: "); - fdumphex(stderr, - (unsigned char *) (ocl_found_out + 2), 20); - fprintf(stderr, "Found delta: %d " - "Start delta: %d\n", - found_delta, orig_delta); - res = 1; - } - vocp->voc_pattern_rewrite = 1; - } else { - vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows); - } - - vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out); - return res; -} - -int -vg_ocl_prefix_init(vg_ocl_context_t *vocp) -{ - int i; - - if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix")) - return 0; - - for (i = 0; i < vocp->voc_nslots; i++) { - if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1)) - return 0; - } - vocp->voc_rekey_func = vg_ocl_prefix_rekey; - vocp->voc_check_func = vg_ocl_prefix_check; - vocp->voc_pattern_rewrite = 1; - vocp->voc_pattern_alloc = 0; - return 1; -} - - -int -vg_ocl_config_pattern(vg_ocl_context_t *vocp) -{ - vg_context_t *vcp = vocp->base.vxc_vc; - int i; - - i = vg_context_hash160_sort(vcp, NULL); - if (i > 0) { - if (vcp->vc_verbose > 1) - fprintf(stderr, "Using OpenCL prefix matcher\n"); - /* Configure for prefix matching */ - return vg_ocl_prefix_init(vocp); - } - - if (vcp->vc_verbose > 0) - fprintf(stderr, "WARNING: Using CPU pattern matcher\n"); - return vg_ocl_gethash_init(vocp); -} - - -/* - * Temporary buffer content verification functions - * This provides a simple test of the kernel, the OpenCL compiler, - * and the hardware. - */ -int -vg_ocl_verify_temporary(vg_ocl_context_t *vocp, int slot, int z_inverted) -{ - vg_exec_context_t *vxcp = &vocp->base; - unsigned char *point_tmp = NULL, *z_heap = NULL; - unsigned char *ocl_points_in = NULL, *ocl_strides_in = NULL; - const EC_GROUP *pgroup; - EC_POINT *ppr = NULL, *ppc = NULL, *pps = NULL, *ppt = NULL; - BIGNUM bnz, bnez, bnm, *bnzc; - BN_CTX *bnctx = NULL; - BN_MONT_CTX *bnmont; - int ret = 0; - int mismatches = 0, mm_r; - int x, y, bx; - static const unsigned char raw_modulus[] = { - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F - }; - - BN_init(&bnz); - BN_init(&bnez); - BN_init(&bnm); - - bnctx = BN_CTX_new(); - bnmont = BN_MONT_CTX_new(); - pgroup = EC_KEY_get0_group(vxcp->vxc_key); - ppr = EC_POINT_new(pgroup); - ppc = EC_POINT_new(pgroup); - pps = EC_POINT_new(pgroup); - ppt = EC_POINT_new(pgroup); - - if (!bnctx || !bnmont || !ppr || !ppc || !pps || !ppt) { - fprintf(stderr, "ERROR: out of memory\n"); - goto out; - } - - BN_bin2bn(raw_modulus, sizeof(raw_modulus), &bnm); - BN_MONT_CTX_set(bnmont, &bnm, bnctx); - - if (z_inverted) { - bnzc = &bnez; - } else { - bnzc = &pps->Z; - } - - z_heap = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, slot, 1, 0); - point_tmp = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, slot, 2, 0); - ocl_points_in = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, slot, 3, 0); - ocl_strides_in = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, slot, 4, 0); - - if (!z_heap || !point_tmp || !ocl_points_in || !ocl_strides_in) { - fprintf(stderr, "ERROR: could not map OpenCL point buffers\n"); - goto out; - } - - for (y = 0; y < vocp->voc_ocl_rows; y++) { - vg_ocl_get_point(ppr, ocl_strides_in + (64*y)); - bx = y * vocp->voc_ocl_cols; - mm_r = 0; - - for (x = 0; x < vocp->voc_ocl_cols; x++) { - vg_ocl_get_point_tpa(ppc, ocl_points_in, x); - assert(ppr->Z_is_one && ppc->Z_is_one); - EC_POINT_add(pgroup, pps, ppc, ppr, bnctx); - assert(!pps->Z_is_one); - vg_ocl_get_point_tpa(ppt, point_tmp, bx + x); - vg_ocl_get_bignum_tpa(&bnz, z_heap, bx + x); - if (z_inverted) { - BN_mod_inverse(&bnez, &pps->Z, &bnm, bnctx); - BN_to_montgomery(&bnez, &bnez, bnmont, bnctx); - BN_to_montgomery(&bnez, &bnez, bnmont, bnctx); - } - if (BN_cmp(&ppt->X, &pps->X) || - BN_cmp(&ppt->Y, &pps->Y) || - BN_cmp(&bnz, bnzc)) { - if (!mismatches) { - fprintf(stderr, "Base privkey: "); - fdumpbn(stderr, EC_KEY_get0_private_key( - vxcp->vxc_key)); - } - mismatches++; - fprintf(stderr, "Mismatch for kernel %d, " - "offset %d (%d,%d)\n", - z_inverted, bx + x, y, x); - if (!mm_r) { - mm_r = 1; - fprintf(stderr, "Row X : "); - fdumpbn(stderr, &ppr->X); - fprintf(stderr, "Row Y : "); - fdumpbn(stderr, &ppr->Y); - } - - fprintf(stderr, "Column X: "); - fdumpbn(stderr, &ppc->X); - fprintf(stderr, "Column Y: "); - fdumpbn(stderr, &ppc->Y); - - if (BN_cmp(&ppt->X, &pps->X)) { - fprintf(stderr, "Expect X: "); - fdumpbn(stderr, &pps->X); - fprintf(stderr, "Device X: "); - fdumpbn(stderr, &ppt->X); - } - if (BN_cmp(&ppt->Y, &pps->Y)) { - fprintf(stderr, "Expect Y: "); - fdumpbn(stderr, &pps->Y); - fprintf(stderr, "Device Y: "); - fdumpbn(stderr, &ppt->Y); - } - if (BN_cmp(&bnz, bnzc)) { - fprintf(stderr, "Expect Z: "); - fdumpbn(stderr, bnzc); - fprintf(stderr, "Device Z: "); - fdumpbn(stderr, &bnz); - } - } - } - } - - ret = !mismatches; - -out: - if (z_heap) - vg_ocl_unmap_arg_buffer(vocp, slot, 1, z_heap); - if (point_tmp) - vg_ocl_unmap_arg_buffer(vocp, slot, 2, point_tmp); - if (ocl_points_in) - vg_ocl_unmap_arg_buffer(vocp, slot, 3, ocl_points_in); - if (ocl_strides_in) - vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in); - if (ppr) - EC_POINT_free(ppr); - if (ppc) - EC_POINT_free(ppc); - if (pps) - EC_POINT_free(pps); - if (ppt) - EC_POINT_free(ppt); - BN_clear_free(&bnz); - BN_clear_free(&bnez); - BN_clear_free(&bnm); - if (bnmont) - BN_MONT_CTX_free(bnmont); - if (bnctx) - BN_CTX_free(bnctx); - return ret; -} - -int -vg_ocl_verify_k0(vg_ocl_context_t *vocp, int slot) -{ - return vg_ocl_verify_temporary(vocp, slot, 0); -} - -int -vg_ocl_verify_k1(vg_ocl_context_t *vocp, int slot) -{ - return vg_ocl_verify_temporary(vocp, slot, 1); -} - -void * -vg_opencl_thread(void *arg) -{ - vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg; - vg_context_t *vcp = vocp->base.vxc_vc; - int halt = 0; - int slot = -1; - int rows, cols, invsize; - unsigned long long idleu, busyu; - double pidle; - struct timeval tv, tvt, tvd, idle, busy; - - memset(&idle, 0, sizeof(idle)); - memset(&busy, 0, sizeof(busy)); - - while (1) { - pthread_mutex_lock(&vocp->voc_lock); - if (halt) { - halt = 0; - vocp->voc_halt = 1; - } - if (slot != -1) { - assert(vocp->voc_ocl_slot == slot); - vocp->voc_ocl_slot = -1; - slot = -1; - pthread_cond_signal(&vocp->voc_wait); - } - if (vocp->voc_halt) - break; - if (vocp->voc_ocl_slot == -1) { - gettimeofday(&tv, NULL); - while (vocp->voc_ocl_slot == -1) { - pthread_cond_wait(&vocp->voc_wait, - &vocp->voc_lock); - if (vocp->voc_halt) - goto out; - } - gettimeofday(&tvt, NULL); - timersub(&tvt, &tv, &tvd); - timeradd(&tvd, &idle, &idle); - } - assert(!vocp->voc_rekey); - assert(!vocp->voc_halt); - slot = vocp->voc_ocl_slot; - rows = vocp->voc_ocl_rows; - cols = vocp->voc_ocl_cols; - invsize = vocp->voc_ocl_invsize; - pthread_mutex_unlock(&vocp->voc_lock); - - gettimeofday(&tv, NULL); - if (!vg_ocl_kernel_start(vocp, slot, cols, rows, invsize)) - halt = 1; - - if (!vg_ocl_kernel_wait(vocp, slot)) - halt = 1; - - if (vcp->vc_verbose > 1) { - gettimeofday(&tvt, NULL); - timersub(&tvt, &tv, &tvd); - timeradd(&tvd, &busy, &busy); - if ((busy.tv_sec + idle.tv_sec) > 1) { - idleu = (1000000 * idle.tv_sec) + idle.tv_usec; - busyu = (1000000 * busy.tv_sec) + busy.tv_usec; - pidle = ((double) idleu) / (idleu + busyu); - - if (pidle > 0.01) { - fprintf(stderr, "\rGPU idle: %.2f%%" - " " - " \n", - 100 * pidle); - } - memset(&idle, 0, sizeof(idle)); - memset(&busy, 0, sizeof(busy)); - } - } - } -out: - pthread_mutex_unlock(&vocp->voc_lock); - return NULL; -} - - -/* - * Address search thread main loop - */ - -void * -vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode, int verify, - int worksize, int nthreads, int nrows, int ncols, int invsize) -{ - int i; - int round, full_threads, wsmult; - cl_ulong memsize, allocsize; - - const BN_ULONG rekey_max = 100000000; - BN_ULONG npoints, rekey_at; - - EC_KEY *pkey = NULL; - const EC_GROUP *pgroup; - const EC_POINT *pgen; - EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL; - EC_POINT *pseek = NULL; - - unsigned char *ocl_points_in, *ocl_strides_in; - - vg_ocl_context_t ctx; - vg_ocl_context_t *vocp = &ctx; - vg_exec_context_t *vxcp = &vocp->base; - - int slot, nslots; - int slot_busy = 0, slot_done = 0, halt = 0; - int c = 0, output_interval = 1000; - - struct timeval tvstart; - - if (!vg_ocl_init(vcp, &ctx, did, safe_mode)) - return NULL; - - if (verify) { - if (vcp->vc_verbose > 0) { - fprintf(stderr, "WARNING: " - "Hardware verification mode enabled\n"); - } - if (!nthreads) - nthreads = 1; - vocp->voc_verify_func[0] = vg_ocl_verify_k0; - vocp->voc_verify_func[1] = vg_ocl_verify_k1; - } - - pkey = vxcp->vxc_key; - pgroup = EC_KEY_get0_group(pkey); - pgen = EC_GROUP_get0_generator(pgroup); - - /* - * nrows: number of point rows per job - * ncols: number of point columns per job - * invsize: number of modular inversion tasks per job - * (each task performs (nrows*ncols)/invsize inversions) - * nslots: number of kernels - * (create two, keep one running while we service the other or wait) - */ - - if (!nthreads) { - /* Pick nthreads sufficient to saturate one compute unit */ - if (vg_ocl_device_gettype(vocp->voc_ocldid) & - CL_DEVICE_TYPE_CPU) - nthreads = 1; - else - nthreads = vg_ocl_device_getsizet(vocp->voc_ocldid, - CL_DEVICE_MAX_WORK_GROUP_SIZE); - } - - full_threads = vg_ocl_device_getsizet(vocp->voc_ocldid, - CL_DEVICE_MAX_COMPUTE_UNITS); - full_threads *= nthreads; - - /* - * The work size selection is complicated, and the most - * important factor is the batch size of the heap_invert kernel. - * Each value added to the batch trades one complete modular - * inversion for four multiply operations. Ideally the work - * size would be as large as possible. The practical limiting - * factors are: - * 1. Available memory - * 2. Responsiveness and operational latency - * - * We take a naive approach and limit batch size to a point of - * sufficiently diminishing returns, hoping that responsiveness - * will be sufficient. - * - * The measured value for the OpenSSL implementations on my CPU - * is 80:1. This causes heap_invert to get batches of 20 or so - * for free, and receive 10% incremental returns at 200. The CPU - * work size is therefore set to 256. - * - * The ratio on most GPUs with the oclvanitygen implementations - * is closer to 500:1, and larger batches are required for - * good performance. - */ - if (!worksize) { - if (vg_ocl_device_gettype(vocp->voc_ocldid) & - CL_DEVICE_TYPE_GPU) - worksize = 2048; - else - worksize = 256; - } - - if (!ncols) { - memsize = vg_ocl_device_getulong(vocp->voc_ocldid, - CL_DEVICE_GLOBAL_MEM_SIZE); - allocsize = vg_ocl_device_getulong(vocp->voc_ocldid, - CL_DEVICE_MAX_MEM_ALLOC_SIZE); - memsize /= 2; - ncols = full_threads; - nrows = 2; - /* Find row and column counts close to sqrt(full_threads) */ - while ((ncols > nrows) && !(ncols & 1)) { - ncols /= 2; - nrows *= 2; - } - - /* - * Increase row & column counts to satisfy work size - * multiplier or fill available memory. - */ - wsmult = 1; - while ((!worksize || ((wsmult * 2) <= worksize)) && - ((ncols * nrows * 2 * 128) < memsize) && - ((ncols * nrows * 2 * 64) < allocsize)) { - if (ncols > nrows) - nrows *= 2; - else - ncols *= 2; - wsmult *= 2; - } - } - - round = nrows * ncols; - - if (!invsize) { - invsize = 2; - while (!(round % (invsize << 1)) && - ((round / invsize) > full_threads)) - invsize <<= 1; - } - - if (vcp->vc_verbose > 1) { - fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows); - fprintf(stderr, "Modular inverse: %d threads, %d ops each\n", - round/invsize, invsize); - } - - if ((round % invsize) || !is_pow2(invsize) || (invsize < 2)) { - if (vcp->vc_verbose <= 1) { - fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows); - fprintf(stderr, - "Modular inverse: %d threads, %d ops each\n", - round/invsize, invsize); - } - if (round % invsize) - fprintf(stderr, - "Modular inverse work size must " - "evenly divide points\n"); - else - fprintf(stderr, - "Modular inverse work per task (%d) " - "must be a power of 2\n", invsize); - goto out; - } - - if (!vcp->vc_remove_on_match && - (vcp->vc_chance >= 1.0f) && - (vcp->vc_chance < round) && - (vcp->vc_verbose > 0)) { - fprintf(stderr, "WARNING: low pattern difficulty\n"); - fprintf(stderr, - "WARNING: better match throughput is possible " - "using vanitygen on the CPU\n"); - } - - nslots = 2; - slot = 0; - vocp->voc_ocl_rows = nrows; - vocp->voc_ocl_cols = ncols; - vocp->voc_ocl_invsize = invsize; - vocp->voc_nslots = nslots; - - ppbase = (EC_POINT **) malloc((nrows + ncols) * - sizeof(EC_POINT*)); - if (!ppbase) - goto enomem; - - for (i = 0; i < (nrows + ncols); i++) { - ppbase[i] = EC_POINT_new(pgroup); - if (!ppbase[i]) - goto enomem; - } - - pprow = ppbase + ncols; - pbatchinc = EC_POINT_new(pgroup); - poffset = EC_POINT_new(pgroup); - pseek = EC_POINT_new(pgroup); - if (!pbatchinc || !poffset || !pseek) - goto enomem; - - BN_set_word(&vxcp->vxc_bntmp, ncols); - EC_POINT_mul(pgroup, pbatchinc, &vxcp->vxc_bntmp, NULL, NULL, - vxcp->vxc_bnctx); - EC_POINT_make_affine(pgroup, pbatchinc, vxcp->vxc_bnctx); - - BN_set_word(&vxcp->vxc_bntmp, round); - EC_POINT_mul(pgroup, poffset, &vxcp->vxc_bntmp, NULL, NULL, - vxcp->vxc_bnctx); - EC_POINT_make_affine(pgroup, poffset, vxcp->vxc_bnctx); - - if (!vg_ocl_config_pattern(vocp)) - goto enomem; - - for (i = 0; i < nslots; i++) { - /* - * Each work group gets its own: - * - Column point array - */ - if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * nrows, 1)) - goto enomem; - } - - /* - * All instances share: - * - The z_heap and point scratch spaces - * - The row point array - */ - if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1, - round_up_pow2(32 * 2 * round, 4096), 0) || - !vg_ocl_kernel_arg_alloc(vocp, -1, 2, - round_up_pow2(32 * 2 * round, 4096), 0) || - !vg_ocl_kernel_arg_alloc(vocp, -1, 3, - round_up_pow2(32 * 2 * ncols, 4096), 1)) - goto enomem; - - npoints = 0; - rekey_at = 0; - vxcp->vxc_binres[0] = vcp->vc_addrtype; - - if (pthread_create(&vocp->voc_ocl_thread, NULL, - vg_opencl_thread, vocp)) - goto enomem; - - gettimeofday(&tvstart, NULL); - -l_rekey: - if (vocp->voc_rekey_func && - !vocp->voc_rekey_func(vocp)) - goto enomem; - - /* Generate a new random private key */ - EC_KEY_generate_key(pkey); - npoints = 0; - - /* Determine rekey interval */ - EC_GROUP_get_order(pgroup, &vxcp->vxc_bntmp, vxcp->vxc_bnctx); - BN_sub(&vxcp->vxc_bntmp2, - &vxcp->vxc_bntmp, - EC_KEY_get0_private_key(pkey)); - rekey_at = BN_get_word(&vxcp->vxc_bntmp2); - if ((rekey_at == BN_MASK2) || (rekey_at > rekey_max)) - rekey_at = rekey_max; - assert(rekey_at > 0); - - EC_POINT_copy(ppbase[0], EC_KEY_get0_public_key(pkey)); - - if (vcp->vc_pubkey_base) { - EC_POINT_add(pgroup, - ppbase[0], - ppbase[0], - vcp->vc_pubkey_base, - vxcp->vxc_bnctx); - } - - /* Build the base array of sequential points */ - for (i = 1; i < ncols; i++) { - EC_POINT_add(pgroup, - ppbase[i], - ppbase[i-1], - pgen, vxcp->vxc_bnctx); - } - - EC_POINTs_make_affine(pgroup, ncols, ppbase, vxcp->vxc_bnctx); - - /* Fill the sequential point array */ - ocl_points_in = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, 0, 3, 1); - if (!ocl_points_in) - goto enomem; - for (i = 0; i < ncols; i++) - vg_ocl_put_point_tpa(ocl_points_in, i, ppbase[i]); - vg_ocl_unmap_arg_buffer(vocp, 0, 3, ocl_points_in); - - /* - * Set up the initial row increment table. - * Set the first element to pgen -- effectively - * skipping the exact key generated above. - */ - EC_POINT_copy(pprow[0], pgen); - for (i = 1; i < nrows; i++) { - EC_POINT_add(pgroup, - pprow[i], - pprow[i-1], - pbatchinc, vxcp->vxc_bnctx); - } - EC_POINTs_make_affine(pgroup, nrows, pprow, vxcp->vxc_bnctx); - vxcp->vxc_delta = 1; - npoints = 1; - slot = 0; - slot_busy = 0; - slot_done = 0; - - while (1) { - if (slot_done) { - assert(rekey_at > 0); - slot_done = 0; - - /* Call the result check function */ - switch (vocp->voc_check_func(vocp, slot)) { - case 1: - rekey_at = 0; - break; - case 2: - halt = 1; - break; - default: - break; - } - - c += round; - if (!halt && (c >= output_interval)) { - output_interval = - vg_output_timing(vcp, c, &tvstart); - c = 0; - } - } - - if (halt) - break; - - if ((npoints + round) < rekey_at) { - if (npoints > 1) { - /* Move the row increments forward */ - for (i = 0; i < nrows; i++) { - EC_POINT_add(pgroup, - pprow[i], - pprow[i], - poffset, - vxcp->vxc_bnctx); - } - - EC_POINTs_make_affine(pgroup, nrows, pprow, - vxcp->vxc_bnctx); - } - - /* Copy the row stride array to the device */ - ocl_strides_in = (unsigned char *) - vg_ocl_map_arg_buffer(vocp, slot, 4, 1); - if (!ocl_strides_in) - goto enomem; - memset(ocl_strides_in, 0, 64*nrows); - for (i = 0; i < nrows; i++) - vg_ocl_put_point(ocl_strides_in + (64*i), - pprow[i]); - vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in); - npoints += round; - - pthread_mutex_lock(&vocp->voc_lock); - while (vocp->voc_ocl_slot != -1) { - assert(slot_busy); - pthread_cond_wait(&vocp->voc_wait, - &vocp->voc_lock); - } - - if (vocp->voc_halt) { - pthread_mutex_unlock(&vocp->voc_lock); - halt = 1; - break; - } - - vocp->voc_ocl_slot = slot; - pthread_cond_signal(&vocp->voc_wait); - pthread_mutex_unlock(&vocp->voc_lock); - - slot_done = slot_busy; - slot_busy = 1; - slot = (slot + 1) % nslots; - - } else { - if (slot_busy) { - pthread_mutex_lock(&vocp->voc_lock); - while (vocp->voc_ocl_slot != -1) { - assert(vocp->voc_ocl_slot == - ((slot + nslots - 1) % nslots)); - pthread_cond_wait(&vocp->voc_wait, - &vocp->voc_lock); - } - pthread_mutex_unlock(&vocp->voc_lock); - slot_busy = 0; - slot_done = 1; - } - - if (!rekey_at || - (!slot_done && ((npoints + round) >= rekey_at))) - goto l_rekey; - } - } - - if (0) { - enomem: - fprintf(stderr, "ERROR: allocation failure?\n"); - } - -out: - if (halt) { - if (vcp->vc_verbose > 1) { - printf("Halting..."); - fflush(stdout); - } - pthread_mutex_lock(&vocp->voc_lock); - vocp->voc_halt = 1; - pthread_cond_signal(&vocp->voc_wait); - while (vocp->voc_ocl_slot != -1) { - assert(slot_busy); - pthread_cond_wait(&vocp->voc_wait, - &vocp->voc_lock); - } - slot_busy = 0; - pthread_mutex_unlock(&vocp->voc_lock); - pthread_join(vocp->voc_ocl_thread, NULL); - if (vcp->vc_verbose > 1) - printf("done!\n"); - } - - if (ppbase) { - for (i = 0; i < (nrows + ncols); i++) - if (ppbase[i]) - EC_POINT_free(ppbase[i]); - free(ppbase); - } - if (pbatchinc) - EC_POINT_free(pbatchinc); - - vg_ocl_del(vocp); - - return NULL; -} - - - - -/* - * OpenCL platform/device selection junk - */ - -int -get_device_list(cl_platform_id pid, cl_device_id **list_out) -{ - cl_uint nd; - cl_int res; - cl_device_id *ids; - res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); - if (res != CL_SUCCESS) { - vg_ocl_error(NULL, res, "clGetDeviceIDs(0)"); - *list_out = NULL; - return -1; - } - if (nd) { - ids = (cl_device_id *) malloc(nd * sizeof(*ids)); - if (ids == NULL) { - fprintf(stderr, "Could not allocate device ID list\n"); - *list_out = NULL; - return -1; - } - res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, nd, ids, NULL); - if (res != CL_SUCCESS) { - vg_ocl_error(NULL, res, "clGetDeviceIDs(n)"); - free(ids); - *list_out = NULL; - return -1; - } - *list_out = ids; - } - return nd; -} - -void -show_devices(cl_platform_id pid, cl_device_id *ids, int nd, int base) -{ - int i; - char nbuf[128]; - char vbuf[128]; - size_t len; - cl_int res; - - for (i = 0; i < nd; i++) { - res = clGetDeviceInfo(ids[i], CL_DEVICE_NAME, - sizeof(nbuf), nbuf, &len); - if (res != CL_SUCCESS) - continue; - if (len >= sizeof(nbuf)) - len = sizeof(nbuf) - 1; - nbuf[len] = '\0'; - res = clGetDeviceInfo(ids[i], CL_DEVICE_VENDOR, - sizeof(vbuf), vbuf, &len); - if (res != CL_SUCCESS) - continue; - if (len >= sizeof(vbuf)) - len = sizeof(vbuf) - 1; - vbuf[len] = '\0'; - fprintf(stderr, " %d: [%s] %s\n", i + base, vbuf, nbuf); - } -} - -cl_device_id -get_device(cl_platform_id pid, int num) -{ - int nd; - cl_device_id id, *ids; - - nd = get_device_list(pid, &ids); - if (nd < 0) - return NULL; - if (!nd) { - fprintf(stderr, "No OpenCL devices found\n"); - return NULL; - } - if (num < 0) { - if (nd == 1) - num = 0; - else - num = nd; - } - if (num < nd) { - id = ids[num]; - free(ids); - return id; - } - free(ids); - return NULL; -} - -int -get_platform_list(cl_platform_id **list_out) -{ - cl_uint np; - cl_int res; - cl_platform_id *ids; - res = clGetPlatformIDs(0, NULL, &np); - if (res != CL_SUCCESS) { - vg_ocl_error(NULL, res, "clGetPlatformIDs(0)"); - *list_out = NULL; - return -1; - } - if (np) { - ids = (cl_platform_id *) malloc(np * sizeof(*ids)); - if (ids == NULL) { - fprintf(stderr, - "Could not allocate platform ID list\n"); - *list_out = NULL; - return -1; - } - res = clGetPlatformIDs(np, ids, NULL); - if (res != CL_SUCCESS) { - vg_ocl_error(NULL, res, "clGetPlatformIDs(n)"); - free(ids); - *list_out = NULL; - return -1; - } - *list_out = ids; - } - return np; -} - -void -show_platforms(cl_platform_id *ids, int np, int base) -{ - int i; - char nbuf[128]; - char vbuf[128]; - size_t len; - cl_int res; - - for (i = 0; i < np; i++) { - res = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, - sizeof(nbuf), nbuf, &len); - if (res != CL_SUCCESS) { - vg_ocl_error(NULL, res, "clGetPlatformInfo(NAME)"); - continue; - } - if (len >= sizeof(nbuf)) - len = sizeof(nbuf) - 1; - nbuf[len] = '\0'; - res = clGetPlatformInfo(ids[i], CL_PLATFORM_VENDOR, - sizeof(vbuf), vbuf, &len); - if (res != CL_SUCCESS) { - vg_ocl_error(NULL, res, "clGetPlatformInfo(VENDOR)"); - continue; - } - if (len >= sizeof(vbuf)) - len = sizeof(vbuf) - 1; - vbuf[len] = '\0'; - fprintf(stderr, "%d: [%s] %s\n", i + base, vbuf, nbuf); - } -} - -cl_platform_id -get_platform(int num) -{ - int np; - cl_platform_id id, *ids; - - np = get_platform_list(&ids); - if (np < 0) - return NULL; - if (!np) { - fprintf(stderr, "No OpenCL platforms available\n"); - return NULL; - } - if (num < 0) { - if (np == 1) - num = 0; - else - num = np; - } - if (num < np) { - id = ids[num]; - free(ids); - return id; - } - free(ids); - return NULL; -} - -void -enumerate_opencl(void) -{ - cl_platform_id *pids; - cl_device_id *dids; - int np, nd, i; - - np = get_platform_list(&pids); - if (!np) { - fprintf(stderr, "No OpenCL platforms available\n"); - return; - } - fprintf(stderr, "Available OpenCL platforms:\n"); - for (i = 0; i < np; i++) { - show_platforms(&pids[i], 1, i); - nd = get_device_list(pids[i], &dids); - if (!nd) { - fprintf(stderr, " -- No devices\n"); - } else { - show_devices(pids[i], dids, nd, 0); - } - } -} +#include +#include +#include -cl_device_id -get_opencl_device(int platformidx, int deviceidx) -{ - cl_platform_id pid; - cl_device_id did = NULL; +#include "oclengine.h" +#include "pattern.h" +#include "util.h" - pid = get_platform(platformidx); - if (pid) { - did = get_device(pid, deviceidx); - if (did) - return did; - } - enumerate_opencl(); - return NULL; -} +const char *version = "0.17"; +const int debug = 0; void @@ -2597,7 +97,7 @@ main(int argc, char **argv) int verify_mode = 0; int safe_mode = 0; vg_context_t *vcp = NULL; - cl_device_id did; + vg_ocl_context_t *vocp = NULL; EC_POINT *pubkey_base = NULL; const char *result_file = NULL; const char *key_password = NULL; @@ -2814,7 +314,11 @@ main(int argc, char **argv) vcp->vc_remove_on_match = remove_on_match; vcp->vc_pubkey_base = pubkey_base; - if (!vg_context_add_patterns(vcp, patterns, npatterns)) + vcp->vc_output_match = vg_output_match_console; + vcp->vc_output_timing = vg_output_timing_console; + + if (!vg_context_add_patterns(vcp, (const char ** const)patterns, + npatterns)) return 1; if (!vcp->vc_npatterns) { @@ -2839,12 +343,14 @@ main(int argc, char **argv) fprintf(stderr, "Regular expressions: %ld\n", vcp->vc_npatterns); - did = get_opencl_device(platformidx, deviceidx); - if (!did) { + vocp = vg_ocl_context_new(vcp, platformidx, deviceidx, + safe_mode, verify_mode, + worksize, nthreads, nrows, ncols, invsize); + if (!vocp) { return 1; } - vg_opencl_loop(vcp, did, safe_mode, verify_mode, - worksize, nthreads, nrows, ncols, invsize); + vg_opencl_loop(vocp); + vg_ocl_context_free(vocp); return 0; } diff --git a/oclvanityminer.c b/oclvanityminer.c new file mode 100644 index 0000000..67b6058 --- /dev/null +++ b/oclvanityminer.c @@ -0,0 +1,748 @@ +/* + * Vanitygen, vanity bitcoin address generator + * Copyright (C) 2011 + * + * Vanitygen is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * Vanitygen is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with Vanitygen. If not, see . + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +#include "oclengine.h" +#include "pattern.h" +#include "util.h" + + +const char *version = "0.17"; +const int debug = 0; + + +typedef struct workitem_s { + const char *pattern; + const char *comment; + EC_POINT *pubkey; + int addrtype; + double difficulty; + double reward; + double value; +} workitem_t; + +typedef struct server_request_s { + int request_status; + const EC_GROUP *group; + + char *part_buf; + size_t part_off; + size_t part_end; + size_t part_size; + + workitem_t **items; + int nitems; + int nalloc; +} server_request_t; + +void +server_workitem_free(workitem_t *wip) +{ + if (wip->pubkey) + EC_POINT_free(wip->pubkey); + free(wip); +} + +static workitem_t * +server_workitem_new(server_request_t *reqp, + const char *pfx, const char *pubkey_s, + const char *addrtype_s, const char *reward_s, + const char *comment) +{ + workitem_t *wip; + EC_POINT *pubkey; + int addrtype; + double reward; + double difficulty; + + addrtype = atoi(addrtype_s); + if ((addrtype < 0) || (addrtype > 255)) + return NULL; + + reward = strtod(reward_s, NULL); + if (reward < 0.0) + return NULL; + + difficulty = vg_prefix_get_difficulty(addrtype, pfx); + if (difficulty == 0.0) + return NULL; + + pubkey = EC_POINT_hex2point(reqp->group, pubkey_s, NULL, NULL); + if (pubkey == NULL) + return NULL; + + + wip = (workitem_t *) malloc(sizeof(*wip) + + strlen(pfx) + + strlen(comment) + 2); + memset(wip, 0, sizeof(*wip)); + wip->pattern = (char *) (wip + 1); + strcpy((char *)wip->pattern, pfx); + wip->comment = wip->pattern + (strlen(wip->pattern) + 1); + strcpy((char *) wip->comment, comment); + wip->pubkey = pubkey; + wip->addrtype = addrtype; + wip->difficulty = difficulty; + wip->reward = reward; + wip->value = (reward * 1000000.0 * 3600.0) / difficulty; + + return wip; +} + +static int +server_workitem_ptr_comp(const void *pa, const void *pb) +{ + workitem_t *a = *(workitem_t **) pa, *b = *(workitem_t **) pb; + return (a->value > b->value) ? -1 : ((a->value < b->value) ? 1 : 0); +} + + +typedef struct server_context_s { + EC_KEY *dummy_key; + const char *url; + const char *credit_addr; + char *getwork; + char *submit; + int verbose; +} server_context_t; + +static int +server_workitem_equal(server_context_t *ctxp, workitem_t *a, workitem_t *b) +{ + if (strcmp(a->pattern, b->pattern)) + return 0; + if (EC_POINT_cmp(EC_KEY_get0_group(ctxp->dummy_key), + a->pubkey, b->pubkey, NULL)) + return 0; + return 1; +} + + +void +server_context_free(server_context_t *ctxp) +{ + if (ctxp->dummy_key) + EC_KEY_free(ctxp->dummy_key); + if (ctxp->getwork) + free(ctxp->getwork); + if (ctxp->submit) + free(ctxp->submit); + free(ctxp); +} + +server_context_t * +server_context_new(const char *url, const char *credit_addr) +{ + server_context_t *ctxp; + int urllen = strlen(url); + int addrlen = strlen(credit_addr); + ctxp = (server_context_t *) + malloc(sizeof(*ctxp) + urllen + addrlen + 2); + memset(ctxp, 0, sizeof(*ctxp)); + ctxp->url = (const char *) (ctxp + 1); + ctxp->credit_addr = (const char *) (ctxp->url + urllen + 1); + strcpy((char *) ctxp->url, url); + strcpy((char *) ctxp->credit_addr, credit_addr); + + ctxp->dummy_key = vg_exec_context_new_key(); + ctxp->getwork = (char *) malloc(urllen + 9); + ctxp->submit = (char *) malloc(urllen + 7); + if (url[urllen - 1] == '/') { + snprintf(ctxp->getwork, urllen + 9, "%sgetWork", url); + snprintf(ctxp->submit, urllen + 7, "%ssolve", url); + } else { + snprintf(ctxp->getwork, urllen + 9, "%s/getWork", url); + snprintf(ctxp->submit, urllen + 7, "%s/solve", url); + } + + return ctxp; +} + + +int +server_workitem_add(server_request_t *reqp, workitem_t *wip) +{ + int nalloc; + + if ((reqp->nitems + 1) >= reqp->nalloc) { + nalloc = reqp->nalloc * 2; + if (nalloc == 0) + nalloc = 16; + if (nalloc > 65536) + return -1; + reqp->items = (workitem_t **) + realloc(reqp->items, nalloc * sizeof(*reqp->items)); + if (reqp->items == NULL) + return -1; + reqp->nalloc = nalloc; + } + reqp->items[reqp->nitems++] = wip; + return 0; +} + + +static int +server_body_reader(const char *buf, size_t elemsize, size_t len, void *param) +{ + server_request_t *reqp = (server_request_t *) param; + char *line, *sep, *pfx, *pubkey_s, *addrtype_s, *reward_s, *comment; + workitem_t *wip; + + if (!len) + return 0; + + if ((reqp->part_size < (reqp->part_end + len)) && + (reqp->part_off > 0)) { + memmove(reqp->part_buf, + reqp->part_buf + reqp->part_off, + reqp->part_end - reqp->part_off); + reqp->part_end -= reqp->part_off; + reqp->part_off = 0; + } + + if (reqp->part_size < (reqp->part_end + len)) { + if (reqp->part_size == 0) + reqp->part_size = 4096; + while (reqp->part_size < (reqp->part_end + len)) { + reqp->part_size *= 2; + if (reqp->part_size > (1024*1024)) { + fprintf(stderr, "Line too long from server"); + reqp->request_status = 0; + return -1; + } + } + reqp->part_buf = (char *) realloc(reqp->part_buf, + reqp->part_size); + if (!reqp->part_buf) { + fprintf(stderr, "Out of memory"); + return -1; + } + } + + memcpy(reqp->part_buf + reqp->part_end, buf, len); + reqp->part_end += len; + + line = reqp->part_buf; + while (1) { + sep = strchr(line, '\n'); + if (!sep) + break; + pfx = line; + *sep = '\0'; + line = sep + 1; + sep = strchr(pfx, ':'); + if (!sep) + goto bad_line; + *sep = '\0'; sep += 1; + pubkey_s = sep; + sep = strchr(sep, ':'); + if (!sep) + goto bad_line; + *sep = '\0'; sep += 1; + addrtype_s = sep; + sep = strchr(sep, ':'); + if (!sep) + goto bad_line; + *sep = '\0'; sep += 1; + reward_s = sep; + sep = strchr(sep, ';'); + if (!sep) + goto bad_line; + *sep = '\0'; sep += 1; + comment = sep; + + wip = server_workitem_new(reqp, pfx, pubkey_s, addrtype_s, + reward_s, comment); + if (!wip) + goto bad_line; + if (server_workitem_add(reqp, wip)) { + server_workitem_free(wip); + goto bad_line; + } + continue; + + bad_line: + ; + } + + reqp->part_off = line - reqp->part_buf; + if (reqp->part_off == reqp->part_end) { + reqp->part_off = 0; + reqp->part_end = 0; + } + + return len; +} + +void +dump_work(workitem_t **workarray) +{ + workitem_t *wip; + int i; + printf("Available bounties:\n"); + for (i = 0; workarray[i] != NULL; i++) { + wip = workarray[i]; + printf("Pattern: \"%s\" Reward: %f Value: %f BTC/MkeyHr\n", + wip->pattern, + wip->reward, + wip->value); + } +} + +void +free_work_array(workitem_t **workarray, workitem_t *except) +{ + int i; + if (workarray) { + for (i = 0; workarray[i] != NULL; i++) { + if (workarray[i] != except) + server_workitem_free(workarray[i]); + } + free(workarray); + } +} + +workitem_t ** +server_context_getwork(server_context_t *ctxp) +{ + CURLcode res; + server_request_t *reqp; + CURL *creq; + + reqp = (server_request_t *) malloc(sizeof(*reqp)); + memset(reqp, 0, sizeof(*reqp)); + + reqp->group = EC_KEY_get0_group(ctxp->dummy_key); + + creq = curl_easy_init(); + if (curl_easy_setopt(creq, CURLOPT_URL, ctxp->getwork) || + curl_easy_setopt(creq, CURLOPT_VERBOSE, ctxp->verbose > 1) || + curl_easy_setopt(creq, CURLOPT_WRITEFUNCTION, + server_body_reader) || + curl_easy_setopt(creq, CURLOPT_WRITEDATA, reqp)) { + fprintf(stderr, "Failed to set up libcurl\n"); + exit(1); + } + + res = curl_easy_perform(creq); + if (res != CURLE_OK) { + fprintf(stderr, "Get work request failed: %s\n", + curl_easy_strerror(res)); + curl_easy_cleanup(creq); + free_work_array(reqp->items, NULL); + return NULL; + } + + if (reqp->items) { + qsort(reqp->items, reqp->nitems, sizeof(*(reqp->items)), + server_workitem_ptr_comp); + reqp->items[reqp->nitems] = NULL; + } + + curl_easy_cleanup(creq); + return reqp->items; +} + + +int +server_context_submit_solution(server_context_t *ctxp, + workitem_t *work, + const char *privkey) +{ + char urlbuf[8192]; + char *pubhex; + CURL *creq; + CURLcode res; + + pubhex = EC_POINT_point2hex(EC_KEY_get0_group(ctxp->dummy_key), + work->pubkey, + POINT_CONVERSION_UNCOMPRESSED, + NULL); + snprintf(urlbuf, sizeof(urlbuf), + "%s?key=%s%%3A%s&privateKey=%s&bitcoinAddress=%s", + ctxp->submit, + work->pattern, + pubhex, + privkey, + ctxp->credit_addr); + OPENSSL_free(pubhex); + creq = curl_easy_init(); + if (curl_easy_setopt(creq, CURLOPT_URL, urlbuf) || + curl_easy_setopt(creq, CURLOPT_VERBOSE, ctxp->verbose > 1) || + curl_easy_setopt(creq, CURLOPT_POST, 1)) { + fprintf(stderr, "Failed to set up libcurl\n"); + exit(1); + } + + res = curl_easy_perform(creq); + if (res != CURLE_OK) { + fprintf(stderr, "Submission failed: %s\n", + curl_easy_strerror(res)); + curl_easy_cleanup(creq); + return -1; + } + + curl_easy_cleanup(creq); + return 0; +} + +static pthread_mutex_t soln_lock; +static pthread_cond_t soln_cond; +static char *soln_pattern = NULL; +static char *soln_private_key = NULL; + +void +free_soln() +{ + if (soln_pattern) { + free(soln_pattern); + soln_pattern = NULL; + } + if (soln_private_key) { + OPENSSL_free(soln_private_key); + soln_private_key = NULL; + } +} + +void +output_match_work_complete(vg_context_t *vcp, EC_KEY *pkey, const char *pattern) +{ + vg_output_match_console(vcp, pkey, pattern); + pthread_mutex_lock(&soln_lock); + free_soln(); + soln_pattern = strdup(pattern); + soln_private_key = BN_bn2hex(EC_KEY_get0_private_key(pkey)); + + /* Signal the generator to stop */ + vcp->vc_halt = 1; + + /* Wake up the main thread, if it's sleeping */ + pthread_cond_broadcast(&soln_cond); + pthread_mutex_unlock(&soln_lock); +} + +int +check_solution(server_context_t *scp, workitem_t *wip) +{ + int res = 0; + pthread_mutex_lock(&soln_lock); + if (soln_private_key != NULL) { + assert(!strcmp(soln_pattern, wip->pattern)); + server_context_submit_solution(scp, wip, soln_private_key); + free_soln(); + res = 1; + } + pthread_mutex_unlock(&soln_lock); + return res; +} + +void +usage(const char *name) +{ + fprintf(stderr, +"oclVanityMiner %s (" OPENSSL_VERSION_TEXT ")\n" +"Usage: %s -u -a \n" +"Organized vanity address mining client using OpenCL. Contacts the specified\n" +"bounty pool server, downloads a list of active bounties, and attempts to\n" +"generate the address with the best difficulty to reward ratio. Maintains\n" +"contact with the bounty pool server and periodically refreshes the bounty\n" +"list.\n" +"\n" +"Options:\n" +"-u Bounty pool URL\n" +"-a
Credit address for completed work\n" +"-i Set server polling interval in seconds (default 90)\n" +"-v Verbose output\n" +"-q Quiet output\n" +"-p Select OpenCL platform\n" +"-d Select OpenCL device\n" +"-S Safe mode, disable OpenCL loop unrolling optimizations\n" +"-w Set work items per thread in a work unit\n" +"-t Set target thread count per multiprocessor\n" +"-g x Set grid size\n" +"-b Set modular inverse ops per thread\n" +"-V Enable kernel/OpenCL/hardware verification (SLOW)\n", +version, name); +} + + +int +main(int argc, char **argv) +{ + const char *url = NULL; + const char *credit_addr = NULL; + int opt; + int platformidx = -1, deviceidx = -1; + char *pend; + int verbose = 1; + int interval = 90; + int nthreads = 0; + int worksize = 0; + int nrows = 0, ncols = 0; + int invsize = 0; + int verify_mode = 0; + int safe_mode = 0; + vg_context_t *vcp = NULL; + vg_ocl_context_t *vocp = NULL; + + int res; + int thread_started = 0; + pthread_t thread; + workitem_t *active_wip = NULL; + + server_context_t *scp = NULL; + workitem_t *wip = NULL, **wipa; + int wip_index; + int was_sleeping = 0; + + struct timeval tv; + struct timespec sleepy; + + pthread_mutex_init(&soln_lock, NULL); + pthread_cond_init(&soln_cond, NULL); + + if (argc == 1) { + usage(argv[0]); + return 1; + } + + while ((opt = getopt(argc, argv, + "u:a:vqp:d:w:t:g:b:VSh?i:")) != -1) { + switch (opt) { + case 'u': + url = optarg; + break; + case 'a': + credit_addr = optarg; + break; + case 'v': + verbose = 2; + break; + case 'q': + verbose = 0; + break; + case 'i': + interval = atoi(optarg); + if (interval < 60) { + fprintf(stderr, + "Invalid interval '%s'\n", optarg); + return 1; + } + case 'p': + platformidx = atoi(optarg); + break; + case 'd': + deviceidx = atoi(optarg); + break; + case 'w': + worksize = atoi(optarg); + if (worksize == 0) { + fprintf(stderr, + "Invalid work size '%s'\n", optarg); + return 1; + } + break; + case 't': + nthreads = atoi(optarg); + if (nthreads == 0) { + fprintf(stderr, + "Invalid thread count '%s'\n", optarg); + return 1; + } + break; + case 'g': + nrows = 0; + ncols = strtol(optarg, &pend, 0); + if (pend && *pend == 'x') { + nrows = strtol(pend+1, NULL, 0); + } + if (!nrows || !ncols) { + fprintf(stderr, + "Invalid grid size '%s'\n", optarg); + return 1; + } + break; + case 'b': + invsize = atoi(optarg); + if (!invsize) { + fprintf(stderr, + "Invalid modular inverse size '%s'\n", + optarg); + return 1; + } + if (invsize & (invsize - 1)) { + fprintf(stderr, + "Modular inverse size must be " + "a power of 2\n"); + return 1; + } + break; + case 'V': + verify_mode = 1; + break; + case 'S': + safe_mode = 1; + break; + default: + usage(argv[0]); + return 1; + } + } + +#if OPENSSL_VERSION_NUMBER < 0x10000000L + /* Complain about older versions of OpenSSL */ + if (verbose > 0) { + fprintf(stderr, + "WARNING: Built with " OPENSSL_VERSION_TEXT "\n" + "WARNING: Use OpenSSL 1.0.0d+ for best performance\n"); + } +#endif + curl_easy_init(); + + vcp = vg_prefix_context_new(0, 128, 0); + + vcp->vc_verbose = verbose; + + vcp->vc_output_match = output_match_work_complete; + vcp->vc_output_timing = vg_output_timing_console; + + + if (!url) { + fprintf(stderr, "ERROR: No server URL specified\n"); + return 1; + } + if (!credit_addr) { + fprintf(stderr, "ERROR: No reward address specified\n"); + return 1; + } + if (!vg_b58_decode_check(credit_addr, NULL, 0)) { + fprintf(stderr, "ERROR: Invalid reward address specified\n"); + return 1; + } + + scp = server_context_new(url, credit_addr); + scp->verbose = verbose; + wipa = NULL; + + while (1) { + if (!wipa) { + wipa = server_context_getwork(scp); + wip_index = 0; + } + + if (wipa) { + wip = wipa[wip_index]; + if (wip) + wip_index += 1; + } else + wip = NULL; + + /* If the work item is the same as the one we're executing, + keep it */ + if (wip && active_wip && + server_workitem_equal(scp, active_wip, wip)) + wip = active_wip; + + if (thread_started && (!active_wip || (wip != active_wip))) { + /* If a thread is running, stop it */ + vcp->vc_halt = 1; + pthread_join(thread, NULL); + thread_started = 0; + vcp->vc_halt = 0; + if (active_wip) { + check_solution(scp, active_wip); + active_wip = NULL; + } + vg_context_clear_all_patterns(vcp); + } + + if (!wip) { + if (!was_sleeping) { + fprintf(stderr, + "No work available, sleeping\n"); + was_sleeping = 1; + } + + } else if (!active_wip) { + was_sleeping = 0; + fprintf(stderr, + "Searching for pattern: \"%s\" " + "Reward: %f Value: %f BTC/MkeyHr\n", + wip->pattern, + wip->reward, + wip->value); + vcp->vc_addrtype = wip->addrtype; + vcp->vc_pubkey_base = wip->pubkey; + if (!vg_context_add_patterns(vcp, &wip->pattern, 1)) + return 1; + assert(vcp->vc_npatterns); + + if (!vocp) { + vocp = vg_ocl_context_new(vcp, + platformidx, deviceidx, + safe_mode, verify_mode, + worksize, nthreads, nrows, + ncols, invsize); + if (!vocp) + return 1; + } + + res = pthread_create(&thread, NULL, + vg_opencl_loop, vocp); + thread_started = 1; + active_wip = wip; + } + + /* Wait for something to happen */ + gettimeofday(&tv, NULL); + sleepy.tv_sec = tv.tv_sec; + sleepy.tv_nsec = tv.tv_usec * 1000; + sleepy.tv_sec += interval; + + pthread_mutex_lock(&soln_lock); + if (!soln_private_key) + res = pthread_cond_timedwait(&soln_cond, + &soln_lock, &sleepy); + pthread_mutex_unlock(&soln_lock); + + if (res == 0) { + if (check_solution(scp, active_wip)) + active_wip = NULL; + } + else if (res == ETIMEDOUT) { + if (wipa) { + free_work_array(wipa, active_wip); + wipa = NULL; + } + } + } + + return 0; +} diff --git a/pattern.c b/pattern.c index 9f874d6..384b0ae 100644 --- a/pattern.c +++ b/pattern.c @@ -141,20 +141,13 @@ typedef struct _timing_info_s { int vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last) { - static unsigned long long total = 0, prevfound = 0, sincelast = 0; static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; - static timing_info_t *timing_head = NULL; pthread_t me; struct timeval tvnow, tv; timing_info_t *tip, *mytip; - unsigned long long rate, myrate = 0, mytime; - double count, prob, time, targ; - char linebuf[80]; - char *unit; - int rem, p, i; - - const double targs[] = { 0.5, 0.75, 0.8, 0.9, 0.95, 1.0 }; + unsigned long long rate, myrate = 0, mytime, total, sincelast; + int p, i; /* Compute the rate */ gettimeofday(&tvnow, NULL); @@ -167,7 +160,8 @@ vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last) pthread_mutex_lock(&mutex); me = pthread_self(); - for (tip = timing_head, mytip = NULL; tip != NULL; tip = tip->ti_next) { + for (tip = vcp->vc_timing_head, mytip = NULL; + tip != NULL; tip = tip->ti_next) { if (pthread_equal(tip->ti_thread, me)) { mytip = tip; p = ((tip->ti_hist_last + 1) % timing_hist_size); @@ -190,9 +184,9 @@ vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last) } if (!mytip) { mytip = (timing_info_t *) malloc(sizeof(*tip)); - mytip->ti_next = timing_head; + mytip->ti_next = vcp->vc_timing_head; mytip->ti_thread = me; - timing_head = mytip; + vcp->vc_timing_head = mytip; mytip->ti_hist_last = 0; mytip->ti_hist_time[0] = mytime; mytip->ti_hist_work[0] = cycle; @@ -205,20 +199,47 @@ vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last) rate += myrate; } - total += cycle; - if (prevfound != vcp->vc_found) { - prevfound = vcp->vc_found; - sincelast = 0; + vcp->vc_timing_total += cycle; + if (vcp->vc_timing_prevfound != vcp->vc_found) { + vcp->vc_timing_prevfound = vcp->vc_found; + vcp->vc_timing_sincelast = 0; } - sincelast += cycle; - count = sincelast; + vcp->vc_timing_sincelast += cycle; - if (mytip != timing_head) { + if (mytip != vcp->vc_timing_head) { pthread_mutex_unlock(&mutex); return myrate; } + total = vcp->vc_timing_total; + sincelast = vcp->vc_timing_sincelast; pthread_mutex_unlock(&mutex); + vcp->vc_output_timing(vcp, sincelast, rate, total); + return myrate; +} + +static void +vg_timing_info_free(vg_context_t *vcp) +{ + timing_info_t *tp; + while (vcp->vc_timing_head != NULL) { + tp = vcp->vc_timing_head; + vcp->vc_timing_head = tp->ti_next; + free(tp); + } +} + +void +vg_output_timing_console(vg_context_t *vcp, double count, + unsigned long long rate, unsigned long long total) +{ + double prob, time, targ; + char *unit; + char linebuf[80]; + int rem, p, i; + + const double targs[] = { 0.5, 0.75, 0.8, 0.9, 0.95, 1.0 }; + targ = rate; unit = "key/s"; if (targ > 1000) { @@ -314,11 +335,10 @@ vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last) } printf("\r%s", linebuf); fflush(stdout); - return myrate; } void -vg_output_match(vg_context_t *vcp, EC_KEY *pkey, const char *pattern) +vg_output_match_console(vg_context_t *vcp, EC_KEY *pkey, const char *pattern) { unsigned char key_buf[512], *pend; char addr_buf[64], addr2_buf[64]; @@ -386,6 +406,7 @@ vg_output_match(vg_context_t *vcp, EC_KEY *pkey, const char *pattern) printf("Privkey (ASN1): "); dumphex(key_buf, len); } + } if (!vcp->vc_result_file || (vcp->vc_verbose > 0)) { @@ -420,20 +441,26 @@ vg_output_match(vg_context_t *vcp, EC_KEY *pkey, const char *pattern) } - void vg_context_free(vg_context_t *vcp) { + vg_timing_info_free(vcp); vcp->vc_free(vcp); } int vg_context_add_patterns(vg_context_t *vcp, - char ** const patterns, int npatterns) + const char ** const patterns, int npatterns) { return vcp->vc_add_patterns(vcp, patterns, npatterns); } +void +vg_context_clear_all_patterns(vg_context_t *vcp) +{ + vcp->vc_clear_all_patterns(vcp); +} + int vg_context_hash160_sort(vg_context_t *vcp, void *buf) { @@ -699,6 +726,21 @@ out: return ret; } +static void +free_ranges(BIGNUM **ranges) +{ + BN_free(ranges[0]); + BN_free(ranges[1]); + ranges[0] = NULL; + ranges[1] = NULL; + if (ranges[2]) { + BN_free(ranges[2]); + BN_free(ranges[3]); + ranges[2] = NULL; + ranges[3] = NULL; + } +} + /* * AVL tree implementation */ @@ -1364,7 +1406,7 @@ typedef struct _vg_prefix_context_s { } vg_prefix_context_t; static void -vg_prefix_context_free(vg_context_t *vcp) +vg_prefix_context_clear_all_patterns(vg_context_t *vcp) { vg_prefix_context_t *vcpp = (vg_prefix_context_t *) vcp; vg_prefix_t *vp; @@ -1378,6 +1420,17 @@ vg_prefix_context_free(vg_context_t *vcp) } assert(npfx_left == vcpp->base.vc_npatterns); + vcpp->base.vc_npatterns = 0; + vcpp->base.vc_npatterns_start = 0; + vcpp->base.vc_found = 0; + BN_clear(&vcpp->vcp_difficulty); +} + +static void +vg_prefix_context_free(vg_context_t *vcp) +{ + vg_prefix_context_t *vcpp = (vg_prefix_context_t *) vcp; + vg_prefix_context_clear_all_patterns(vcp); BN_clear_free(&vcpp->vcp_difficulty); free(vcpp); } @@ -1407,7 +1460,7 @@ vg_prefix_context_next_difficulty(vg_prefix_context_t *vcpp, static int vg_prefix_context_add_patterns(vg_context_t *vcp, - char ** const patterns, int npatterns) + const char ** const patterns, int npatterns) { vg_prefix_context_t *vcpp = (vg_prefix_context_t *) vcp; prefix_case_iter_t caseiter; @@ -1556,6 +1609,47 @@ vg_prefix_context_add_patterns(vg_context_t *vcp, return ret; } +double +vg_prefix_get_difficulty(int addrtype, const char *pattern) +{ + BN_CTX *bnctx; + BIGNUM result, bntmp; + BIGNUM *ranges[4]; + char *dbuf; + int ret; + double diffret = 0.0; + + bnctx = BN_CTX_new(); + BN_init(&result); + BN_init(&bntmp); + + ret = get_prefix_ranges(addrtype, + pattern, ranges, bnctx); + + if (ret == 0) { + BN_sub(&bntmp, ranges[1], ranges[0]); + BN_add(&result, &result, &bntmp); + if (ranges[2]) { + BN_sub(&bntmp, ranges[3], ranges[2]); + BN_add(&result, &result, &bntmp); + } + free_ranges(ranges); + + BN_clear(&bntmp); + BN_set_bit(&bntmp, 192); + BN_div(&result, NULL, &bntmp, &result, bnctx); + + dbuf = BN_bn2dec(&result); + diffret = strtod(dbuf, NULL); + OPENSSL_free(dbuf); + } + + BN_clear_free(&result); + BN_clear_free(&bntmp); + BN_CTX_free(bnctx); + return diffret; +} + static int vg_prefix_test(vg_exec_context_t *vxcp) @@ -1579,7 +1673,8 @@ research: goto research; vg_exec_context_consolidate_key(vxcp); - vg_output_match(&vcpp->base, vxcp->vxc_key, vp->vp_pattern); + vcpp->base.vc_output_match(&vcpp->base, vxcp->vxc_key, + vp->vp_pattern); vcpp->base.vc_found++; @@ -1673,6 +1768,8 @@ vg_prefix_context_new(int addrtype, int privtype, int caseinsensitive) vcpp->base.vc_chance = 0.0; vcpp->base.vc_free = vg_prefix_context_free; vcpp->base.vc_add_patterns = vg_prefix_context_add_patterns; + vcpp->base.vc_clear_all_patterns = + vg_prefix_context_clear_all_patterns; vcpp->base.vc_test = vg_prefix_test; vcpp->base.vc_hash160_sort = vg_prefix_hash160_sort; avl_root_init(&vcpp->vcp_avlroot); @@ -1695,7 +1792,7 @@ typedef struct _vg_regex_context_s { static int vg_regex_context_add_patterns(vg_context_t *vcp, - char ** const patterns, int npatterns) + const char ** const patterns, int npatterns) { vg_regex_context_t *vcrp = (vg_regex_context_t *) vcp; const char *pcre_errptr; @@ -1770,7 +1867,7 @@ vg_regex_context_add_patterns(vg_context_t *vcp, } static void -vg_regex_context_free(vg_context_t *vcp) +vg_regex_context_clear_all_patterns(vg_context_t *vcp) { vg_regex_context_t *vcrp = (vg_regex_context_t *) vcp; int i; @@ -1779,6 +1876,16 @@ vg_regex_context_free(vg_context_t *vcp) pcre_free(vcrp->vcr_regex_extra[i]); pcre_free(vcrp->vcr_regex[i]); } + vcrp->base.vc_npatterns = 0; + vcrp->base.vc_npatterns_start = 0; + vcrp->base.vc_found = 0; +} + +static void +vg_regex_context_free(vg_context_t *vcp) +{ + vg_regex_context_t *vcrp = (vg_regex_context_t *) vcp; + vg_regex_context_clear_all_patterns(vcp); if (vcrp->vcr_nalloc) free(vcrp->vcr_regex); free(vcrp); @@ -1860,8 +1967,8 @@ restart_loop: goto restart_loop; vg_exec_context_consolidate_key(vxcp); - vg_output_match(&vcrp->base, vxcp->vxc_key, - vcrp->vcr_regex_pat[i]); + vcrp->base.vc_output_match(&vcrp->base, vxcp->vxc_key, + vcrp->vcr_regex_pat[i]); vcrp->base.vc_found++; if (vcrp->base.vc_remove_on_match) { @@ -1902,6 +2009,8 @@ vg_regex_context_new(int addrtype, int privtype) vcrp->base.vc_chance = 0.0; vcrp->base.vc_free = vg_regex_context_free; vcrp->base.vc_add_patterns = vg_regex_context_add_patterns; + vcrp->base.vc_clear_all_patterns = + vg_regex_context_clear_all_patterns; vcrp->base.vc_test = vg_regex_test; vcrp->base.vc_hash160_sort = NULL; vcrp->vcr_regex = NULL; diff --git a/pattern.h b/pattern.h index 70f86d6..a775937 100644 --- a/pattern.h +++ b/pattern.h @@ -56,16 +56,24 @@ extern void vg_exec_context_consolidate_key(vg_exec_context_t *vxcp); extern void vg_exec_context_calc_address(vg_exec_context_t *vxcp); extern EC_KEY *vg_exec_context_new_key(void); - /* Implementation-specific lock/unlock/consolidate */ extern void vg_exec_downgrade_lock(vg_exec_context_t *vxcp); extern int vg_exec_upgrade_lock(vg_exec_context_t *vxcp); + typedef void (*vg_free_func_t)(vg_context_t *); typedef int (*vg_add_pattern_func_t)(vg_context_t *, - char ** const patterns, int npatterns); + const char ** const patterns, + int npatterns); +typedef void (*vg_clear_all_patterns_func_t)(vg_context_t *); typedef int (*vg_test_func_t)(vg_exec_context_t *); typedef int (*vg_hash160_sort_func_t)(vg_context_t *vcp, void *buf); +typedef void (*vg_output_error_func_t)(vg_context_t *vcp, const char *info); +typedef void (*vg_output_match_func_t)(vg_context_t *vcp, EC_KEY *pkey, + const char *pattern); +typedef void (*vg_output_timing_func_t)(vg_context_t *vcp, double count, + unsigned long long rate, + unsigned long long total); enum vg_format { VCF_PUBKEY, @@ -84,28 +92,50 @@ struct _vg_context_s { const char *vc_key_protect_pass; int vc_remove_on_match; int vc_verbose; - vg_free_func_t vc_free; - vg_add_pattern_func_t vc_add_patterns; - vg_test_func_t vc_test; - vg_hash160_sort_func_t vc_hash160_sort; enum vg_format vc_format; int vc_pubkeytype; EC_POINT *vc_pubkey_base; + int vc_halt; + + /* Internal methods */ + vg_free_func_t vc_free; + vg_add_pattern_func_t vc_add_patterns; + vg_clear_all_patterns_func_t vc_clear_all_patterns; + vg_test_func_t vc_test; + vg_hash160_sort_func_t vc_hash160_sort; + + /* Performance related members */ + unsigned long long vc_timing_total; + unsigned long long vc_timing_prevfound; + unsigned long long vc_timing_sincelast; + struct _timing_info_s *vc_timing_head; + + /* External methods */ + vg_output_error_func_t vc_output_error; + vg_output_match_func_t vc_output_match; + vg_output_timing_func_t vc_output_timing; }; extern void vg_context_free(vg_context_t *vcp); extern int vg_context_add_patterns(vg_context_t *vcp, - char ** const patterns, int npatterns); + const char ** const patterns, int npatterns); +extern void vg_context_clear_all_patterns(vg_context_t *vcp); extern int vg_context_hash160_sort(vg_context_t *vcp, void *buf); extern vg_context_t *vg_prefix_context_new(int addrtype, int privtype, int caseinsensitive); +extern double vg_prefix_get_difficulty(int addrtype, const char *pattern); + extern vg_context_t *vg_regex_context_new(int addrtype, int privtype); extern int vg_output_timing(vg_context_t *vcp, int cycle, struct timeval *last); -extern void vg_output_match(vg_context_t *vcp, EC_KEY *pkey, - const char *pattern); + +extern void vg_output_match_console(vg_context_t *vcp, EC_KEY *pkey, + const char *pattern); +extern void vg_output_timing_console(vg_context_t *vcp, double count, + unsigned long long rate, + unsigned long long total); #endif /* !defined (__VG_PATTERN_H__) */ diff --git a/vanitygen.c b/vanitygen.c index d11dd04..e368729 100644 --- a/vanitygen.c +++ b/vanitygen.c @@ -280,7 +280,7 @@ vg_thread_loop(void *arg) hash_len = 65; } - while (1) { + while (!vcp->vc_halt) { if (++npoints >= rekey_at) { pthread_mutex_lock(&vg_thread_lock); /* Generate a new random private key */ @@ -704,7 +704,11 @@ main(int argc, char **argv) vcp->vc_pubkeytype = pubkeytype; vcp->vc_pubkey_base = pubkey_base; - if (!vg_context_add_patterns(vcp, patterns, npatterns)) + vcp->vc_output_match = vg_output_match_console; + vcp->vc_output_timing = vg_output_timing_console; + + if (!vg_context_add_patterns(vcp, (const char ** const) patterns, + npatterns)) return 1; if (!vcp->vc_npatterns) { diff --git a/winglue.c b/winglue.c index 10c0d02..b249e6a 100644 --- a/winglue.c +++ b/winglue.c @@ -73,11 +73,9 @@ count_processors(void) * struct timeval compatibility for Win32 */ -#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 -#else -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif +#define TIMESPEC_TO_FILETIME_OFFSET \ + ( ((unsigned __int64) 27111902 << 32) + \ + (unsigned __int64) 3577643008 ) int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -88,14 +86,13 @@ gettimeofday(struct timeval *tv, struct timezone *tz) if (NULL != tv) { GetSystemTimeAsFileTime(&ft); - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tmpres /= 10; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); + tv->tv_sec = (int) ((*(unsigned __int64 *) &ft - + TIMESPEC_TO_FILETIME_OFFSET) / + 10000000); + tv->tv_usec = (int) ((*(unsigned __int64 *) &ft - + TIMESPEC_TO_FILETIME_OFFSET - + ((unsigned __int64) tv->tv_sec * + (unsigned __int64) 10000000)) / 10); } return 0;