/* * Vanitygen, vanity bitcoin address generator * Copyright (C) 2011 * * Vanitygen is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * any later version. * * Vanitygen is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with Vanitygen. If not, see . */ #include #include #include #include #include #include #include #include #include #ifdef __APPLE__ #include #ifndef CL_CALLBACK #define CL_CALLBACK #endif #else #include #endif #include "oclengine.h" #include "pattern.h" #include "util.h" #define MAX_SLOT 2 #define MAX_ARG 6 #define MAX_KERNEL 3 #define is_pow2(v) (!((v) & ((v)-1))) #define round_up_pow2(x, a) (((x) + ((a)-1)) & ~((a)-1)) static void vg_ocl_free_args(vg_ocl_context_t *vocp); static void *vg_opencl_loop(vg_exec_context_t *arg); /* OpenCL address searching mode */ struct _vg_ocl_context_s; typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *); typedef int (*vg_ocl_check_t)(struct _vg_ocl_context_s *, int slot); struct _vg_ocl_context_s { vg_exec_context_t base; cl_device_id voc_ocldid; cl_context voc_oclctx; cl_command_queue voc_oclcmdq; cl_program voc_oclprog; vg_ocl_init_t voc_init_func; vg_ocl_init_t voc_rekey_func; vg_ocl_check_t voc_check_func; int voc_quirks; int voc_nslots; cl_kernel voc_oclkernel[MAX_SLOT][MAX_KERNEL]; cl_event voc_oclkrnwait[MAX_SLOT]; cl_mem voc_args[MAX_SLOT][MAX_ARG]; size_t voc_arg_size[MAX_SLOT][MAX_ARG]; int voc_pattern_rewrite; int voc_pattern_alloc; vg_ocl_check_t voc_verify_func[MAX_KERNEL]; pthread_t voc_ocl_thread; pthread_mutex_t voc_lock; pthread_cond_t voc_wait; int voc_ocl_slot; int voc_ocl_rows; int voc_ocl_cols; int voc_ocl_invsize; int voc_halt; int voc_dump_done; }; /* Thread synchronization stubs */ void vg_exec_downgrade_lock(vg_exec_context_t *vxcp) { } int vg_exec_upgrade_lock(vg_exec_context_t *vxcp) { return 0; } /* * OpenCL debugging and support */ static const char * vg_ocl_strerror(cl_int ret) { #define OCL_STATUS(st) case st: return #st; switch (ret) { OCL_STATUS(CL_SUCCESS); OCL_STATUS(CL_DEVICE_NOT_FOUND); OCL_STATUS(CL_DEVICE_NOT_AVAILABLE); OCL_STATUS(CL_COMPILER_NOT_AVAILABLE); OCL_STATUS(CL_MEM_OBJECT_ALLOCATION_FAILURE); OCL_STATUS(CL_OUT_OF_RESOURCES); OCL_STATUS(CL_OUT_OF_HOST_MEMORY); OCL_STATUS(CL_PROFILING_INFO_NOT_AVAILABLE); OCL_STATUS(CL_MEM_COPY_OVERLAP); OCL_STATUS(CL_IMAGE_FORMAT_MISMATCH); OCL_STATUS(CL_IMAGE_FORMAT_NOT_SUPPORTED); OCL_STATUS(CL_BUILD_PROGRAM_FAILURE); OCL_STATUS(CL_MAP_FAILURE); #if defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) OCL_STATUS(CL_MISALIGNED_SUB_BUFFER_OFFSET); #endif /* defined(CL_MISALIGNED_SUB_BUFFER_OFFSET) */ #if defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) OCL_STATUS(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST); #endif /* defined(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) */ OCL_STATUS(CL_INVALID_VALUE); OCL_STATUS(CL_INVALID_DEVICE_TYPE); OCL_STATUS(CL_INVALID_PLATFORM); OCL_STATUS(CL_INVALID_DEVICE); OCL_STATUS(CL_INVALID_CONTEXT); OCL_STATUS(CL_INVALID_QUEUE_PROPERTIES); OCL_STATUS(CL_INVALID_COMMAND_QUEUE); OCL_STATUS(CL_INVALID_HOST_PTR); OCL_STATUS(CL_INVALID_MEM_OBJECT); OCL_STATUS(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); OCL_STATUS(CL_INVALID_IMAGE_SIZE); OCL_STATUS(CL_INVALID_SAMPLER); OCL_STATUS(CL_INVALID_BINARY); OCL_STATUS(CL_INVALID_BUILD_OPTIONS); OCL_STATUS(CL_INVALID_PROGRAM); OCL_STATUS(CL_INVALID_PROGRAM_EXECUTABLE); OCL_STATUS(CL_INVALID_KERNEL_NAME); OCL_STATUS(CL_INVALID_KERNEL_DEFINITION); OCL_STATUS(CL_INVALID_KERNEL); OCL_STATUS(CL_INVALID_ARG_INDEX); OCL_STATUS(CL_INVALID_ARG_VALUE); OCL_STATUS(CL_INVALID_ARG_SIZE); OCL_STATUS(CL_INVALID_KERNEL_ARGS); OCL_STATUS(CL_INVALID_WORK_DIMENSION); OCL_STATUS(CL_INVALID_WORK_GROUP_SIZE); OCL_STATUS(CL_INVALID_WORK_ITEM_SIZE); OCL_STATUS(CL_INVALID_GLOBAL_OFFSET); OCL_STATUS(CL_INVALID_EVENT_WAIT_LIST); OCL_STATUS(CL_INVALID_EVENT); OCL_STATUS(CL_INVALID_OPERATION); OCL_STATUS(CL_INVALID_GL_OBJECT); OCL_STATUS(CL_INVALID_BUFFER_SIZE); OCL_STATUS(CL_INVALID_MIP_LEVEL); OCL_STATUS(CL_INVALID_GLOBAL_WORK_SIZE); #if defined(CL_INVALID_PROPERTY) OCL_STATUS(CL_INVALID_PROPERTY); #endif /* defined(CL_INVALID_PROPERTY) */ #undef OCL_STATUS default: { static char tmp[64]; snprintf(tmp, sizeof(tmp), "Unknown code %d", ret); return tmp; } } } /* Get device strings, using a static buffer -- caveat emptor */ static const char * vg_ocl_platform_getstr(cl_platform_id pid, cl_platform_info param) { static char platform_str[1024]; cl_int ret; size_t size_ret; ret = clGetPlatformInfo(pid, param, sizeof(platform_str), platform_str, &size_ret); if (ret != CL_SUCCESS) { snprintf(platform_str, sizeof(platform_str), "clGetPlatformInfo(%d): %s", param, vg_ocl_strerror(ret)); } return platform_str; } static cl_platform_id vg_ocl_device_getplatform(cl_device_id did) { cl_int ret; cl_platform_id val; size_t size_ret; ret = clGetDeviceInfo(did, CL_DEVICE_PLATFORM, sizeof(val), &val, &size_ret); if (ret != CL_SUCCESS) { fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_PLATFORM): %s", vg_ocl_strerror(ret)); } return val; } static cl_device_type vg_ocl_device_gettype(cl_device_id did) { cl_int ret; cl_device_type val; size_t size_ret; ret = clGetDeviceInfo(did, CL_DEVICE_TYPE, sizeof(val), &val, &size_ret); if (ret != CL_SUCCESS) { fprintf(stderr, "clGetDeviceInfo(CL_DEVICE_TYPE): %s", vg_ocl_strerror(ret)); } return val; } static const char * vg_ocl_device_getstr(cl_device_id did, cl_device_info param) { static char device_str[1024]; cl_int ret; size_t size_ret; ret = clGetDeviceInfo(did, param, sizeof(device_str), device_str, &size_ret); if (ret != CL_SUCCESS) { snprintf(device_str, sizeof(device_str), "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); } return device_str; } static size_t vg_ocl_device_getsizet(cl_device_id did, cl_device_info param) { cl_int ret; size_t val; size_t size_ret; ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); if (ret != CL_SUCCESS) { fprintf(stderr, "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); } return val; } static cl_ulong vg_ocl_device_getulong(cl_device_id did, cl_device_info param) { cl_int ret; cl_ulong val; size_t size_ret; ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); if (ret != CL_SUCCESS) { fprintf(stderr, "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); } return val; } static cl_uint vg_ocl_device_getuint(cl_device_id did, cl_device_info param) { cl_int ret; cl_uint val; size_t size_ret; ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret); if (ret != CL_SUCCESS) { fprintf(stderr, "clGetDeviceInfo(%d): %s", param, vg_ocl_strerror(ret)); } return val; } void vg_ocl_dump_info(vg_ocl_context_t *vocp) { cl_device_id did; if (vocp->base.vxc_vc && (vocp->base.vxc_vc->vc_verbose < 1)) return; if (vocp->voc_dump_done) return; did = vocp->voc_ocldid; fprintf(stderr, "Device: %s\n", vg_ocl_device_getstr(did, CL_DEVICE_NAME)); fprintf(stderr, "Vendor: %s (%04x)\n", vg_ocl_device_getstr(did, CL_DEVICE_VENDOR), vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID)); fprintf(stderr, "Driver: %s\n", vg_ocl_device_getstr(did, CL_DRIVER_VERSION)); fprintf(stderr, "Profile: %s\n", vg_ocl_device_getstr(did, CL_DEVICE_PROFILE)); fprintf(stderr, "Version: %s\n", vg_ocl_device_getstr(did, CL_DEVICE_VERSION)); fprintf(stderr, "Max compute units: %"PRSIZET"d\n", vg_ocl_device_getsizet(did, CL_DEVICE_MAX_COMPUTE_UNITS)); fprintf(stderr, "Max workgroup size: %"PRSIZET"d\n", vg_ocl_device_getsizet(did, CL_DEVICE_MAX_WORK_GROUP_SIZE)); fprintf(stderr, "Global memory: %ld\n", vg_ocl_device_getulong(did, CL_DEVICE_GLOBAL_MEM_SIZE)); fprintf(stderr, "Max allocation: %ld\n", vg_ocl_device_getulong(did, CL_DEVICE_MAX_MEM_ALLOC_SIZE)); vocp->voc_dump_done = 1; } void vg_ocl_error(vg_ocl_context_t *vocp, int code, const char *desc) { const char *err = vg_ocl_strerror(code); if (desc) { fprintf(stderr, "%s: %s\n", desc, err); } else { fprintf(stderr, "%s\n", err); } if (vocp && vocp->voc_ocldid) vg_ocl_dump_info(vocp); } static void vg_ocl_buildlog(vg_ocl_context_t *vocp, cl_program prog) { size_t logbufsize, logsize; char *log; int off = 0; cl_int ret; ret = clGetProgramBuildInfo(prog, vocp->voc_ocldid, CL_PROGRAM_BUILD_LOG, 0, NULL, &logbufsize); if (ret != CL_SUCCESS) { vg_ocl_error(NULL, ret, "clGetProgramBuildInfo"); return; } log = (char *) malloc(logbufsize); if (!log) { fprintf(stderr, "Could not allocate build log buffer\n"); return; } ret = clGetProgramBuildInfo(prog, vocp->voc_ocldid, CL_PROGRAM_BUILD_LOG, logbufsize, log, &logsize); if (ret != CL_SUCCESS) { vg_ocl_error(NULL, ret, "clGetProgramBuildInfo"); } else { /* Remove leading newlines and trailing newlines/whitespace */ log[logbufsize-1] = '\0'; for (off = logsize - 1; off >= 0; off--) { if ((log[off] != '\r') && (log[off] != '\n') && (log[off] != ' ') && (log[off] != '\t') && (log[off] != '\0')) break; log[off] = '\0'; } for (off = 0; off < logbufsize; off++) { if ((log[off] != '\r') && (log[off] != '\n')) break; } fprintf(stderr, "Build log:\n%s\n", &log[off]); } free(log); } /* * OpenCL per-exec functions */ enum { VG_OCL_DEEP_PREPROC_UNROLL = (1 << 0), VG_OCL_PRAGMA_UNROLL = (1 << 1), VG_OCL_EXPENSIVE_BRANCHES = (1 << 2), VG_OCL_DEEP_VLIW = (1 << 3), VG_OCL_AMD_BFI_INT = (1 << 4), VG_OCL_NV_VERBOSE = (1 << 5), VG_OCL_BROKEN = (1 << 6), VG_OCL_NO_BINARIES = (1 << 7), VG_OCL_OPTIMIZATIONS = (VG_OCL_DEEP_PREPROC_UNROLL | VG_OCL_PRAGMA_UNROLL | VG_OCL_EXPENSIVE_BRANCHES | VG_OCL_DEEP_VLIW | VG_OCL_AMD_BFI_INT), }; static int vg_ocl_get_quirks(vg_ocl_context_t *vocp) { uint32_t vend; const char *dvn; unsigned int quirks = 0; quirks |= VG_OCL_DEEP_PREPROC_UNROLL; vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID); switch (vend) { case 0x10de: /* NVIDIA */ /* * NVIDIA's compiler seems to take a really really long * time when using preprocessor unrolling, but works * well with pragma unroll. */ quirks &= ~VG_OCL_DEEP_PREPROC_UNROLL; quirks |= VG_OCL_PRAGMA_UNROLL; quirks |= VG_OCL_NV_VERBOSE; break; case 0x1002: /* AMD/ATI */ /* * AMD's compiler works best with preprocesor unrolling. * Pragma unroll is unreliable with AMD's compiler and * seems to crash based on whether the gods were smiling * when Catalyst was last installed/upgraded. */ if (vg_ocl_device_gettype(vocp->voc_ocldid) & CL_DEVICE_TYPE_GPU) { quirks |= VG_OCL_EXPENSIVE_BRANCHES; quirks |= VG_OCL_DEEP_VLIW; dvn = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_EXTENSIONS); if (dvn && strstr(dvn, "cl_amd_media_ops")) quirks |= VG_OCL_AMD_BFI_INT; dvn = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME); if (!strcmp(dvn, "ATI RV710")) { quirks &= ~VG_OCL_OPTIMIZATIONS; quirks |= VG_OCL_NO_BINARIES; } } break; default: break; } return quirks; } static int vg_ocl_create_kernel(vg_ocl_context_t *vocp, int knum, const char *func) { int i; cl_kernel krn; cl_int ret; for (i = 0; i < MAX_SLOT; i++) { krn = clCreateKernel(vocp->voc_oclprog, func, &ret); if (!krn) { fprintf(stderr, "clCreateKernel(%d): ", i); vg_ocl_error(vocp, ret, NULL); while (--i >= 0) { clReleaseKernel(vocp->voc_oclkernel[i][knum]); vocp->voc_oclkernel[i][knum] = NULL; } return 0; } vocp->voc_oclkernel[i][knum] = krn; vocp->voc_oclkrnwait[i] = NULL; } return 1; } static void vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts, const char *program, size_t size, unsigned char *hash_out) { EVP_MD_CTX *mdctx; cl_platform_id pid; const char *str; mdctx = EVP_MD_CTX_create(); EVP_DigestInit_ex(mdctx, EVP_md5(), NULL); pid = vg_ocl_device_getplatform(vocp->voc_ocldid); str = vg_ocl_platform_getstr(pid, CL_PLATFORM_NAME); EVP_DigestUpdate(mdctx, str, strlen(str) + 1); str = vg_ocl_platform_getstr(pid, CL_PLATFORM_VERSION); EVP_DigestUpdate(mdctx, str, strlen(str) + 1); str = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME); EVP_DigestUpdate(mdctx, str, strlen(str) + 1); if (opts) EVP_DigestUpdate(mdctx, opts, strlen(opts) + 1); if (size) EVP_DigestUpdate(mdctx, program, size); EVP_DigestFinal_ex(mdctx, hash_out, NULL); EVP_MD_CTX_destroy(mdctx); } typedef struct { unsigned char e_ident[16]; uint16_t e_type; uint16_t e_machine; uint32_t e_version; uint32_t e_entry; uint32_t e_phoff; uint32_t e_shoff; uint32_t e_flags; uint16_t e_ehsize; uint16_t e_phentsize; uint16_t e_phnum; uint16_t e_shentsize; uint16_t e_shnum; uint16_t e_shstrndx; } vg_elf32_header_t; typedef struct { uint32_t sh_name; uint32_t sh_type; uint32_t sh_flags; uint32_t sh_addr; uint32_t sh_offset; uint32_t sh_size; uint32_t sh_link; uint32_t sh_info; uint32_t sh_addralign; uint32_t sh_entsize; } vg_elf32_shdr_t; static int vg_ocl_amd_patch_inner(unsigned char *binary, size_t size) { vg_elf32_header_t *ehp; vg_elf32_shdr_t *shp, *nshp; uint32_t *instr; size_t off; int i, n, txt2idx, patched; ehp = (vg_elf32_header_t *) binary; if ((size < sizeof(*ehp)) || memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) || !ehp->e_shoff) return 0; off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize); nshp = (vg_elf32_shdr_t *) (binary + off); if ((off + sizeof(*nshp)) > size) return 0; shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff); n = 0; txt2idx = 0; for (i = 0; i < ehp->e_shnum; i++) { off = nshp->sh_offset + shp[i].sh_name; if (((off + 6) >= size) || memcmp(binary + off, ".text", 6)) continue; n++; if (n == 2) txt2idx = i; } if (n != 2) return 0; off = shp[txt2idx].sh_offset; instr = (uint32_t *) (binary + off); n = shp[txt2idx].sh_size / 4; patched = 0; for (i = 0; i < n; i += 2) { if (((instr[i] & 0x02001000) == 0) && ((instr[i+1] & 0x9003f000) == 0x0001a000)) { instr[i+1] ^= (0x0001a000 ^ 0x0000c000); patched++; } } return patched; } static int vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size) { vg_context_t *vcp = vocp->base.vxc_vc; vg_elf32_header_t *ehp; unsigned char *ptr; size_t offset = 1; int ninner = 0, nrun, npatched = 0; ehp = (vg_elf32_header_t *) binary; if ((size < sizeof(*ehp)) || memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) || !ehp->e_shoff) return 0; offset = 1; while (offset < (size - 8)) { ptr = (unsigned char *) memchr(binary + offset, 0x7f, size - offset); if (!ptr) return npatched; offset = ptr - binary; ehp = (vg_elf32_header_t *) ptr; if (((size - offset) < sizeof(*ehp)) || memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) || !ehp->e_shoff) { offset += 1; continue; } ninner++; nrun = vg_ocl_amd_patch_inner(ptr, size - offset); npatched += nrun; if (vcp->vc_verbose > 1) fprintf(stderr, "AMD BFI_INT: patched %d instructions " "in kernel %d\n", nrun, ninner); npatched++; offset += 1; } return npatched; } static int vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp, const char *filename, const char *opts) { FILE *kfp; char *buf, *tbuf; int len, fromsource = 0, patched = 0; size_t sz, szr; cl_program prog; cl_int ret, sts; unsigned char prog_hash[16]; char bin_name[64]; if (vcp->vc_verbose > 1) fprintf(stderr, "OpenCL compiler flags: %s\n", opts ? opts : ""); sz = 128 * 1024; buf = (char *) malloc(sz); if (!buf) { fprintf(stderr, "Could not allocate program buffer\n"); return 0; } kfp = fopen(filename, "r"); if (!kfp) { fprintf(stderr, "Error loading kernel file '%s': %s\n", filename, strerror(errno)); free(buf); return 0; } len = fread(buf, 1, sz, kfp); fclose(kfp); if (!len) { fprintf(stderr, "Short read on CL kernel\n"); free(buf); return 0; } vg_ocl_hash_program(vocp, opts, buf, len, prog_hash); snprintf(bin_name, sizeof(bin_name), "%02x%02x%02x%02x%02x%02x%02x%02x" "%02x%02x%02x%02x%02x%02x%02x%02x.oclbin", prog_hash[0], prog_hash[1], prog_hash[2], prog_hash[3], prog_hash[4], prog_hash[5], prog_hash[6], prog_hash[7], prog_hash[8], prog_hash[9], prog_hash[10], prog_hash[11], prog_hash[12], prog_hash[13], prog_hash[14], prog_hash[15]); if (vocp->voc_quirks & VG_OCL_NO_BINARIES) { kfp = NULL; if (vcp->vc_verbose > 1) fprintf(stderr, "Binary OpenCL programs disabled\n"); } else { kfp = fopen(bin_name, "rb"); } if (!kfp) { /* No binary available, create with source */ fromsource = 1; sz = len; prog = clCreateProgramWithSource(vocp->voc_oclctx, 1, (const char **) &buf, &sz, &ret); } else { if (vcp->vc_verbose > 1) fprintf(stderr, "Loading kernel binary %s\n", bin_name); szr = 0; while (!feof(kfp)) { len = fread(buf + szr, 1, sz - szr, kfp); if (!len) { fprintf(stderr, "Short read on CL kernel binary\n"); fclose(kfp); free(buf); return 0; } szr += len; if (szr == sz) { tbuf = (char *) realloc(buf, sz*2); if (!tbuf) { fprintf(stderr, "Could not expand CL kernel " "binary buffer\n"); fclose(kfp); free(buf); return 0; } buf = tbuf; sz *= 2; } } fclose(kfp); rebuild: prog = clCreateProgramWithBinary(vocp->voc_oclctx, 1, &vocp->voc_ocldid, &szr, (const unsigned char **) &buf, &sts, &ret); } free(buf); if (!prog) { vg_ocl_error(vocp, ret, "clCreateProgramWithSource"); return 0; } if (vcp->vc_verbose > 0) { if (fromsource && !patched) { fprintf(stderr, "Compiling kernel, can take minutes..."); fflush(stderr); } } ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL); if (ret != CL_SUCCESS) { if ((vcp->vc_verbose > 0) && fromsource && !patched) fprintf(stderr, "failure.\n"); vg_ocl_error(NULL, ret, "clBuildProgram"); } else if ((vcp->vc_verbose > 0) && fromsource && !patched) { fprintf(stderr, "done!\n"); } if ((ret != CL_SUCCESS) || ((vcp->vc_verbose > 1) && fromsource && !patched)) { vg_ocl_buildlog(vocp, prog); } if (ret != CL_SUCCESS) { vg_ocl_dump_info(vocp); clReleaseProgram(prog); return 0; } if (fromsource && !(vocp->voc_quirks & VG_OCL_NO_BINARIES)) { ret = clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES, sizeof(szr), &szr, &sz); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "WARNING: clGetProgramInfo(BINARY_SIZES)"); goto out; } if (sz == 0) { fprintf(stderr, "WARNING: zero-length CL kernel binary\n"); goto out; } buf = (char *) malloc(szr); if (!buf) { fprintf(stderr, "WARNING: Could not allocate %"PRSIZET"d bytes " "for CL binary\n", szr); goto out; } ret = clGetProgramInfo(prog, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &sz); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "WARNING: clGetProgramInfo(BINARIES)"); free(buf); goto out; } if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) { patched = vg_ocl_amd_patch(vocp, (unsigned char *) buf, szr); if (patched > 0) { if (vcp->vc_verbose > 1) fprintf(stderr, "AMD BFI_INT patch complete\n"); clReleaseProgram(prog); goto rebuild; } fprintf(stderr, "WARNING: AMD BFI_INT patching failed\n"); if (patched < 0) { /* Program was incompletely modified */ free(buf); goto out; } } kfp = fopen(bin_name, "wb"); if (!kfp) { fprintf(stderr, "WARNING: " "could not save CL kernel binary: %s\n", strerror(errno)); } else { sz = fwrite(buf, 1, szr, kfp); fclose(kfp); if (sz != szr) { fprintf(stderr, "WARNING: short write on CL kernel " "binary file: expected " "%"PRSIZET"d, got %"PRSIZET"d\n", szr, sz); unlink(bin_name); } } free(buf); } out: vocp->voc_oclprog = prog; if (!vg_ocl_create_kernel(vocp, 0, "ec_add_grid") || !vg_ocl_create_kernel(vocp, 1, "heap_invert")) { clReleaseProgram(vocp->voc_oclprog); vocp->voc_oclprog = NULL; return 0; } return 1; } static void CL_CALLBACK vg_ocl_context_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data) { fprintf(stderr, "vg_ocl_context_callback error: %s\n", errinfo); } static int vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did, int safe_mode) { cl_int ret; char optbuf[128]; int end = 0; memset(vocp, 0, sizeof(*vocp)); vg_exec_context_init(vcp, &vocp->base); vocp->base.vxc_threadfunc = vg_opencl_loop; pthread_mutex_init(&vocp->voc_lock, NULL); pthread_cond_init(&vocp->voc_wait, NULL); vocp->voc_ocl_slot = -1; vocp->voc_ocldid = did; if (vcp->vc_verbose > 1) vg_ocl_dump_info(vocp); vocp->voc_quirks = vg_ocl_get_quirks(vocp); if ((vocp->voc_quirks & VG_OCL_BROKEN) && (vcp->vc_verbose > 0)) { char yesbuf[16]; printf("Type 'yes' to continue: "); fflush(stdout); if (!fgets(yesbuf, sizeof(yesbuf), stdin) || strncmp(yesbuf, "yes", 3)) exit(1); } vocp->voc_oclctx = clCreateContext(NULL, 1, &did, vg_ocl_context_callback, NULL, &ret); if (!vocp->voc_oclctx) { vg_ocl_error(vocp, ret, "clCreateContext"); return 0; } vocp->voc_oclcmdq = clCreateCommandQueue(vocp->voc_oclctx, vocp->voc_ocldid, 0, &ret); if (!vocp->voc_oclcmdq) { vg_ocl_error(vocp, ret, "clCreateCommandQueue"); return 0; } if (safe_mode) vocp->voc_quirks &= ~VG_OCL_OPTIMIZATIONS; end = 0; optbuf[end] = '\0'; if (vocp->voc_quirks & VG_OCL_DEEP_PREPROC_UNROLL) end += snprintf(optbuf + end, sizeof(optbuf) - end, "-DDEEP_PREPROC_UNROLL "); if (vocp->voc_quirks & VG_OCL_PRAGMA_UNROLL) end += snprintf(optbuf + end, sizeof(optbuf) - end, "-DPRAGMA_UNROLL "); if (vocp->voc_quirks & VG_OCL_EXPENSIVE_BRANCHES) end += snprintf(optbuf + end, sizeof(optbuf) - end, "-DVERY_EXPENSIVE_BRANCHES "); if (vocp->voc_quirks & VG_OCL_DEEP_VLIW) end += snprintf(optbuf + end, sizeof(optbuf) - end, "-DDEEP_VLIW "); if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT) end += snprintf(optbuf + end, sizeof(optbuf) - end, "-DAMD_BFI_INT "); if (vocp->voc_quirks & VG_OCL_NV_VERBOSE) end += snprintf(optbuf + end, sizeof(optbuf) - end, "-cl-nv-verbose "); if (!vg_ocl_load_program(vcp, vocp, "calc_addrs.cl", optbuf)) return 0; return 1; } static void vg_ocl_del(vg_ocl_context_t *vocp) { vg_ocl_free_args(vocp); if (vocp->voc_oclprog) { clReleaseProgram(vocp->voc_oclprog); vocp->voc_oclprog = NULL; } if (vocp->voc_oclcmdq) { clReleaseCommandQueue(vocp->voc_oclcmdq); vocp->voc_oclcmdq = NULL; } if (vocp->voc_oclctx) { clReleaseContext(vocp->voc_oclctx); vocp->voc_oclctx = NULL; } pthread_cond_destroy(&vocp->voc_wait); pthread_mutex_destroy(&vocp->voc_lock); vg_exec_context_del(&vocp->base); } static int vg_ocl_arg_map[][8] = { /* hashes_out / found */ { 2, 0, -1 }, /* z_heap */ { 0, 1, 1, 0, 2, 2, -1 }, /* point_tmp */ { 0, 0, 2, 1, -1 }, /* row_in */ { 0, 2, -1 }, /* col_in */ { 0, 3, -1 }, /* target_table */ { 2, 3, -1 }, }; static int vg_ocl_kernel_arg_alloc(vg_ocl_context_t *vocp, int slot, int arg, size_t size, int host) { cl_mem clbuf; cl_int ret; int i, j, knum, karg; for (i = 0; i < MAX_SLOT; i++) { if ((i != slot) && (slot >= 0)) continue; if (vocp->voc_args[i][arg]) { clReleaseMemObject(vocp->voc_args[i][arg]); vocp->voc_args[i][arg] = NULL; vocp->voc_arg_size[i][arg] = 0; } } clbuf = clCreateBuffer(vocp->voc_oclctx, CL_MEM_READ_WRITE | (host ? CL_MEM_ALLOC_HOST_PTR : 0), size, NULL, &ret); if (!clbuf) { fprintf(stderr, "clCreateBuffer(%d,%d): ", slot, arg); vg_ocl_error(vocp, ret, NULL); return 0; } for (i = 0; i < MAX_SLOT; i++) { if ((i != slot) && (slot >= 0)) continue; clRetainMemObject(clbuf); vocp->voc_args[i][arg] = clbuf; vocp->voc_arg_size[i][arg] = size; for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) { knum = vg_ocl_arg_map[arg][j]; karg = vg_ocl_arg_map[arg][j+1]; ret = clSetKernelArg(vocp->voc_oclkernel[i][knum], karg, sizeof(clbuf), &clbuf); if (ret) { fprintf(stderr, "clSetKernelArg(%d,%d): ", knum, karg); vg_ocl_error(vocp, ret, NULL); return 0; } } } clReleaseMemObject(clbuf); return 1; } int vg_ocl_copyout_arg(vg_ocl_context_t *vocp, int wslot, int arg, void *buffer, size_t size) { cl_int slot, ret; slot = (wslot < 0) ? 0 : wslot; assert((slot >= 0) && (slot < MAX_SLOT)); assert(size <= vocp->voc_arg_size[slot][arg]); ret = clEnqueueWriteBuffer(vocp->voc_oclcmdq, vocp->voc_args[slot][arg], CL_TRUE, 0, size, buffer, 0, NULL, NULL); if (ret) { fprintf(stderr, "clEnqueueWriteBuffer(%d): ", arg); vg_ocl_error(vocp, ret, NULL); return 0; } return 1; } static void * vg_ocl_map_arg_buffer(vg_ocl_context_t *vocp, int slot, int arg, int rw) { void *buf; cl_int ret; assert((slot >= 0) && (slot < MAX_SLOT)); buf = clEnqueueMapBuffer(vocp->voc_oclcmdq, vocp->voc_args[slot][arg], CL_TRUE, (rw == 2) ? (CL_MAP_READ|CL_MAP_WRITE) : (rw ? CL_MAP_WRITE : CL_MAP_READ), 0, vocp->voc_arg_size[slot][arg], 0, NULL, NULL, &ret); if (!buf) { fprintf(stderr, "clEnqueueMapBuffer(%d): ", arg); vg_ocl_error(vocp, ret, NULL); return NULL; } return buf; } static void vg_ocl_unmap_arg_buffer(vg_ocl_context_t *vocp, int slot, int arg, void *buf) { cl_int ret; cl_event ev; assert((slot >= 0) && (slot < MAX_SLOT)); ret = clEnqueueUnmapMemObject(vocp->voc_oclcmdq, vocp->voc_args[slot][arg], buf, 0, NULL, &ev); if (ret != CL_SUCCESS) { fprintf(stderr, "clEnqueueUnmapMemObject(%d): ", arg); vg_ocl_error(vocp, ret, NULL); return; } ret = clWaitForEvents(1, &ev); clReleaseEvent(ev); if (ret != CL_SUCCESS) { fprintf(stderr, "clWaitForEvent(clUnmapMemObject,%d): ", arg); vg_ocl_error(vocp, ret, NULL); } } int vg_ocl_kernel_int_arg(vg_ocl_context_t *vocp, int slot, int arg, int value) { cl_int ret; int i; for (i = 0; i < MAX_SLOT; i++) { if ((i != slot) && (slot >= 0)) continue; ret = clSetKernelArg(vocp->voc_oclkernel[i][2], arg, sizeof(value), &value); if (ret) { fprintf(stderr, "clSetKernelArg(%d): ", arg); vg_ocl_error(vocp, ret, NULL); return 0; } } return 1; } int vg_ocl_kernel_buffer_arg(vg_ocl_context_t *vocp, int slot, int arg, void *value, size_t size) { cl_int ret; int i, j, knum, karg; for (i = 0; i < MAX_SLOT; i++) { if ((i != slot) && (slot >= 0)) continue; for (j = 0; vg_ocl_arg_map[arg][j] >= 0; j += 2) { knum = vg_ocl_arg_map[arg][j]; karg = vg_ocl_arg_map[arg][j+1]; ret = clSetKernelArg(vocp->voc_oclkernel[i][knum], karg, size, value); if (ret) { fprintf(stderr, "clSetKernelArg(%d,%d): ", knum, karg); vg_ocl_error(vocp, ret, NULL); return 0; } } } return 1; } static void vg_ocl_free_args(vg_ocl_context_t *vocp) { int i, arg; for (i = 0; i < MAX_SLOT; i++) { for (arg = 0; arg < MAX_ARG; arg++) { if (vocp->voc_args[i][arg]) { clReleaseMemObject(vocp->voc_args[i][arg]); vocp->voc_args[i][arg] = NULL; vocp->voc_arg_size[i][arg] = 0; } } } } int vg_ocl_kernel_dead(vg_ocl_context_t *vocp, int slot) { return (vocp->voc_oclkrnwait[slot] == NULL); } static int vg_ocl_kernel_start(vg_ocl_context_t *vocp, int slot, int ncol, int nrow, int invsize) { cl_int val, ret; cl_event ev; size_t globalws[2] = { ncol, nrow }; size_t invws = (ncol * nrow) / invsize; assert(!vocp->voc_oclkrnwait[slot]); /* heap_invert() preconditions */ assert(is_pow2(invsize) && (invsize > 1)); val = invsize; ret = clSetKernelArg(vocp->voc_oclkernel[slot][1], 1, sizeof(val), &val); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "clSetKernelArg(ncol)"); return 0; } ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, vocp->voc_oclkernel[slot][0], 2, NULL, globalws, NULL, 0, NULL, &ev); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "clEnqueueNDRange(0)"); return 0; } ret = clWaitForEvents(1, &ev); clReleaseEvent(ev); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,0)"); return 0; } if (vocp->voc_verify_func[0] && !(vocp->voc_verify_func[0])(vocp, slot)) { fprintf(stderr, "ERROR: Kernel 0 failed verification test\n"); return 0; } ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, vocp->voc_oclkernel[slot][1], 1, NULL, &invws, NULL, 0, NULL, &ev); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "clEnqueueNDRange(1)"); return 0; } ret = clWaitForEvents(1, &ev); clReleaseEvent(ev); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,1)"); return 0; } if (vocp->voc_verify_func[1] && !(vocp->voc_verify_func[1])(vocp, slot)) { fprintf(stderr, "ERROR: Kernel 1 failed verification test\n"); return 0; } ret = clEnqueueNDRangeKernel(vocp->voc_oclcmdq, vocp->voc_oclkernel[slot][2], 2, NULL, globalws, NULL, 0, NULL, &ev); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "clEnqueueNDRange(2)"); return 0; } vocp->voc_oclkrnwait[slot] = ev; return 1; } static int vg_ocl_kernel_wait(vg_ocl_context_t *vocp, int slot) { cl_event ev; cl_int ret; ev = vocp->voc_oclkrnwait[slot]; vocp->voc_oclkrnwait[slot] = NULL; if (ev) { ret = clWaitForEvents(1, &ev); clReleaseEvent(ev); if (ret != CL_SUCCESS) { vg_ocl_error(vocp, ret, "clWaitForEvents(NDRange,e)"); return 0; } } return 1; } static INLINE void vg_ocl_get_bignum_raw(BIGNUM *bn, const unsigned char *buf) { bn_expand(bn, 256); memcpy(bn->d, buf, 32); bn->top = (32 / sizeof(BN_ULONG)); } static INLINE void vg_ocl_put_bignum_raw(unsigned char *buf, const BIGNUM *bn) { int bnlen = (bn->top * sizeof(BN_ULONG)); if (bnlen >= 32) { memcpy(buf, bn->d, 32); } else { memcpy(buf, bn->d, bnlen); memset(buf + bnlen, 0, 32 - bnlen); } } #define ACCESS_BUNDLE 1024 #define ACCESS_STRIDE (ACCESS_BUNDLE/8) static void vg_ocl_get_bignum_tpa(BIGNUM *bn, const unsigned char *buf, int cell) { unsigned char bnbuf[32]; int start, i; start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % ACCESS_STRIDE)); for (i = 0; i < 8; i++) memcpy(bnbuf+(i*4), buf + 4*(start + i*ACCESS_STRIDE), 4); vg_ocl_get_bignum_raw(bn, bnbuf); } /* * Absolutely disgusting. * We want points in Montgomery form, and it's a lot easier to read the * coordinates from the structure than to export and re-montgomeryize. */ struct ec_point_st { const EC_METHOD *meth; BIGNUM X; BIGNUM Y; BIGNUM Z; int Z_is_one; }; static INLINE void vg_ocl_get_point(EC_POINT *ppnt, const unsigned char *buf) { static const unsigned char mont_one[] = { 0x01,0x00,0x00,0x03,0xd1 }; vg_ocl_get_bignum_raw(&ppnt->X, buf); vg_ocl_get_bignum_raw(&ppnt->Y, buf + 32); if (!ppnt->Z_is_one) { ppnt->Z_is_one = 1; BN_bin2bn(mont_one, sizeof(mont_one), &ppnt->Z); } } static INLINE void vg_ocl_put_point(unsigned char *buf, const EC_POINT *ppnt) { assert(ppnt->Z_is_one); vg_ocl_put_bignum_raw(buf, &ppnt->X); vg_ocl_put_bignum_raw(buf + 32, &ppnt->Y); } static void vg_ocl_put_point_tpa(unsigned char *buf, int cell, const EC_POINT *ppnt) { unsigned char pntbuf[64]; int start, i; vg_ocl_put_point(pntbuf, ppnt); start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % (ACCESS_STRIDE/2))); for (i = 0; i < 8; i++) memcpy(buf + 4*(start + i*ACCESS_STRIDE), pntbuf+(i*4), 4); for (i = 0; i < 8; i++) memcpy(buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)), pntbuf+32+(i*4), 4); } static void vg_ocl_get_point_tpa(EC_POINT *ppnt, const unsigned char *buf, int cell) { unsigned char pntbuf[64]; int start, i; start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % (ACCESS_STRIDE/2))); for (i = 0; i < 8; i++) memcpy(pntbuf+(i*4), buf + 4*(start + i*ACCESS_STRIDE), 4); for (i = 0; i < 8; i++) memcpy(pntbuf+32+(i*4), buf + 4*(start + (ACCESS_STRIDE/2) + (i*ACCESS_STRIDE)), 4); vg_ocl_get_point(ppnt, pntbuf); } void show_elapsed(struct timeval *tv, const char *place) { struct timeval now, delta; gettimeofday(&now, NULL); timersub(&now, tv, &delta); fprintf(stderr, "%s spent %ld.%06lds\n", place, delta.tv_sec, delta.tv_usec); } /* * GPU address matching methods * * gethash: GPU computes and returns all address hashes. * + Works with any matching method, including regular expressions. * - The CPU will not be able to keep up with mid- to high-end GPUs. * * prefix: GPU computes hash, searches a range list, and discards. * + Fast, minimal work for CPU. */ static int vg_ocl_gethash_check(vg_ocl_context_t *vocp, int slot) { vg_exec_context_t *vxcp = &vocp->base; vg_context_t *vcp = vocp->base.vxc_vc; vg_test_func_t test_func = vcp->vc_test; unsigned char *ocl_hashes_out; int i, res = 0, round; ocl_hashes_out = (unsigned char *) vg_ocl_map_arg_buffer(vocp, slot, 0, 0); if (!ocl_hashes_out) { fprintf(stderr, "ERROR: Could not map hash result buffer " "for slot %d\n", slot); return 2; } round = vocp->voc_ocl_cols * vocp->voc_ocl_rows; for (i = 0; i < round; i++, vxcp->vxc_delta++) { memcpy(&vxcp->vxc_binres[1], ocl_hashes_out + (20*i), 20); res = test_func(vxcp); if (res) break; } vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_hashes_out); return res; } static int vg_ocl_gethash_init(vg_ocl_context_t *vocp) { int i; if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_get")) return 0; for (i = 0; i < vocp->voc_nslots; i++) { /* Each slot gets its own hash output buffer */ if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 20 * vocp->voc_ocl_rows * vocp->voc_ocl_cols, 1)) return 0; } vocp->voc_rekey_func = NULL; vocp->voc_check_func = vg_ocl_gethash_check; return 1; } static int vg_ocl_prefix_rekey(vg_ocl_context_t *vocp) { vg_context_t *vcp = vocp->base.vxc_vc; unsigned char *ocl_targets_in; uint32_t *ocl_found_out; int i; /* Set the found indicator for each slot to -1 */ for (i = 0; i < vocp->voc_nslots; i++) { ocl_found_out = (uint32_t *) vg_ocl_map_arg_buffer(vocp, i, 0, 1); if (!ocl_found_out) { fprintf(stderr, "ERROR: Could not map result buffer" " for slot %d (rekey)\n", i); return -1; } ocl_found_out[0] = 0xffffffff; vg_ocl_unmap_arg_buffer(vocp, i, 0, ocl_found_out); } if (vocp->voc_pattern_rewrite) { /* Count number of range records */ i = vg_context_hash160_sort(vcp, NULL); if (!i) return 0; if (i > vocp->voc_pattern_alloc) { /* (re)allocate target buffer */ if (!vg_ocl_kernel_arg_alloc(vocp, -1, 5, 40 * i, 0)) return -1; vocp->voc_pattern_alloc = i; } /* Write range records */ ocl_targets_in = (unsigned char *) vg_ocl_map_arg_buffer(vocp, 0, 5, 1); if (!ocl_targets_in) { fprintf(stderr, "ERROR: Could not map hash target buffer\n"); return -1; } vg_context_hash160_sort(vcp, ocl_targets_in); vg_ocl_unmap_arg_buffer(vocp, 0, 5, ocl_targets_in); vg_ocl_kernel_int_arg(vocp, -1, 4, i); vocp->voc_pattern_rewrite = 0; } return 1; } static int vg_ocl_prefix_check(vg_ocl_context_t *vocp, int slot) { vg_exec_context_t *vxcp = &vocp->base; vg_context_t *vcp = vocp->base.vxc_vc; vg_test_func_t test_func = vcp->vc_test; uint32_t *ocl_found_out; uint32_t found_delta; int orig_delta, tablesize; int res = 0; /* Retrieve the found indicator */ ocl_found_out = (uint32_t *) vg_ocl_map_arg_buffer(vocp, slot, 0, 2); if (!ocl_found_out) { fprintf(stderr, "ERROR: Could not map result buffer" " for slot %d\n", slot); return 2; } found_delta = ocl_found_out[0]; if (found_delta != 0xffffffff) { /* GPU code claims match, verify with CPU version */ orig_delta = vxcp->vxc_delta; vxcp->vxc_delta += found_delta; vg_exec_context_calc_address(vxcp); /* Make sure the GPU produced the expected hash */ res = 0; if (!memcmp(vxcp->vxc_binres + 1, ocl_found_out + 2, 20)) { res = test_func(vxcp); } if (res == 0) { /* * The match was not found in * the pattern list. Hmm. */ tablesize = ocl_found_out[2]; fprintf(stderr, "Match idx: %d\n", ocl_found_out[1]); fprintf(stderr, "CPU hash: "); fdumphex(stderr, vxcp->vxc_binres + 1, 20); fprintf(stderr, "GPU hash: "); fdumphex(stderr, (unsigned char *) (ocl_found_out + 2), 20); fprintf(stderr, "Found delta: %d " "Start delta: %d\n", found_delta, orig_delta); res = 1; } } else { vxcp->vxc_delta += (vocp->voc_ocl_cols * vocp->voc_ocl_rows); } vg_ocl_unmap_arg_buffer(vocp, slot, 0, ocl_found_out); return res; } static int vg_ocl_prefix_init(vg_ocl_context_t *vocp) { int i; if (!vg_ocl_create_kernel(vocp, 2, "hash_ec_point_search_prefix")) return 0; for (i = 0; i < vocp->voc_nslots; i++) { if (!vg_ocl_kernel_arg_alloc(vocp, i, 0, 28, 1)) return 0; } vocp->voc_rekey_func = vg_ocl_prefix_rekey; vocp->voc_check_func = vg_ocl_prefix_check; vocp->voc_pattern_rewrite = 1; vocp->voc_pattern_alloc = 0; return 1; } static int vg_ocl_config_pattern(vg_ocl_context_t *vocp) { vg_context_t *vcp = vocp->base.vxc_vc; int i; i = vg_context_hash160_sort(vcp, NULL); if (i > 0) { if (vcp->vc_verbose > 1) fprintf(stderr, "Using OpenCL prefix matcher\n"); /* Configure for prefix matching */ return vg_ocl_prefix_init(vocp); } if (vcp->vc_verbose > 0) fprintf(stderr, "WARNING: Using CPU pattern matcher\n"); return vg_ocl_gethash_init(vocp); } /* * Temporary buffer content verification functions * This provides a simple test of the kernel, the OpenCL compiler, * and the hardware. */ static int vg_ocl_verify_temporary(vg_ocl_context_t *vocp, int slot, int z_inverted) { vg_exec_context_t *vxcp = &vocp->base; unsigned char *point_tmp = NULL, *z_heap = NULL; unsigned char *ocl_points_in = NULL, *ocl_strides_in = NULL; const EC_GROUP *pgroup; EC_POINT *ppr = NULL, *ppc = NULL, *pps = NULL, *ppt = NULL; BIGNUM bnz, bnez, bnm, *bnzc; BN_CTX *bnctx = NULL; BN_MONT_CTX *bnmont; int ret = 0; int mismatches = 0, mm_r; int x, y, bx; static const unsigned char raw_modulus[] = { 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F }; BN_init(&bnz); BN_init(&bnez); BN_init(&bnm); bnctx = BN_CTX_new(); bnmont = BN_MONT_CTX_new(); pgroup = EC_KEY_get0_group(vxcp->vxc_key); ppr = EC_POINT_new(pgroup); ppc = EC_POINT_new(pgroup); pps = EC_POINT_new(pgroup); ppt = EC_POINT_new(pgroup); if (!bnctx || !bnmont || !ppr || !ppc || !pps || !ppt) { fprintf(stderr, "ERROR: out of memory\n"); goto out; } BN_bin2bn(raw_modulus, sizeof(raw_modulus), &bnm); BN_MONT_CTX_set(bnmont, &bnm, bnctx); if (z_inverted) { bnzc = &bnez; } else { bnzc = &pps->Z; } z_heap = (unsigned char *) vg_ocl_map_arg_buffer(vocp, slot, 1, 0); point_tmp = (unsigned char *) vg_ocl_map_arg_buffer(vocp, slot, 2, 0); ocl_points_in = (unsigned char *) vg_ocl_map_arg_buffer(vocp, slot, 3, 0); ocl_strides_in = (unsigned char *) vg_ocl_map_arg_buffer(vocp, slot, 4, 0); if (!z_heap || !point_tmp || !ocl_points_in || !ocl_strides_in) { fprintf(stderr, "ERROR: could not map OpenCL point buffers\n"); goto out; } for (y = 0; y < vocp->voc_ocl_rows; y++) { vg_ocl_get_point(ppr, ocl_strides_in + (64*y)); bx = y * vocp->voc_ocl_cols; mm_r = 0; for (x = 0; x < vocp->voc_ocl_cols; x++) { vg_ocl_get_point_tpa(ppc, ocl_points_in, x); assert(ppr->Z_is_one && ppc->Z_is_one); EC_POINT_add(pgroup, pps, ppc, ppr, bnctx); assert(!pps->Z_is_one); vg_ocl_get_point_tpa(ppt, point_tmp, bx + x); vg_ocl_get_bignum_tpa(&bnz, z_heap, bx + x); if (z_inverted) { BN_mod_inverse(&bnez, &pps->Z, &bnm, bnctx); BN_to_montgomery(&bnez, &bnez, bnmont, bnctx); BN_to_montgomery(&bnez, &bnez, bnmont, bnctx); } if (BN_cmp(&ppt->X, &pps->X) || BN_cmp(&ppt->Y, &pps->Y) || BN_cmp(&bnz, bnzc)) { if (!mismatches) { fprintf(stderr, "Base privkey: "); fdumpbn(stderr, EC_KEY_get0_private_key( vxcp->vxc_key)); } mismatches++; fprintf(stderr, "Mismatch for kernel %d, " "offset %d (%d,%d)\n", z_inverted, bx + x, y, x); if (!mm_r) { mm_r = 1; fprintf(stderr, "Row X : "); fdumpbn(stderr, &ppr->X); fprintf(stderr, "Row Y : "); fdumpbn(stderr, &ppr->Y); } fprintf(stderr, "Column X: "); fdumpbn(stderr, &ppc->X); fprintf(stderr, "Column Y: "); fdumpbn(stderr, &ppc->Y); if (BN_cmp(&ppt->X, &pps->X)) { fprintf(stderr, "Expect X: "); fdumpbn(stderr, &pps->X); fprintf(stderr, "Device X: "); fdumpbn(stderr, &ppt->X); } if (BN_cmp(&ppt->Y, &pps->Y)) { fprintf(stderr, "Expect Y: "); fdumpbn(stderr, &pps->Y); fprintf(stderr, "Device Y: "); fdumpbn(stderr, &ppt->Y); } if (BN_cmp(&bnz, bnzc)) { fprintf(stderr, "Expect Z: "); fdumpbn(stderr, bnzc); fprintf(stderr, "Device Z: "); fdumpbn(stderr, &bnz); } } } } ret = !mismatches; out: if (z_heap) vg_ocl_unmap_arg_buffer(vocp, slot, 1, z_heap); if (point_tmp) vg_ocl_unmap_arg_buffer(vocp, slot, 2, point_tmp); if (ocl_points_in) vg_ocl_unmap_arg_buffer(vocp, slot, 3, ocl_points_in); if (ocl_strides_in) vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in); if (ppr) EC_POINT_free(ppr); if (ppc) EC_POINT_free(ppc); if (pps) EC_POINT_free(pps); if (ppt) EC_POINT_free(ppt); BN_clear_free(&bnz); BN_clear_free(&bnez); BN_clear_free(&bnm); if (bnmont) BN_MONT_CTX_free(bnmont); if (bnctx) BN_CTX_free(bnctx); return ret; } static int vg_ocl_verify_k0(vg_ocl_context_t *vocp, int slot) { return vg_ocl_verify_temporary(vocp, slot, 0); } static int vg_ocl_verify_k1(vg_ocl_context_t *vocp, int slot) { return vg_ocl_verify_temporary(vocp, slot, 1); } static void * vg_opencl_thread(void *arg) { vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg; vg_context_t *vcp = vocp->base.vxc_vc; int halt = 0; int slot = -1; int rows, cols, invsize; unsigned long long idleu, busyu; double pidle; struct timeval tv, tvt, tvd, idle, busy; memset(&idle, 0, sizeof(idle)); memset(&busy, 0, sizeof(busy)); while (1) { pthread_mutex_lock(&vocp->voc_lock); if (halt) { halt = 0; vocp->voc_halt = 1; } if (slot != -1) { assert(vocp->voc_ocl_slot == slot); vocp->voc_ocl_slot = -1; slot = -1; pthread_cond_signal(&vocp->voc_wait); } if (vocp->voc_ocl_slot == -1) { gettimeofday(&tv, NULL); while (vocp->voc_ocl_slot == -1) { if (vocp->voc_halt) goto out; pthread_cond_wait(&vocp->voc_wait, &vocp->voc_lock); } gettimeofday(&tvt, NULL); timersub(&tvt, &tv, &tvd); timeradd(&tvd, &idle, &idle); } slot = vocp->voc_ocl_slot; rows = vocp->voc_ocl_rows; cols = vocp->voc_ocl_cols; invsize = vocp->voc_ocl_invsize; pthread_mutex_unlock(&vocp->voc_lock); gettimeofday(&tv, NULL); if (!vg_ocl_kernel_start(vocp, slot, cols, rows, invsize)) halt = 1; if (!vg_ocl_kernel_wait(vocp, slot)) halt = 1; if (vcp->vc_verbose > 1) { gettimeofday(&tvt, NULL); timersub(&tvt, &tv, &tvd); timeradd(&tvd, &busy, &busy); if ((busy.tv_sec + idle.tv_sec) > 1) { idleu = (1000000 * idle.tv_sec) + idle.tv_usec; busyu = (1000000 * busy.tv_sec) + busy.tv_usec; pidle = ((double) idleu) / (idleu + busyu); if (pidle > 0.01) { fprintf(stderr, "\rGPU idle: %.2f%%" " " " \n", 100 * pidle); } memset(&idle, 0, sizeof(idle)); memset(&busy, 0, sizeof(busy)); } } } out: pthread_mutex_unlock(&vocp->voc_lock); return NULL; } /* * Address search thread main loop */ static void * vg_opencl_loop(vg_exec_context_t *arg) { vg_ocl_context_t *vocp = (vg_ocl_context_t *) arg; int i; int round, nrows, ncols; int pattern_generation; const BN_ULONG rekey_max = 100000000; BN_ULONG npoints, rekey_at; EC_KEY *pkey = NULL; const EC_GROUP *pgroup; const EC_POINT *pgen; EC_POINT **ppbase = NULL, **pprow, *pbatchinc = NULL, *poffset = NULL; EC_POINT *pseek = NULL; unsigned char *ocl_points_in, *ocl_strides_in; vg_context_t *vcp = vocp->base.vxc_vc; vg_exec_context_t *vxcp = &vocp->base; int slot, nslots; int slot_busy = 0, slot_done = 0, halt = 0; int c = 0, output_interval = 1000; struct timeval tvstart; pkey = vxcp->vxc_key; pgroup = EC_KEY_get0_group(pkey); pgen = EC_GROUP_get0_generator(pgroup); round = vocp->voc_ocl_rows * vocp->voc_ocl_cols; if (!vcp->vc_remove_on_match && (vcp->vc_chance >= 1.0f) && (vcp->vc_chance < round) && (vcp->vc_verbose > 0)) { fprintf(stderr, "WARNING: low pattern difficulty\n"); fprintf(stderr, "WARNING: better match throughput is possible " "using vanitygen on the CPU\n"); } slot = 0; nslots = 2; vocp->voc_nslots = nslots; nrows = vocp->voc_ocl_rows; ncols = vocp->voc_ocl_cols; ppbase = (EC_POINT **) malloc((nrows + ncols) * sizeof(EC_POINT*)); if (!ppbase) goto enomem; for (i = 0; i < (nrows + ncols); i++) { ppbase[i] = EC_POINT_new(pgroup); if (!ppbase[i]) goto enomem; } pprow = ppbase + ncols; pbatchinc = EC_POINT_new(pgroup); poffset = EC_POINT_new(pgroup); pseek = EC_POINT_new(pgroup); if (!pbatchinc || !poffset || !pseek) goto enomem; BN_set_word(&vxcp->vxc_bntmp, ncols); EC_POINT_mul(pgroup, pbatchinc, &vxcp->vxc_bntmp, NULL, NULL, vxcp->vxc_bnctx); EC_POINT_make_affine(pgroup, pbatchinc, vxcp->vxc_bnctx); BN_set_word(&vxcp->vxc_bntmp, round); EC_POINT_mul(pgroup, poffset, &vxcp->vxc_bntmp, NULL, NULL, vxcp->vxc_bnctx); EC_POINT_make_affine(pgroup, poffset, vxcp->vxc_bnctx); if (!vg_ocl_config_pattern(vocp)) goto enomem; for (i = 0; i < nslots; i++) { /* * Each work group gets its own: * - Column point array */ if (!vg_ocl_kernel_arg_alloc(vocp, i, 4, 32 * 2 * nrows, 1)) goto enomem; } /* * All instances share: * - The z_heap and point scratch spaces * - The row point array */ if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1, round_up_pow2(32 * 2 * round, 4096), 0) || !vg_ocl_kernel_arg_alloc(vocp, -1, 2, round_up_pow2(32 * 2 * round, 4096), 0) || !vg_ocl_kernel_arg_alloc(vocp, -1, 3, round_up_pow2(32 * 2 * ncols, 4096), 1)) goto enomem; npoints = 0; rekey_at = 0; vxcp->vxc_binres[0] = vcp->vc_addrtype; if (pthread_create(&vocp->voc_ocl_thread, NULL, vg_opencl_thread, vocp)) goto enomem; gettimeofday(&tvstart, NULL); l_rekey: if (vocp->voc_rekey_func) { switch (vocp->voc_rekey_func(vocp)) { case 1: break; case 0: goto nopatterns; default: goto enomem; } } vg_exec_context_upgrade_lock(vxcp); pattern_generation = vcp->vc_pattern_generation; /* Generate a new random private key */ EC_KEY_generate_key(pkey); npoints = 0; /* Determine rekey interval */ EC_GROUP_get_order(pgroup, &vxcp->vxc_bntmp, vxcp->vxc_bnctx); BN_sub(&vxcp->vxc_bntmp2, &vxcp->vxc_bntmp, EC_KEY_get0_private_key(pkey)); rekey_at = BN_get_word(&vxcp->vxc_bntmp2); if ((rekey_at == BN_MASK2) || (rekey_at > rekey_max)) rekey_at = rekey_max; assert(rekey_at > 0); EC_POINT_copy(ppbase[0], EC_KEY_get0_public_key(pkey)); vg_exec_context_downgrade_lock(vxcp); if (vcp->vc_pubkey_base) { EC_POINT_add(pgroup, ppbase[0], ppbase[0], vcp->vc_pubkey_base, vxcp->vxc_bnctx); } /* Build the base array of sequential points */ for (i = 1; i < ncols; i++) { EC_POINT_add(pgroup, ppbase[i], ppbase[i-1], pgen, vxcp->vxc_bnctx); } EC_POINTs_make_affine(pgroup, ncols, ppbase, vxcp->vxc_bnctx); /* Fill the sequential point array */ ocl_points_in = (unsigned char *) vg_ocl_map_arg_buffer(vocp, 0, 3, 1); if (!ocl_points_in) { fprintf(stderr, "ERROR: Could not map column buffer\n"); goto enomem; } for (i = 0; i < ncols; i++) vg_ocl_put_point_tpa(ocl_points_in, i, ppbase[i]); vg_ocl_unmap_arg_buffer(vocp, 0, 3, ocl_points_in); /* * Set up the initial row increment table. * Set the first element to pgen -- effectively * skipping the exact key generated above. */ EC_POINT_copy(pprow[0], pgen); for (i = 1; i < nrows; i++) { EC_POINT_add(pgroup, pprow[i], pprow[i-1], pbatchinc, vxcp->vxc_bnctx); } EC_POINTs_make_affine(pgroup, nrows, pprow, vxcp->vxc_bnctx); vxcp->vxc_delta = 1; npoints = 1; slot = 0; slot_busy = 0; slot_done = 0; while (1) { if (slot_done) { assert(rekey_at > 0); slot_done = 0; /* Call the result check function */ switch (vocp->voc_check_func(vocp, slot)) { case 1: rekey_at = 0; break; case 2: halt = 1; break; default: break; } c += round; if (!halt && (c >= output_interval)) { output_interval = vg_output_timing(vcp, c, &tvstart); c = 0; } vg_exec_context_yield(vxcp); /* If the patterns changed, reload it to the GPU */ if (vocp->voc_rekey_func && (pattern_generation != vcp->vc_pattern_generation)) { vocp->voc_pattern_rewrite = 1; rekey_at = 0; } } if (vcp->vc_halt) halt = 1; if (halt) break; if ((npoints + round) < rekey_at) { if (npoints > 1) { /* Move the row increments forward */ for (i = 0; i < nrows; i++) { EC_POINT_add(pgroup, pprow[i], pprow[i], poffset, vxcp->vxc_bnctx); } EC_POINTs_make_affine(pgroup, nrows, pprow, vxcp->vxc_bnctx); } /* Copy the row stride array to the device */ ocl_strides_in = (unsigned char *) vg_ocl_map_arg_buffer(vocp, slot, 4, 1); if (!ocl_strides_in) { fprintf(stderr, "ERROR: Could not map row buffer " "for slot %d\n", slot); goto enomem; } memset(ocl_strides_in, 0, 64*nrows); for (i = 0; i < nrows; i++) vg_ocl_put_point(ocl_strides_in + (64*i), pprow[i]); vg_ocl_unmap_arg_buffer(vocp, slot, 4, ocl_strides_in); npoints += round; pthread_mutex_lock(&vocp->voc_lock); while (vocp->voc_ocl_slot != -1) { assert(slot_busy); pthread_cond_wait(&vocp->voc_wait, &vocp->voc_lock); } if (vocp->voc_halt) { pthread_mutex_unlock(&vocp->voc_lock); halt = 1; break; } vocp->voc_ocl_slot = slot; pthread_cond_signal(&vocp->voc_wait); pthread_mutex_unlock(&vocp->voc_lock); slot_done = slot_busy; slot_busy = 1; slot = (slot + 1) % nslots; } else { if (slot_busy) { pthread_mutex_lock(&vocp->voc_lock); while (vocp->voc_ocl_slot != -1) { assert(vocp->voc_ocl_slot == ((slot + nslots - 1) % nslots)); pthread_cond_wait(&vocp->voc_wait, &vocp->voc_lock); } pthread_mutex_unlock(&vocp->voc_lock); slot_busy = 0; slot_done = 1; } if (!rekey_at || (!slot_done && ((npoints + round) >= rekey_at))) goto l_rekey; } } if (0) { enomem: fprintf(stderr, "ERROR: allocation failure?\n"); nopatterns: ; } if (halt) { if (vcp->vc_verbose > 1) { printf("Halting..."); fflush(stdout); } pthread_mutex_lock(&vocp->voc_lock); vocp->voc_halt = 1; pthread_cond_signal(&vocp->voc_wait); while (vocp->voc_ocl_slot != -1) { assert(slot_busy); pthread_cond_wait(&vocp->voc_wait, &vocp->voc_lock); } slot_busy = 0; pthread_mutex_unlock(&vocp->voc_lock); pthread_join(vocp->voc_ocl_thread, NULL); if (vcp->vc_verbose > 1) printf("done!\n"); } vg_exec_context_yield(vxcp); if (ppbase) { for (i = 0; i < (nrows + ncols); i++) if (ppbase[i]) EC_POINT_free(ppbase[i]); free(ppbase); } if (pbatchinc) EC_POINT_free(pbatchinc); /* Release the argument buffers */ vg_ocl_free_args(vocp); vocp->voc_halt = 0; vocp->voc_ocl_slot = -1; vg_context_thread_exit(vcp); return NULL; } /* * OpenCL platform/device selection junk */ static int get_device_list(cl_platform_id pid, cl_device_id **list_out) { cl_uint nd; cl_int res; cl_device_id *ids; res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); if (res != CL_SUCCESS) { vg_ocl_error(NULL, res, "clGetDeviceIDs(0)"); *list_out = NULL; return -1; } if (nd) { ids = (cl_device_id *) malloc(nd * sizeof(*ids)); if (ids == NULL) { fprintf(stderr, "Could not allocate device ID list\n"); *list_out = NULL; return -1; } res = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, nd, ids, NULL); if (res != CL_SUCCESS) { vg_ocl_error(NULL, res, "clGetDeviceIDs(n)"); free(ids); *list_out = NULL; return -1; } *list_out = ids; } return nd; } static void show_devices(cl_platform_id pid, cl_device_id *ids, int nd, int base) { int i; char nbuf[128]; char vbuf[128]; size_t len; cl_int res; for (i = 0; i < nd; i++) { res = clGetDeviceInfo(ids[i], CL_DEVICE_NAME, sizeof(nbuf), nbuf, &len); if (res != CL_SUCCESS) continue; if (len >= sizeof(nbuf)) len = sizeof(nbuf) - 1; nbuf[len] = '\0'; res = clGetDeviceInfo(ids[i], CL_DEVICE_VENDOR, sizeof(vbuf), vbuf, &len); if (res != CL_SUCCESS) continue; if (len >= sizeof(vbuf)) len = sizeof(vbuf) - 1; vbuf[len] = '\0'; fprintf(stderr, " %d: [%s] %s\n", i + base, vbuf, nbuf); } } static cl_device_id get_device(cl_platform_id pid, int num) { int nd; cl_device_id id, *ids; nd = get_device_list(pid, &ids); if (nd < 0) return NULL; if (!nd) { fprintf(stderr, "No OpenCL devices found\n"); return NULL; } if (num < 0) { if (nd == 1) num = 0; else num = nd; } if (num < nd) { id = ids[num]; free(ids); return id; } free(ids); return NULL; } static int get_platform_list(cl_platform_id **list_out) { cl_uint np; cl_int res; cl_platform_id *ids; res = clGetPlatformIDs(0, NULL, &np); if (res != CL_SUCCESS) { vg_ocl_error(NULL, res, "clGetPlatformIDs(0)"); *list_out = NULL; return -1; } if (np) { ids = (cl_platform_id *) malloc(np * sizeof(*ids)); if (ids == NULL) { fprintf(stderr, "Could not allocate platform ID list\n"); *list_out = NULL; return -1; } res = clGetPlatformIDs(np, ids, NULL); if (res != CL_SUCCESS) { vg_ocl_error(NULL, res, "clGetPlatformIDs(n)"); free(ids); *list_out = NULL; return -1; } *list_out = ids; } return np; } void show_platforms(cl_platform_id *ids, int np, int base) { int i; char nbuf[128]; char vbuf[128]; size_t len; cl_int res; for (i = 0; i < np; i++) { res = clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, sizeof(nbuf), nbuf, &len); if (res != CL_SUCCESS) { vg_ocl_error(NULL, res, "clGetPlatformInfo(NAME)"); continue; } if (len >= sizeof(nbuf)) len = sizeof(nbuf) - 1; nbuf[len] = '\0'; res = clGetPlatformInfo(ids[i], CL_PLATFORM_VENDOR, sizeof(vbuf), vbuf, &len); if (res != CL_SUCCESS) { vg_ocl_error(NULL, res, "clGetPlatformInfo(VENDOR)"); continue; } if (len >= sizeof(vbuf)) len = sizeof(vbuf) - 1; vbuf[len] = '\0'; fprintf(stderr, "%d: [%s] %s\n", i + base, vbuf, nbuf); } } static cl_platform_id get_platform(int num) { int np; cl_platform_id id, *ids; np = get_platform_list(&ids); if (np < 0) return NULL; if (!np) { fprintf(stderr, "No OpenCL platforms available\n"); return NULL; } if (num < 0) { if (np == 1) num = 0; else num = np; } if (num < np) { id = ids[num]; free(ids); return id; } free(ids); return NULL; } void vg_ocl_enumerate_devices(void) { cl_platform_id *pids; cl_device_id *dids; int np, nd, i; np = get_platform_list(&pids); if (!np) { fprintf(stderr, "No OpenCL platforms available\n"); return; } fprintf(stderr, "Available OpenCL platforms:\n"); for (i = 0; i < np; i++) { show_platforms(&pids[i], 1, i); nd = get_device_list(pids[i], &dids); if (!nd) { fprintf(stderr, " -- No devices\n"); } else { show_devices(pids[i], dids, nd, 0); } } } static cl_device_id get_opencl_device(int platformidx, int deviceidx) { cl_platform_id pid; cl_device_id did = NULL; pid = get_platform(platformidx); if (pid) { did = get_device(pid, deviceidx); if (did) return did; } return NULL; } vg_ocl_context_t * vg_ocl_context_new(vg_context_t *vcp, int platformidx, int deviceidx, int safe_mode, int verify, int worksize, int nthreads, int nrows, int ncols, int invsize) { cl_device_id did; int round, full_threads, wsmult; cl_ulong memsize, allocsize; vg_ocl_context_t *vocp; /* Find the device */ did = get_opencl_device(platformidx, deviceidx); if (!did) { return 0; } vocp = (vg_ocl_context_t *) malloc(sizeof(*vocp)); if (!vocp) return NULL; /* Open the device and compile the kernel */ if (!vg_ocl_init(vcp, vocp, did, safe_mode)) { free(vocp); return NULL; } if (verify) { if (vcp->vc_verbose > 0) { fprintf(stderr, "WARNING: " "Hardware verification mode enabled\n"); } if (!nthreads) nthreads = 1; vocp->voc_verify_func[0] = vg_ocl_verify_k0; vocp->voc_verify_func[1] = vg_ocl_verify_k1; } /* * nrows: number of point rows per job * ncols: number of point columns per job * invsize: number of modular inversion tasks per job * (each task performs (nrows*ncols)/invsize inversions) * nslots: number of kernels * (create two, keep one running while we service the other or wait) */ if (!nthreads) { /* Pick nthreads sufficient to saturate one compute unit */ if (vg_ocl_device_gettype(vocp->voc_ocldid) & CL_DEVICE_TYPE_CPU) nthreads = 1; else nthreads = vg_ocl_device_getsizet(vocp->voc_ocldid, CL_DEVICE_MAX_WORK_GROUP_SIZE); } full_threads = vg_ocl_device_getsizet(vocp->voc_ocldid, CL_DEVICE_MAX_COMPUTE_UNITS); full_threads *= nthreads; /* * The work size selection is complicated, and the most * important factor is the batch size of the heap_invert kernel. * Each value added to the batch trades one complete modular * inversion for four multiply operations. Ideally the work * size would be as large as possible. The practical limiting * factors are: * 1. Available memory * 2. Responsiveness and operational latency * * We take a naive approach and limit batch size to a point of * sufficiently diminishing returns, hoping that responsiveness * will be sufficient. * * The measured value for the OpenSSL implementations on my CPU * is 80:1. This causes heap_invert to get batches of 20 or so * for free, and receive 10% incremental returns at 200. The CPU * work size is therefore set to 256. * * The ratio on most GPUs with the oclvanitygen implementations * is closer to 500:1, and larger batches are required for * good performance. */ if (!worksize) { if (vg_ocl_device_gettype(vocp->voc_ocldid) & CL_DEVICE_TYPE_GPU) worksize = 2048; else worksize = 256; } if (!ncols) { memsize = vg_ocl_device_getulong(vocp->voc_ocldid, CL_DEVICE_GLOBAL_MEM_SIZE); allocsize = vg_ocl_device_getulong(vocp->voc_ocldid, CL_DEVICE_MAX_MEM_ALLOC_SIZE); memsize /= 2; ncols = full_threads; nrows = 2; /* Find row and column counts close to sqrt(full_threads) */ while ((ncols > nrows) && !(ncols & 1)) { ncols /= 2; nrows *= 2; } /* * Increase row & column counts to satisfy work size * multiplier or fill available memory. */ wsmult = 1; while ((!worksize || ((wsmult * 2) <= worksize)) && ((ncols * nrows * 2 * 128) < memsize) && ((ncols * nrows * 2 * 64) < allocsize)) { if (ncols > nrows) nrows *= 2; else ncols *= 2; wsmult *= 2; } } round = nrows * ncols; if (!invsize) { invsize = 2; while (!(round % (invsize << 1)) && ((round / invsize) > full_threads)) invsize <<= 1; } if (vcp->vc_verbose > 1) { fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows); fprintf(stderr, "Modular inverse: %d threads, %d ops each\n", round/invsize, invsize); } if ((round % invsize) || !is_pow2(invsize) || (invsize < 2)) { if (vcp->vc_verbose <= 1) { fprintf(stderr, "Grid size: %dx%d\n", ncols, nrows); fprintf(stderr, "Modular inverse: %d threads, %d ops each\n", round/invsize, invsize); } if (round % invsize) fprintf(stderr, "Modular inverse work size must " "evenly divide points\n"); else fprintf(stderr, "Modular inverse work per task (%d) " "must be a power of 2\n", invsize); goto out_fail; } vocp->voc_ocl_rows = nrows; vocp->voc_ocl_cols = ncols; vocp->voc_ocl_invsize = invsize; return vocp; out_fail: vg_ocl_context_free(vocp); return NULL; } vg_ocl_context_t * vg_ocl_context_new_from_devstr(vg_context_t *vcp, const char *devstr, int safemode, int verify) { int platformidx, deviceidx; int worksize = 0, nthreads = 0, nrows = 0, ncols = 0, invsize = 0; char *dsd, *part, *part2, *save, *param; dsd = strdup(devstr); if (!dsd) return NULL; save = NULL; part = strtok_r(dsd, ",", &save); part2 = strchr(part, ':'); if (!part2) { fprintf(stderr, "Invalid device specifier '%s'\n", part); free(dsd); return NULL; } *part2 = '\0'; platformidx = atoi(part); deviceidx = atoi(part2 + 1); while ((part = strtok_r(NULL, ",", &save)) != NULL) { param = strchr(part, '='); if (!param) { fprintf(stderr, "Unrecognized parameter '%s'\n", part); continue; } *param = '\0'; param++; if (!strcmp(part, "grid")) { ncols = strtol(param, &part2, 0); if (part2 && *part2 == 'x') { nrows = strtol(part2+1, NULL, 0); } if (!nrows || !ncols) { fprintf(stderr, "Invalid grid size '%s'\n", param); nrows = 0; ncols = 0; continue; } } else if (!strcmp(part, "invsize")) { invsize = atoi(param); if (!invsize) { fprintf(stderr, "Invalid modular inverse size '%s'\n", param); continue; } if (invsize & (invsize - 1)) { fprintf(stderr, "Modular inverse size %d must be " "a power of 2\n", invsize); invsize = 0; continue; } } else if (!strcmp(part, "threads")) { nthreads = atoi(param); if (nthreads == 0) { fprintf(stderr, "Invalid thread count '%s'\n", param); continue; } } else { fprintf(stderr, "Unrecognized parameter '%s'\n", part); } } free(dsd); return vg_ocl_context_new(vcp, platformidx, deviceidx, safemode, verify, worksize, nthreads, nrows, ncols, invsize); } void vg_ocl_context_free(vg_ocl_context_t *vocp) { vg_ocl_del(vocp); free(vocp); }