diff --git a/oclvanitygen.c b/oclvanitygen.c index f9167ec..dd1ff9a 100644 --- a/oclvanitygen.c +++ b/oclvanitygen.c @@ -1630,16 +1630,22 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode, full_threads *= nthreads; /* - * The work size should be set to a point of diminishing - * returns for the batch size of the heap_invert kernel. + * The work size selection is complicated, and the most + * important factor is the batch size of the heap_invert kernel. * Each value added to the batch trades one complete modular - * inversion for four multiply operations. - * Selection of a work size depends on the throughput ratio of - * the multiply and modular inversion operations. + * inversion for four multiply operations. Ideally the work + * size would be as large as possible. The practical limiting + * factors are: + * 1. Available memory + * 2. Responsiveness and operational latency + * + * We take a naive approach and limit batch size to a point of + * sufficiently diminishing returns, hoping that responsiveness + * will be sufficient. * * The measured value for the OpenSSL implementations on my CPU - * is 80:1. This causes heap_invert to break even with batches - * of 20, and receive 10% incremental returns at 200. The CPU + * is 80:1. This causes heap_invert to get batches of 20 or so + * for free, and receive 10% incremental returns at 200. The CPU * work size is therefore set to 256. * * The ratio on most GPUs with the oclvanitygen implementations @@ -1673,7 +1679,7 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode, * multiplier or fill available memory. */ wsmult = 1; - while ((!worksize || (wsmult < worksize)) && + while ((!worksize || ((wsmult * 2) < worksize)) && ((ncols * nrows * 2 * 128) < memsize) && ((ncols * nrows * 2 * 64) < allocsize)) { if (ncols > nrows) @@ -1714,6 +1720,15 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode, goto out; } + if (!vcp->vc_remove_on_match && + (vcp->vc_chance >= 1.0f) && + (vcp->vc_chance < round) && + (vcp->vc_verbose > 0)) { + printf("WARNING: low pattern difficulty\n"); + printf("WARNING: better match throughput is possible " + "using vanitygen on the CPU\n"); + } + nslots = 2; slot = 0; vocp->voc_ocl_rows = nrows;