diff --git a/ocl.c b/ocl.c index f7264447..ba8cde2f 100644 --- a/ocl.c +++ b/ocl.c @@ -472,17 +472,35 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) #ifdef USE_SCRYPT if (opt_scrypt) { + cl_ulong ma = gpus[gpu].max_alloc, mt; + int pow2 = 0; + if (!gpus[gpu].lookup_gap) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); gpus[gpu].lookup_gap = 2; } if (!gpus[gpu].thread_concurrency) { - gpus[gpu].thread_concurrency = gpus[gpu].max_alloc / 32768 / gpus[gpu].lookup_gap; + gpus[gpu].thread_concurrency = ma / 32768 / gpus[gpu].lookup_gap; if (gpus[gpu].shaders && gpus[gpu].thread_concurrency > gpus[gpu].shaders) gpus[gpu].thread_concurrency -= gpus[gpu].thread_concurrency % gpus[gpu].shaders; applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu, gpus[gpu].thread_concurrency); } + + /* If we have memory to spare, try to find a power of 2 value + * >= required amount to map nicely to an intensity */ + mt = gpus[gpu].thread_concurrency * 32768 * gpus[gpu].lookup_gap; + if (ma > mt) { + while (ma >>= 1) + pow2++; + ma = 1; + while (--pow2 && ma < mt) + ma <<= 1; + if (ma >= mt) { + gpus[gpu].max_alloc = ma; + applog(LOG_DEBUG, "Max alloc decreased to %lu", gpus[gpu].max_alloc); + } + } } #endif @@ -776,8 +794,8 @@ built: size_t ipt = (1024 / gpus[gpu].lookup_gap + (1024 % gpus[gpu].lookup_gap > 0)); size_t bufsize = 128 * ipt * gpus[gpu].thread_concurrency; - /* Always allocate the largest possible buffer allowed, even if we're not initially requiring it - * based on thread_concurrency, giving us some headroom for intensity levels. */ + /* Use the max alloc value which has been rounded to a power of + * 2 greater >= required amount earlier */ if (bufsize > gpus[gpu].max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u, your scrypt settings come to %u", gpu, gpus[gpu].max_alloc, bufsize);