From e5bd5f869d14bfe911270f069289a45371e56881 Mon Sep 17 00:00:00 2001 From: Wolf Date: Fri, 20 May 2016 18:22:07 -0500 Subject: [PATCH] Added host code required to utilize a custom Quark AMD binary. --- .gitignore | 1 + algorithm.c | 113 +++++++++++++++++++++++++++++++++++++++++++++--- algorithm.h | 3 ++ driver-opencl.c | 83 +++++++++++++++++++++++++++++++---- ocl.c | 43 +++++++++++++++++- ocl.h | 2 + 6 files changed, 228 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index ce23abd8..0f82734d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ minerd.exe autom4te.cache .deps +m4 Makefile Makefile.in diff --git a/algorithm.c b/algorithm.c index ad41a494..9e35eeb5 100644 --- a/algorithm.c +++ b/algorithm.c @@ -1007,6 +1007,108 @@ static cl_int queue_decred_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_u return status; } +static cl_int queue_quarkcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; + cl_ulong le_target; + cl_int status = 0; + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL); + // search + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->padbuffer8); + num = 0; + kernel = clState->extra_kernels; + + for(int i = 0; i < 3; ++i, kernel++) + { + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->BranchBuffer[i << 1]); + CL_SET_ARG(clState->BranchBuffer[(i << 1) + 1]); + CL_SET_ARG(clState->GlobalThreadCount); + num = 0; + + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->BranchBuffer[i << 1]); + + if(i == 2) + { + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + } + + num = 0; + + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->BranchBuffer[(i << 1) + 1]); + + if(i < 2) + { + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + } + else + { + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + } + num = 0; + } + /* + // search1 + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch1Nonces); + CL_SET_ARG(clState->Branch2Nonces); + CL_SET_ARG(clState->GlobalThreadCount); + num = 0; + // search2 + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch1Nonces); + num = 0; + // search3 + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch2Nonces); + num = 0; + // search4 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // search5 + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch3Nonces); + CL_SET_ARG(clState->Branch4Nonces); + CL_SET_ARG(clState->GlobalThreadCount); + num = 0; + // search6 + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch3Nonces); + num = 0; + //search7 + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch4Nonces); + num = 0; + // search8 + CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8); + // search9 + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch5Nonces); + CL_SET_ARG(clState->Branch6Nonces); + CL_SET_ARG(clState->GlobalThreadCount); + num = 0; + // search10 + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch5Nonces); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + num = 0; + // search11 + CL_NEXTKERNEL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->Branch6Nonces); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + */ + return status; +} + static algorithm_settings_t algos[] = { // kernels starting from this will have difficulty calculated by using litecoin algorithm #define A_SCRYPT(a) \ @@ -1051,13 +1153,10 @@ static algorithm_settings_t algos[] = { #undef A_YESCRYPT_MULTI // kernels starting from this will have difficulty calculated by using quarkcoin algorithm -#define A_QUARK(a, b) \ - { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options } - A_QUARK("quarkcoin", quarkcoin_regenhash), - A_QUARK("qubitcoin", qubitcoin_regenhash), - A_QUARK("animecoin", animecoin_regenhash), - A_QUARK("sifcoin", sifcoin_regenhash), -#undef A_QUARK + { "quarkcoin", ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 11, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, quarkcoin_regenhash, NULL, NULL, queue_quarkcoin_kernel, gen_hash, append_x11_compiler_options }, + { "qubitcoin", ALGO_QUBIT, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, qubitcoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }, + { "animecoin", ALGO_ANIME, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, animecoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }, + { "sifcoin", ALGO_SIF, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, sifcoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }, // kernels starting from this will have difficulty calculated by using bitcoin algorithm #define A_DARK(a, b) \ diff --git a/algorithm.h b/algorithm.h index 5628d006..83e3d94a 100644 --- a/algorithm.h +++ b/algorithm.h @@ -22,6 +22,9 @@ typedef enum { ALGO_X15, ALGO_KECCAK, ALGO_QUARK, + ALGO_QUBIT, + ALGO_ANIME, + ALGO_SIF, ALGO_TWE, ALGO_FUGUE, ALGO_NIST, diff --git a/driver-opencl.c b/driver-opencl.c index 3e6667bc..9c91eed7 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1425,20 +1425,85 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, if (clState->goffset) p_global_work_offset = (size_t *)&work->blk.nonce; - status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset, - globalThreads, localThreads, 0, NULL, NULL); - if (unlikely(status != CL_SUCCESS)) { - applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); - return -1; - } + if (gpu->algorithm.type == ALGO_QUARK) { + cl_event WaitEvents[2]; + cl_uint zero = 0; + + for (int i = 0; i < 6; ++i) { + status = clEnqueueWriteBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_TRUE, sizeof(cl_uint) * clState->GlobalThreadCount, sizeof(cl_uint), &zero, 0, NULL, NULL); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to nuke BranchBuffer %d.", status, i); + return -1; + } + } + + for (int i = 0, x = 0; i < 9; ++i, ++x) { + size_t BranchCount0, BranchCount1; + status = clEnqueueNDRangeKernel(clState->commandQueue, ((i) ? clState->extra_kernels[i] : clState->kernel), 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, WaitEvents); + if (status != CL_SUCCESS) { + if (i) applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i); + else applog(LOG_ERR, "Error %d while attempting to enqueue initial kernel.", status); + return -1; + } + + clWaitForEvents(1, WaitEvents); + clReleaseEvent(WaitEvents[0]); + + if (i) ++i; + + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, WaitEvents); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i); + return -1; + } + + clWaitForEvents(1, WaitEvents); + clReleaseEvent(WaitEvents[0]); + + // Do a blocking read for the found counter in both buffers + // so we know how many threads to dispatch to each branch + status = clEnqueueReadBuffer(clState->commandQueue, clState->BranchBuffer[x << 1], CL_TRUE, sizeof(cl_uint) * globalThreads[0], sizeof(cl_uint), &BranchCount0, 0, NULL, NULL); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to read intermediate data from GPU.", status, i); + return -1; + } + + BranchCount1 = globalThreads[0] - BranchCount0; + ++i; + + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, &BranchCount0, localThreads, 0, NULL, WaitEvents); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i); + return -1; + } + + ++i; + + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, &BranchCount1, localThreads, 0, NULL, WaitEvents + 1); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i); + return -1; + } - for (i = 0; i < clState->n_extra_kernels; i++) { - status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, - globalThreads, localThreads, 0, NULL, NULL); + clWaitForEvents(2, WaitEvents); + clReleaseEvent(WaitEvents[0]); + clReleaseEvent(WaitEvents[1]); + } + } + else { + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); return -1; } + + for (i = 0; i < clState->n_extra_kernels; i++) { + status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL); + if (unlikely(status != CL_SUCCESS)) { + applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); + return -1; + } + } } status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, diff --git a/ocl.c b/ocl.c index c34f191b..7f26842a 100644 --- a/ocl.c +++ b/ocl.c @@ -180,6 +180,35 @@ static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_co return status; } +// Copied from set_threads_hashes() in driver-opencl.c +static size_t CalcGlobalThreads(unsigned int compute_shaders, unsigned int minthreads, __maybe_unused int *intensity, __maybe_unused int *xintensity, __maybe_unused int *rawintensity, algorithm_t *algorithm) +{ + size_t threads = 0; + while (threads < minthreads) { + + if (*rawintensity > 0) { + threads = *rawintensity; + } + else if (*xintensity > 0) { + threads = compute_shaders * ((algorithm->xintensity_shift)?(1 << (algorithm->xintensity_shift + *xintensity)):*xintensity); + } + else { + threads = 1 << (algorithm->intensity_shift + *intensity); + } + + if (threads < minthreads) { + if (likely(*intensity < MAX_INTENSITY)) { + (*intensity)++; + } + else { + threads = minthreads; + } + } + } + + return(threads); +} + _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm) { cl_int status = 0; @@ -837,7 +866,19 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize); } - if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_MULTI) { + // By the way, if you change the intensity between now and opencl_scanhash() + // calculating the global thread count, God help you. + if (algorithm->type == ALGO_QUARK) { + clState->GlobalThreadCount = CalcGlobalThreads(clState->compute_shaders, clState->wsize, &cgpu->intensity, &cgpu->xintensity, &cgpu->rawintensity, &cgpu->algorithm); + for (int i = 0; i < 6; ++i) { + clState->BranchBuffer[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(cl_uint) * (clState->GlobalThreadCount + 2), NULL, &status); + if (status != CL_SUCCESS && !clState->BranchBuffer[i]) { + applog(LOG_ERR, "Error %d while creating BranchBuffer %d.", status, i); + return NULL; + } + } + } + else if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_MULTI) { // need additionnal buffers clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); if (status != CL_SUCCESS && !clState->buffer1) { diff --git a/ocl.h b/ocl.h index 311db29e..e66275c8 100644 --- a/ocl.h +++ b/ocl.h @@ -13,6 +13,8 @@ typedef struct __clState { cl_mem outputBuffer; cl_mem CLbuffer0; cl_mem MidstateBuf; + cl_mem BranchBuffer[6]; + size_t GlobalThreadCount; cl_mem padbuffer8; cl_mem buffer1; cl_mem buffer2;