From e5bd5f869d14bfe911270f069289a45371e56881 Mon Sep 17 00:00:00 2001
From: Wolf <wolf@mykolab.com>
Date: Fri, 20 May 2016 18:22:07 -0500
Subject: [PATCH] Added host code required to utilize a custom Quark AMD
 binary.

---
 .gitignore      |   1 +
 algorithm.c     | 113 +++++++++++++++++++++++++++++++++++++++++++++---
 algorithm.h     |   3 ++
 driver-opencl.c |  83 +++++++++++++++++++++++++++++++----
 ocl.c           |  43 +++++++++++++++++-
 ocl.h           |   2 +
 6 files changed, 228 insertions(+), 17 deletions(-)

diff --git a/.gitignore b/.gitignore
index ce23abd8..0f82734d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ minerd.exe
 
 autom4te.cache
 .deps
+m4
 
 Makefile
 Makefile.in
diff --git a/algorithm.c b/algorithm.c
index ad41a494..9e35eeb5 100644
--- a/algorithm.c
+++ b/algorithm.c
@@ -1007,6 +1007,108 @@ static cl_int queue_decred_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_u
   return status;
 }
 
+static cl_int queue_quarkcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel = &clState->kernel;
+  unsigned int num = 0;
+  cl_ulong le_target;
+  cl_int status = 0;
+  le_target = *(cl_ulong *)(blk->work->device_target + 24);
+  flip80(clState->cldata, blk->work->data);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+  // search
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->padbuffer8);
+  num = 0;
+  kernel = clState->extra_kernels;
+
+  for(int i = 0; i < 3; ++i, kernel++)
+  {
+    CL_SET_ARG(clState->padbuffer8);
+    CL_SET_ARG(clState->BranchBuffer[i << 1]);
+    CL_SET_ARG(clState->BranchBuffer[(i << 1) + 1]);
+    CL_SET_ARG(clState->GlobalThreadCount);
+    num = 0;
+
+    CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+    CL_SET_ARG(clState->BranchBuffer[i << 1]);
+
+    if(i == 2)
+    {
+      CL_SET_ARG(clState->outputBuffer);
+      CL_SET_ARG(le_target);
+    }
+
+    num = 0;
+
+    CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+    CL_SET_ARG(clState->BranchBuffer[(i << 1) + 1]);
+
+    if(i < 2)
+    {
+      CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+    }
+    else
+    {
+      CL_SET_ARG(clState->outputBuffer);
+      CL_SET_ARG(le_target);
+    }
+    num = 0;
+  }
+  /*
+  // search1
+  CL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch1Nonces);
+  CL_SET_ARG(clState->Branch2Nonces);
+  CL_SET_ARG(clState->GlobalThreadCount);
+  num = 0;
+  // search2
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch1Nonces);
+  num = 0;
+  // search3
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch2Nonces);
+  num = 0;
+  // search4
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // search5
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch3Nonces);
+  CL_SET_ARG(clState->Branch4Nonces);
+  CL_SET_ARG(clState->GlobalThreadCount);
+  num = 0;
+  // search6
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch3Nonces);
+  num = 0;
+  //search7
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch4Nonces);
+  num = 0;
+  // search8
+  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
+  // search9
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch5Nonces);
+  CL_SET_ARG(clState->Branch6Nonces);
+  CL_SET_ARG(clState->GlobalThreadCount);
+  num = 0;
+  // search10
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch5Nonces);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+  num = 0;
+  // search11
+  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(clState->Branch6Nonces);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(le_target);
+  */
+  return status;
+}
+
 static algorithm_settings_t algos[] = {
   // kernels starting from this will have difficulty calculated by using litecoin algorithm
 #define A_SCRYPT(a) \
@@ -1051,13 +1153,10 @@ static algorithm_settings_t algos[] = {
 #undef A_YESCRYPT_MULTI
 
   // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
-#define A_QUARK(a, b) \
-  { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }
-  A_QUARK("quarkcoin", quarkcoin_regenhash),
-  A_QUARK("qubitcoin", qubitcoin_regenhash),
-  A_QUARK("animecoin", animecoin_regenhash),
-  A_QUARK("sifcoin", sifcoin_regenhash),
-#undef A_QUARK
+  { "quarkcoin", ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 11, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, quarkcoin_regenhash, NULL, NULL, queue_quarkcoin_kernel, gen_hash, append_x11_compiler_options },
+  { "qubitcoin", ALGO_QUBIT, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, qubitcoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options },
+  { "animecoin", ALGO_ANIME, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, animecoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options },
+  { "sifcoin", ALGO_SIF, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, sifcoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options },
 
   // kernels starting from this will have difficulty calculated by using bitcoin algorithm
 #define A_DARK(a, b) \
diff --git a/algorithm.h b/algorithm.h
index 5628d006..83e3d94a 100644
--- a/algorithm.h
+++ b/algorithm.h
@@ -22,6 +22,9 @@ typedef enum {
   ALGO_X15,
   ALGO_KECCAK,
   ALGO_QUARK,
+  ALGO_QUBIT,
+  ALGO_ANIME,
+  ALGO_SIF,
   ALGO_TWE,
   ALGO_FUGUE,
   ALGO_NIST,
diff --git a/driver-opencl.c b/driver-opencl.c
index 3e6667bc..9c91eed7 100644
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -1425,20 +1425,85 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
   if (clState->goffset)
     p_global_work_offset = (size_t *)&work->blk.nonce;
 
-  status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset,
-    globalThreads, localThreads, 0, NULL, NULL);
-  if (unlikely(status != CL_SUCCESS)) {
-    applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
-    return -1;
-  }
+  if (gpu->algorithm.type == ALGO_QUARK) {
+    cl_event WaitEvents[2];
+    cl_uint zero = 0;
+
+    for (int i = 0; i < 6; ++i) {
+      status = clEnqueueWriteBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_TRUE, sizeof(cl_uint) * clState->GlobalThreadCount, sizeof(cl_uint), &zero, 0, NULL, NULL);
+      if (status != CL_SUCCESS) {
+        applog(LOG_ERR, "Error %d while attempting to nuke BranchBuffer %d.", status, i);
+        return -1;
+      }
+    }
+
+    for (int i = 0, x = 0; i < 9; ++i, ++x) {
+      size_t BranchCount0, BranchCount1;
+      status = clEnqueueNDRangeKernel(clState->commandQueue, ((i) ? clState->extra_kernels[i] : clState->kernel), 1, p_global_work_offset, globalThreads, localThreads, 0,  NULL, WaitEvents);
+      if (status != CL_SUCCESS) {
+        if (i) applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i);
+        else applog(LOG_ERR, "Error %d while attempting to enqueue initial kernel.", status);
+        return -1;
+      }
+
+      clWaitForEvents(1, WaitEvents);
+      clReleaseEvent(WaitEvents[0]);
+
+      if (i) ++i;
+
+      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, globalThreads, localThreads, 0,  NULL, WaitEvents);
+      if (status != CL_SUCCESS) {
+        applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i);
+        return -1;
+      }
+
+      clWaitForEvents(1, WaitEvents);
+      clReleaseEvent(WaitEvents[0]);
+
+      // Do a blocking read for the found counter in both buffers
+      // so we know how many threads to dispatch to each branch
+      status = clEnqueueReadBuffer(clState->commandQueue, clState->BranchBuffer[x << 1], CL_TRUE, sizeof(cl_uint) * globalThreads[0], sizeof(cl_uint), &BranchCount0, 0, NULL, NULL);
+      if (status != CL_SUCCESS) {
+        applog(LOG_ERR, "Error %d while attempting to read intermediate data from GPU.", status, i);
+        return -1;
+      }
+
+      BranchCount1 = globalThreads[0] - BranchCount0;
+      ++i;
+
+      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, &BranchCount0, localThreads, 0,  NULL, WaitEvents);
+      if (status != CL_SUCCESS) {
+        applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i);
+        return -1;
+      }
+
+      ++i;
+
+      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, &BranchCount1, localThreads, 0,  NULL, WaitEvents + 1);
+      if (status != CL_SUCCESS) {
+        applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i);
+        return -1;
+      }
 
-  for (i = 0; i < clState->n_extra_kernels; i++) {
-    status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset,
-      globalThreads, localThreads, 0, NULL, NULL);
+      clWaitForEvents(2, WaitEvents);
+      clReleaseEvent(WaitEvents[0]);
+      clReleaseEvent(WaitEvents[1]);
+    }
+  }
+  else {
+    status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL);
     if (unlikely(status != CL_SUCCESS)) {
       applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
       return -1;
     }
+
+    for (i = 0; i < clState->n_extra_kernels; i++) {
+      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL);
+      if (unlikely(status != CL_SUCCESS)) {
+        applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
+        return -1;
+      }
+    }
   }
 
   status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
diff --git a/ocl.c b/ocl.c
index c34f191b..7f26842a 100644
--- a/ocl.c
+++ b/ocl.c
@@ -180,6 +180,35 @@ static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_co
   return status;
 }
 
+// Copied from set_threads_hashes() in driver-opencl.c
+static size_t CalcGlobalThreads(unsigned int compute_shaders, unsigned int minthreads, __maybe_unused int *intensity, __maybe_unused int *xintensity, __maybe_unused int *rawintensity, algorithm_t *algorithm)
+{
+  size_t threads = 0;
+  while (threads < minthreads) {
+
+    if (*rawintensity > 0) {
+      threads = *rawintensity;
+    }
+    else if (*xintensity > 0) {
+      threads = compute_shaders * ((algorithm->xintensity_shift)?(1 << (algorithm->xintensity_shift + *xintensity)):*xintensity);
+    }
+    else {
+      threads = 1 << (algorithm->intensity_shift + *intensity);
+    }
+
+    if (threads < minthreads) {
+      if (likely(*intensity < MAX_INTENSITY)) {
+        (*intensity)++;
+      }
+      else {
+        threads = minthreads;
+      }
+    }
+  }
+
+  return(threads);
+}
+
 _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm)
 {
   cl_int status = 0;
@@ -837,7 +866,19 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
       applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize);
     }
 
-    if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_MULTI) {
+    // By the way, if you change the intensity between now and opencl_scanhash()
+    // calculating the global thread count, God help you.
+    if (algorithm->type == ALGO_QUARK) {
+      clState->GlobalThreadCount = CalcGlobalThreads(clState->compute_shaders, clState->wsize, &cgpu->intensity, &cgpu->xintensity, &cgpu->rawintensity, &cgpu->algorithm);
+      for (int i = 0; i < 6; ++i) {
+        clState->BranchBuffer[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(cl_uint) * (clState->GlobalThreadCount + 2), NULL, &status);
+        if (status != CL_SUCCESS && !clState->BranchBuffer[i]) {
+          applog(LOG_ERR, "Error %d while creating BranchBuffer %d.", status, i);
+          return NULL;
+        }
+      }
+    }
+    else if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_MULTI) {
       // need additionnal buffers
       clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
       if (status != CL_SUCCESS && !clState->buffer1) {
diff --git a/ocl.h b/ocl.h
index 311db29e..e66275c8 100644
--- a/ocl.h
+++ b/ocl.h
@@ -13,6 +13,8 @@ typedef struct __clState {
   cl_mem outputBuffer;
   cl_mem CLbuffer0;
   cl_mem MidstateBuf;
+  cl_mem BranchBuffer[6];
+  size_t GlobalThreadCount;
   cl_mem padbuffer8;
   cl_mem buffer1;
   cl_mem buffer2;