Added host code required to utilize a custom Quark AMD binary.

9 years ago · e5bd5f869d
6 changed files with 228 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,7 @@ minerd.exe
 autom4te.cache
 .deps
 m4
 Makefile
 Makefile.in
--- a/algorithm.c
+++ b/algorithm.c
@ -1007,6 +1007,108 @@ static cl_int queue_decred_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_u
  return status;
 }
 static cl_int queue_quarkcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
  cl_kernel *kernel = &clState->kernel;
  unsigned int num = 0;
  cl_ulong le_target;
  cl_int status = 0;
  le_target = *(cl_ulong *)(blk->work->device_target + 24);
  flip80(clState->cldata, blk->work->data);
  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
  // search
  CL_SET_ARG(clState->CLbuffer0);
  CL_SET_ARG(clState->padbuffer8);
  num = 0;
  kernel = clState->extra_kernels;
  for(int i = 0; i < 3; ++i, kernel++)
  {
    CL_SET_ARG(clState->padbuffer8);
    CL_SET_ARG(clState->BranchBuffer[i << 1]);
    CL_SET_ARG(clState->BranchBuffer[(i << 1) + 1]);
    CL_SET_ARG(clState->GlobalThreadCount);
    num = 0;
    CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
    CL_SET_ARG(clState->BranchBuffer[i << 1]);
    if(i == 2)
    {
      CL_SET_ARG(clState->outputBuffer);
      CL_SET_ARG(le_target);
    }
    num = 0;
    CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
    CL_SET_ARG(clState->BranchBuffer[(i << 1) + 1]);
    if(i < 2)
    {
      CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
    }
    else
    {
      CL_SET_ARG(clState->outputBuffer);
      CL_SET_ARG(le_target);
    }
    num = 0;
  }
  /*
  // search1
  CL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch1Nonces);
  CL_SET_ARG(clState->Branch2Nonces);
  CL_SET_ARG(clState->GlobalThreadCount);
  num = 0;
  // search2
  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch1Nonces);
  num = 0;
  // search3
  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch2Nonces);
  num = 0;
  // search4
  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
  // search5
  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch3Nonces);
  CL_SET_ARG(clState->Branch4Nonces);
  CL_SET_ARG(clState->GlobalThreadCount);
  num = 0;
  // search6
  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch3Nonces);
  num = 0;
  //search7
  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch4Nonces);
  num = 0;
  // search8
  CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
  // search9
  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch5Nonces);
  CL_SET_ARG(clState->Branch6Nonces);
  CL_SET_ARG(clState->GlobalThreadCount);
  num = 0;
  // search10
  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch5Nonces);
  CL_SET_ARG(clState->outputBuffer);
  CL_SET_ARG(le_target);
  num = 0;
  // search11
  CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
  CL_SET_ARG(clState->Branch6Nonces);
  CL_SET_ARG(clState->outputBuffer);
  CL_SET_ARG(le_target);
  */
  return status;
 }
 static algorithm_settings_t algos[] = {
  // kernels starting from this will have difficulty calculated by using litecoin algorithm
 #define A_SCRYPT(a) \
@ -1051,13 +1153,10 @@ static algorithm_settings_t algos[] = {
 #undef A_YESCRYPT_MULTI
  // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
-#define A_QUARK(a, b) \
+  { "quarkcoin", ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 11, 8 * 16 * 4194304, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, quarkcoin_regenhash, NULL, NULL, queue_quarkcoin_kernel, gen_hash, append_x11_compiler_options },
-  { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }
+  { "qubitcoin", ALGO_QUBIT, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, qubitcoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options },
-  A_QUARK("quarkcoin", quarkcoin_regenhash),
+  { "animecoin", ALGO_ANIME, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, animecoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options },
-  A_QUARK("qubitcoin", qubitcoin_regenhash),
+  { "sifcoin", ALGO_SIF, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, sifcoin_regenhash, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options },
  A_QUARK("animecoin", animecoin_regenhash),
  A_QUARK("sifcoin", sifcoin_regenhash),
 #undef A_QUARK
  // kernels starting from this will have difficulty calculated by using bitcoin algorithm
 #define A_DARK(a, b) \
--- a/algorithm.h
+++ b/algorithm.h
@ -22,6 +22,9 @@ typedef enum {
  ALGO_X15,
  ALGO_KECCAK,
  ALGO_QUARK,
  ALGO_QUBIT,
  ALGO_ANIME,
  ALGO_SIF,
  ALGO_TWE,
  ALGO_FUGUE,
  ALGO_NIST,
--- a/driver-opencl.c
+++ b/driver-opencl.c
@ -1425,21 +1425,86 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
  if (clState->goffset)
    p_global_work_offset = (size_t *)&work->blk.nonce;
-  status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset,
+  if (gpu->algorithm.type == ALGO_QUARK) {
-    globalThreads, localThreads, 0, NULL, NULL);
+    cl_event WaitEvents[2];
    cl_uint zero = 0;
    for (int i = 0; i < 6; ++i) {
      status = clEnqueueWriteBuffer(clState->commandQueue, clState->BranchBuffer[i], CL_TRUE, sizeof(cl_uint) * clState->GlobalThreadCount, sizeof(cl_uint), &zero, 0, NULL, NULL);
      if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d while attempting to nuke BranchBuffer %d.", status, i);
        return -1;
      }
    }
    for (int i = 0, x = 0; i < 9; ++i, ++x) {
      size_t BranchCount0, BranchCount1;
      status = clEnqueueNDRangeKernel(clState->commandQueue, ((i) ? clState->extra_kernels[i] : clState->kernel), 1, p_global_work_offset, globalThreads, localThreads, 0,  NULL, WaitEvents);
      if (status != CL_SUCCESS) {
        if (i) applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i);
        else applog(LOG_ERR, "Error %d while attempting to enqueue initial kernel.", status);
        return -1;
      }
      clWaitForEvents(1, WaitEvents);
      clReleaseEvent(WaitEvents[0]);
      if (i) ++i;
      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, globalThreads, localThreads, 0,  NULL, WaitEvents);
      if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i);
        return -1;
      }
      clWaitForEvents(1, WaitEvents);
      clReleaseEvent(WaitEvents[0]);
      // Do a blocking read for the found counter in both buffers
      // so we know how many threads to dispatch to each branch
      status = clEnqueueReadBuffer(clState->commandQueue, clState->BranchBuffer[x << 1], CL_TRUE, sizeof(cl_uint) * globalThreads[0], sizeof(cl_uint), &BranchCount0, 0, NULL, NULL);
      if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d while attempting to read intermediate data from GPU.", status, i);
        return -1;
      }
      BranchCount1 = globalThreads[0] - BranchCount0;
      ++i;
      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, &BranchCount0, localThreads, 0,  NULL, WaitEvents);
      if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i);
        return -1;
      }
      ++i;
      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, &BranchCount1, localThreads, 0,  NULL, WaitEvents + 1);
      if (status != CL_SUCCESS) {
        applog(LOG_ERR, "Error %d while attempting to enqueue kernel with index %d.", status, i);
        return -1;
      }
      clWaitForEvents(2, WaitEvents);
      clReleaseEvent(WaitEvents[0]);
      clReleaseEvent(WaitEvents[1]);
    }
  }
  else {
    status = clEnqueueNDRangeKernel(clState->commandQueue, clState->kernel, 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL);
    if (unlikely(status != CL_SUCCESS)) {
      applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
      return -1;
    }
    for (i = 0; i < clState->n_extra_kernels; i++) {
-    status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset,
+      status = clEnqueueNDRangeKernel(clState->commandQueue, clState->extra_kernels[i], 1, p_global_work_offset, globalThreads, localThreads, 0, NULL, NULL);
      globalThreads, localThreads, 0, NULL, NULL);
      if (unlikely(status != CL_SUCCESS)) {
        applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
        return -1;
      }
    }
  }
  status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
    buffersize, thrdata->res, 0, NULL, NULL);
--- a/ocl.c
+++ b/ocl.c
@ -180,6 +180,35 @@ static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_co
  return status;
 }
 // Copied from set_threads_hashes() in driver-opencl.c
 static size_t CalcGlobalThreads(unsigned int compute_shaders, unsigned int minthreads, __maybe_unused int *intensity, __maybe_unused int *xintensity, __maybe_unused int *rawintensity, algorithm_t *algorithm)
 {
  size_t threads = 0;
  while (threads < minthreads) {
    if (*rawintensity > 0) {
      threads = *rawintensity;
    }
    else if (*xintensity > 0) {
      threads = compute_shaders * ((algorithm->xintensity_shift)?(1 << (algorithm->xintensity_shift + *xintensity)):*xintensity);
    }
    else {
      threads = 1 << (algorithm->intensity_shift + *intensity);
    }
    if (threads < minthreads) {
      if (likely(*intensity < MAX_INTENSITY)) {
        (*intensity)++;
      }
      else {
        threads = minthreads;
      }
    }
  }
  return(threads);
 }
 _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm)
 {
  cl_int status = 0;
@ -837,7 +866,19 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
      applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize);
    }
-    if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_MULTI) {
+    // By the way, if you change the intensity between now and opencl_scanhash()
    // calculating the global thread count, God help you.
    if (algorithm->type == ALGO_QUARK) {
      clState->GlobalThreadCount = CalcGlobalThreads(clState->compute_shaders, clState->wsize, &cgpu->intensity, &cgpu->xintensity, &cgpu->rawintensity, &cgpu->algorithm);
      for (int i = 0; i < 6; ++i) {
        clState->BranchBuffer[i] = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(cl_uint) * (clState->GlobalThreadCount + 2), NULL, &status);
        if (status != CL_SUCCESS && !clState->BranchBuffer[i]) {
          applog(LOG_ERR, "Error %d while creating BranchBuffer %d.", status, i);
          return NULL;
        }
      }
    }
    else if (algorithm->type == ALGO_YESCRYPT || algorithm->type == ALGO_YESCRYPT_MULTI) {
      // need additionnal buffers
      clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
      if (status != CL_SUCCESS && !clState->buffer1) {
--- a/ocl.h
+++ b/ocl.h
@ -13,6 +13,8 @@ typedef struct __clState {
  cl_mem outputBuffer;
  cl_mem CLbuffer0;
  cl_mem MidstateBuf;
  cl_mem BranchBuffer[6];
  size_t GlobalThreadCount;
  cl_mem padbuffer8;
  cl_mem buffer1;
  cl_mem buffer2;