diff --git a/configure.ac b/configure.ac index 47ef8b06..ea0270aa 100644 --- a/configure.ac +++ b/configure.ac @@ -304,10 +304,10 @@ fi AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to cgminer install]) -AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120222"], [Filename for phatk kernel]) -AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120222"], [Filename for poclbm kernel]) -AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120222"], [Filename for diakgcn kernel]) -AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120222"], [Filename for diablo kernel]) +AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120223"], [Filename for phatk kernel]) +AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120223"], [Filename for poclbm kernel]) +AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120223"], [Filename for diakgcn kernel]) +AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120223"], [Filename for diablo kernel]) AC_SUBST(OPENCL_LIBS) diff --git a/device-gpu.c b/device-gpu.c index a526c70c..cff18619 100644 --- a/device-gpu.c +++ b/device-gpu.c @@ -740,7 +740,7 @@ static _clState *clStates[MAX_GPUDEVICES]; #define CL_SET_ARG(var) status |= clSetKernelArg(*kernel, num++, sizeof(var), (void *)&var) #define CL_SET_VARG(args, var) status |= clSetKernelArg(*kernel, num++, args * sizeof(uint), (void *)var) -static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) +static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk) { cl_kernel *kernel = &clState->kernel; cl_uint vwidth = clState->vwidth; @@ -767,7 +767,7 @@ static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t nonces = alloca(sizeof(uint) * vwidth); for (i = 0; i < vwidth; i++) - nonces[i] = blk->nonce + (i * threads); + nonces[i] = blk->nonce + i; CL_SET_VARG(vwidth, nonces); CL_SET_BLKARG(fW0); @@ -791,8 +791,7 @@ static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t return status; } -static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk, - __maybe_unused cl_uint threads) +static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk) { cl_kernel *kernel = &clState->kernel; cl_uint vwidth = clState->vwidth; @@ -818,7 +817,7 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk, nonces = alloca(sizeof(uint) * vwidth); for (i = 0; i < vwidth; i++) - nonces[i] = blk->nonce + (i * threads); + nonces[i] = blk->nonce + i; CL_SET_VARG(vwidth, nonces); CL_SET_BLKARG(W16); @@ -835,8 +834,7 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk, return status; } -static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk, - __maybe_unused cl_uint threads) +static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk) { cl_kernel *kernel = &clState->kernel; cl_uint vwidth = clState->vwidth; @@ -894,7 +892,7 @@ static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk, return status; } -static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) +static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk) { cl_kernel *kernel = &clState->kernel; cl_uint vwidth = clState->vwidth; @@ -904,7 +902,7 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t nonces = alloca(sizeof(uint) * vwidth); for (i = 0; i < vwidth; i++) - nonces[i] = blk->nonce + (i * threads); + nonces[i] = blk->nonce + i; CL_SET_VARG(vwidth, nonces); CL_SET_BLKARG(PreVal0); @@ -1140,7 +1138,7 @@ static void get_opencl_statline(char *buf, struct cgpu_info *gpu) } struct opencl_thread_data { - cl_int (*queue_kernel_parameters)(_clState *, dev_blk_ctx *, cl_uint); + cl_int (*queue_kernel_parameters)(_clState *, dev_blk_ctx *); uint32_t *res; struct work *last_work; struct work _last_work; @@ -1313,7 +1311,7 @@ static uint64_t opencl_scanhash(struct thr_info *thr, struct work *work, localThreads[0], gpu->intensity); if (hashes > gpu->max_hashes) gpu->max_hashes = hashes; - status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]); + status = thrdata->queue_kernel_parameters(clState, &work->blk); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); return 0; diff --git a/diablo120222.cl b/diablo120223.cl similarity index 98% rename from diablo120222.cl rename to diablo120223.cl index 4bc7394d..a08e47cf 100644 --- a/diablo120222.cl +++ b/diablo120223.cl @@ -62,7 +62,13 @@ __kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) void search( z ZA[930]; - const z Znonce = base + (uint)(get_global_id(0)); +#ifdef VECTORS4 + const z Znonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); +#elif defined VECTORS2 + const z Znonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); +#else + const z Znonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); +#endif ZA[15] = Znonce + PreVal4_state0; @@ -1237,27 +1243,29 @@ __kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) void search( #define NFLAG (0x7F) #if defined(VECTORS4) - bool result = any(ZA[924] == 0x136032EDU); + ZA[924] ^= 0x136032EDU; + bool result = ZA[924].x & ZA[924].y & ZA[924].z & ZA[924].w; - if (result) { + if (!result) { output[FOUND] = FOUND; - if (ZA[924].x == 0x136032EDU) + if (!ZA[924].xU) output[NFLAG & Znonce.x] = Znonce.x; - if (ZA[924].y == 0x136032EDU) + if (!ZA[924].y) output[NFLAG & Znonce.y] = Znonce.y; - if (ZA[924].z == 0x136032EDU) + if (!ZA[924].z) output[NFLAG & Znonce.z] = Znonce.z; - if (ZA[924].w == 0x136032EDU) + if (!ZA[924].w) output[NFLAG & Znonce.w] = Znonce.w; } #elif defined(VECTORS2) - bool result = any(ZA[924] == 0x136032EDU); + ZA[924] ^= 0x136032EDU; + bool result = ZA[924].x & ZA[924].y; - if (result) { + if (!result) { output[FOUND] = FOUND; - if (ZA[924].x == 0x136032EDU) + if (!ZA[924].xU) output[NFLAG & Znonce.x] = Znonce.x; - if (ZA[924].y == 0x136032EDU) + if (!ZA[924].y) output[NFLAG & Znonce.y] = Znonce.y; } #else diff --git a/diakgcn120222.cl b/diakgcn120223.cl similarity index 100% rename from diakgcn120222.cl rename to diakgcn120223.cl diff --git a/phatk120222.cl b/phatk120223.cl similarity index 94% rename from phatk120222.cl rename to phatk120223.cl index 4e548117..f6f20784 100644 --- a/phatk120222.cl +++ b/phatk120223.cl @@ -186,13 +186,25 @@ void search( const uint state0, const uint state1, const uint state2, const uint W[16] = W16; W[17] = W17; - W[3] = base + (uint)get_global_id(0); +#ifdef VECTORS4 + //Less dependencies to get both the local id and group id and then add them + W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); + uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U); + //Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3 + W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U}; +#elif defined VECTORS2 + W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); + uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U); + W[18] = PreW18 + (u){r, r ^ 0x2004000U}; +#else + W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U); W[18] = PreW18 + r; - +#endif //the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions + Vals[4] = PreVal4 + W[3]; Vals[0] = PreVal0 + W[3]; diff --git a/poclbm120222.cl b/poclbm120223.cl similarity index 98% rename from poclbm120222.cl rename to poclbm120223.cl index e7ba623e..4567fb8c 100644 --- a/poclbm120222.cl +++ b/poclbm120223.cl @@ -82,7 +82,13 @@ __kernel void search(const uint state0, const uint state1, const uint state2, co u W[24]; u *Vals = &W[16]; // Now put at W[16] to be in same array - const u nonce = base + (uint)(get_global_id(0)); +#ifdef VECTORS4 + const u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); +#elif defined VECTORS2 + const u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); +#else + const u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); +#endif Vals[0]=Preval0+nonce; @@ -164,7 +170,7 @@ Vals[1]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); Vals[1]+=Ma(Vals[4],Vals[2],Vals[3]); Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); Vals[0]+=ch(Vals[5],Vals[6],Vals[7]); -Vals[0]+=0xC19BF3F4U; +Vals[0]+=0xC19BF3F4; Vals[4]+=Vals[0]; Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); @@ -201,7 +207,7 @@ Vals[0]+=Vals[4]; Vals[4]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); -W[4]+=0x80000000U; +W[4]+=0x80000000; Vals[3]+=W[4]; Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); Vals[3]+=ch(Vals[0],Vals[1],Vals[2]); @@ -652,14 +658,14 @@ W[0]=Vals[0]; W[7]=state7; W[7]+=Vals[7]; -Vals[7]=0xF377ED68U; +Vals[7]=0xF377ED68; W[0]+=state0; Vals[7]+=W[0]; W[3]=state3; W[3]+=Vals[3]; -Vals[3]=0xa54ff53aU; +Vals[3]=0xa54ff53a; Vals[3]+=Vals[7]; W[1]=Vals[1]; @@ -668,7 +674,7 @@ W[1]+=state1; W[6]=state6; W[6]+=Vals[6]; -Vals[6]=0x90BB1E3CU; +Vals[6]=0x90BB1E3C; Vals[6]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); Vals[6]+=(0x9b05688cU^(Vals[3]&0xca0b3af3U)); @@ -684,7 +690,7 @@ Vals[6]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22)); W[5]=state5; W[5]+=Vals[5]; -Vals[5]=0x150C6645BU; +Vals[5]=0x150C6645B; Vals[5]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); Vals[5]+=ch(Vals[2],Vals[3],0x510e527fU); Vals[5]+=W[2]; @@ -697,7 +703,7 @@ Vals[5]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22)); W[4]=state4; W[4]+=Vals[4]; -Vals[4]=0x13AC42E24U; +Vals[4]=0x13AC42E24; Vals[4]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); Vals[4]+=ch(Vals[1],Vals[2],Vals[3]); Vals[4]+=W[3]; @@ -736,7 +742,7 @@ Vals[4]+=Vals[0]; Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); Vals[7]+=ch(Vals[4],Vals[5],Vals[6]); -Vals[7]+=0x5807AA98U; +Vals[7]+=0x15807AA98; Vals[0]+=Ma(Vals[3],Vals[1],Vals[2]); Vals[3]+=Vals[7]; Vals[7]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); @@ -779,7 +785,7 @@ Vals[1]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22)); Vals[1]+=Ma(Vals[4],Vals[2],Vals[3]); Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); Vals[0]+=ch(Vals[5],Vals[6],Vals[7]); -Vals[0]+=0xC19BF274U; +Vals[0]+=0xC19BF274; Vals[4]+=Vals[0]; Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); @@ -856,7 +862,7 @@ Vals[0]+=W[7]; Vals[4]+=Vals[0]; Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); -W[8]=0x80000000U; +W[8]=0x80000000; W[8]+=W[1]; W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); Vals[7]+=W[8];