diff --git a/device-gpu.c b/device-gpu.c index d40db48c..e39bff4b 100644 --- a/device-gpu.c +++ b/device-gpu.c @@ -673,39 +673,42 @@ void manage_gpu(void) #ifdef HAVE_OPENCL static _clState *clStates[MAX_GPUDEVICES]; +#define CL_SET_BLKARG(blkvar) status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->blkvar) +#define CL_SET_ARG(var) status |= clSetKernelArg(*kernel, num++, sizeof(var), (void *)&var) +#define CL_SET_VARG(args, var) status |= clSetKernelArg(*kernel, num++, args * sizeof(uint), (void *)var) + static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk) { cl_kernel *kernel = &clState->kernel; cl_int status = 0; int num = 0; - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce); - - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2); - - status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer), - (void *)&clState->outputBuffer); + CL_SET_BLKARG(ctx_a); + CL_SET_BLKARG(ctx_b); + CL_SET_BLKARG(ctx_c); + CL_SET_BLKARG(ctx_d); + CL_SET_BLKARG(ctx_e); + CL_SET_BLKARG(ctx_f); + CL_SET_BLKARG(ctx_g); + CL_SET_BLKARG(ctx_h); + CL_SET_BLKARG(cty_b); + CL_SET_BLKARG(cty_c); + CL_SET_BLKARG(cty_d); + CL_SET_BLKARG(cty_f); + CL_SET_BLKARG(cty_g); + CL_SET_BLKARG(cty_h); + CL_SET_BLKARG(nonce); + + CL_SET_BLKARG(fW0); + CL_SET_BLKARG(fW1); + CL_SET_BLKARG(fW2); + CL_SET_BLKARG(fW3); + CL_SET_BLKARG(fW15); + CL_SET_BLKARG(fW01r); + CL_SET_BLKARG(fcty_e); + CL_SET_BLKARG(fcty_e2); + + CL_SET_ARG(clState->outputBuffer); return status; } @@ -718,75 +721,87 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk) int i, num = 0; uint *nonces; - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h); - - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h); + CL_SET_BLKARG(ctx_a); + CL_SET_BLKARG(ctx_b); + CL_SET_BLKARG(ctx_c); + CL_SET_BLKARG(ctx_d); + CL_SET_BLKARG(ctx_e); + CL_SET_BLKARG(ctx_f); + CL_SET_BLKARG(ctx_g); + CL_SET_BLKARG(ctx_h); + + CL_SET_BLKARG(cty_b); + CL_SET_BLKARG(cty_c); + CL_SET_BLKARG(cty_d); + CL_SET_BLKARG(cty_f); + CL_SET_BLKARG(cty_g); + CL_SET_BLKARG(cty_h); nonces = alloca(sizeof(uint) * vwidth); for (i = 0; i < vwidth; i++) nonces[i] = blk->nonce + i; status |= clSetKernelArg(*kernel, num++, vwidth * sizeof(uint), (void *)nonces); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W16); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W17); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreVal4_2); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreVal0); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW18); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW19); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW31); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreW32); + CL_SET_BLKARG(W16); + CL_SET_BLKARG(W17); + CL_SET_BLKARG(PreVal4_2); + CL_SET_BLKARG(PreVal0); + CL_SET_BLKARG(PreW18); + CL_SET_BLKARG(PreW19); + CL_SET_BLKARG(PreW31); + CL_SET_BLKARG(PreW32); - status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer), - (void *)&clState->outputBuffer); + CL_SET_ARG(clState->outputBuffer); return status; } static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk) { + cl_uint vwidth = clState->preferred_vwidth; cl_kernel *kernel = &clState->kernel; cl_int status = 0; - int num = 0; + int i, num = 0; + uint *nonces; - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_d); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce); - - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e); - status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2); - - status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer), - (void *)&clState->outputBuffer); + nonces = alloca(sizeof(uint) * vwidth); + for (i = 0; i < vwidth; i++) + nonces[i] = blk->nonce + i; + CL_SET_VARG(vwidth, nonces); + + CL_SET_BLKARG(PreVal4); + CL_SET_BLKARG(cty_h); + CL_SET_BLKARG(cty_d); + CL_SET_BLKARG(PreVal0); + CL_SET_BLKARG(cty_b); + CL_SET_BLKARG(cty_c); + CL_SET_BLKARG(cty_f); + CL_SET_BLKARG(cty_g); + CL_SET_BLKARG(C1addK5); + CL_SET_BLKARG(B1addK6); + CL_SET_BLKARG(PreVal0addK7); + CL_SET_BLKARG(W16addK16); + CL_SET_BLKARG(W17addK17); + CL_SET_BLKARG(PreW18); + CL_SET_BLKARG(PreW19); + CL_SET_BLKARG(W16); + CL_SET_BLKARG(W17); + CL_SET_BLKARG(PreW31); + CL_SET_BLKARG(PreW32); + + CL_SET_BLKARG(ctx_a); + CL_SET_BLKARG(ctx_b); + CL_SET_BLKARG(ctx_c); + CL_SET_BLKARG(ctx_d); + CL_SET_BLKARG(ctx_e); + CL_SET_BLKARG(ctx_f); + CL_SET_BLKARG(ctx_g); + CL_SET_BLKARG(ctx_h); + + CL_SET_BLKARG(A0); + CL_SET_BLKARG(B0); + + CL_SET_ARG(clState->outputBuffer); return status; } diff --git a/diakgcn120208.cl b/diakgcn120208.cl index f8b263fb..84f02e11 100644 --- a/diakgcn120208.cl +++ b/diakgcn120208.cl @@ -3,9 +3,7 @@ // Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3. // The kernel was rewritten by me (Diapolo) and is still public-domain! -#ifdef VECTORS8 - typedef uint8 u; -#elif defined VECTORS4 +#if defined VECTORS4 typedef uint4 u; #elif defined VECTORS2 typedef uint2 u; @@ -31,9 +29,7 @@ #ifdef GOFFSET typedef uint uu; #else - #ifdef VECTORS8 - typedef uint8 uu; - #elif defined VECTORS4 + #if defined VECTORS4 typedef uint4 uu; #elif defined VECTORS2 typedef uint2 uu; @@ -67,29 +63,23 @@ __kernel u W[17]; u V[8]; -#ifdef VECTORS8 +#if defined VECTORS4 #ifdef GOFFSET - u nonce = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7); + u nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3); #else - u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + ((uint)get_local_id(0) * 8U) + base; - #endif -#elif defined VECTORS4 - #ifdef GOFFSET - u nonce = ((uint)get_global_id(0) << 2) + (u)(0, 1, 2, 3); - #else - u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + ((uint)get_local_id(0) * 4U) + base; + u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u); #endif #elif defined VECTORS2 #ifdef GOFFSET - u nonce = ((uint)get_global_id(0) << 1) + (u)(0, 1); + u nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1); #else - u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) * 2U) + ((uint)get_local_id(0) * 2U) + base; + u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u); #endif #else #ifdef GOFFSET - u nonce = (uint)get_global_id(0); + u nonce = base + get_global_id(0); #else - u nonce = ((uint)get_group_id(0) * (uint)WORKSIZExVECSIZE) + (uint)get_local_id(0) + base; + u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE); #endif #endif @@ -589,22 +579,40 @@ __kernel V[7] += V[3] + W[10] + ch(124) + rot26(V[0]); -#ifdef VECTORS8 - u result = (u)(((V[7].s0 == 0x136032ed) * nonce.s0), ((V[7].s1 == 0x136032ed) * nonce.s1), ((V[7].s2 == 0x136032ed) * nonce.s2), ((V[7].s3 == 0x136032ed) * nonce.s3), - ((V[7].s4 == 0x136032ed) * nonce.s4), ((V[7].s5 == 0x136032ed) * nonce.s5), ((V[7].s6 == 0x136032ed) * nonce.s6), ((V[7].s7 == 0x136032ed) * nonce.s7)); - output[0 + (upsample(result.s0, result.s1) > 0)] = upsample(result.s0, result.s1); - output[2 + (upsample(result.s2, result.s3) > 1)] = upsample(result.s2, result.s3); - output[4 + (upsample(result.s4, result.s5) > 0)] = upsample(result.s4, result.s5); - output[6 + (upsample(result.s6, result.s7) > 1)] = upsample(result.s6, result.s7); -#elif defined VECTORS4 - u result = (u)(((V[7].x == 0x136032ed) * nonce.x), ((V[7].y == 0x136032ed) * nonce.y), ((V[7].z == 0x136032ed) * nonce.z), ((V[7].w == 0x136032ed) * nonce.w)); - output[0 + (upsample(result.x, result.y) > 0)] = upsample(result.x, result.y); - output[2 + (upsample(result.z, result.w) > 1)] = upsample(result.z, result.w); -#elif defined VECTORS2 - u result = (u)(((V[7].x == 0x136032ed) * nonce.x), ((V[7].y == 0x136032ed) * nonce.y)); - output[upsample(result.x, result.y) > 0] = upsample(result.x, result.y); + +#define FOUND (0x80) +#define NFLAG (0x7F) + +#ifdef VECTORS4 + V[7] ^= 0x136032ed; + + bool result = V[7].x & V[7].y & V[7].z & V[7].w; + + if (!result) { + if (!V[7].x) + output[FOUND] = output[NFLAG & W[3].x] = W[3].x; + if (!V[7].y) + output[FOUND] = output[NFLAG & W[3].y] = W[3].y; + if (!V[7].z) + output[FOUND] = output[NFLAG & W[3].z] = W[3].z; + if (!V[7].w) + output[FOUND] = output[NFLAG & W[3].w] = W[3].w; + } #else - u result = (V[7] == 0x136032ed) * nonce; - output[result != 0] = result; + #ifdef VECTORS2 + V[7] ^= 0x136032ed; + + bool result = V[7].x & V[7].y; + + if (!result) { + if (!V[7].x) + output[FOUND] = output[NFLAG & W[3].x] = W[3].x; + if (!V[7].y) + output[FOUND] = output[NFLAG & W[3].y] = W[3].y; + } + #else + if (V[7] == 0x136032ed) + output[FOUND] = output[NFLAG & W[3]] = W[3]; + #endif #endif } diff --git a/findnonce.c b/findnonce.c index 35fd14e0..da9c4ecb 100644 --- a/findnonce.c +++ b/findnonce.c @@ -115,7 +115,15 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) { blk->PreVal4addT1 = blk->PreVal4 + blk->T1; - blk->T1substate0 = state[0] - blk->T1; + blk->T1substate0 = blk->ctx_a - blk->T1; + + blk->B1addK6 = blk->cty_b + 0x923f82a4; + blk->PreVal0addK7 = blk->PreVal0 + 0xab1c5ed5; + blk->W16addK16 = blk->W16 + 0xe49b69c1; + blk->W17addK17 = blk->W17 + 0xefbe4786; + + blk->A0 = blk->ctx_a + 0x98c7e2a2; + blk->B0 = blk->ctx_a + 0xfc08884d; } #define P(t) (W[(t)&0xF] = W[(t-16)&0xF] + (rotate(W[(t-15)&0xF], 25) ^ rotate(W[(t-15)&0xF], 14) ^ (W[(t-15)&0xF] >> 3)) + W[(t-7)&0xF] + (rotate(W[(t-2)&0xF], 15) ^ rotate(W[(t-2)&0xF], 13) ^ (W[(t-2)&0xF] >> 10))) diff --git a/miner.h b/miner.h index adedaaac..a13113c2 100644 --- a/miner.h +++ b/miner.h @@ -610,6 +610,10 @@ typedef struct { cl_uint PreW19; cl_uint PreW31; cl_uint PreW32; + + /* For diakgcn */ + cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17; + cl_uint A0, B0; } dev_blk_ctx; #else typedef struct {