Browse Source

Revert behaviour to old nonce init code.

nfactor-troky
Con Kolivas 13 years ago
parent
commit
70e8ade54f
  1. 8
      configure.ac
  2. 20
      device-gpu.c
  3. 30
      diablo120223.cl
  4. 0
      diakgcn120223.cl
  5. 16
      phatk120223.cl
  6. 28
      poclbm120223.cl

8
configure.ac

@ -304,10 +304,10 @@ fi
AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to cgminer install]) AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to cgminer install])
AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120222"], [Filename for phatk kernel]) AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120223"], [Filename for phatk kernel])
AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120222"], [Filename for poclbm kernel]) AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120223"], [Filename for poclbm kernel])
AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120222"], [Filename for diakgcn kernel]) AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120223"], [Filename for diakgcn kernel])
AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120222"], [Filename for diablo kernel]) AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120223"], [Filename for diablo kernel])
AC_SUBST(OPENCL_LIBS) AC_SUBST(OPENCL_LIBS)

20
device-gpu.c

@ -740,7 +740,7 @@ static _clState *clStates[MAX_GPUDEVICES];
#define CL_SET_ARG(var) status |= clSetKernelArg(*kernel, num++, sizeof(var), (void *)&var) #define CL_SET_ARG(var) status |= clSetKernelArg(*kernel, num++, sizeof(var), (void *)&var)
#define CL_SET_VARG(args, var) status |= clSetKernelArg(*kernel, num++, args * sizeof(uint), (void *)var) #define CL_SET_VARG(args, var) status |= clSetKernelArg(*kernel, num++, args * sizeof(uint), (void *)var)
static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk)
{ {
cl_kernel *kernel = &clState->kernel; cl_kernel *kernel = &clState->kernel;
cl_uint vwidth = clState->vwidth; cl_uint vwidth = clState->vwidth;
@ -767,7 +767,7 @@ static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t
nonces = alloca(sizeof(uint) * vwidth); nonces = alloca(sizeof(uint) * vwidth);
for (i = 0; i < vwidth; i++) for (i = 0; i < vwidth; i++)
nonces[i] = blk->nonce + (i * threads); nonces[i] = blk->nonce + i;
CL_SET_VARG(vwidth, nonces); CL_SET_VARG(vwidth, nonces);
CL_SET_BLKARG(fW0); CL_SET_BLKARG(fW0);
@ -791,8 +791,7 @@ static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t
return status; return status;
} }
static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk, static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk)
__maybe_unused cl_uint threads)
{ {
cl_kernel *kernel = &clState->kernel; cl_kernel *kernel = &clState->kernel;
cl_uint vwidth = clState->vwidth; cl_uint vwidth = clState->vwidth;
@ -818,7 +817,7 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk,
nonces = alloca(sizeof(uint) * vwidth); nonces = alloca(sizeof(uint) * vwidth);
for (i = 0; i < vwidth; i++) for (i = 0; i < vwidth; i++)
nonces[i] = blk->nonce + (i * threads); nonces[i] = blk->nonce + i;
CL_SET_VARG(vwidth, nonces); CL_SET_VARG(vwidth, nonces);
CL_SET_BLKARG(W16); CL_SET_BLKARG(W16);
@ -835,8 +834,7 @@ static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk,
return status; return status;
} }
static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk, static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk)
__maybe_unused cl_uint threads)
{ {
cl_kernel *kernel = &clState->kernel; cl_kernel *kernel = &clState->kernel;
cl_uint vwidth = clState->vwidth; cl_uint vwidth = clState->vwidth;
@ -894,7 +892,7 @@ static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk,
return status; return status;
} }
static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk)
{ {
cl_kernel *kernel = &clState->kernel; cl_kernel *kernel = &clState->kernel;
cl_uint vwidth = clState->vwidth; cl_uint vwidth = clState->vwidth;
@ -904,7 +902,7 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t
nonces = alloca(sizeof(uint) * vwidth); nonces = alloca(sizeof(uint) * vwidth);
for (i = 0; i < vwidth; i++) for (i = 0; i < vwidth; i++)
nonces[i] = blk->nonce + (i * threads); nonces[i] = blk->nonce + i;
CL_SET_VARG(vwidth, nonces); CL_SET_VARG(vwidth, nonces);
CL_SET_BLKARG(PreVal0); CL_SET_BLKARG(PreVal0);
@ -1140,7 +1138,7 @@ static void get_opencl_statline(char *buf, struct cgpu_info *gpu)
} }
struct opencl_thread_data { struct opencl_thread_data {
cl_int (*queue_kernel_parameters)(_clState *, dev_blk_ctx *, cl_uint); cl_int (*queue_kernel_parameters)(_clState *, dev_blk_ctx *);
uint32_t *res; uint32_t *res;
struct work *last_work; struct work *last_work;
struct work _last_work; struct work _last_work;
@ -1313,7 +1311,7 @@ static uint64_t opencl_scanhash(struct thr_info *thr, struct work *work,
localThreads[0], gpu->intensity); localThreads[0], gpu->intensity);
if (hashes > gpu->max_hashes) if (hashes > gpu->max_hashes)
gpu->max_hashes = hashes; gpu->max_hashes = hashes;
status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]); status = thrdata->queue_kernel_parameters(clState, &work->blk);
if (unlikely(status != CL_SUCCESS)) { if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
return 0; return 0;

30
diablo120222.cl → diablo120223.cl

@ -62,7 +62,13 @@ __kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) void search(
z ZA[930]; z ZA[930];
const z Znonce = base + (uint)(get_global_id(0)); #ifdef VECTORS4
const z Znonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
#elif defined VECTORS2
const z Znonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
#else
const z Znonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
#endif
ZA[15] = Znonce + PreVal4_state0; ZA[15] = Znonce + PreVal4_state0;
@ -1237,27 +1243,29 @@ __kernel __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) void search(
#define NFLAG (0x7F) #define NFLAG (0x7F)
#if defined(VECTORS4) #if defined(VECTORS4)
bool result = any(ZA[924] == 0x136032EDU); ZA[924] ^= 0x136032EDU;
bool result = ZA[924].x & ZA[924].y & ZA[924].z & ZA[924].w;
if (result) { if (!result) {
output[FOUND] = FOUND; output[FOUND] = FOUND;
if (ZA[924].x == 0x136032EDU) if (!ZA[924].xU)
output[NFLAG & Znonce.x] = Znonce.x; output[NFLAG & Znonce.x] = Znonce.x;
if (ZA[924].y == 0x136032EDU) if (!ZA[924].y)
output[NFLAG & Znonce.y] = Znonce.y; output[NFLAG & Znonce.y] = Znonce.y;
if (ZA[924].z == 0x136032EDU) if (!ZA[924].z)
output[NFLAG & Znonce.z] = Znonce.z; output[NFLAG & Znonce.z] = Znonce.z;
if (ZA[924].w == 0x136032EDU) if (!ZA[924].w)
output[NFLAG & Znonce.w] = Znonce.w; output[NFLAG & Znonce.w] = Znonce.w;
} }
#elif defined(VECTORS2) #elif defined(VECTORS2)
bool result = any(ZA[924] == 0x136032EDU); ZA[924] ^= 0x136032EDU;
bool result = ZA[924].x & ZA[924].y;
if (result) { if (!result) {
output[FOUND] = FOUND; output[FOUND] = FOUND;
if (ZA[924].x == 0x136032EDU) if (!ZA[924].xU)
output[NFLAG & Znonce.x] = Znonce.x; output[NFLAG & Znonce.x] = Znonce.x;
if (ZA[924].y == 0x136032EDU) if (!ZA[924].y)
output[NFLAG & Znonce.y] = Znonce.y; output[NFLAG & Znonce.y] = Znonce.y;
} }
#else #else

0
diakgcn120222.cl → diakgcn120223.cl

16
phatk120222.cl → phatk120223.cl

@ -186,13 +186,25 @@ void search( const uint state0, const uint state1, const uint state2, const uint
W[16] = W16; W[16] = W16;
W[17] = W17; W[17] = W17;
W[3] = base + (uint)get_global_id(0); #ifdef VECTORS4
//Less dependencies to get both the local id and group id and then add them
W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
#elif defined VECTORS2
W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
W[18] = PreW18 + (u){r, r ^ 0x2004000U};
#else
W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U); u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
W[18] = PreW18 + r; W[18] = PreW18 + r;
#endif
//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions //the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions
Vals[4] = PreVal4 + W[3]; Vals[4] = PreVal4 + W[3];
Vals[0] = PreVal0 + W[3]; Vals[0] = PreVal0 + W[3];

28
poclbm120222.cl → poclbm120223.cl

@ -82,7 +82,13 @@ __kernel void search(const uint state0, const uint state1, const uint state2, co
u W[24]; u W[24];
u *Vals = &W[16]; // Now put at W[16] to be in same array u *Vals = &W[16]; // Now put at W[16] to be in same array
const u nonce = base + (uint)(get_global_id(0)); #ifdef VECTORS4
const u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
#elif defined VECTORS2
const u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
#else
const u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
#endif
Vals[0]=Preval0+nonce; Vals[0]=Preval0+nonce;
@ -164,7 +170,7 @@ Vals[1]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
Vals[1]+=Ma(Vals[4],Vals[2],Vals[3]); Vals[1]+=Ma(Vals[4],Vals[2],Vals[3]);
Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
Vals[0]+=ch(Vals[5],Vals[6],Vals[7]); Vals[0]+=ch(Vals[5],Vals[6],Vals[7]);
Vals[0]+=0xC19BF3F4U; Vals[0]+=0xC19BF3F4;
Vals[4]+=Vals[0]; Vals[4]+=Vals[0];
Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
@ -201,7 +207,7 @@ Vals[0]+=Vals[4];
Vals[4]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22)); Vals[4]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U)); W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
W[4]+=0x80000000U; W[4]+=0x80000000;
Vals[3]+=W[4]; Vals[3]+=W[4];
Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25)); Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
Vals[3]+=ch(Vals[0],Vals[1],Vals[2]); Vals[3]+=ch(Vals[0],Vals[1],Vals[2]);
@ -652,14 +658,14 @@ W[0]=Vals[0];
W[7]=state7; W[7]=state7;
W[7]+=Vals[7]; W[7]+=Vals[7];
Vals[7]=0xF377ED68U; Vals[7]=0xF377ED68;
W[0]+=state0; W[0]+=state0;
Vals[7]+=W[0]; Vals[7]+=W[0];
W[3]=state3; W[3]=state3;
W[3]+=Vals[3]; W[3]+=Vals[3];
Vals[3]=0xa54ff53aU; Vals[3]=0xa54ff53a;
Vals[3]+=Vals[7]; Vals[3]+=Vals[7];
W[1]=Vals[1]; W[1]=Vals[1];
@ -668,7 +674,7 @@ W[1]+=state1;
W[6]=state6; W[6]=state6;
W[6]+=Vals[6]; W[6]+=Vals[6];
Vals[6]=0x90BB1E3CU; Vals[6]=0x90BB1E3C;
Vals[6]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25)); Vals[6]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
Vals[6]+=(0x9b05688cU^(Vals[3]&0xca0b3af3U)); Vals[6]+=(0x9b05688cU^(Vals[3]&0xca0b3af3U));
@ -684,7 +690,7 @@ Vals[6]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
W[5]=state5; W[5]=state5;
W[5]+=Vals[5]; W[5]+=Vals[5];
Vals[5]=0x150C6645BU; Vals[5]=0x150C6645B;
Vals[5]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25)); Vals[5]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
Vals[5]+=ch(Vals[2],Vals[3],0x510e527fU); Vals[5]+=ch(Vals[2],Vals[3],0x510e527fU);
Vals[5]+=W[2]; Vals[5]+=W[2];
@ -697,7 +703,7 @@ Vals[5]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
W[4]=state4; W[4]=state4;
W[4]+=Vals[4]; W[4]+=Vals[4];
Vals[4]=0x13AC42E24U; Vals[4]=0x13AC42E24;
Vals[4]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25)); Vals[4]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
Vals[4]+=ch(Vals[1],Vals[2],Vals[3]); Vals[4]+=ch(Vals[1],Vals[2],Vals[3]);
Vals[4]+=W[3]; Vals[4]+=W[3];
@ -736,7 +742,7 @@ Vals[4]+=Vals[0];
Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
Vals[7]+=ch(Vals[4],Vals[5],Vals[6]); Vals[7]+=ch(Vals[4],Vals[5],Vals[6]);
Vals[7]+=0x5807AA98U; Vals[7]+=0x15807AA98;
Vals[0]+=Ma(Vals[3],Vals[1],Vals[2]); Vals[0]+=Ma(Vals[3],Vals[1],Vals[2]);
Vals[3]+=Vals[7]; Vals[3]+=Vals[7];
Vals[7]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22)); Vals[7]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
@ -779,7 +785,7 @@ Vals[1]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
Vals[1]+=Ma(Vals[4],Vals[2],Vals[3]); Vals[1]+=Ma(Vals[4],Vals[2],Vals[3]);
Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25)); Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
Vals[0]+=ch(Vals[5],Vals[6],Vals[7]); Vals[0]+=ch(Vals[5],Vals[6],Vals[7]);
Vals[0]+=0xC19BF274U; Vals[0]+=0xC19BF274;
Vals[4]+=Vals[0]; Vals[4]+=Vals[0];
Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25)); Vals[7]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
@ -856,7 +862,7 @@ Vals[0]+=W[7];
Vals[4]+=Vals[0]; Vals[4]+=Vals[0];
Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22)); Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
W[8]=0x80000000U; W[8]=0x80000000;
W[8]+=W[1]; W[8]+=W[1];
W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U)); W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
Vals[7]+=W[8]; Vals[7]+=W[8];
Loading…
Cancel
Save