Browse Source

Use global offset parameter to diablo and poclbm kernel ONLY for 1 vector kernels.

nfactor-troky
Con Kolivas 13 years ago
parent
commit
621bcca7f5
  1. 31
      device-gpu.c
  2. 6
      diablo120222.cl
  3. 12
      ocl.c
  4. 1
      ocl.h
  5. 7
      poclbm120222.cl

31
device-gpu.c

@ -743,10 +743,8 @@ static _clState *clStates[MAX_GPUDEVICES];
static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads)
{ {
cl_kernel *kernel = &clState->kernel; cl_kernel *kernel = &clState->kernel;
cl_uint vwidth = clState->vwidth; unsigned int num = 0;
unsigned int i, num = 0;
cl_int status = 0; cl_int status = 0;
uint *nonces;
CL_SET_BLKARG(ctx_a); CL_SET_BLKARG(ctx_a);
CL_SET_BLKARG(ctx_b); CL_SET_BLKARG(ctx_b);
@ -765,10 +763,15 @@ static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t
CL_SET_BLKARG(cty_g); CL_SET_BLKARG(cty_g);
CL_SET_BLKARG(cty_h); CL_SET_BLKARG(cty_h);
nonces = alloca(sizeof(uint) * vwidth); if (!clState->goffset) {
cl_uint vwidth = clState->vwidth;
uint *nonces = alloca(sizeof(uint) * vwidth);
unsigned int i;
for (i = 0; i < vwidth; i++) for (i = 0; i < vwidth; i++)
nonces[i] = blk->nonce + (i * threads); nonces[i] = blk->nonce + (i * threads);
CL_SET_VARG(vwidth, nonces); CL_SET_VARG(vwidth, nonces);
}
CL_SET_BLKARG(fW0); CL_SET_BLKARG(fW0);
CL_SET_BLKARG(fW1); CL_SET_BLKARG(fW1);
@ -896,15 +899,19 @@ static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk,
static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads) static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads)
{ {
cl_kernel *kernel = &clState->kernel; cl_kernel *kernel = &clState->kernel;
cl_uint vwidth = clState->vwidth; unsigned int num = 0;
unsigned int i, num = 0;
cl_int status = 0; cl_int status = 0;
uint *nonces;
nonces = alloca(sizeof(uint) * vwidth); if (!clState->goffset) {
cl_uint vwidth = clState->vwidth;
uint *nonces = alloca(sizeof(uint) * vwidth);
unsigned int i;
for (i = 0; i < vwidth; i++) for (i = 0; i < vwidth; i++)
nonces[i] = blk->nonce + (i * threads); nonces[i] = blk->nonce + (i * threads);
CL_SET_VARG(vwidth, nonces); CL_SET_VARG(vwidth, nonces);
}
CL_SET_BLKARG(PreVal0); CL_SET_BLKARG(PreVal0);
CL_SET_BLKARG(PreVal0addK7); CL_SET_BLKARG(PreVal0addK7);
@ -1338,6 +1345,14 @@ static uint64_t opencl_scanhash(struct thr_info *thr, struct work *work,
memset(thrdata->res, 0, BUFFERSIZE); memset(thrdata->res, 0, BUFFERSIZE);
clFinish(clState->commandQueue); clFinish(clState->commandQueue);
} }
if (clState->goffset) {
size_t global_work_offset[1];
global_work_offset[0] = work->blk.nonce;
status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset,
globalThreads, localThreads, 0, NULL, NULL);
} else
status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
globalThreads, localThreads, 0, NULL, NULL); globalThreads, localThreads, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) { if (unlikely(status != CL_SUCCESS)) {

6
diablo120222.cl

@ -48,7 +48,9 @@ __kernel
__attribute__((vec_type_hint(z))) __attribute__((vec_type_hint(z)))
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
void search( void search(
#ifndef GOFFSET
const z base, const z base,
#endif
const uint PreVal4_state0, const uint PreVal4_state0_k7, const uint PreVal4_state0, const uint PreVal4_state0_k7,
const uint PreVal4_T1, const uint PreVal4_T1,
const uint W18, const uint W19, const uint W18, const uint W19,
@ -65,7 +67,11 @@ void search(
z ZA[25]; z ZA[25];
#ifdef GOFFSET
const z Znonce = (uint)(get_global_id(0));
#else
const z Znonce = base + (uint)(get_global_id(0)); const z Znonce = base + (uint)(get_global_id(0));
#endif
ZA[2]=Znonce; ZA[2]=Znonce;
ZA[2]+=PreVal4_state0; ZA[2]+=PreVal4_state0;

12
ocl.c

@ -335,7 +335,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
/* Create binary filename based on parameters passed to opencl /* Create binary filename based on parameters passed to opencl
* compiler to ensure we only load a binary that matches what would * compiler to ensure we only load a binary that matches what would
* have otherwise created. The filename is: * have otherwise created. The filename is:
* name + kernelname + v + vectors + w + work_size + l + sizeof(long) + .bin * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
*/ */
char binaryfilename[255]; char binaryfilename[255];
char filename[255]; char filename[255];
@ -398,6 +398,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
gpus[gpu].vwidth = preferred_vwidth; gpus[gpu].vwidth = preferred_vwidth;
} }
if ((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO) &&
clState->vwidth == 1 && clState->hasOpenCL11plus)
clState->goffset = true;
if (gpus[gpu].work_size && gpus[gpu].work_size <= clState->max_work_size) if (gpus[gpu].work_size && gpus[gpu].work_size <= clState->max_work_size)
clState->wsize = gpus[gpu].work_size; clState->wsize = gpus[gpu].work_size;
else if (strstr(name, "Tahiti")) else if (strstr(name, "Tahiti"))
@ -431,7 +435,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
} }
strcat(binaryfilename, name); strcat(binaryfilename, name);
if (clState->goffset)
strcat(binaryfilename, "g");
strcat(binaryfilename, "v"); strcat(binaryfilename, "v");
sprintf(numbuf, "%d", clState->vwidth); sprintf(numbuf, "%d", clState->vwidth);
strcat(binaryfilename, numbuf); strcat(binaryfilename, numbuf);
@ -533,6 +538,9 @@ build:
} else } else
applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch"); applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");
if (clState->goffset)
strcat(CompilerOptions, " -D GOFFSET");
applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions); applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL); status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);
free(CompilerOptions); free(CompilerOptions);

1
ocl.h

@ -21,6 +21,7 @@ typedef struct {
cl_mem outputBuffer; cl_mem outputBuffer;
bool hasBitAlign; bool hasBitAlign;
bool hasOpenCL11plus; bool hasOpenCL11plus;
bool goffset;
cl_uint vwidth; cl_uint vwidth;
size_t max_work_size; size_t max_work_size;
size_t wsize; size_t wsize;

7
poclbm120222.cl

@ -73,7 +73,9 @@ void search(const uint state0, const uint state1, const uint state2, const uint
const uint state4, const uint state5, const uint state6, const uint state7, const uint state4, const uint state5, const uint state6, const uint state7,
const uint b1, const uint c1, const uint b1, const uint c1,
const uint f1, const uint g1, const uint h1, const uint f1, const uint g1, const uint h1,
#ifndef GOFFSET
const u base, const u base,
#endif
const uint fw0, const uint fw1, const uint fw2, const uint fw3, const uint fw15, const uint fw01r, const uint fw0, const uint fw1, const uint fw2, const uint fw3, const uint fw15, const uint fw01r,
const uint D1A, const uint C1addK5, const uint B1addK6, const uint D1A, const uint C1addK5, const uint B1addK6,
const uint W16addK16, const uint W17addK17, const uint W16addK16, const uint W17addK17,
@ -83,8 +85,11 @@ void search(const uint state0, const uint state1, const uint state2, const uint
u W[24]; u W[24];
u *Vals = &W[16]; // Now put at W[16] to be in same array u *Vals = &W[16]; // Now put at W[16] to be in same array
#ifdef GOFFSET
const u nonce = (uint)(get_global_id(0));
#else
const u nonce = base + (uint)(get_global_id(0)); const u nonce = base + (uint)(get_global_id(0));
#endif
Vals[0]=Preval0; Vals[0]=Preval0;
Vals[0]+=nonce; Vals[0]+=nonce;

Loading…
Cancel
Save