|
|
|
@ -97,14 +97,18 @@ int clDevicesNum() {
@@ -97,14 +97,18 @@ int clDevicesNum() {
|
|
|
|
|
return numDevices; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
void advance(char **area, unsigned *remaining, const char *marker) |
|
|
|
|
static int advance(char **area, unsigned *remaining, const char *marker) |
|
|
|
|
{ |
|
|
|
|
char *find = memmem(*area, *remaining, marker, strlen(marker)); |
|
|
|
|
|
|
|
|
|
if (!find) |
|
|
|
|
applog(LOG_ERR, "Marker \"%s\" not found", marker), exit(1); |
|
|
|
|
if (!find) { |
|
|
|
|
if (opt_debug) |
|
|
|
|
applog(LOG_DEBUG, "Marker \"%s\" not found", marker); |
|
|
|
|
return 0; |
|
|
|
|
} |
|
|
|
|
*remaining -= find - *area; |
|
|
|
|
*area = find; |
|
|
|
|
return 1; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define OP3_INST_BFE_UINT 4ULL |
|
|
|
@ -158,6 +162,7 @@ void patch_opcodes(char *w, unsigned remaining)
@@ -158,6 +162,7 @@ void patch_opcodes(char *w, unsigned remaining)
|
|
|
|
|
_clState *initCl(unsigned int gpu, char *name, size_t nameSize) |
|
|
|
|
{ |
|
|
|
|
bool hasBitAlign = false; |
|
|
|
|
bool patchbfi = false; |
|
|
|
|
cl_int status = 0; |
|
|
|
|
unsigned int i; |
|
|
|
|
|
|
|
|
@ -282,7 +287,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
@@ -282,7 +287,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
|
|
|
|
|
} |
|
|
|
|
find = strstr(extensions, camo); |
|
|
|
|
if (find) |
|
|
|
|
hasBitAlign = true; |
|
|
|
|
hasBitAlign = patchbfi = true; |
|
|
|
|
|
|
|
|
|
status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL); |
|
|
|
|
if (status != CL_SUCCESS) { |
|
|
|
@ -309,8 +314,15 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
@@ -309,8 +314,15 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
|
|
|
|
|
char *filename = "poclbm.cl"; |
|
|
|
|
|
|
|
|
|
int pl; |
|
|
|
|
char *source = file_contents(filename, &pl); |
|
|
|
|
char *source, *rawsource = file_contents(filename, &pl); |
|
|
|
|
size_t sourceSize[] = {(size_t)pl}; |
|
|
|
|
source = malloc(pl); |
|
|
|
|
retry: |
|
|
|
|
if (!source) { |
|
|
|
|
applog(LOG_ERR, "Unable to malloc source"); |
|
|
|
|
return NULL; |
|
|
|
|
} |
|
|
|
|
memcpy(source, rawsource, pl); |
|
|
|
|
|
|
|
|
|
if (opt_vectors) |
|
|
|
|
clState->preferred_vwidth = opt_vectors; |
|
|
|
@ -336,8 +348,22 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
@@ -336,8 +348,22 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
|
|
|
|
|
applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->preferred_vwidth); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/* Patch the source file defining BFI_INT */ |
|
|
|
|
/* Patch the source file defining BITALIGN */ |
|
|
|
|
if (hasBitAlign == true) { |
|
|
|
|
char *find = strstr(source, "BITALIGNX"); |
|
|
|
|
|
|
|
|
|
if (unlikely(!find)) { |
|
|
|
|
applog(LOG_ERR, "Unable to find BITALIGNX in source"); |
|
|
|
|
return NULL; |
|
|
|
|
} |
|
|
|
|
find += 8; // "BITALIGN"
|
|
|
|
|
strncpy(find, " ", 1); |
|
|
|
|
if (opt_debug) |
|
|
|
|
applog(LOG_DEBUG, "cl_amd_media_ops found, patched source with BITALIGN"); |
|
|
|
|
} else if (opt_debug) |
|
|
|
|
applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BITALIGN patch"); |
|
|
|
|
|
|
|
|
|
if (patchbfi == true) { |
|
|
|
|
char *find = strstr(source, "BFI_INTX"); |
|
|
|
|
|
|
|
|
|
if (unlikely(!find)) { |
|
|
|
@ -351,9 +377,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
@@ -351,9 +377,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
|
|
|
|
|
} else if (opt_debug) |
|
|
|
|
applog(LOG_DEBUG, "cl_amd_media_ops not found, will not BFI_INT patch"); |
|
|
|
|
|
|
|
|
|
applog(LOG_INFO, "Initialising kernel with%s BFI_INT patching, %d vectors and worksize %d", |
|
|
|
|
hasBitAlign ? "" : "out", clState->preferred_vwidth, clState->work_size); |
|
|
|
|
|
|
|
|
|
clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status); |
|
|
|
|
if (status != CL_SUCCESS) |
|
|
|
|
{ |
|
|
|
@ -376,7 +399,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
@@ -376,7 +399,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/* Patch the kernel if the hardware supports BFI_INT */ |
|
|
|
|
if (hasBitAlign == true) { |
|
|
|
|
if (patchbfi == true) { |
|
|
|
|
size_t nDevices; |
|
|
|
|
size_t * binary_sizes; |
|
|
|
|
char ** binaries; |
|
|
|
@ -402,15 +425,19 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
@@ -402,15 +425,19 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
|
|
|
|
|
* position and length at a fixed offset from that. Then go |
|
|
|
|
* back and find the 2nd incidence of \x7ELF (rewind by one |
|
|
|
|
* from ELF) and then patch the opcocdes */ |
|
|
|
|
advance(&w, &remaining, ".text"); |
|
|
|
|
if (!advance(&w, &remaining, ".text")) |
|
|
|
|
{patchbfi = false; goto retry;} |
|
|
|
|
w++; remaining--; |
|
|
|
|
advance(&w, &remaining, ".text"); |
|
|
|
|
if (!advance(&w, &remaining, ".text")) |
|
|
|
|
{patchbfi = false; goto retry;} |
|
|
|
|
memcpy(&start, w + 285, 4); |
|
|
|
|
memcpy(&length, w + 289, 4); |
|
|
|
|
w = binaries[gpu]; remaining = binary_sizes[gpu]; |
|
|
|
|
advance(&w, &remaining, "ELF"); |
|
|
|
|
if (!advance(&w, &remaining, "ELF")) |
|
|
|
|
{patchbfi = false; goto retry;} |
|
|
|
|
w++; remaining--; |
|
|
|
|
advance(&w, &remaining, "ELF"); |
|
|
|
|
if (!advance(&w, &remaining, "ELF")) |
|
|
|
|
{patchbfi = false; goto retry;} |
|
|
|
|
w--; remaining++; |
|
|
|
|
w += start; remaining -= start; |
|
|
|
|
if (opt_debug) |
|
|
|
@ -433,6 +460,12 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
@@ -433,6 +460,12 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
|
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
free(source); |
|
|
|
|
free(rawsource); |
|
|
|
|
|
|
|
|
|
applog(LOG_INFO, "Initialising kernel with%s BFI_INT patching, %d vectors and worksize %d", |
|
|
|
|
patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size); |
|
|
|
|
|
|
|
|
|
/* create a cl program executable for all the devices specified */ |
|
|
|
|
status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); |
|
|
|
|
if(status != CL_SUCCESS) |
|
|
|
|