From 4d730577728b88351da6d05916c629d351a04d74 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Thu, 30 Jun 2011 10:36:19 +1000 Subject: [PATCH] Build binaries with unique filenames from the kernel generated and save them. Try to load this cached binary if it matches on next kernel instantiation. This speeds up start-up dramatically, and has a unique kernel binary for different kernel configurations. --- ocl.c | 170 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 134 insertions(+), 36 deletions(-) diff --git a/ocl.c b/ocl.c index 1c496cc7..574af6ea 100644 --- a/ocl.c +++ b/ocl.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include "findnonce.h" #include "ocl.h" @@ -307,40 +309,117 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) if (clState->max_work_size > 512) clState->max_work_size = 512; - ///////////////////////////////////////////////////////////////// - // Load CL file, build CL program object, create CL kernel object - ///////////////////////////////////////////////////////////////// - - /* Load a different kernel depending on whether it supports - * cl_amd_media_ops or not */ - char filename[10]; - - if (clState->hasBitAlign) - strcpy(filename, "phatk.cl"); + /* For some reason 2 vectors is still better even if the card says + * otherwise */ + if (clState->preferred_vwidth > 1) + clState->preferred_vwidth = 2; + if (opt_vectors) + clState->preferred_vwidth = opt_vectors; + if (opt_worksize && opt_worksize <= clState->max_work_size) + clState->work_size = opt_worksize; else - strcpy(filename, "poclbm.cl"); + clState->work_size = clState->max_work_size / clState->preferred_vwidth; + /* Create binary filename based on parameters passed to opencl + * compiler to ensure we only load a binary that matches what would + * have otherwise created. The filename is: + * kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin + */ + char binaryfilename[255]; + char numbuf[10]; + char filename[10]; + FILE *binaryfile; + size_t *binary_sizes; + char **binaries; + size_t nDevices = 1; int pl; - char *source, *rawsource = file_contents(filename, &pl); + char *source, *rawsource; size_t sourceSize[] = {(size_t)pl}; + source = malloc(pl); -retry: if (!source) { applog(LOG_ERR, "Unable to malloc source"); return NULL; } - memcpy(source, rawsource, pl); - /* For some reason 2 vectors is still better even if the card says - * otherwise */ - if (clState->preferred_vwidth > 1) - clState->preferred_vwidth = 2; - if (opt_vectors) - clState->preferred_vwidth = opt_vectors; - if (opt_worksize && opt_worksize <= clState->max_work_size) - clState->work_size = opt_worksize; + if (clState->hasBitAlign) + strcpy(filename, "phatk.cl"); else - clState->work_size = clState->max_work_size / clState->preferred_vwidth; + strcpy(filename, "poclbm.cl"); + rawsource = file_contents(filename, &pl); + + binary_sizes = (size_t *)malloc(sizeof(size_t)*nDevices); + if (unlikely(!binary_sizes)) { + applog(LOG_ERR, "Unable to malloc binary_sizes"); + return NULL; + } + binaries = (char **)malloc(sizeof(char *)*nDevices); + if (unlikely(!binaries)) { + applog(LOG_ERR, "Unable to malloc binaries"); + return NULL; + } + + if (clState->hasBitAlign) { + strcpy(binaryfilename, "phatk"); + strcat(binaryfilename, "bitalign"); + } else + strcpy(binaryfilename, "poclbm"); + strcat(binaryfilename, "v"); + sprintf(numbuf, "%d", clState->preferred_vwidth); + strcat(binaryfilename, numbuf); + strcat(binaryfilename, "w"); + sprintf(numbuf, "%d", (int)clState->work_size); + strcat(binaryfilename, numbuf); + strcat(binaryfilename, "long"); + sprintf(numbuf, "%d", (int)sizeof(long)); + strcat(binaryfilename, numbuf); + strcat(binaryfilename, ".bin"); + + binaryfile = fopen(binaryfilename, "r"); + if (!binaryfile) { + if (opt_debug) + applog(LOG_DEBUG, "No binary found, generating from source"); + } else { + struct stat binary_stat; + + if (unlikely(stat(binaryfilename, &binary_stat))) { + if (opt_debug) + applog(LOG_DEBUG, "Unable to stat binary, generating from source"); + fclose(binaryfile); + goto build; + } + binary_sizes[gpu] = binary_stat.st_size; + binaries[gpu] = (char *)malloc(binary_sizes[gpu]); + if (unlikely(!binaries[gpu])) { + applog(LOG_ERR, "Unable to malloc binaries"); + fclose(binaryfile); + return NULL; + } + + if (fread(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu]) { + applog(LOG_ERR, "Unable to fread binaries[gpu]"); + fclose(binaryfile); + return NULL; + } + fclose(binaryfile); + + clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL); + if (status != CL_SUCCESS) + { + applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)"); + return NULL; + } + if (opt_debug) + applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename); + goto built; + } + + ///////////////////////////////////////////////////////////////// + // Load CL file, build CL program object, create CL kernel object + ///////////////////////////////////////////////////////////////// + +build: + memcpy(source, rawsource, pl); /* Patch the source file with the preferred_vwidth */ if (clState->preferred_vwidth > 1) { @@ -411,22 +490,24 @@ retry: /* Patch the kernel if the hardware supports BFI_INT */ if (patchbfi) { - size_t nDevices; - size_t * binary_sizes; - char ** binaries; - int err; - - /* figure out number of devices and the sizes of the binary for each device. */ - err = clGetProgramInfo( clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(nDevices), &nDevices, NULL ); - binary_sizes = (size_t *)malloc( sizeof(size_t)*nDevices ); - err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL ); + /* figure out the size of the binary for each device. */ + status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*nDevices, binary_sizes, NULL ); + if (unlikely(status != CL_SUCCESS)) + { + applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)"); + return NULL; + } /* copy over all of the generated binaries. */ - binaries = (char **)malloc( sizeof(char *)*nDevices ); if (opt_debug) applog(LOG_DEBUG, "binary size %d : %d", gpu, binary_sizes[gpu]); binaries[gpu] = (char *)malloc( sizeof(char)*binary_sizes[gpu] ); - err = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL ); + status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*nDevices, binaries, NULL ); + if (unlikely(status != CL_SUCCESS)) + { + applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)"); + return NULL; + } unsigned remaining = binary_sizes[gpu]; char *w = binaries[gpu]; @@ -437,7 +518,7 @@ retry: * back and find the 2nd incidence of \x7ELF (rewind by one * from ELF) and then patch the opcocdes */ if (!advance(&w, &remaining, ".text")) - {patchbfi = 0; goto retry;} + {patchbfi = 0; goto build;} w++; remaining--; if (!advance(&w, &remaining, ".text")) { /* 32 bit builds only one ELF */ @@ -447,7 +528,7 @@ retry: memcpy(&length, w + 289, 4); w = binaries[gpu]; remaining = binary_sizes[gpu]; if (!advance(&w, &remaining, "ELF")) - {patchbfi = 0; goto retry;} + {patchbfi = 0; goto build;} w++; remaining--; if (!advance(&w, &remaining, "ELF")) { /* 32 bit builds only one ELF */ @@ -478,6 +559,23 @@ retry: free(source); free(rawsource); + /* Save the binary to be loaded next time */ + binaryfile = fopen(binaryfilename, "w"); + if (!binaryfile) { + /* Not a fatal problem, just means we build it again next time */ + if (opt_debug) + applog(LOG_DEBUG, "Unable to create file %s", binaryfilename); + } else { + if (unlikely(fwrite(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu])) { + applog(LOG_ERR, "Unable to fwrite to binaryfile"); + return NULL; + } + fclose(binaryfile); + } +built: + free(binaries); + free(binary_sizes); + applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT patching, %d vectors and worksize %d", filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size);