From fe62dc75fcb99b2afc3b9303fc5b08ee49fdafda Mon Sep 17 00:00:00 2001 From: troky Date: Wed, 25 Mar 2015 18:54:32 +0100 Subject: [PATCH] wolf's improvements #2 --- algorithm.c | 33 +++------- algorithm.h | 27 +++++++- ocl.c | 147 +++++++++++++++++++++++--------------------- ocl.h | 3 +- ocl/binary_kernel.c | 2 + ocl/build_kernel.c | 2 + ocl/build_kernel.h | 8 ++- 7 files changed, 122 insertions(+), 100 deletions(-) diff --git a/algorithm.c b/algorithm.c index f2f8ddaa..a3e382dd 100644 --- a/algorithm.c +++ b/algorithm.c @@ -655,10 +655,14 @@ static cl_int queue_whirlpoolx_kernel(struct __clState *clState, struct _dev_blk tmp[0] = 0; whirlpool_round(midblock, tmp); - for (int x = 0; x < 8; ++x) midblock[x] ^= key[x]; + for (int x = 0; x < 8; ++x) { + midblock[x] ^= key[x]; + } } - for (int i = 0; i < 8; ++i) midblock[i] ^= ((uint64_t *)(clState->cldata))[i]; + for (int i = 0; i < 8; ++i) { + midblock[i] ^= ((uint64_t *)(clState->cldata))[i]; + } status = clSetKernelArg(clState->kernel, 0, sizeof(cl_ulong8), (cl_ulong8 *)&midblock); status |= clSetKernelArg(clState->kernel, 1, sizeof(cl_ulong), (void *)(((uint64_t *)clState->cldata) + 8)); @@ -732,27 +736,6 @@ static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_un return status; } -typedef struct _algorithm_settings_t { - const char *name; /* Human-readable identifier */ - algorithm_type_t type; //common algorithm type - const char *kernelfile; /* alternate kernel file */ - double diff_multiplier1; - double diff_multiplier2; - double share_diff_multiplier; - uint32_t xintensity_shift; - uint32_t intensity_shift; - uint32_t found_idx; - unsigned long long diff_numerator; - uint32_t diff1targ; - size_t n_extra_kernels; - long rw_buffer_size; - cl_command_queue_properties cq_properties; - void(*regenhash)(struct work *); - cl_int(*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint); - void(*gen_hash)(const unsigned char *, unsigned int, unsigned char *); - void(*set_compile_options)(build_kernel_data *, struct cgpu_info *, algorithm_t *); -} algorithm_settings_t; - static algorithm_settings_t algos[] = { // kernels starting from this will have difficulty calculated by using litecoin algorithm #define A_SCRYPT(a) \ @@ -895,7 +878,6 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa ALGO_ALIAS("nist5", "talkcoin-mod"); ALGO_ALIAS("keccak", "maxcoin"); ALGO_ALIAS("whirlpool", "whirlcoin"); - ALGO_ALIAS("whirlpoolx", "whirlpoolx"); ALGO_ALIAS("Lyra2RE", "lyra2re"); ALGO_ALIAS("lyra2", "lyra2re"); @@ -957,8 +939,7 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor) } } -bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2) +bool cmp_algorithm(const algorithm_t* algo1, const algorithm_t* algo2) { - // return (strcmp(algo1->name, algo2->name) == 0) && (algo1->nfactor == algo2->nfactor); return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor)); } diff --git a/algorithm.h b/algorithm.h index 4884aa27..b2527d17 100644 --- a/algorithm.h +++ b/algorithm.h @@ -9,6 +9,7 @@ #include #include +#include "ocl/build_kernel.h" // For the build_kernel_data type typedef enum { ALGO_UNK, @@ -25,8 +26,8 @@ typedef enum { ALGO_NIST, ALGO_FRESH, ALGO_WHIRL, - ALGO_WHIRLPOOLX, ALGO_NEOSCRYPT, + ALGO_WHIRLPOOLX, ALGO_LYRA2RE, ALGO_PLUCK } algorithm_type_t; @@ -67,6 +68,28 @@ typedef struct _algorithm_t { void(*set_compile_options)(struct _build_kernel_data *, struct cgpu_info *, struct _algorithm_t *); } algorithm_t; +typedef struct _algorithm_settings_t +{ + const char *name; + algorithm_type_t type; + const char *kernelfile; + double diff_multiplier1; + double diff_multiplier2; + double share_diff_multiplier; + uint32_t xintensity_shift; + uint32_t intensity_shift; + uint32_t found_idx; + unsigned long long diff_numerator; + uint32_t diff1targ; + size_t n_extra_kernels; + long rw_buffer_size; + cl_command_queue_properties cq_properties; + void (*regenhash)(struct work *); + cl_int (*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint); + void (*gen_hash)(const unsigned char *, unsigned int, unsigned char *); + void (*set_compile_options)(build_kernel_data *, struct cgpu_info *, algorithm_t *); +} algorithm_settings_t; + /* Set default parameters based on name. */ void set_algorithm(algorithm_t* algo, const char* name); @@ -74,6 +97,6 @@ void set_algorithm(algorithm_t* algo, const char* name); void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor); /* Compare two algorithm parameters */ -bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2); +bool cmp_algorithm(const algorithm_t* algo1, const algorithm_t* algo2); #endif /* ALGORITHM_H */ diff --git a/ocl.c b/ocl.c index 1ea198a2..8174e2ab 100644 --- a/ocl.c +++ b/ocl.c @@ -146,16 +146,6 @@ static cl_int create_opencl_context(cl_context *context, cl_platform_id *platfor return status; } -static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_context *context, cl_device_id *device, cl_command_queue_properties cq_properties) -{ - cl_int status; - *command_queue = clCreateCommandQueue(*context, *device, - cq_properties, &status); - if (status != CL_SUCCESS) /* Try again without OOE enable */ - *command_queue = clCreateCommandQueue(*context, *device, 0, &status); - return status; -} - static float get_opencl_version(cl_device_id device) { /* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */ @@ -193,27 +183,56 @@ static bool get_opencl_bit_align_support(cl_device_id *device) return !!find; } -_clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm) +static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_context *context, cl_device_id *device, const void *cq_properties) { - _clState *clState = (_clState *)calloc(1, sizeof(_clState)); - struct cgpu_info *cgpu = &gpus[gpu]; - cl_platform_id platform = NULL; - char pbuff[256]; - build_kernel_data *build_data = (build_kernel_data *)alloca(sizeof(struct _build_kernel_data)); - cl_uint preferred_vwidth; - cl_device_id *devices; - cl_uint numDevices; - cl_int status; + cl_int status; + + if(get_opencl_version(*device) < 2.0) { + *command_queue = clCreateCommandQueue(*context, *device, *((const cl_command_queue_properties *)cq_properties), &status); + + // Didn't work, try again with no properties. + if (status != CL_SUCCESS) { + *command_queue = clCreateCommandQueue(*context, *device, 0, &status); + } + } + else { + *command_queue = clCreateCommandQueueWithProperties(*context, *device, (const cl_queue_properties *)cq_properties, &status); + + // Didn't work, same deal. + if (status != CL_SUCCESS) { + *command_queue = clCreateCommandQueueWithProperties(*context, *device, 0, &status); + } + } + + return status; +} +_clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm) +{ + cl_int status = 0; + size_t compute_units = 0; + cl_platform_id platform = NULL; + struct cgpu_info *cgpu = &gpus[gpu]; + _clState *clState = (_clState *)calloc(1, sizeof(_clState)); + cl_uint preferred_vwidth, slot = 0, cpnd = 0, numDevices = clDevicesNum(); + cl_device_id *devices = (cl_device_id *)alloca(numDevices * sizeof(cl_device_id)); + build_kernel_data *build_data = (build_kernel_data *)alloca(sizeof(struct _build_kernel_data)); + char **pbuff = (char **)alloca(sizeof(char *) * numDevices), filename[256]; + + // sanity check if (!get_opencl_platform(opt_platform_id, &platform)) { return NULL; } - numDevices = clDevicesNum(); + if (numDevices <= 0) { + return NULL; + } - if (numDevices <= 0) return NULL; + if (gpu >= numDevices) { + applog(LOG_ERR, "Invalid GPU %i", gpu); + return NULL; + } - devices = (cl_device_id *)alloca(numDevices*sizeof(cl_device_id)); /* Now, get the device list data */ @@ -225,34 +244,33 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_INFO, "List of devices:"); - unsigned int i; - for (i = 0; i < numDevices; i++) { - status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); - if (status != CL_SUCCESS) { - applog(LOG_ERR, "Error %d: Getting Device Info", status); + for (int i = 0; i < numDevices; ++i) { + size_t tmpsize; + if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &tmpsize) != CL_SUCCESS) { + applog(LOG_ERR, "Error while getting the length of the name for GPU #%d.", i); return NULL; } - applog(LOG_INFO, "\t%i\t%s", i, pbuff); - - if (i == gpu) { - applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); - strncpy(name, pbuff, nameSize); + // Does the size include the NULL terminator? Who knows, just add one, it's faster than looking it up. + pbuff[i] = (char *)alloca(sizeof(char) * (tmpsize + 1)); + if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(char) * tmpsize, pbuff[i], NULL) != CL_SUCCESS) { + applog(LOG_ERR, "Error while attempting to get device information."); + return NULL; } - } - - if (gpu >= numDevices) { - applog(LOG_ERR, "Invalid GPU %i", gpu); - return NULL; - } + applog(LOG_INFO, "\t%i\t%s", i, pbuff[i]); + } + + applog(LOG_INFO, "Selected %d: %s", gpu, pbuff[gpu]); + strncpy(name, pbuff[gpu], nameSize); + status = create_opencl_context(&clState->context, &platform); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status); return NULL; } - status = create_opencl_command_queue(&clState->commandQueue, &clState->context, &devices[gpu], cgpu->algorithm.cq_properties); + status = create_opencl_command_queue(&clState->commandQueue, &clState->context, &devices[gpu], (const void *)&(cgpu->algorithm.cq_properties)); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status); return NULL; @@ -274,7 +292,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size)); - size_t compute_units = 0; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), (void *)&compute_units, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_COMPUTE_UNITS", status); @@ -282,8 +299,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } // AMD architechture got 64 compute shaders per compute unit. // Source: http://www.amd.com/us/Documents/GCN_Architecture_whitepaper.pdf - clState->compute_shaders = compute_units * 64; - applog(LOG_DEBUG, "Max shaders calculated %d", (int)(clState->compute_shaders)); + clState->compute_shaders = compute_units << 6; + applog(LOG_INFO, "Maximum work size for this GPU (%d) is %d.", gpu, clState->max_work_size); + applog(LOG_INFO, "Your GPU (#%d) has %d compute units, and all AMD cards in the 7 series or newer (GCN cards) \ + have 64 shaders per compute unit - this means it has %d shaders.", gpu, compute_units, clState->compute_shaders); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL); if (status != CL_SUCCESS) { @@ -297,12 +316,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg * would have otherwise created. The filename is: * name + g + lg + lookup_gap + tc + thread_concurrency + nf + nfactor + w + work_size + l + sizeof(long) + .bin */ - char filename[255]; - char strbuf[32]; - - sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name)); - strcpy(filename, strbuf); + sprintf(filename, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name)); applog(LOG_DEBUG, "Using source file %s", filename); /* For some reason 2 vectors is still better even if the card says @@ -326,10 +341,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg clState->goffset = true; - if (cgpu->work_size && cgpu->work_size <= clState->max_work_size) - clState->wsize = cgpu->work_size; - else - clState->wsize = 256; + clState->wsize = (cgpu->work_size && cgpu->work_size <= clState->max_work_size) ? cgpu->work_size : 256; if (!cgpu->opt_lg) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); @@ -536,38 +548,32 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg cgpu->thread_concurrency = cgpu->opt_tc; } - cl_uint slot, cpnd; - - slot = cpnd = 0; - build_data->context = clState->context; build_data->device = &devices[gpu]; // Build information strcpy(build_data->source_filename, filename); - strcpy(build_data->platform, name); - strcpy(build_data->sgminer_path, sgminer_path); - if (opt_kernel_path && *opt_kernel_path) { - build_data->kernel_path = opt_kernel_path; - } - else { - build_data->kernel_path = NULL; - } + strcpy(build_data->platform, name); + strcpy(build_data->sgminer_path, sgminer_path); + build_data->kernel_path = (*opt_kernel_path) ? opt_kernel_path : NULL; build_data->work_size = clState->wsize; build_data->has_bit_align = clState->hasBitAlign; - build_data->opencl_version = get_opencl_version(devices[gpu]); build_data->patch_bfi = needs_bfi_patch(build_data); - strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name)); - strcat(build_data->binary_filename, name); - if (clState->goffset) + strcpy(build_data->binary_filename, filename); + build_data->binary_filename[strlen(filename) - 3] = 0x00; // And one NULL terminator, cutting off the .cl suffix. + strcat(build_data->binary_filename, pbuff[gpu]); + + if (clState->goffset) { strcat(build_data->binary_filename, "g"); + } set_base_compiler_options(build_data); - if (algorithm->set_compile_options) + if (algorithm->set_compile_options) { algorithm->set_compile_options(build_data, cgpu, algorithm); + } strcat(build_data->binary_filename, ".bin"); applog(LOG_DEBUG, "Using binary file %s", build_data->binary_filename); @@ -576,8 +582,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg if (!(clState->program = load_opencl_binary_kernel(build_data))) { applog(LOG_NOTICE, "Building binary %s", build_data->binary_filename); - if (!(clState->program = build_opencl_kernel(build_data, filename))) + if (!(clState->program = build_opencl_kernel(build_data, filename))) { return NULL; + } if (save_opencl_kernel(build_data, clState->program)) { /* Program needs to be rebuilt, because the binary was patched */ diff --git a/ocl.h b/ocl.h index 272246da..502119e0 100644 --- a/ocl.h +++ b/ocl.h @@ -10,7 +10,7 @@ #include #endif -#include "miner.h" +#include "algorithm.h" typedef struct __clState { cl_context context; @@ -21,6 +21,7 @@ typedef struct __clState { cl_program program; cl_mem outputBuffer; cl_mem CLbuffer0; + cl_mem MidstateBuf; cl_mem padbuffer8; unsigned char cldata[80]; bool hasBitAlign; diff --git a/ocl/binary_kernel.c b/ocl/binary_kernel.c index 3843b459..4fd77875 100644 --- a/ocl/binary_kernel.c +++ b/ocl/binary_kernel.c @@ -1,5 +1,7 @@ #include "binary_kernel.h" +#include "miner.h" #include +#include cl_program load_opencl_binary_kernel(build_kernel_data *data) { diff --git a/ocl/build_kernel.c b/ocl/build_kernel.c index dc98319b..29a99e18 100644 --- a/ocl/build_kernel.c +++ b/ocl/build_kernel.c @@ -1,5 +1,7 @@ +#include #include "build_kernel.h" #include "patch_kernel.h" +#include "miner.h" static char *file_contents(const char *filename, int *length) { diff --git a/ocl/build_kernel.h b/ocl/build_kernel.h index 841ee017..92de074a 100644 --- a/ocl/build_kernel.h +++ b/ocl/build_kernel.h @@ -1,8 +1,14 @@ #ifndef BUILD_KERNEL_H #define BUILD_KERNEL_H -#include "ocl.h" #include +#include "logging.h" + +#ifdef __APPLE_CC__ +#include +#else +#include +#endif typedef struct _build_kernel_data { char source_filename[255];