Browse Source

wolf's improvements #2

windows
troky 10 years ago
parent
commit
fe62dc75fc
  1. 33
      algorithm.c
  2. 27
      algorithm.h
  3. 133
      ocl.c
  4. 3
      ocl.h
  5. 2
      ocl/binary_kernel.c
  6. 2
      ocl/build_kernel.c
  7. 8
      ocl/build_kernel.h

33
algorithm.c

@ -655,10 +655,14 @@ static cl_int queue_whirlpoolx_kernel(struct __clState *clState, struct _dev_blk
tmp[0] = 0; tmp[0] = 0;
whirlpool_round(midblock, tmp); whirlpool_round(midblock, tmp);
for (int x = 0; x < 8; ++x) midblock[x] ^= key[x]; for (int x = 0; x < 8; ++x) {
midblock[x] ^= key[x];
}
} }
for (int i = 0; i < 8; ++i) midblock[i] ^= ((uint64_t *)(clState->cldata))[i]; for (int i = 0; i < 8; ++i) {
midblock[i] ^= ((uint64_t *)(clState->cldata))[i];
}
status = clSetKernelArg(clState->kernel, 0, sizeof(cl_ulong8), (cl_ulong8 *)&midblock); status = clSetKernelArg(clState->kernel, 0, sizeof(cl_ulong8), (cl_ulong8 *)&midblock);
status |= clSetKernelArg(clState->kernel, 1, sizeof(cl_ulong), (void *)(((uint64_t *)clState->cldata) + 8)); status |= clSetKernelArg(clState->kernel, 1, sizeof(cl_ulong), (void *)(((uint64_t *)clState->cldata) + 8));
@ -732,27 +736,6 @@ static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_un
return status; return status;
} }
typedef struct _algorithm_settings_t {
const char *name; /* Human-readable identifier */
algorithm_type_t type; //common algorithm type
const char *kernelfile; /* alternate kernel file */
double diff_multiplier1;
double diff_multiplier2;
double share_diff_multiplier;
uint32_t xintensity_shift;
uint32_t intensity_shift;
uint32_t found_idx;
unsigned long long diff_numerator;
uint32_t diff1targ;
size_t n_extra_kernels;
long rw_buffer_size;
cl_command_queue_properties cq_properties;
void(*regenhash)(struct work *);
cl_int(*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint);
void(*gen_hash)(const unsigned char *, unsigned int, unsigned char *);
void(*set_compile_options)(build_kernel_data *, struct cgpu_info *, algorithm_t *);
} algorithm_settings_t;
static algorithm_settings_t algos[] = { static algorithm_settings_t algos[] = {
// kernels starting from this will have difficulty calculated by using litecoin algorithm // kernels starting from this will have difficulty calculated by using litecoin algorithm
#define A_SCRYPT(a) \ #define A_SCRYPT(a) \
@ -895,7 +878,6 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
ALGO_ALIAS("nist5", "talkcoin-mod"); ALGO_ALIAS("nist5", "talkcoin-mod");
ALGO_ALIAS("keccak", "maxcoin"); ALGO_ALIAS("keccak", "maxcoin");
ALGO_ALIAS("whirlpool", "whirlcoin"); ALGO_ALIAS("whirlpool", "whirlcoin");
ALGO_ALIAS("whirlpoolx", "whirlpoolx");
ALGO_ALIAS("Lyra2RE", "lyra2re"); ALGO_ALIAS("Lyra2RE", "lyra2re");
ALGO_ALIAS("lyra2", "lyra2re"); ALGO_ALIAS("lyra2", "lyra2re");
@ -957,8 +939,7 @@ void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor)
} }
} }
bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2) bool cmp_algorithm(const algorithm_t* algo1, const algorithm_t* algo2)
{ {
// return (strcmp(algo1->name, algo2->name) == 0) && (algo1->nfactor == algo2->nfactor);
return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor)); return (!safe_cmp(algo1->name, algo2->name) && !safe_cmp(algo1->kernelfile, algo2->kernelfile) && (algo1->nfactor == algo2->nfactor));
} }

27
algorithm.h

@ -9,6 +9,7 @@
#include <inttypes.h> #include <inttypes.h>
#include <stdbool.h> #include <stdbool.h>
#include "ocl/build_kernel.h" // For the build_kernel_data type
typedef enum { typedef enum {
ALGO_UNK, ALGO_UNK,
@ -25,8 +26,8 @@ typedef enum {
ALGO_NIST, ALGO_NIST,
ALGO_FRESH, ALGO_FRESH,
ALGO_WHIRL, ALGO_WHIRL,
ALGO_WHIRLPOOLX,
ALGO_NEOSCRYPT, ALGO_NEOSCRYPT,
ALGO_WHIRLPOOLX,
ALGO_LYRA2RE, ALGO_LYRA2RE,
ALGO_PLUCK ALGO_PLUCK
} algorithm_type_t; } algorithm_type_t;
@ -67,6 +68,28 @@ typedef struct _algorithm_t {
void(*set_compile_options)(struct _build_kernel_data *, struct cgpu_info *, struct _algorithm_t *); void(*set_compile_options)(struct _build_kernel_data *, struct cgpu_info *, struct _algorithm_t *);
} algorithm_t; } algorithm_t;
typedef struct _algorithm_settings_t
{
const char *name;
algorithm_type_t type;
const char *kernelfile;
double diff_multiplier1;
double diff_multiplier2;
double share_diff_multiplier;
uint32_t xintensity_shift;
uint32_t intensity_shift;
uint32_t found_idx;
unsigned long long diff_numerator;
uint32_t diff1targ;
size_t n_extra_kernels;
long rw_buffer_size;
cl_command_queue_properties cq_properties;
void (*regenhash)(struct work *);
cl_int (*queue_kernel)(struct __clState *, struct _dev_blk_ctx *, cl_uint);
void (*gen_hash)(const unsigned char *, unsigned int, unsigned char *);
void (*set_compile_options)(build_kernel_data *, struct cgpu_info *, algorithm_t *);
} algorithm_settings_t;
/* Set default parameters based on name. */ /* Set default parameters based on name. */
void set_algorithm(algorithm_t* algo, const char* name); void set_algorithm(algorithm_t* algo, const char* name);
@ -74,6 +97,6 @@ void set_algorithm(algorithm_t* algo, const char* name);
void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor); void set_algorithm_nfactor(algorithm_t* algo, const uint8_t nfactor);
/* Compare two algorithm parameters */ /* Compare two algorithm parameters */
bool cmp_algorithm(algorithm_t* algo1, algorithm_t* algo2); bool cmp_algorithm(const algorithm_t* algo1, const algorithm_t* algo2);
#endif /* ALGORITHM_H */ #endif /* ALGORITHM_H */

133
ocl.c

@ -146,16 +146,6 @@ static cl_int create_opencl_context(cl_context *context, cl_platform_id *platfor
return status; return status;
} }
static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_context *context, cl_device_id *device, cl_command_queue_properties cq_properties)
{
cl_int status;
*command_queue = clCreateCommandQueue(*context, *device,
cq_properties, &status);
if (status != CL_SUCCESS) /* Try again without OOE enable */
*command_queue = clCreateCommandQueue(*context, *device, 0, &status);
return status;
}
static float get_opencl_version(cl_device_id device) static float get_opencl_version(cl_device_id device)
{ {
/* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */ /* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
@ -193,27 +183,56 @@ static bool get_opencl_bit_align_support(cl_device_id *device)
return !!find; return !!find;
} }
static cl_int create_opencl_command_queue(cl_command_queue *command_queue, cl_context *context, cl_device_id *device, const void *cq_properties)
{
cl_int status;
if(get_opencl_version(*device) < 2.0) {
*command_queue = clCreateCommandQueue(*context, *device, *((const cl_command_queue_properties *)cq_properties), &status);
// Didn't work, try again with no properties.
if (status != CL_SUCCESS) {
*command_queue = clCreateCommandQueue(*context, *device, 0, &status);
}
}
else {
*command_queue = clCreateCommandQueueWithProperties(*context, *device, (const cl_queue_properties *)cq_properties, &status);
// Didn't work, same deal.
if (status != CL_SUCCESS) {
*command_queue = clCreateCommandQueueWithProperties(*context, *device, 0, &status);
}
}
return status;
}
_clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm) _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *algorithm)
{ {
_clState *clState = (_clState *)calloc(1, sizeof(_clState)); cl_int status = 0;
struct cgpu_info *cgpu = &gpus[gpu]; size_t compute_units = 0;
cl_platform_id platform = NULL; cl_platform_id platform = NULL;
char pbuff[256]; struct cgpu_info *cgpu = &gpus[gpu];
_clState *clState = (_clState *)calloc(1, sizeof(_clState));
cl_uint preferred_vwidth, slot = 0, cpnd = 0, numDevices = clDevicesNum();
cl_device_id *devices = (cl_device_id *)alloca(numDevices * sizeof(cl_device_id));
build_kernel_data *build_data = (build_kernel_data *)alloca(sizeof(struct _build_kernel_data)); build_kernel_data *build_data = (build_kernel_data *)alloca(sizeof(struct _build_kernel_data));
cl_uint preferred_vwidth; char **pbuff = (char **)alloca(sizeof(char *) * numDevices), filename[256];
cl_device_id *devices;
cl_uint numDevices;
cl_int status;
// sanity check
if (!get_opencl_platform(opt_platform_id, &platform)) { if (!get_opencl_platform(opt_platform_id, &platform)) {
return NULL; return NULL;
} }
numDevices = clDevicesNum(); if (numDevices <= 0) {
return NULL;
}
if (numDevices <= 0) return NULL; if (gpu >= numDevices) {
applog(LOG_ERR, "Invalid GPU %i", gpu);
return NULL;
}
devices = (cl_device_id *)alloca(numDevices*sizeof(cl_device_id));
/* Now, get the device list data */ /* Now, get the device list data */
@ -225,34 +244,33 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
applog(LOG_INFO, "List of devices:"); applog(LOG_INFO, "List of devices:");
unsigned int i; for (int i = 0; i < numDevices; ++i) {
for (i = 0; i < numDevices; i++) { size_t tmpsize;
status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &tmpsize) != CL_SUCCESS) {
if (status != CL_SUCCESS) { applog(LOG_ERR, "Error while getting the length of the name for GPU #%d.", i);
applog(LOG_ERR, "Error %d: Getting Device Info", status);
return NULL; return NULL;
} }
applog(LOG_INFO, "\t%i\t%s", i, pbuff); // Does the size include the NULL terminator? Who knows, just add one, it's faster than looking it up.
pbuff[i] = (char *)alloca(sizeof(char) * (tmpsize + 1));
if (i == gpu) { if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(char) * tmpsize, pbuff[i], NULL) != CL_SUCCESS) {
applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); applog(LOG_ERR, "Error while attempting to get device information.");
strncpy(name, pbuff, nameSize); return NULL;
}
} }
if (gpu >= numDevices) { applog(LOG_INFO, "\t%i\t%s", i, pbuff[i]);
applog(LOG_ERR, "Invalid GPU %i", gpu);
return NULL;
} }
applog(LOG_INFO, "Selected %d: %s", gpu, pbuff[gpu]);
strncpy(name, pbuff[gpu], nameSize);
status = create_opencl_context(&clState->context, &platform); status = create_opencl_context(&clState->context, &platform);
if (status != CL_SUCCESS) { if (status != CL_SUCCESS) {
applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status); applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status);
return NULL; return NULL;
} }
status = create_opencl_command_queue(&clState->commandQueue, &clState->context, &devices[gpu], cgpu->algorithm.cq_properties); status = create_opencl_command_queue(&clState->commandQueue, &clState->context, &devices[gpu], (const void *)&(cgpu->algorithm.cq_properties));
if (status != CL_SUCCESS) { if (status != CL_SUCCESS) {
applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status); applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
return NULL; return NULL;
@ -274,7 +292,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
} }
applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size)); applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size));
size_t compute_units = 0;
status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), (void *)&compute_units, NULL); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), (void *)&compute_units, NULL);
if (status != CL_SUCCESS) { if (status != CL_SUCCESS) {
applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_COMPUTE_UNITS", status); applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_COMPUTE_UNITS", status);
@ -282,8 +299,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
} }
// AMD architechture got 64 compute shaders per compute unit. // AMD architechture got 64 compute shaders per compute unit.
// Source: http://www.amd.com/us/Documents/GCN_Architecture_whitepaper.pdf // Source: http://www.amd.com/us/Documents/GCN_Architecture_whitepaper.pdf
clState->compute_shaders = compute_units * 64; clState->compute_shaders = compute_units << 6;
applog(LOG_DEBUG, "Max shaders calculated %d", (int)(clState->compute_shaders)); applog(LOG_INFO, "Maximum work size for this GPU (%d) is %d.", gpu, clState->max_work_size);
applog(LOG_INFO, "Your GPU (#%d) has %d compute units, and all AMD cards in the 7 series or newer (GCN cards) \
have 64 shaders per compute unit - this means it has %d shaders.", gpu, compute_units, clState->compute_shaders);
status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
if (status != CL_SUCCESS) { if (status != CL_SUCCESS) {
@ -297,12 +316,8 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
* would have otherwise created. The filename is: * would have otherwise created. The filename is:
* name + g + lg + lookup_gap + tc + thread_concurrency + nf + nfactor + w + work_size + l + sizeof(long) + .bin * name + g + lg + lookup_gap + tc + thread_concurrency + nf + nfactor + w + work_size + l + sizeof(long) + .bin
*/ */
char filename[255];
char strbuf[32];
sprintf(strbuf, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
strcpy(filename, strbuf);
sprintf(filename, "%s.cl", (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name));
applog(LOG_DEBUG, "Using source file %s", filename); applog(LOG_DEBUG, "Using source file %s", filename);
/* For some reason 2 vectors is still better even if the card says /* For some reason 2 vectors is still better even if the card says
@ -326,10 +341,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
clState->goffset = true; clState->goffset = true;
if (cgpu->work_size && cgpu->work_size <= clState->max_work_size) clState->wsize = (cgpu->work_size && cgpu->work_size <= clState->max_work_size) ? cgpu->work_size : 256;
clState->wsize = cgpu->work_size;
else
clState->wsize = 256;
if (!cgpu->opt_lg) { if (!cgpu->opt_lg) {
applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
@ -536,10 +548,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
cgpu->thread_concurrency = cgpu->opt_tc; cgpu->thread_concurrency = cgpu->opt_tc;
} }
cl_uint slot, cpnd;
slot = cpnd = 0;
build_data->context = clState->context; build_data->context = clState->context;
build_data->device = &devices[gpu]; build_data->device = &devices[gpu];
@ -547,27 +555,25 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
strcpy(build_data->source_filename, filename); strcpy(build_data->source_filename, filename);
strcpy(build_data->platform, name); strcpy(build_data->platform, name);
strcpy(build_data->sgminer_path, sgminer_path); strcpy(build_data->sgminer_path, sgminer_path);
if (opt_kernel_path && *opt_kernel_path) {
build_data->kernel_path = opt_kernel_path;
}
else {
build_data->kernel_path = NULL;
}
build_data->kernel_path = (*opt_kernel_path) ? opt_kernel_path : NULL;
build_data->work_size = clState->wsize; build_data->work_size = clState->wsize;
build_data->has_bit_align = clState->hasBitAlign; build_data->has_bit_align = clState->hasBitAlign;
build_data->opencl_version = get_opencl_version(devices[gpu]); build_data->opencl_version = get_opencl_version(devices[gpu]);
build_data->patch_bfi = needs_bfi_patch(build_data); build_data->patch_bfi = needs_bfi_patch(build_data);
strcpy(build_data->binary_filename, (!empty_string(cgpu->algorithm.kernelfile) ? cgpu->algorithm.kernelfile : cgpu->algorithm.name)); strcpy(build_data->binary_filename, filename);
strcat(build_data->binary_filename, name); build_data->binary_filename[strlen(filename) - 3] = 0x00; // And one NULL terminator, cutting off the .cl suffix.
if (clState->goffset) strcat(build_data->binary_filename, pbuff[gpu]);
if (clState->goffset) {
strcat(build_data->binary_filename, "g"); strcat(build_data->binary_filename, "g");
}
set_base_compiler_options(build_data); set_base_compiler_options(build_data);
if (algorithm->set_compile_options) if (algorithm->set_compile_options) {
algorithm->set_compile_options(build_data, cgpu, algorithm); algorithm->set_compile_options(build_data, cgpu, algorithm);
}
strcat(build_data->binary_filename, ".bin"); strcat(build_data->binary_filename, ".bin");
applog(LOG_DEBUG, "Using binary file %s", build_data->binary_filename); applog(LOG_DEBUG, "Using binary file %s", build_data->binary_filename);
@ -576,8 +582,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
if (!(clState->program = load_opencl_binary_kernel(build_data))) { if (!(clState->program = load_opencl_binary_kernel(build_data))) {
applog(LOG_NOTICE, "Building binary %s", build_data->binary_filename); applog(LOG_NOTICE, "Building binary %s", build_data->binary_filename);
if (!(clState->program = build_opencl_kernel(build_data, filename))) if (!(clState->program = build_opencl_kernel(build_data, filename))) {
return NULL; return NULL;
}
if (save_opencl_kernel(build_data, clState->program)) { if (save_opencl_kernel(build_data, clState->program)) {
/* Program needs to be rebuilt, because the binary was patched */ /* Program needs to be rebuilt, because the binary was patched */

3
ocl.h

@ -10,7 +10,7 @@
#include <CL/cl.h> #include <CL/cl.h>
#endif #endif
#include "miner.h" #include "algorithm.h"
typedef struct __clState { typedef struct __clState {
cl_context context; cl_context context;
@ -21,6 +21,7 @@ typedef struct __clState {
cl_program program; cl_program program;
cl_mem outputBuffer; cl_mem outputBuffer;
cl_mem CLbuffer0; cl_mem CLbuffer0;
cl_mem MidstateBuf;
cl_mem padbuffer8; cl_mem padbuffer8;
unsigned char cldata[80]; unsigned char cldata[80];
bool hasBitAlign; bool hasBitAlign;

2
ocl/binary_kernel.c

@ -1,5 +1,7 @@
#include "binary_kernel.h" #include "binary_kernel.h"
#include "miner.h"
#include <sys/stat.h> #include <sys/stat.h>
#include <stdio.h>
cl_program load_opencl_binary_kernel(build_kernel_data *data) cl_program load_opencl_binary_kernel(build_kernel_data *data)
{ {

2
ocl/build_kernel.c

@ -1,5 +1,7 @@
#include <stdio.h>
#include "build_kernel.h" #include "build_kernel.h"
#include "patch_kernel.h" #include "patch_kernel.h"
#include "miner.h"
static char *file_contents(const char *filename, int *length) static char *file_contents(const char *filename, int *length)
{ {

8
ocl/build_kernel.h

@ -1,8 +1,14 @@
#ifndef BUILD_KERNEL_H #ifndef BUILD_KERNEL_H
#define BUILD_KERNEL_H #define BUILD_KERNEL_H
#include "ocl.h"
#include <stdbool.h> #include <stdbool.h>
#include "logging.h"
#ifdef __APPLE_CC__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
typedef struct _build_kernel_data { typedef struct _build_kernel_data {
char source_filename[255]; char source_filename[255];

Loading…
Cancel
Save