Browse Source

Use a buffer of up to 512 * 4 integers when retrieving work from the GPU.

This allows each local thread id to have one slot to put any positive results into, thus making overlapping results far less likely.
Thus races will be much rarer, allowing more threads.
It should also pick up blocks close to each other more reliably and hopefully decrease the number of rejects and opencl errors.
Do the search over the buffer entirely in a separate thread to allow the GPU to stay as busy as possible.
Detach threads from themselves to prevent unlucky even where dereferencing occurs by freeing the data that stores the thread info.
nfactor-troky
Con Kolivas 14 years ago
parent
commit
2b6e841673
  1. 36
      cpu-miner.c
  2. 37
      findnonce.c
  3. 6
      findnonce.h
  4. 2
      ocl.c
  5. 63
      phatk.cl
  6. 63
      poclbm.cl

36
cpu-miner.c

@ -676,6 +676,7 @@ static void *submit_work(void *userdata) @@ -676,6 +676,7 @@ static void *submit_work(void *userdata)
err_out:
workio_cmd_free(wc);
out:
pthread_detach(pthread_self());
free(sd);
return NULL;
}
@ -696,7 +697,6 @@ static bool submit_work_async(struct thr_info *thr, const struct work *work_in) @@ -696,7 +697,6 @@ static bool submit_work_async(struct thr_info *thr, const struct work *work_in)
applog(LOG_ERR, "Failed to create submit_thread");
return false;
}
pthread_detach(sd->pth);
return true;
}
@ -900,12 +900,8 @@ static void *gpuminer_thread(void *userdata) @@ -900,12 +900,8 @@ static void *gpuminer_thread(void *userdata)
{
struct thr_info *mythr = userdata;
struct timeval tv_start, diff;
int thr_id = mythr->id;
uint32_t res[128], blank_res[128];
cl_kernel *kernel;
memset(res, 0, BUFFERSIZE);
memset(blank_res, 0, BUFFERSIZE);
const int thr_id = mythr->id;
uint32_t *res, *blank_res;
size_t globalThreads[1];
size_t localThreads[1];
@ -913,7 +909,7 @@ static void *gpuminer_thread(void *userdata) @@ -913,7 +909,7 @@ static void *gpuminer_thread(void *userdata)
cl_int status;
_clState *clState = clStates[thr_id];
kernel = &clState->kernel;
const cl_kernel *kernel = &clState->kernel;
struct work *work = malloc(sizeof(struct work));
unsigned const int threads = 1 << (15 + scan_intensity);
@ -921,6 +917,14 @@ static void *gpuminer_thread(void *userdata) @@ -921,6 +917,14 @@ static void *gpuminer_thread(void *userdata)
unsigned const int hashes = threads * vectors;
unsigned int hashes_done = 0;
res = calloc(BUFFERSIZE, 1);
blank_res = calloc(BUFFERSIZE, 1);
if (!res || !blank_res) {
applog(LOG_ERR, "Failed to calloc in gpuminer_thread");
goto out;
}
gettimeofday(&tv_start, NULL);
globalThreads[0] = threads;
localThreads[0] = clState->work_size;
@ -966,21 +970,17 @@ static void *gpuminer_thread(void *userdata) @@ -966,21 +970,17 @@ static void *gpuminer_thread(void *userdata)
{ applog(LOG_ERR, "Error: clSetKernelArg of nonce failed."); goto out; }
}
/* 127 is used as a flag to say nonces exist */
if (unlikely(res[127])) {
/* MAXBUFFERS entry is used as a flag to say nonces exist */
if (res[MAXBUFFERS]) {
/* Clear the buffer again */
status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
BUFFERSIZE, blank_res, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS))
{ applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); goto out; }
for (i = 0; i < 127; i++) {
if (res[i]) {
if (opt_debug)
applog(LOG_DEBUG, "GPU %d found something?", gpu_from_thr_id(thr_id));
postcalc_hash_async(mythr, work, res[i]);
} else
break;
}
if (opt_debug)
applog(LOG_DEBUG, "GPU %d found something?", gpu_from_thr_id(thr_id));
postcalc_hash_async(mythr, work, res);
memset(res, 0, BUFFERSIZE);
clFinish(clState->commandQueue);
}

37
findnonce.c

@ -138,7 +138,7 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) { @@ -138,7 +138,7 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
struct pc_data {
struct thr_info *thr;
struct work work;
uint32_t start;
uint32_t res[MAXBUFFERS];
pthread_t pth;
};
@ -148,13 +148,28 @@ static void *postcalc_hash(void *userdata) @@ -148,13 +148,28 @@ static void *postcalc_hash(void *userdata)
struct thr_info *thr = pcd->thr;
dev_blk_ctx *blk = &pcd->work.blk;
struct work *work = &pcd->work;
uint32_t start = pcd->start;
uint32_t start;
cl_uint A, B, C, D, E, F, G, H;
cl_uint W[16];
cl_uint nonce;
cl_uint best_g = ~0;
uint32_t end = start + 1026;
cl_uint best_g;
uint32_t end;
int entry = 0;
cycle:
while (entry < MAXBUFFERS) {
if (pcd->res[entry]) {
start = pcd->res[entry++];
break;
}
entry++;
}
if (entry == MAXBUFFERS)
goto out;
best_g = ~0;
end = start + 1026;
for (nonce = start; nonce != end; nonce+=1) {
A = blk->cty_a; B = blk->cty_b;
@ -189,7 +204,7 @@ static void *postcalc_hash(void *userdata) @@ -189,7 +204,7 @@ static void *postcalc_hash(void *userdata)
if (unlikely(H == 0xA41F32E7)) {
if (unlikely(submit_nonce(thr, work, nonce) == false)) {
applog(LOG_ERR, "Failed to submit work, exiting");
goto out;
break;
}
G += 0x1f83d9ab;
@ -199,17 +214,22 @@ static void *postcalc_hash(void *userdata) @@ -199,17 +214,22 @@ static void *postcalc_hash(void *userdata)
best_g = G;
}
}
out:
if (unlikely(best_g == ~0)) {
if (opt_debug)
applog(LOG_DEBUG, "No best_g found! Error in OpenCL code?");
hw_errors++;
thr->cgpu->hw_errors++;
}
if (entry < MAXBUFFERS)
goto cycle;
out:
pthread_detach(pthread_self());
free(pcd);
return NULL;
}
void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t start)
void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res)
{
struct pc_data *pcd = malloc(sizeof(struct pc_data));
if (unlikely(!pcd)) {
@ -219,11 +239,10 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t start @@ -219,11 +239,10 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t start
pcd->thr = thr;
memcpy(&pcd->work, work, sizeof(struct work));
pcd->start = start;
memcpy(&pcd->res, res, BUFFERSIZE);
if (pthread_create(&pcd->pth, NULL, postcalc_hash, (void *)pcd)) {
applog(LOG_ERR, "Failed to create postcalc_hash thread");
return;
}
pthread_detach(pcd->pth);
}

6
findnonce.h

@ -3,8 +3,10 @@ @@ -3,8 +3,10 @@
#include "miner.h"
#define MAXTHREADS (0xFFFFFFFEULL)
#define BUFFERSIZE (sizeof(uint32_t) * 128)
/* Maximum worksize 512 * maximum vectors 4 plus one flag entry */
#define MAXBUFFERS (4 * 512)
#define BUFFERSIZE (sizeof(uint32_t) * (MAXBUFFERS + 1))
extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data);
extern void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t start);
extern void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res);
#endif /*__FINDNONCE_H__*/

2
ocl.c

@ -513,7 +513,7 @@ retry: @@ -513,7 +513,7 @@ retry:
return NULL;
}
clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, sizeof(uint32_t) * 128, NULL, &status);
clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, BUFFERSIZE, NULL, &status);
if(status != CL_SUCCESS) {
applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)");
return NULL;

63
phatk.cl

@ -141,7 +141,7 @@ void search( const uint state0, const uint state1, const uint state2, const uint @@ -141,7 +141,7 @@ void search( const uint state0, const uint state1, const uint state2, const uint
W[19] = P1(19) + P2(19) + P3(19);
W[18] = P1(18) + P3(18) + P4(18);
W[20] = P2(20) + P3(20) + P4(20);
uint it;
uint it = get_local_id(0);
#ifdef VECTORS4
W[3] = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3);
@ -363,59 +363,70 @@ void search( const uint state0, const uint state1, const uint state2, const uint @@ -363,59 +363,70 @@ void search( const uint state0, const uint state1, const uint state2, const uint
partround(64 + 60);
Vals[7] += H[7];
#define MAXBUFFERS (4 * 512)
#if defined(VECTORS4) || defined(VECTORS2)
if (Vals[7].x == 0)
{
for (it = 0; it != 127; it++) {
if (!output[it]) {
output[it] = W[3].x;
output[127] = 1;
break;
// Unlikely event there is something here already !
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = W[3].x;
output[MAXBUFFERS] = 1;
}
if (Vals[7].y == 0)
{
for (it = 0; it != 127; it++) {
if (!output[it]) {
output[it] = W[3].y;
output[127] = 1;
break;
it += 512;
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = W[3].y;
output[MAXBUFFERS] = 1;
}
#ifdef VECTORS4
if (Vals[7].z == 0)
{
for (it = 0; it != 127; it++) {
if (!output[it]) {
output[it] = W[3].z;
output[127] = 1;
break;
it += 1024;
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = W[3].z;
output[MAXBUFFERS] = 1;
}
if (Vals[7].w == 0)
{
for (it = 0; it != 127; it++) {
if (!output[it]) {
output[it] = W[3].w;
output[127] = 1;
break;
it += 1536;
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = W[3].w;
output[MAXBUFFERS] = 1;
}
#endif
#else
if (Vals[7] == 0)
{
for (it = 0; it != 127; it++) {
if (!output[it]) {
output[it] = W[3];
output[127] = 1;
break;
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = W[3];
output[MAXBUFFERS] = 1;
}
#endif

63
poclbm.cl

@ -79,7 +79,7 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c @@ -79,7 +79,7 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
u W[24];
u Vals[8];
u nonce;
u it;
uint it = get_local_id(0);
#ifdef VECTORS4
nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3);
@ -627,59 +627,70 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c @@ -627,59 +627,70 @@ __kernel void search( const uint state0, const uint state1, const uint state2, c
Vals[7]+=0x5be0cd19U;
#define MAXBUFFERS (4 * 512)
#if defined(VECTORS4) || defined(VECTORS2)
if (Vals[7].x == 0)
{
for (it.x = 0; it.x != 127; it.x++) {
if (!output[it.x]) {
output[it.x] = nonce.x;
output[127] = 1;
break;
// Unlikely event there is something here already !
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = nonce.x;
output[MAXBUFFERS] = 1;
}
if (Vals[7].y == 0)
{
for (it.y = 0; it.y != 127; it.y++) {
if (!output[it.y]) {
output[it.y] = nonce.y;
output[127] = 1;
break;
it += 512;
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = nonce.y;
output[MAXBUFFERS] = 1;
}
#ifdef VECTORS4
if (Vals[7].z == 0)
{
for (it.z = 0; it.z != 127; it.z++) {
if (!output[it.z]) {
output[it.z] = nonce.z;
output[127] = 1;
break;
it += 1024;
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = nonce.z;
output[MAXBUFFERS] = 1;
}
if (Vals[7].w == 0)
{
for (it.w = 0; it.w != 127; it.w++) {
if (!output[it.w]) {
output[it.w] = nonce.w;
output[127] = 1;
break;
it += 1536;
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = nonce.w;
output[MAXBUFFERS] = 1;
}
#endif
#else
if (Vals[7] == 0)
{
for (it = 0; it != 127; it++) {
if (!output[it]) {
output[it] = nonce;
output[127] = 1;
break;
if (output[it]) {
for (it = 0; it < MAXBUFFERS; it++) {
if (!output[it])
break;
}
}
output[it] = nonce;
output[MAXBUFFERS] = 1;
}
#endif
}
Loading…
Cancel
Save