From cac54f30b8df8cb50cedeb3a5e3841438a7f8e39 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 15 Aug 2011 20:25:18 +1000 Subject: [PATCH 1/7] Revert "When pinging a sick cpu, flush finish and then ping it in a separate thread in the hope it recovers, but without blocking code elsewhere." This reverts commit a466942fd880d157cc0b5968805b2159f556fc20. --- main.c | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/main.c b/main.c index ed06c096..bacae1c2 100644 --- a/main.c +++ b/main.c @@ -3431,46 +3431,10 @@ static void *reinit_gpu(void *userdata) return NULL; } - -static void *ping_gputhread(void *userdata) -{ - struct cgpu_info *cgpu = (struct cgpu_info *)userdata; - int gpu = cgpu->cpu_gpu; - struct thr_info *thr; - _clState *clState; - int thr_id; - - for (thr_id = 0; thr_id < gpu_threads; thr_id ++) { - if (dev_from_id(thr_id) != gpu) - continue; - - thr = &thr_info[thr_id]; - clState = clStates[thr_id]; - tq_push(thr->q, &ping); - applog(LOG_WARNING, "Attempting to flush command queue of thread %d", thr_id); - clFlush(clState->commandQueue); - clFinish(clState->commandQueue); - tq_push(thr->q, &ping); - } - - return NULL; -} - -static void ping_gpu(struct cgpu_info *cgpu) -{ - pthread_t ping_thread; - - if (unlikely(pthread_create(&ping_thread, NULL, ping_gputhread, (void *)cgpu))) - applog(LOG_ERR, "Failed to create ping thread"); -} #else static void *reinit_gpu(void *userdata) { } - -static void ping_gpu(struct cgpu_info *cgpu) -{ -} #endif static void reinit_device(struct cgpu_info *cgpu) @@ -3583,7 +3547,7 @@ static void *watchdog_thread(void *userdata) gpus[gpu].status = LIFE_SICK; applog(LOG_ERR, "Thread %d idle for more than 60 seconds, GPU %d declared SICK!", i, gpu); /* Sent it a ping, it might respond */ - ping_gpu(thr->cgpu); + tq_push(thr->q, &ping); } else if (now.tv_sec - thr->last.tv_sec > 300 && gpus[i].status == LIFE_SICK) { gpus[gpu].status = LIFE_DEAD; applog(LOG_ERR, "Thread %d idle for more than 5 minutes, GPU %d declared DEAD!", i, gpu); From 7c50bb02ab9d9b079718c27e6461100ddafb6a78 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 15 Aug 2011 20:26:27 +1000 Subject: [PATCH 2/7] Revert "Display last initialised time in gpu management info." This reverts commit 177e07aed32e6b38f615f52d3243620625ff93e0. --- main.c | 11 +---------- miner.h | 2 -- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/main.c b/main.c index bacae1c2..18374e07 100644 --- a/main.c +++ b/main.c @@ -1970,7 +1970,6 @@ retry: gpu, cgpu->rolling, cgpu->total_mhashes / total_secs, cgpu->getworks, cgpu->accepted, cgpu->rejected, cgpu->hw_errors, cgpu->efficiency, cgpu->utility); - wlog("Last initialised: %s\n", cgpu->init); for (i = 0; i < mining_threads; i++) { thr = &thr_info[i]; if (thr->cgpu != cgpu) @@ -3378,9 +3377,8 @@ static void *reinit_gpu(void *userdata) struct cgpu_info *cgpu = (struct cgpu_info *)userdata; int gpu = cgpu->cpu_gpu; struct thr_info *thr; - struct timeval now; - _clState *clState; int thr_id; + _clState *clState; /* Send threads message to stop */ gpu_devices[gpu] = false; @@ -3418,9 +3416,6 @@ static void *reinit_gpu(void *userdata) applog(LOG_WARNING, "Thread %d restarted", thr_id); } - gettimeofday(&now, NULL); - get_datestamp(cgpu->init, &now); - /* Try to re-enable it */ gpu_devices[gpu] = true; for (thr_id = 0; thr_id < gpu_threads; thr_id ++) { @@ -4053,7 +4048,6 @@ int main (int argc, char *argv[]) for (j = 0; j < nDevs * opt_g_threads; j++) { int gpu = j % nDevs; struct cgpu_info *cgpu; - struct timeval now; gpus[gpu].is_gpu = 1; gpus[gpu].cpu_gpu = gpu; @@ -4081,12 +4075,9 @@ int main (int argc, char *argv[]) if (!clStates[i]) { applog(LOG_ERR, "Failed to init GPU thread %d", i); gpu_devices[i] = false; - strcat(cgpu->init, "Never"); continue; } applog(LOG_INFO, "initCl() finished. Found %s", name); - gettimeofday(&now, NULL); - get_datestamp(cgpu->init, &now); if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr))) quit(1, "thread %d create failed", i); diff --git a/miner.h b/miner.h index f26df4dd..ac9f6165 100644 --- a/miner.h +++ b/miner.h @@ -157,8 +157,6 @@ struct cgpu_info { unsigned int vwidth; size_t max_work_size; size_t work_size; - - char init[40]; }; struct thr_info { From b1289a015936a962f3cfa52f4e04c9cf52efecc1 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 15 Aug 2011 20:26:46 +1000 Subject: [PATCH 3/7] Revert "Move the non cl_ variables into the cgpu info struct to allow creating a new cl state on reinit, preserving known GPU variables." This reverts commit 28880d0dc7c601ee4479921502b66e913e38e36d. --- main.c | 15 ++++++--------- miner.h | 5 ----- ocl.c | 51 +++++++++++++++++++-------------------------------- ocl.h | 6 ++++-- 4 files changed, 29 insertions(+), 48 deletions(-) diff --git a/main.c b/main.c index 18374e07..17e1ca93 100644 --- a/main.c +++ b/main.c @@ -2966,7 +2966,6 @@ static void *gpuminer_thread(void *userdata) uint32_t *res, *blank_res; double gpu_ms_average = 7; int gpu = dev_from_id(thr_id); - struct cgpu_info *cgpu = mythr->cgpu; size_t globalThreads[1]; size_t localThreads[1]; @@ -2978,7 +2977,7 @@ static void *gpuminer_thread(void *userdata) struct work *work = make_work(); unsigned int threads; - unsigned const int vectors = cgpu->vwidth; + unsigned const int vectors = clState->preferred_vwidth; unsigned int hashes; unsigned int hashes_done = 0; @@ -3015,7 +3014,7 @@ static void *gpuminer_thread(void *userdata) } gettimeofday(&tv_start, NULL); - localThreads[0] = cgpu->work_size; + localThreads[0] = clState->work_size; set_threads_hashes(vectors, &threads, &hashes, &globalThreads[0], localThreads[0]); @@ -3029,7 +3028,7 @@ static void *gpuminer_thread(void *userdata) if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); goto out; } - cgpu->status = LIFE_WELL; + mythr->cgpu->status = LIFE_WELL; if (opt_debug) applog(LOG_DEBUG, "Popping ping in gpuminer thread"); @@ -3156,7 +3155,7 @@ static void *gpuminer_thread(void *userdata) } if (unlikely(!gpu_devices[gpu])) { applog(LOG_WARNING, "Thread %d being disabled", thr_id); - mythr->rolling = cgpu->rolling = 0; + mythr->rolling = mythr->cgpu->rolling = 0; if (opt_debug) applog(LOG_DEBUG, "Popping wakeup ping in gpuminer thread"); @@ -4047,15 +4046,13 @@ int main (int argc, char *argv[]) /* start GPU mining threads */ for (j = 0; j < nDevs * opt_g_threads; j++) { int gpu = j % nDevs; - struct cgpu_info *cgpu; gpus[gpu].is_gpu = 1; gpus[gpu].cpu_gpu = gpu; thr = &thr_info[i]; thr->id = i; - cgpu = &gpus[gpu]; - thr->cgpu = cgpu; + thr->cgpu = &gpus[gpu]; thr->q = tq_new(); if (!thr->q) @@ -4071,7 +4068,7 @@ int main (int argc, char *argv[]) } applog(LOG_INFO, "Init GPU thread %i", i); - clStates[i] = initCl(cgpu, name, sizeof(name)); + clStates[i] = initCl(gpu, name, sizeof(name)); if (!clStates[i]) { applog(LOG_ERR, "Failed to init GPU thread %d", i); gpu_devices[i] = false; diff --git a/miner.h b/miner.h index ac9f6165..4a706511 100644 --- a/miner.h +++ b/miner.h @@ -152,11 +152,6 @@ struct cgpu_info { double efficiency; double utility; enum alive status; - - int hasBitAlign; - unsigned int vwidth; - size_t max_work_size; - size_t work_size; }; struct thr_info { diff --git a/ocl.c b/ocl.c index 21f6bd90..873bfa4c 100644 --- a/ocl.c +++ b/ocl.c @@ -267,16 +267,8 @@ void patch_opcodes(char *w, unsigned remaining) _clState *initCQ(_clState *clState, unsigned int gpu) { cl_int status = 0; - cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; - - clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)"); - return NULL; - } - /* create a cl program executable for the device specified */ + /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); if (status != CL_SUCCESS) { @@ -320,9 +312,8 @@ _clState *initCQ(_clState *clState, unsigned int gpu) return clState; } -_clState *initCl(struct cgpu_info *cgpu, char *name, size_t nameSize) +_clState *initCl(unsigned int gpu, char *name, size_t nameSize) { - unsigned int gpu = cgpu->cpu_gpu; int patchbfi = 0; cl_int status = 0; size_t nDevices; @@ -367,7 +358,7 @@ _clState *initCl(struct cgpu_info *cgpu, char *name, size_t nameSize) } find = strstr(extensions, camo); if (find) - cgpu->hasBitAlign = patchbfi = 1; + clState->hasBitAlign = patchbfi = 1; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL); if (status != CL_SUCCESS) { @@ -377,27 +368,26 @@ _clState *initCl(struct cgpu_info *cgpu, char *name, size_t nameSize) if (opt_debug) applog(LOG_DEBUG, "Preferred vector width reported %d", clState->preferred_vwidth); - status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&cgpu->max_work_size, NULL); + status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE"); return NULL; } if (opt_debug) - applog(LOG_DEBUG, "Max work group size reported %d", cgpu->max_work_size); + applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size); /* For some reason 2 vectors is still better even if the card says * otherwise, and many cards lie about their max so use 256 as max * unless explicitly set on the command line */ - cgpu->vwidth = clState->preferred_vwidth; if (clState->preferred_vwidth > 1) - cgpu->vwidth = 2; + clState->preferred_vwidth = 2; if (opt_vectors) - cgpu->vwidth = opt_vectors; - if (opt_worksize && opt_worksize <= cgpu->max_work_size) - cgpu->work_size = opt_worksize; + clState->preferred_vwidth = opt_vectors; + if (opt_worksize && opt_worksize <= clState->max_work_size) + clState->work_size = opt_worksize; else - cgpu->work_size = (cgpu->max_work_size <= 256 ? cgpu->max_work_size : 256) / - cgpu->vwidth; + clState->work_size = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / + clState->preferred_vwidth; /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would @@ -409,7 +399,7 @@ _clState *initCl(struct cgpu_info *cgpu, char *name, size_t nameSize) char filename[16]; if (chosen_kernel == KL_NONE) { - if (cgpu->hasBitAlign) + if (clState->hasBitAlign) chosen_kernel = KL_PHATK; else chosen_kernel = KL_POCLBM; @@ -452,14 +442,14 @@ _clState *initCl(struct cgpu_info *cgpu, char *name, size_t nameSize) } strcat(binaryfilename, name); - if (cgpu->hasBitAlign) + if (clState->hasBitAlign) strcat(binaryfilename, "bitalign"); strcat(binaryfilename, "v"); - sprintf(numbuf, "%d", cgpu->vwidth); + sprintf(numbuf, "%d", clState->preferred_vwidth); strcat(binaryfilename, numbuf); strcat(binaryfilename, "w"); - sprintf(numbuf, "%d", (int)cgpu->work_size); + sprintf(numbuf, "%d", (int)clState->work_size); strcat(binaryfilename, numbuf); strcat(binaryfilename, "long"); sprintf(numbuf, "%d", (int)sizeof(long)); @@ -515,7 +505,7 @@ build: memcpy(source, rawsource, pl); /* Patch the source file with the preferred_vwidth */ - if (cgpu->vwidth > 1) { + if (clState->preferred_vwidth > 1) { char *find = strstr(source, "VECTORSX"); if (unlikely(!find)) { @@ -523,7 +513,7 @@ build: return NULL; } find += 7; // "VECTORS" - if (cgpu->vwidth == 2) + if (clState->preferred_vwidth == 2) strncpy(find, "2", 1); else strncpy(find, "4", 1); @@ -532,7 +522,7 @@ build: } /* Patch the source file defining BITALIGN */ - if (cgpu->hasBitAlign) { + if (clState->hasBitAlign) { char *find = strstr(source, "BITALIGNX"); if (unlikely(!find)) { @@ -690,11 +680,8 @@ built: free(binaries); free(binary_sizes); - /* We throw everything out now and create the real context we're using in initCQ */ - clReleaseContext(clState->context); - applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT patching, %d vectors and worksize %d", - filename, patchbfi ? "" : "out", cgpu->vwidth, cgpu->work_size); + filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size); return initCQ(clState, gpu); } diff --git a/ocl.h b/ocl.h index 2189fd46..a95f9726 100644 --- a/ocl.h +++ b/ocl.h @@ -7,7 +7,6 @@ #else #include #endif -#include "miner.h" typedef struct { cl_context context; @@ -15,13 +14,16 @@ typedef struct { cl_command_queue commandQueue; cl_program program; cl_mem outputBuffer; + int hasBitAlign; cl_uint preferred_vwidth; + size_t max_work_size; + size_t work_size; } _clState; extern char *file_contents(const char *filename, int *length); extern int clDevicesNum(); extern int preinit_devices(void); extern _clState *initCQ(_clState *clState, unsigned int gpu); -extern _clState *initCl(struct cgpu_info *cgpu, char *name, size_t nameSize); +extern _clState *initCl(unsigned int gpu, char *name, size_t nameSize); #endif /* HAVE_OPENCL */ #endif /* __OCL_H__ */ From cf543507c6f2c5d96b8e2f7adf461d06441b3f8d Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 15 Aug 2011 20:27:02 +1000 Subject: [PATCH 4/7] Revert "Preinitialise the devices only once on startup." This reverts commit 071a0ad2f156ab492ebea6c5a60a1e49a62466de. --- main.c | 3 - ocl.c | 199 ++++++++++++++++++++++++++++----------------------------- ocl.h | 2 +- 3 files changed, 97 insertions(+), 107 deletions(-) diff --git a/main.c b/main.c index 17e1ca93..095ac2f3 100644 --- a/main.c +++ b/main.c @@ -4040,9 +4040,6 @@ int main (int argc, char *argv[]) #ifdef HAVE_OPENCL i = 0; - if (nDevs > 0) - preinit_devices(); - /* start GPU mining threads */ for (j = 0; j < nDevs * opt_g_threads; j++) { int gpu = j % nDevs; diff --git a/ocl.c b/ocl.c index 873bfa4c..5fde3736 100644 --- a/ocl.c +++ b/ocl.c @@ -52,8 +52,6 @@ char *file_contents(const char *filename, int *length) return (char*)buffer; } -static cl_uint numDevices; - int clDevicesNum() { cl_int status = 0; @@ -113,95 +111,6 @@ int clDevicesNum() { return numDevices; } -static cl_platform_id platform = NULL; -static cl_device_id *devices; - -int preinit_devices(void) -{ - cl_int status; - cl_uint numPlatforms; - int i; - - status = clGetPlatformIDs(0, NULL, &numPlatforms); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)"); - return -1; - } - - if (numPlatforms > 0) - { - cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); - status = clGetPlatformIDs(numPlatforms, platforms, NULL); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)"); - return -1; - } - - for(i = 0; i < numPlatforms; ++i) - { - char pbuff[100]; - status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)"); - free(platforms); - return -1; - } - platform = platforms[i]; - if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) - { - break; - } - } - free(platforms); - } - - if (platform == NULL) { - perror("NULL platform found!\n"); - return -1; - } - - status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Error: Getting Device IDs (num)"); - return -1; - } - - if (numDevices > 0 ) { - devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); - - /* Now, get the device list data */ - - status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Error: Getting Device IDs (list)"); - return -1; - } - - applog(LOG_INFO, "List of devices:"); - - unsigned int i; - for(i=0; idevices; /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); @@ -316,32 +226,115 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) { int patchbfi = 0; cl_int status = 0; - size_t nDevices; + unsigned int i; _clState *clState = calloc(1, sizeof(_clState)); - cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; + cl_uint numPlatforms; + cl_platform_id platform = NULL; + status = clGetPlatformIDs(0, NULL, &numPlatforms); + if (status != CL_SUCCESS) + { + applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)"); + return NULL; + } - clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); + if (numPlatforms > 0) + { + cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); + status = clGetPlatformIDs(numPlatforms, platforms, NULL); + if (status != CL_SUCCESS) + { + applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)"); + return NULL; + } + + for(i = 0; i < numPlatforms; ++i) + { + char pbuff[100]; + status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); + if (status != CL_SUCCESS) + { + applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)"); + free(platforms); + return NULL; + } + platform = platforms[i]; + if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) + { + break; + } + } + free(platforms); + } + + if (platform == NULL) { + perror("NULL platform found!\n"); + return NULL; + } + + size_t nDevices; + cl_uint numDevices; + status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { - applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)"); + applog(LOG_ERR, "Error: Getting Device IDs (num)"); return NULL; } - if (gpu < numDevices) { - char pbuff[100]; - status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, &nDevices); + cl_device_id *devices; + if (numDevices > 0 ) { + devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); + clState->devices = devices; + + /* Now, get the device list data */ + + status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { - applog(LOG_ERR, "Error: Getting Device Info"); + applog(LOG_ERR, "Error: Getting Device IDs (list)"); return NULL; } - applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); - strncpy(name, pbuff, nameSize); - } else { - applog(LOG_ERR, "Invalid GPU %i", gpu); + applog(LOG_INFO, "List of devices:"); + + unsigned int i; + for(i=0; icontext = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); + if (status != CL_SUCCESS) + { + applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)"); return NULL; } diff --git a/ocl.h b/ocl.h index a95f9726..0960a4c0 100644 --- a/ocl.h +++ b/ocl.h @@ -18,11 +18,11 @@ typedef struct { cl_uint preferred_vwidth; size_t max_work_size; size_t work_size; + cl_device_id *devices; } _clState; extern char *file_contents(const char *filename, int *length); extern int clDevicesNum(); -extern int preinit_devices(void); extern _clState *initCQ(_clState *clState, unsigned int gpu); extern _clState *initCl(unsigned int gpu, char *name, size_t nameSize); #endif /* HAVE_OPENCL */ From 42d49ffdc73288a81abff81c3de0a9a0548f7f69 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 15 Aug 2011 20:28:25 +1000 Subject: [PATCH 5/7] Revert "Restart threads by abstracting out the clcontext initialisation and using that instead of probing all cards." This reverts commit 8f186e61e250e71bd606cabb52795eaa0c9ad423. --- main.c | 24 +++++++++++---- ocl.c | 93 ++++++++++++++++++++++++++-------------------------------- ocl.h | 2 -- 3 files changed, 60 insertions(+), 59 deletions(-) diff --git a/main.c b/main.c index 095ac2f3..bc11af57 100644 --- a/main.c +++ b/main.c @@ -3376,22 +3376,30 @@ static void *reinit_gpu(void *userdata) struct cgpu_info *cgpu = (struct cgpu_info *)userdata; int gpu = cgpu->cpu_gpu; struct thr_info *thr; + char name[256]; int thr_id; _clState *clState; /* Send threads message to stop */ gpu_devices[gpu] = false; + sleep(5); for (thr_id = 0; thr_id < gpu_threads; thr_id ++) { if (dev_from_id(thr_id) != gpu) continue; + clState = clStates[thr_id]; + /* Send it a command. If it responds we can restart */ + applog(LOG_WARNING, "Attempting to send GPU command"); + clFlush(clState->commandQueue); + clFinish(clState->commandQueue); + thr = &thr_info[thr_id]; thr->rolling = thr->cgpu->rolling = 0; if (!pthread_cancel(*thr->pth)) { applog(LOG_WARNING, "Thread still exists, killing it off"); } else - applog(LOG_WARNING, "Thread no longer exists!"); + applog(LOG_WARNING, "Thread no longer exists"); /* Lose this ram cause we may get stuck here! */ //tq_freeze(thr->q); @@ -3400,13 +3408,17 @@ static void *reinit_gpu(void *userdata) if (!thr->q) quit(1, "Failed to tq_new in reinit_gpu"); - /* Create a new clstate */ - applog(LOG_WARNING, "Attempting to create a new clState"); - clState = initCQ(clStates[thr_id], gpu); - /* Lose this ram cause we may dereference in the dying thread! */ //free(clState); - applog(LOG_WARNING, "Command successful, attempting to create new thread"); + applog(LOG_WARNING, "Command successful, attempting to reinit device"); + + applog(LOG_INFO, "Reinit GPU thread %d", thr_id); + clState = initCl(gpu, name, sizeof(name)); + if (!clState) { + applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id); + return NULL; + } + applog(LOG_INFO, "initCl() finished. Found %s", name); if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr))) { applog(LOG_ERR, "thread %d create failed", thr_id); diff --git a/ocl.c b/ocl.c index 5fde3736..45f70955 100644 --- a/ocl.c +++ b/ocl.c @@ -173,55 +173,6 @@ void patch_opcodes(char *w, unsigned remaining) } } -_clState *initCQ(_clState *clState, unsigned int gpu) -{ - cl_int status = 0; - cl_device_id *devices = clState->devices; - - /* create a cl program executable for all the devices specified */ - status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Error: Building Program (clBuildProgram)"); - size_t logSize; - status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); - - char *log = malloc(logSize); - status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); - applog(LOG_INFO, "%s", log); - return NULL; - } - - /* get a kernel object handle for a kernel with the given name */ - clState->kernel = clCreateKernel(clState->program, "search", &status); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)"); - return NULL; - } - - ///////////////////////////////////////////////////////////////// - // Create an OpenCL command queue - ///////////////////////////////////////////////////////////////// - clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], - CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); - if (status != CL_SUCCESS) /* Try again without OOE enable */ - clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); - if (status != CL_SUCCESS) - { - applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)"); - return NULL; - } - - clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, BUFFERSIZE, NULL, &status); - if (status != CL_SUCCESS) { - applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)"); - return NULL; - } - - return clState; -} - _clState *initCl(unsigned int gpu, char *name, size_t nameSize) { int patchbfi = 0; @@ -285,7 +236,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize) cl_device_id *devices; if (numDevices > 0 ) { devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); - clState->devices = devices; /* Now, get the device list data */ @@ -676,7 +626,48 @@ built: applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT patching, %d vectors and worksize %d", filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size); - return initCQ(clState, gpu); + /* create a cl program executable for all the devices specified */ + status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); + if (status != CL_SUCCESS) + { + applog(LOG_ERR, "Error: Building Program (clBuildProgram)"); + size_t logSize; + status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); + + char *log = malloc(logSize); + status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); + applog(LOG_INFO, "%s", log); + return NULL; + } + + /* get a kernel object handle for a kernel with the given name */ + clState->kernel = clCreateKernel(clState->program, "search", &status); + if (status != CL_SUCCESS) + { + applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)"); + return NULL; + } + + ///////////////////////////////////////////////////////////////// + // Create an OpenCL command queue + ///////////////////////////////////////////////////////////////// + clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); + if (status != CL_SUCCESS) /* Try again without OOE enable */ + clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); + if (status != CL_SUCCESS) + { + applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)"); + return NULL; + } + + clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, BUFFERSIZE, NULL, &status); + if (status != CL_SUCCESS) { + applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)"); + return NULL; + } + + return clState; } #endif /* HAVE_OPENCL */ diff --git a/ocl.h b/ocl.h index 0960a4c0..3c2a5cee 100644 --- a/ocl.h +++ b/ocl.h @@ -18,12 +18,10 @@ typedef struct { cl_uint preferred_vwidth; size_t max_work_size; size_t work_size; - cl_device_id *devices; } _clState; extern char *file_contents(const char *filename, int *length); extern int clDevicesNum(); -extern _clState *initCQ(_clState *clState, unsigned int gpu); extern _clState *initCl(unsigned int gpu, char *name, size_t nameSize); #endif /* HAVE_OPENCL */ #endif /* __OCL_H__ */ From cfe8534c39dd867ed1c35da6ec24ccb66e2be015 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 15 Aug 2011 22:07:28 +1000 Subject: [PATCH 6/7] Partial restart of threads has proven to be unsuccessful so reinstate device re-initialisation to restart GPUs. Do this by having a reinit thread that is told via a queue which device to restart. If this thread fails to return from opencl code, it should not interrupt the workings of other devices. --- main.c | 102 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 36 deletions(-) diff --git a/main.c b/main.c index bc11af57..858623f2 100644 --- a/main.c +++ b/main.c @@ -184,6 +184,8 @@ int longpoll_thr_id; static int stage_thr_id; static int watchdog_thr_id; static int input_thr_id; +static int gpur_thr_id; +static int cpur_thr_id; static int total_threads; struct work_restart *work_restart = NULL; @@ -3340,6 +3342,7 @@ static void restart_longpoll(void) static void *reinit_cpu(void *userdata) { + pthread_detach(pthread_self()); #if 0 struct cgpu_info *cgpu = (struct cgpu_info *)userdata; int cpu = cgpu->cpu_gpu; @@ -3371,35 +3374,52 @@ static void *reinit_cpu(void *userdata) } #ifdef HAVE_OPENCL +/* We have only one thread that ever re-initialises GPUs, thus if any GPU + * init command fails due to a completely wedged GPU, the thread will never + * return, unable to harm other GPUs. If it does return, it means we only had + * a soft failure and then the reinit_gpu thread is ready to tackle another + * GPU */ static void *reinit_gpu(void *userdata) { - struct cgpu_info *cgpu = (struct cgpu_info *)userdata; - int gpu = cgpu->cpu_gpu; + struct thr_info *mythr = userdata; + struct cgpu_info *cgpu; struct thr_info *thr; char name[256]; int thr_id; - _clState *clState; + int gpu; + + pthread_detach(pthread_self()); + +select_cgpu: + cgpu = tq_pop(mythr->q, NULL); + if (!cgpu) + goto out; + + if (clDevicesNum() != nDevs) { + applog(LOG_WARNING, "Hardware not reporting same number of active devices, will not attempt to restart GPU"); + goto out; + } - /* Send threads message to stop */ + gpu = cgpu->cpu_gpu; gpu_devices[gpu] = false; - sleep(5); for (thr_id = 0; thr_id < gpu_threads; thr_id ++) { if (dev_from_id(thr_id) != gpu) continue; - clState = clStates[thr_id]; - /* Send it a command. If it responds we can restart */ - applog(LOG_WARNING, "Attempting to send GPU command"); - clFlush(clState->commandQueue); - clFinish(clState->commandQueue); - thr = &thr_info[thr_id]; thr->rolling = thr->cgpu->rolling = 0; if (!pthread_cancel(*thr->pth)) { - applog(LOG_WARNING, "Thread still exists, killing it off"); + applog(LOG_WARNING, "Thread %d still exists, killing it off", thr_id); } else - applog(LOG_WARNING, "Thread no longer exists"); + applog(LOG_WARNING, "Thread %d no longer exists", thr_id); + } + + for (thr_id = 0; thr_id < gpu_threads; thr_id ++) { + if (dev_from_id(thr_id) != gpu) + continue; + + thr = &thr_info[thr_id]; /* Lose this ram cause we may get stuck here! */ //tq_freeze(thr->q); @@ -3410,13 +3430,12 @@ static void *reinit_gpu(void *userdata) /* Lose this ram cause we may dereference in the dying thread! */ //free(clState); - applog(LOG_WARNING, "Command successful, attempting to reinit device"); applog(LOG_INFO, "Reinit GPU thread %d", thr_id); - clState = initCl(gpu, name, sizeof(name)); - if (!clState) { + clStates[thr_id] = initCl(gpu, name, sizeof(name)); + if (!clStates[thr_id]) { applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id); - return NULL; + goto out; } applog(LOG_INFO, "initCl() finished. Found %s", name); @@ -3427,14 +3446,17 @@ static void *reinit_gpu(void *userdata) applog(LOG_WARNING, "Thread %d restarted", thr_id); } - /* Try to re-enable it */ gpu_devices[gpu] = true; for (thr_id = 0; thr_id < gpu_threads; thr_id ++) { + if (dev_from_id(thr_id) != gpu) + continue; + thr = &thr_info[thr_id]; - if (dev_from_id(thr_id) == gpu) - tq_push(thr->q, &ping); + tq_push(thr->q, &ping); } + goto select_cgpu; +out: return NULL; } #else @@ -3445,16 +3467,10 @@ static void *reinit_gpu(void *userdata) static void reinit_device(struct cgpu_info *cgpu) { - pthread_t resus_thread; - void *reinit; - if (cgpu->is_gpu) - reinit = reinit_gpu; + tq_push(thr_info[gpur_thr_id].q, cgpu); else - reinit = reinit_cpu; - - if (unlikely(pthread_create(&resus_thread, NULL, reinit, (void *)cgpu))) - applog(LOG_ERR, "Failed to create reinit thread"); + tq_push(thr_info[cpur_thr_id].q, cgpu); } /* Determine which are the first threads belonging to a device and if they're @@ -3811,7 +3827,7 @@ static void fork_monitor() int main (int argc, char *argv[]) { - unsigned int i, j = 0, x, y, pools_active = 0; + unsigned int i, x, y, pools_active = 0; struct sigaction handler; struct thr_info *thr; char name[256]; @@ -3969,7 +3985,7 @@ int main (int argc, char *argv[]) mining_threads = opt_n_threads + gpu_threads; - total_threads = mining_threads + 5; + total_threads = mining_threads + 7; work_restart = calloc(total_threads, sizeof(*work_restart)); if (!work_restart) quit(1, "Failed to calloc work_restart"); @@ -4050,11 +4066,9 @@ int main (int argc, char *argv[]) quit(0, "No pools active! Exiting."); #ifdef HAVE_OPENCL - i = 0; - /* start GPU mining threads */ - for (j = 0; j < nDevs * opt_g_threads; j++) { - int gpu = j % nDevs; + for (i = 0; i < nDevs * opt_g_threads; i++) { + int gpu = i % nDevs; gpus[gpu].is_gpu = 1; gpus[gpu].cpu_gpu = gpu; @@ -4087,8 +4101,6 @@ int main (int argc, char *argv[]) if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr))) quit(1, "thread %d create failed", i); - - i++; } applog(LOG_INFO, "%d gpu miner threads started", gpu_threads); @@ -4134,6 +4146,24 @@ int main (int argc, char *argv[]) quit(1, "input thread create failed"); pthread_detach(*thr->pth); + /* Create reinit cpu thread */ + cpur_thr_id = mining_threads + 5; + thr = &thr_info[cpur_thr_id]; + thr->q = tq_new(); + if (!thr->q) + quit(1, "tq_new failed for cpur_thr_id"); + if (thr_info_create(thr, NULL, reinit_cpu, thr)) + quit(1, "reinit_cpu thread create failed"); + + /* Create reinit gpu thread */ + gpur_thr_id = mining_threads + 6; + thr = &thr_info[gpur_thr_id]; + thr->q = tq_new(); + if (!thr->q) + quit(1, "tq_new failed for gpur_thr_id"); + if (thr_info_create(thr, NULL, reinit_gpu, thr)) + quit(1, "reinit_gpu thread create failed"); + /* main loop - simply wait for workio thread to exit */ pthread_join(*thr_info[work_thr_id].pth, NULL); applog(LOG_INFO, "workio thread dead, exiting."); From 4e48561a13d73722d95aa084f435b0ead3a32b5d Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Mon, 15 Aug 2011 22:21:17 +1000 Subject: [PATCH 7/7] Rework the last-initialised time displayed. --- main.c | 11 ++++++++++- miner.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/main.c b/main.c index 858623f2..15b3da28 100644 --- a/main.c +++ b/main.c @@ -1972,6 +1972,7 @@ retry: gpu, cgpu->rolling, cgpu->total_mhashes / total_secs, cgpu->getworks, cgpu->accepted, cgpu->rejected, cgpu->hw_errors, cgpu->efficiency, cgpu->utility); + wlog("Last initialised: %s\n", cgpu->init); for (i = 0; i < mining_threads; i++) { thr = &thr_info[i]; if (thr->cgpu != cgpu) @@ -3384,6 +3385,7 @@ static void *reinit_gpu(void *userdata) struct thr_info *mythr = userdata; struct cgpu_info *cgpu; struct thr_info *thr; + struct timeval now; char name[256]; int thr_id; int gpu; @@ -3446,6 +3448,9 @@ select_cgpu: applog(LOG_WARNING, "Thread %d restarted", thr_id); } + gettimeofday(&now, NULL); + get_datestamp(cgpu->init, &now); + gpu_devices[gpu] = true; for (thr_id = 0; thr_id < gpu_threads; thr_id ++) { if (dev_from_id(thr_id) != gpu) @@ -4069,13 +4074,15 @@ int main (int argc, char *argv[]) /* start GPU mining threads */ for (i = 0; i < nDevs * opt_g_threads; i++) { int gpu = i % nDevs; + struct cgpu_info *cgpu; + struct timeval now; gpus[gpu].is_gpu = 1; gpus[gpu].cpu_gpu = gpu; thr = &thr_info[i]; thr->id = i; - thr->cgpu = &gpus[gpu]; + cgpu = thr->cgpu = &gpus[gpu]; thr->q = tq_new(); if (!thr->q) @@ -4098,6 +4105,8 @@ int main (int argc, char *argv[]) continue; } applog(LOG_INFO, "initCl() finished. Found %s", name); + gettimeofday(&now, NULL); + get_datestamp(cgpu->init, &now); if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr))) quit(1, "thread %d create failed", i); diff --git a/miner.h b/miner.h index 4a706511..b1034ba4 100644 --- a/miner.h +++ b/miner.h @@ -152,6 +152,7 @@ struct cgpu_info { double efficiency; double utility; enum alive status; + char init[40]; }; struct thr_info {