From c2214091aee94a2afff763f6e1259b603e7eea19 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 10 Oct 2015 02:12:43 +0200 Subject: [PATCH] benchmark: free last memory leaks on algo switch remains my original lyra2 implementation to fix... (cuda_lyra2.cu) I guess some kind of memory overflow force the driver to allocate memory... but was unable to free it without device reset. --- Algo256/cuda_fugue256.cu | 35 +++++++++++++++++++++++------------ Algo256/cuda_groestl256.cu | 25 +++++++++++++++++-------- ccminer.cpp | 1 + scrypt-jane.cpp | 3 +++ scrypt.cpp | 3 +++ skein.cu | 6 ++++-- util.cpp | 27 +++++++-------------------- x13/cuda_x13_fugue512.cu | 13 +++++++++---- x15/cuda_x15_whirlpool.cu | 4 ++-- 9 files changed, 69 insertions(+), 48 deletions(-) diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu index 66d1c35..a5b6628 100644 --- a/Algo256/cuda_fugue256.cu +++ b/Algo256/cuda_fugue256.cu @@ -10,14 +10,15 @@ uint32_t *d_fugue256_hashoutput[MAX_GPUS]; static uint32_t *d_resultNonce[MAX_GPUS]; +static unsigned int* d_textures[MAX_GPUS][8]; __constant__ uint32_t GPUstate[30]; // Single GPU __constant__ uint32_t pTarget[8]; // Single GPU -texture mixTab0Tex; -texture mixTab1Tex; -texture mixTab2Tex; -texture mixTab3Tex; +static texture mixTab0Tex; +static texture mixTab1Tex; +static texture mixTab2Tex; +static texture mixTab3Tex; #if USE_SHARED #define mixtab0(x) (*((uint32_t*)mixtabs + ( (x)))) @@ -707,28 +708,30 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp } } -#define texDef(texname, texmem, texsource, texsize) \ +#define texDef(id, texname, texmem, texsource, texsize) { \ unsigned int *texmem; \ cudaMalloc(&texmem, texsize); \ + d_textures[thr_id][id] = texmem; \ cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ texname.normalized = 0; \ texname.filterMode = cudaFilterModePoint; \ texname.addressMode[0] = cudaAddressModeClamp; \ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } - + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \ + } \ +} __host__ void fugue256_cpu_init(int thr_id, uint32_t threads) { // Kopiere die Hash-Tabellen in den GPU-Speicher - texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); - texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); - texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); - texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); + texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); + texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); + texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); + texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); + cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); } @@ -737,6 +740,14 @@ void fugue256_cpu_free(int thr_id) { cudaFree(d_fugue256_hashoutput[thr_id]); cudaFree(d_resultNonce[thr_id]); + + cudaUnbindTexture(mixTab0Tex); + cudaUnbindTexture(mixTab1Tex); + cudaUnbindTexture(mixTab2Tex); + cudaUnbindTexture(mixTab3Tex); + + for (int i=0; i<4; i++) + cudaFree(d_textures[thr_id][i]); } __host__ diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu index b14a0f2..f65b5c2 100644 --- a/Algo256/cuda_groestl256.cu +++ b/Algo256/cuda_groestl256.cu @@ -67,14 +67,14 @@ __constant__ uint32_t pTarget[8]; #define T3dn(x) tex1Dfetch(t3dn2, x) #endif -texture t0up2; -texture t0dn2; -texture t1up2; -texture t1dn2; -texture t2up2; -texture t2dn2; -texture t3up2; -texture t3dn2; +static texture t0up2; +static texture t0dn2; +static texture t1up2; +static texture t1dn2; +static texture t2up2; +static texture t2dn2; +static texture t3up2; +static texture t3dn2; #define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ t[d0] = T0up(B32_0(a[b0])) \ @@ -283,6 +283,15 @@ void groestl256_cpu_init(int thr_id, uint32_t threads) __host__ void groestl256_cpu_free(int thr_id) { + cudaUnbindTexture(t0up2); + cudaUnbindTexture(t0dn2); + cudaUnbindTexture(t1up2); + cudaUnbindTexture(t1dn2); + cudaUnbindTexture(t2up2); + cudaUnbindTexture(t2dn2); + cudaUnbindTexture(t3up2); + cudaUnbindTexture(t3dn2); + for (int i=0; i<8; i++) cudaFree(d_textures[thr_id][i]); diff --git a/ccminer.cpp b/ccminer.cpp index eda3a35..86b0f68 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -1632,6 +1632,7 @@ bool algo_switch_next(int thr_id) stats_purge_all(); global_hashrate = 0; + thr_hashrates[thr_id] = 0; // reset for minmax64 if (thr_id == 0) applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]); diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp index 8023fe9..4620fdf 100644 --- a/scrypt-jane.cpp +++ b/scrypt-jane.cpp @@ -433,6 +433,9 @@ void free_scrypt_jane(int thr_id) { int dev_id = device_map[thr_id]; + if (!init[thr_id]) + return; + cudaSetDevice(dev_id); cudaDeviceSynchronize(); cudaDeviceReset(); // well, simple way to free ;) diff --git a/scrypt.cpp b/scrypt.cpp index e447583..9483cd7 100644 --- a/scrypt.cpp +++ b/scrypt.cpp @@ -692,6 +692,9 @@ void free_scrypt(int thr_id) { int dev_id = device_map[thr_id]; + if (!init[thr_id]) + return; + // trivial way to free all... cudaSetDevice(dev_id); cudaDeviceSynchronize(); diff --git a/skein.cu b/skein.cu index 8577e2f..b7b46ec 100644 --- a/skein.cu +++ b/skein.cu @@ -376,7 +376,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no if (sm5) { skeincoin_init(thr_id); } else { - cudaMalloc(&d_hash[thr_id], throughput * 64U); + cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput); quark_skein512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput); CUDA_SAFE_CALL(cudaDeviceSynchronize()); @@ -475,7 +475,9 @@ extern "C" void free_skeincoin(int thr_id) cudaSetDevice(device_map[thr_id]); - if (!sm5) { + if (sm5) + skeincoin_free(thr_id); + else { cudaFree(d_hash[thr_id]); cuda_check_cpu_free(thr_id); } diff --git a/util.cpp b/util.cpp index f721f24..13d1735 100644 --- a/util.cpp +++ b/util.cpp @@ -1829,33 +1829,20 @@ void do_gpu_tests(void) unsigned long done; char s[128] = { '\0' }; uchar buf[160]; - uint32_t tgt[8] = { 0 }; + struct work work; + memset(&work, 0, sizeof(work)); opt_tracegpu = true; work_restart = (struct work_restart*) malloc(sizeof(struct work_restart)); work_restart[0].restart = 1; - tgt[7] = 0xffff; - - //memset(buf, 0, sizeof buf); - //scanhash_skeincoin(0, (uint32_t*)buf, tgt, 1, &done); - - //memset(buf, 0, sizeof buf); - //memcpy(buf, zrtest, 80); - //scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done); + work.target[7] = 0xffff; //struct timeval tv; - //memset(buf, 0, sizeof buf); - //scanhash_scrypt_jane(0, (uint32_t*)buf, tgt, NULL, 1, &done, &tv, &tv); - - memset(buf, 0, sizeof buf); - scanhash_lyra2(0, (uint32_t*)buf, tgt, 1, &done); - - //memset(buf, 0, sizeof buf); - // buf[0] = 1; buf[64] = 2; // for endian tests - //scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14); + //memset(work.data, 0, sizeof(work.data)); + //scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv); - //memset(buf, 0, sizeof buf); - //scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84 + memset(work.data, 0, sizeof(work.data)); + scanhash_lyra2(0, &work, 1, &done); free(work_restart); work_restart = NULL; diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu index d243935..1f3cbcb 100644 --- a/x13/cuda_x13_fugue512.cu +++ b/x13/cuda_x13_fugue512.cu @@ -46,10 +46,10 @@ static unsigned int* d_textures[MAX_GPUS][4]; #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x)))) #define mixtab3(x) (*((uint32_t*)mixtabs + (768+(x)))) -texture mixTab0Tex; -texture mixTab1Tex; -texture mixTab2Tex; -texture mixTab3Tex; +static texture mixTab0Tex; +static texture mixTab1Tex; +static texture mixTab2Tex; +static texture mixTab3Tex; static const uint32_t mixtab0_cpu[] = { SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7), @@ -685,6 +685,11 @@ void x13_fugue512_cpu_init(int thr_id, uint32_t threads) __host__ void x13_fugue512_cpu_free(int thr_id) { + cudaUnbindTexture(mixTab0Tex); + cudaUnbindTexture(mixTab1Tex); + cudaUnbindTexture(mixTab2Tex); + cudaUnbindTexture(mixTab3Tex); + for (int i=0; i<4; i++) cudaFree(d_textures[thr_id][i]); } diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index c6c98fd..49f6867 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -14,8 +14,8 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint32_t pTarget[8]; -uint32_t *d_wnounce[MAX_GPUS]; -uint32_t *d_WNonce[MAX_GPUS]; +static uint32_t *d_wnounce[MAX_GPUS]; +static uint32_t *d_WNonce[MAX_GPUS]; #define USE_ALL_TABLES 1