From 922c2a5cd7cf1a153995b4d7b3bdec0ed8a5bb3a Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 8 Oct 2015 21:31:16 +0200 Subject: [PATCH] algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) --- Algo256/bmw.cu | 4 ++- Algo256/cuda_bmw.cu | 6 ++++ Algo256/cuda_groestl256.cu | 27 +++++++++++------- JHA/cuda_jha_compactionTest.cu | 17 ++++++++++-- JHA/jackpotcoin.cu | 6 +++- cuda_nist5.cu | 2 ++ cuda_skeincoin.cu | 7 ++++- heavy/cuda_blake512.cu | 16 ++++++++--- heavy/cuda_combine.cu | 8 +++++- heavy/cuda_groestl512.cu | 43 +++++++++++++++++++---------- heavy/cuda_hefty1.cu | 11 ++++++-- heavy/cuda_keccak512.cu | 10 +++++-- heavy/cuda_sha256.cu | 13 +++++++-- heavy/heavy.cu | 10 +++++-- heavy/heavy.h | 6 ++++ lyra2/lyra2RE.cu | 3 ++ lyra2/lyra2REv2.cu | 3 ++ neoscrypt/cuda_neoscrypt.cu | 2 +- pentablake.cu | 4 +-- quark/cuda_quark_groestl512.cu | 8 ++++++ quark/cuda_quark_groestl512_sm20.cu | 32 ++++++++++++++------- quark/quarkcoin.cu | 2 ++ qubit/qubit.cu | 9 +++--- scrypt-jane.cpp | 24 ++++++++++++++-- scrypt.cpp | 27 +++++++++++++++--- scrypt/blake.cu | 14 ++++++++-- scrypt/salsa_kernel.cu | 5 ++-- skein.cu | 10 +++++-- skein2.cpp | 12 ++++---- x11/c11.cu | 4 +++ x11/cuda_x11_simd512.cu | 7 +++++ x11/fresh.cu | 2 ++ x11/s3.cu | 2 ++ x11/x11.cu | 5 ++++ x13/cuda_x13_fugue512.cu | 31 +++++++++++++++------ x13/x13.cu | 8 +++++- x15/x14.cu | 14 ++++++++-- x15/x15.cu | 7 +++++ x17/x17.cu | 7 +++++ zr5.cu | 2 ++ 40 files changed, 337 insertions(+), 93 deletions(-) diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu index 2951482..9b91252 100644 --- a/Algo256/bmw.cu +++ b/Algo256/bmw.cu @@ -13,6 +13,7 @@ extern "C" { static uint32_t *d_hash[MAX_GPUS]; extern void bmw256_midstate_init(int thr_id, uint32_t threads); +extern void bmw256_midstate_free(int thr_id); extern void bmw256_setBlock_80(int thr_id, void *pdata); extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap); @@ -111,8 +112,9 @@ extern "C" void free_bmw(int thr_id) cudaSetDevice(device_map[thr_id]); cudaFree(d_hash[thr_id]); - + bmw256_midstate_free(thr_id); cuda_check_cpu_free(thr_id); + init[thr_id] = false; cudaDeviceSynchronize(); diff --git a/Algo256/cuda_bmw.cu b/Algo256/cuda_bmw.cu index b7a4f99..c4638b9 100644 --- a/Algo256/cuda_bmw.cu +++ b/Algo256/cuda_bmw.cu @@ -372,3 +372,9 @@ void bmw256_midstate_init(int thr_id, uint32_t threads) { cudaMalloc(&d_midstate[thr_id], sizeof(sph_bmw256_context)); } + +__host__ +void bmw256_midstate_free(int thr_id) +{ + cudaFree(d_midstate[thr_id]); +} diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu index b1ffa4f..b14a0f2 100644 --- a/Algo256/cuda_groestl256.cu +++ b/Algo256/cuda_groestl256.cu @@ -7,6 +7,7 @@ static uint32_t *h_GNonces[MAX_GPUS]; static uint32_t *d_GNonces[MAX_GPUS]; +static unsigned int* d_textures[MAX_GPUS][8]; __constant__ uint32_t pTarget[8]; @@ -249,28 +250,31 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *out } } -#define texDef(texname, texmem, texsource, texsize) \ +#define texDef(id, texname, texmem, texsource, texsize) { \ unsigned int *texmem; \ cudaMalloc(&texmem, texsize); \ + d_textures[thr_id][id] = texmem; \ cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ texname.normalized = 0; \ texname.filterMode = cudaFilterModePoint; \ texname.addressMode[0] = cudaAddressModeClamp; \ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \ + } \ +} __host__ void groestl256_cpu_init(int thr_id, uint32_t threads) { // Texturen mit obigem Makro initialisieren - texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256); - texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256); - texDef(t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256); - texDef(t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256); - texDef(t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256); - texDef(t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256); - texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256); - texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256); + texDef(0, t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256); + texDef(1, t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256); + texDef(2, t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256); + texDef(3, t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256); + texDef(4, t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256); + texDef(5, t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256); + texDef(6, t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256); + texDef(7, t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256); cudaMalloc(&d_GNonces[thr_id], 2*sizeof(uint32_t)); cudaMallocHost(&h_GNonces[thr_id], 2*sizeof(uint32_t)); @@ -279,6 +283,9 @@ void groestl256_cpu_init(int thr_id, uint32_t threads) __host__ void groestl256_cpu_free(int thr_id) { + for (int i=0; i<8; i++) + cudaFree(d_textures[thr_id][i]); + cudaFree(d_GNonces[thr_id]); cudaFreeHost(h_GNonces[thr_id]); } diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu index a1d1ec3..35a13b7 100644 --- a/JHA/cuda_jha_compactionTest.cu +++ b/JHA/cuda_jha_compactionTest.cu @@ -33,8 +33,9 @@ __device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFunction[MAX_GPUS]; -// Setup-Funktionen -__host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads) +// Setup-Function +__host__ +void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads) { cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t)); @@ -51,6 +52,18 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) } +__host__ +void jackpot_compactTest_cpu_free(int thr_id) +{ + cudaFree(d_tempBranch1Nonces[thr_id]); + cudaFree(d_numValid[thr_id]); + + cudaFree(d_partSum[0][thr_id]); + cudaFree(d_partSum[1][thr_id]); + + cudaFreeHost(h_numValid[thr_id]); +} + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 /** * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index 674a388..558233e 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -21,6 +21,7 @@ extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -29,6 +30,7 @@ extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads); +extern void jackpot_compactTest_cpu_free(int thr_id); extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, @@ -264,9 +266,11 @@ extern "C" void free_jackpot(int thr_id) cudaFree(d_branch1Nonces[thr_id]); cudaFree(d_branch2Nonces[thr_id]); cudaFree(d_branch3Nonces[thr_id]); - cudaFree(d_jackpotNonces[thr_id]); + quark_groestl512_cpu_free(thr_id); + jackpot_compactTest_cpu_free(thr_id); + cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/cuda_nist5.cu b/cuda_nist5.cu index c4c390d..44bfe60 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -19,6 +19,7 @@ extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t st extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -173,6 +174,7 @@ extern "C" void free_nist5(int thr_id) cudaFree(d_hash[thr_id]); + quark_groestl512_cpu_free(thr_id); cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/cuda_skeincoin.cu b/cuda_skeincoin.cu index 5a6e8a0..8c91c5e 100644 --- a/cuda_skeincoin.cu +++ b/cuda_skeincoin.cu @@ -708,6 +708,12 @@ __host__ void skeincoin_init(int thr_id) { cuda_get_arch(thr_id); + CUDA_SAFE_CALL(cudaMalloc(&d_found[thr_id], 2 * sizeof(uint32_t))); +} + +__host__ +void skeincoin_free(int thr_id) { + cudaFree(d_found[thr_id]); } __host__ @@ -716,7 +722,6 @@ void skeincoin_setBlock_80(int thr_id, void *pdata) uint64_t message[16]; memcpy(&message[0], pdata, 80); - CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 2 * sizeof(uint32_t))); cudaMemcpyToSymbol(c_message16, &message[8], 16, 0, cudaMemcpyHostToDevice); precalc(message); diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu index 22f413b..f3d22fa 100644 --- a/heavy/cuda_blake512.cu +++ b/heavy/cuda_blake512.cu @@ -191,8 +191,9 @@ template __global__ void blake512_gpu_hash(uint32_t threads, uin // ---------------------------- END CUDA blake512 functions ------------------------------------ -// Setup-Funktionen -__host__ void blake512_cpu_init(int thr_id, uint32_t threads) +// Setup Function +__host__ +void blake512_cpu_init(int thr_id, uint32_t threads) { // Kopiere die Hash-Tabellen in den GPU-Speicher cudaMemcpyToSymbol( c_sigma, @@ -211,12 +212,19 @@ __host__ void blake512_cpu_init(int thr_id, uint32_t threads) 0, cudaMemcpyHostToDevice); // Speicher für alle Ergebnisse belegen - CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], (size_t) 64 * threads)); +} + +__host__ +void blake512_cpu_free(int thr_id) +{ + cudaFree(d_hash5output[thr_id]); } static int BLOCKSIZE = 84; -__host__ void blake512_cpu_setBlock(void *pdata, int len) +__host__ +void blake512_cpu_setBlock(void *pdata, int len) // data muss 84-Byte haben! // heftyHash hat 32-Byte { diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu index 3365cf1..64a9561 100644 --- a/heavy/cuda_combine.cu +++ b/heavy/cuda_combine.cu @@ -124,7 +124,13 @@ __host__ void combine_cpu_init(int thr_id, uint32_t threads) { // Speicher für alle Ergebnisse belegen - CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], (size_t) 32 * threads)); +} + +__host__ +void combine_cpu_free(int thr_id) +{ + cudaFree(d_hashoutput[thr_id]); } __host__ diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu index 0e503d3..ef4c2c1 100644 --- a/heavy/cuda_groestl512.cu +++ b/heavy/cuda_groestl512.cu @@ -9,6 +9,7 @@ // globaler Speicher für alle HeftyHashes aller Threads extern uint32_t *heavy_heftyHashes[MAX_GPUS]; extern uint32_t *heavy_nonceVector[MAX_GPUS]; +static unsigned int *d_textures[MAX_GPUS][8]; // globaler Speicher für unsere Ergebnisse uint32_t *d_hash4output[MAX_GPUS]; @@ -730,36 +731,50 @@ template __global__ void groestl512_gpu_hash(uint32_t threads, u } } -#define texDef(texname, texmem, texsource, texsize) \ +#define texDef(id, texname, texmem, texsource, texsize) { \ unsigned int *texmem; \ cudaMalloc(&texmem, texsize); \ + d_textures[thr_id][id] = texmem; \ cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ texname.normalized = 0; \ texname.filterMode = cudaFilterModePoint; \ texname.addressMode[0] = cudaAddressModeClamp; \ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \ + } \ +} -// Setup-Funktionen -__host__ void groestl512_cpu_init(int thr_id, uint32_t threads) +// Setup Function +__host__ +void groestl512_cpu_init(int thr_id, uint32_t threads) { // Texturen mit obigem Makro initialisieren - texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); + texDef(0, t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256); + texDef(1, t0dn, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); + texDef(2, t1up, d_T1up, T1up_cpu, sizeof(uint32_t)*256); + texDef(3, t1dn, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); + texDef(4, t2up, d_T2up, T2up_cpu, sizeof(uint32_t)*256); + texDef(5, t2dn, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); + texDef(6, t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256); + texDef(7, t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads); + cudaMalloc(&d_hash4output[thr_id], (size_t) 64 * threads); +} + +__host__ +void groestl512_cpu_free(int thr_id) +{ + for (int i=0; i <8; i++) + cudaFree(d_textures[thr_id][i]); + + cudaFree(d_hash4output[thr_id]); } static int BLOCKSIZE = 84; -__host__ void groestl512_cpu_setBlock(void *data, int len) +__host__ +void groestl512_cpu_setBlock(void *data, int len) // data muss 80/84-Byte haben! // heftyHash hat 32-Byte { diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu index d8a65a8..4c83441 100644 --- a/heavy/cuda_hefty1.cu +++ b/heavy/cuda_hefty1.cu @@ -31,7 +31,8 @@ uint32_t hefty_cpu_hashTable[] = { 0x510e527fUL, 0x9b05688cUL, 0x1f83d9abUL, - 0x5be0cd19UL }; + 0x5be0cd19UL +}; uint32_t hefty_cpu_constantTable[] = { 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, @@ -316,7 +317,13 @@ void hefty_cpu_init(int thr_id, uint32_t threads) sizeof(uint32_t) * 64 ); // Speicher für alle Hefty1 hashes belegen - CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], (size_t) 32 * threads)); +} + +__host__ +void hefty_cpu_free(int thr_id) +{ + cudaFree(heavy_heftyHashes[thr_id]); } __host__ diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu index 7415848..3e94f16 100644 --- a/heavy/cuda_keccak512.cu +++ b/heavy/cuda_keccak512.cu @@ -185,7 +185,7 @@ template __global__ void keccak512_gpu_hash(uint32_t threads, ui // ---------------------------- END CUDA keccak512 functions ------------------------------------ -__host__ +__host__ void keccak512_cpu_init(int thr_id, uint32_t threads) { // Kopiere die Hash-Tabellen in den GPU-Speicher @@ -195,7 +195,13 @@ void keccak512_cpu_init(int thr_id, uint32_t threads) 0, cudaMemcpyHostToDevice); // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads); + cudaMalloc(&d_hash3output[thr_id], (size_t) 64 * threads); +} + +__host__ +void keccak512_cpu_free(int thr_id) +{ + cudaFree(d_hash3output[thr_id]); } // ----------------BEGIN keccak512 CPU version from scrypt-jane code -------------------- diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu index d0305da..7441621 100644 --- a/heavy/cuda_sha256.cu +++ b/heavy/cuda_sha256.cu @@ -160,8 +160,9 @@ template __global__ void sha256_gpu_hash(uint32_t threads, uint3 } } -// Setup-Funktionen -__host__ void sha256_cpu_init(int thr_id, uint32_t threads) +// Setup Function +__host__ +void sha256_cpu_init(int thr_id, uint32_t threads) { // Kopiere die Hash-Tabellen in den GPU-Speicher cudaMemcpyToSymbol( sha256_gpu_constantTable, @@ -169,7 +170,13 @@ __host__ void sha256_cpu_init(int thr_id, uint32_t threads) sizeof(uint32_t) * 64 ); // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads); + cudaMalloc(&d_hash2output[thr_id], (size_t) 8 * sizeof(uint32_t) * threads); +} + +__host__ +void sha256_cpu_free(int thr_id) +{ + cudaFree(d_hash2output[thr_id]); } static int BLOCKSIZE = 84; diff --git a/heavy/heavy.cu b/heavy/heavy.cu index 605c81f..37eb8b8 100644 --- a/heavy/heavy.cu +++ b/heavy/heavy.cu @@ -178,7 +178,7 @@ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned l { uint16_t *ext = (uint16_t *)&pdata[20]; - if (opt_vote > maxvote) { + if (opt_vote > maxvote && !opt_benchmark) { applog(LOG_WARNING, "Your block reward vote (%hu) exceeds " "the maxvote reported by the pool (%hu).", opt_vote, maxvote); @@ -310,12 +310,18 @@ extern "C" void free_heavy(int thr_id) cudaFree(heavy_nonceVector[thr_id]); - // todo: free sub algos vectors + blake512_cpu_free(thr_id); + groestl512_cpu_free(thr_id); + hefty_cpu_free(thr_id); + keccak512_cpu_free(thr_id); + sha256_cpu_free(thr_id); + combine_cpu_free(thr_id); init[thr_id] = false; cudaDeviceSynchronize(); } + __host__ void heavycoin_hash(uchar* output, const uchar* input, int len) { diff --git a/heavy/heavy.h b/heavy/heavy.h index 59f3913..50affc4 100644 --- a/heavy/heavy.h +++ b/heavy/heavy.h @@ -4,27 +4,33 @@ void blake512_cpu_init(int thr_id, uint32_t threads); void blake512_cpu_setBlock(void *pdata, int len); void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce); +void blake512_cpu_free(int thr_id); void groestl512_cpu_init(int thr_id, uint32_t threads); void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy); void groestl512_cpu_setBlock(void *data, int len); void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce); +void groestl512_cpu_free(int thr_id); void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce); void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len); void hefty_cpu_init(int thr_id, uint32_t threads); +void hefty_cpu_free(int thr_id); void keccak512_cpu_init(int thr_id, uint32_t threads); void keccak512_cpu_setBlock(void *data, int len); void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy); void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce); +void keccak512_cpu_free(int thr_id); void sha256_cpu_init(int thr_id, uint32_t threads); void sha256_cpu_setBlock(void *data, int len); void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce); void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy); +void sha256_cpu_free(int thr_id); void combine_cpu_init(int thr_id, uint32_t threads); void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash); +void combine_cpu_free(int thr_id); #endif diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index 8475eb4..5a9cb71 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -17,6 +17,7 @@ extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const extern void blake256_cpu_setBlock_80(uint32_t *pdata); extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void keccak256_cpu_init(int thr_id, uint32_t threads); +extern void keccak256_cpu_free(int thr_id); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_init(int thr_id, uint32_t threads); @@ -174,7 +175,9 @@ extern "C" void free_lyra2(int thr_id) cudaFree(d_hash[thr_id]); + keccak256_cpu_free(thr_id); groestl256_cpu_free(thr_id); + init[thr_id] = false; cudaDeviceSynchronize(); diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index f327b46..15327ec 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -19,6 +19,7 @@ extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const extern void blake256_cpu_setBlock_80(uint32_t *pdata); extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void keccak256_cpu_init(int thr_id, uint32_t threads); +extern void keccak256_cpu_free(int thr_id); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); @@ -183,6 +184,8 @@ extern "C" void free_lyra2v2(int thr_id) cudaFree(d_matrix[thr_id]); bmw256_cpu_free(thr_id); + keccak256_cpu_free(thr_id); + init[thr_id] = false; cudaDeviceSynchronize(); diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu index 79d14d6..e7f4b21 100644 --- a/neoscrypt/cuda_neoscrypt.cu +++ b/neoscrypt/cuda_neoscrypt.cu @@ -730,7 +730,7 @@ void neoscrypt_cpu_init(int thr_id, uint32_t threads) { cuda_get_arch(thr_id); cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t)); - CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], threads * 256 * SHIFT)); + CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], (size_t) 256 * SHIFT * threads)); cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice); } diff --git a/pentablake.cu b/pentablake.cu index ca909b5..5d19a9c 100644 --- a/pentablake.cu +++ b/pentablake.cu @@ -381,7 +381,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n if (active_gpus > 1) { cudaSetDevice(device_map[thr_id]); } - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t))); @@ -452,7 +452,7 @@ void free_pentablake(int thr_id) cudaSetDevice(device_map[thr_id]); cudaFree(d_hash[thr_id]); - cudaFree(h_resNounce[thr_id]); + cudaFreeHost(h_resNounce[thr_id]); cudaFree(d_resNounce[thr_id]); init[thr_id] = false; diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index 2a946b9..456b009 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -141,6 +141,14 @@ void quark_groestl512_cpu_init(int thr_id, uint32_t threads) quark_groestl512_sm20_init(thr_id, threads); } +__host__ +void quark_groestl512_cpu_free(int thr_id) +{ + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) + quark_groestl512_sm20_free(thr_id); +} + __host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { diff --git a/quark/cuda_quark_groestl512_sm20.cu b/quark/cuda_quark_groestl512_sm20.cu index ea710c8..f368594 100644 --- a/quark/cuda_quark_groestl512_sm20.cu +++ b/quark/cuda_quark_groestl512_sm20.cu @@ -5,6 +5,8 @@ #define MAXWELL_OR_FERMI 0 #define USE_SHARED 1 +static unsigned int *d_textures[MAX_GPUS][8]; + // #define SPH_C32(x) ((uint32_t)(x ## U)) // #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) @@ -274,28 +276,38 @@ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32 #endif } -#define texDef(texname, texmem, texsource, texsize) \ +#define texDef(id, texname, texmem, texsource, texsize) { \ unsigned int *texmem; \ cudaMalloc(&texmem, texsize); \ + d_textures[thr_id][id] = texmem; \ cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ texname.normalized = 0; \ texname.filterMode = cudaFilterModePoint; \ texname.addressMode[0] = cudaAddressModeClamp; \ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \ + } \ +} __host__ void quark_groestl512_sm20_init(int thr_id, uint32_t threads) { // Texturen mit obigem Makro initialisieren - texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); - texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); - texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); - texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); - texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); - texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); - texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); - texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); + texDef(0, t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); + texDef(1, t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256); + texDef(2, t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256); + texDef(3, t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256); + texDef(4, t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256); + texDef(5, t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256); + texDef(6, t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); + texDef(7, t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); +} + +__host__ +void quark_groestl512_sm20_free(int thr_id) +{ + for (int i=0; i<8; i++) + cudaFree(d_textures[thr_id][i]); } __host__ diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index e03b51b..772fd65 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -30,6 +30,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -270,6 +271,7 @@ extern "C" void free_quark(int thr_id) cudaFree(d_branch2Nonces[thr_id]); cudaFree(d_branch3Nonces[thr_id]); + quark_groestl512_cpu_free(thr_id); quark_compactTest_cpu_free(thr_id); cuda_check_cpu_free(thr_id); diff --git a/qubit/qubit.cu b/qubit/qubit.cu index 3d40ded..7837892 100644 --- a/qubit/qubit.cu +++ b/qubit/qubit.cu @@ -28,14 +28,11 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads); -extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, - uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, - int order); extern "C" void qubithash(void *state, const void *input) { @@ -96,7 +93,7 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, x11_simd512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); cuda_check_cpu_init(thr_id, throughput); @@ -167,6 +164,8 @@ extern "C" void free_qubit(int thr_id) cudaFree(d_hash[thr_id]); + x11_simd512_cpu_free(thr_id); + cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp index f65388e..099501a 100644 --- a/scrypt-jane.cpp +++ b/scrypt-jane.cpp @@ -426,10 +426,17 @@ unsigned char GetNfactor(unsigned int nTimestamp) return Nfactor; } +static bool init[MAX_GPUS] = { 0 }; + // cleanup void free_scrypt_jane(int thr_id) { - // todo ? + int dev_id = device_map[thr_id]; + + cudaSetDevice(dev_id); + cudaDeviceReset(); // well, simple way to free ;) + + init[thr_id] = false; } #define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ @@ -467,7 +474,18 @@ int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsi s_Nfactor = Nfactor; } - int throughput = cuda_throughput(thr_id); + static __thread int throughput = 0; + if(!init[thr_id]) { + int dev_id = device_map[thr_id]; + + cudaSetDevice(dev_id); + cudaDeviceReset(); + cudaSetDevice(dev_id); + throughput = cuda_throughput(thr_id); + applog(LOG_INFO, "GPU #%d: cuda throughput is %d", dev_id, throughput); + + init[thr_id] = true; + } if(throughput == 0) return -1; @@ -602,7 +620,7 @@ int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsi if (memcmp(thash, &hash[cur][8*i], 32) == 0) { - bn_store_hash_target_ratio(thash, ptarget, work); + work_set_target_ratio(work, thash); *hashes_done = n - pdata[19]; pdata[19] = tmp_nonce; scrypt_free(&Vbuf); diff --git a/scrypt.cpp b/scrypt.cpp index d13b0bd..62c5a58 100644 --- a/scrypt.cpp +++ b/scrypt.cpp @@ -685,10 +685,18 @@ static int lastFactor = 0; static void computeGold(uint32_t* const input, uint32_t *reference, uchar *scratchpad); +static bool init[MAX_GPUS] = { 0 }; + // cleanup void free_scrypt(int thr_id) { - // todo ? + int dev_id = device_map[thr_id]; + + // trivial way to free all... + cudaSetDevice(dev_id); + cudaDeviceReset(); + + init[thr_id] = false; } // Scrypt proof of work algorithm @@ -701,9 +709,20 @@ int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned int result = 0; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; - int throughput = cuda_throughput(thr_id); + static __thread int throughput = 0; + + if (!init[thr_id]) { + int dev_id = device_map[thr_id]; + cudaSetDevice(dev_id); + cudaDeviceReset(); + cudaSetDevice(dev_id); + throughput = cuda_throughput(thr_id); + applog(LOG_INFO, "GPU #%d: cuda throughput is %d", dev_id, throughput); + + init[thr_id] = true; + } - if(throughput == 0) + if (throughput == 0) return -1; gettimeofday(tv_start, NULL); @@ -912,7 +931,7 @@ int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned device_map[thr_id], device_name[thr_id], i, cur); } else { *hashes_done = n - pdata[19]; - bn_store_hash_target_ratio(refhash, ptarget, work); + work_set_target_ratio(work, refhash); pdata[19] = nonce[cur] + i; result = 1; goto byebye; diff --git a/scrypt/blake.cu b/scrypt/blake.cu index 09ed4e6..5e8443a 100644 --- a/scrypt/blake.cu +++ b/scrypt/blake.cu @@ -404,10 +404,10 @@ void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool static std::map context_good[2]; +static bool init[MAX_GPUS] = { 0 }; + bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) { - static bool init[MAX_GPUS] = { 0 }; - if (!init[thr_id]) { // allocate pinned host memory for good hashes @@ -441,3 +441,13 @@ void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32 cudaMemcpyDeviceToHost, context_streams[stream][thr_id])); } } + +void default_free_blake256(int thr_id) +{ + if (init[thr_id]) { + cudaFree(context_good[0][thr_id]); + cudaFree(context_good[1][thr_id]); + init[thr_id] = false; + } +} + diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu index a22a863..9fb3c44 100644 --- a/scrypt/salsa_kernel.cu +++ b/scrypt/salsa_kernel.cu @@ -144,9 +144,8 @@ int cuda_throughput(int thr_id) cuCtxCreate( &ctx, CU_CTX_SCHED_YIELD, device_map[thr_id] ); cuCtxSetCurrent(ctx); #else - checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield)); checkCudaErrors(cudaSetDevice(device_map[thr_id])); - // checkCudaErrors(cudaFree(0)); + checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleYield)); #endif KernelInterface *kernel; @@ -256,7 +255,7 @@ inline int _ConvertSMVer2Cores(int major, int minor) } // If we don't find the values, we default use the previous one to run properly - applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128); + applog(LOG_WARNING, "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM", major, minor, 128); return 128; } diff --git a/skein.cu b/skein.cu index cfeef37..8577e2f 100644 --- a/skein.cu +++ b/skein.cu @@ -11,12 +11,14 @@ #include static uint32_t *d_hash[MAX_GPUS]; +static __thread bool sm5 = true; extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); extern void skeincoin_init(int thr_id); +extern void skeincoin_free(int thr_id); extern void skeincoin_setBlock_80(int thr_id, void *pdata); extern uint32_t skeincoin_hash_sm5(int thr_id, uint32_t threads, uint32_t startNounce, int swap, uint64_t target64, uint32_t *secNonce); @@ -355,7 +357,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no const uint32_t first_nonce = pdata[19]; const int swap = 1; - bool sm5 = (device_sm[device_map[thr_id]] >= 500); + sm5 = (device_sm[device_map[thr_id]] >= 500); bool checkSecnonce = (have_stratum || have_longpoll) && !sm5; uint32_t throughput = device_intensity(thr_id, __func__, 1U << 20); @@ -473,9 +475,11 @@ extern "C" void free_skeincoin(int thr_id) cudaSetDevice(device_map[thr_id]); - cudaFree(d_hash[thr_id]); + if (!sm5) { + cudaFree(d_hash[thr_id]); + cuda_check_cpu_free(thr_id); + } - cuda_check_cpu_free(thr_id); init[thr_id] = false; cudaDeviceSynchronize(); diff --git a/skein2.cpp b/skein2.cpp index d2974dd..ef0c930 100644 --- a/skein2.cpp +++ b/skein2.cpp @@ -16,6 +16,7 @@ extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); +extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); void skein2hash(void *output, const void *input) @@ -38,6 +39,7 @@ static bool init[MAX_GPUS] = { 0 }; int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { + int dev_id = device_map[thr_id]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -50,9 +52,9 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); - cudaMalloc(&d_hash[thr_id], throughput * 64U); + cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput); quark_skein512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput); @@ -92,7 +94,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned work_set_target_ratio(work, vhash64); if (secNonce != 0) { if (!opt_quiet) - applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", device_map[thr_id], swab32(secNonce)); + applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce)); endiandata[19] = secNonce; skein2hash(vhash64, endiandata); @@ -104,7 +106,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned pdata[19] = swab32(foundNonce); return res; } else { - applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce); + applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", dev_id, foundNonce); } } @@ -135,4 +137,4 @@ void free_skein2(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +} diff --git a/x11/c11.cu b/x11/c11.cu index c59c594..c3a0848 100644 --- a/x11/c11.cu +++ b/x11/c11.cu @@ -32,6 +32,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -50,6 +51,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -255,6 +257,8 @@ extern "C" void free_c11(int thr_id) cudaSetDevice(device_map[thr_id]); cudaFree(d_hash[thr_id]); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 326ed12..797fa49 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -672,6 +672,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads) return 0; } +__host__ +void x11_simd512_cpu_free(int thr_id) +{ + cudaFree(d_temp4[thr_id]); + cudaFree(d_state[thr_id]); +} + __host__ void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { diff --git a/x11/fresh.cu b/x11/fresh.cu index 81ecac9..49af70b 100644 --- a/x11/fresh.cu +++ b/x11/fresh.cu @@ -21,6 +21,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -163,6 +164,7 @@ extern "C" void free_fresh(int thr_id) cudaSetDevice(device_map[thr_id]); cudaFree(d_hash[thr_id]); + x11_simd512_cpu_free(thr_id); cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/x11/s3.cu b/x11/s3.cu index c3fc8d3..e8e522c 100644 --- a/x11/s3.cu +++ b/x11/s3.cu @@ -21,6 +21,7 @@ extern void x11_shavite512_setBlock_80(void *pdata); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -146,6 +147,7 @@ extern "C" void free_s3(int thr_id) cudaSetDevice(device_map[thr_id]); cudaFree(d_hash[thr_id]); + x11_simd512_cpu_free(thr_id); cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/x11/x11.cu b/x11/x11.cu index 3ac4158..ba061e0 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -32,6 +32,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -50,6 +51,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -254,6 +256,9 @@ extern "C" void free_x11(int thr_id) cudaFree(d_hash[thr_id]); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu index ea26f17..d243935 100644 --- a/x13/cuda_x13_fugue512.cu +++ b/x13/cuda_x13_fugue512.cu @@ -38,6 +38,9 @@ * @author phm */ +// store allocated textures device addresses +static unsigned int* d_textures[MAX_GPUS][4]; + #define mixtab0(x) (*((uint32_t*)mixtabs + ( (x)))) #define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x)))) #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x)))) @@ -657,25 +660,37 @@ __global__ void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce, } } -#define texDef(texname, texmem, texsource, texsize) \ +#define texDef(id, texname, texmem, texsource, texsize) { \ unsigned int *texmem; \ cudaMalloc(&texmem, texsize); \ + d_textures[thr_id][id] = texmem; \ cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ texname.normalized = 0; \ texname.filterMode = cudaFilterModePoint; \ texname.addressMode[0] = cudaAddressModeClamp; \ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ - cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \ + } \ +} + +__host__ +void x13_fugue512_cpu_init(int thr_id, uint32_t threads) +{ + texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); + texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); + texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); + texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); +} -__host__ void x13_fugue512_cpu_init(int thr_id, uint32_t threads) +__host__ +void x13_fugue512_cpu_free(int thr_id) { - texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); - texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); - texDef(mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); - texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); + for (int i=0; i<4; i++) + cudaFree(d_textures[thr_id][i]); } -__host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ +void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 256; diff --git a/x13/x13.cu b/x13/x13.cu index a2727c2..5833e71 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -34,6 +34,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -52,6 +53,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -61,7 +63,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); - +extern void x13_fugue512_cpu_free(int thr_id); // X13 CPU Hash extern "C" void x13hash(void *output, const void *input) @@ -256,6 +258,10 @@ extern "C" void free_x13(int thr_id) cudaFree(d_hash[thr_id]); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/x15/x14.cu b/x15/x14.cu index 81b6183..195ace9 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -38,6 +38,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -56,6 +57,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -65,6 +67,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -185,9 +188,10 @@ extern "C" int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, x13_fugue512_cpu_init(thr_id, throughput); x14_shabal512_cpu_init(thr_id, throughput); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); - cuda_check_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + init[thr_id] = true; } @@ -260,8 +264,12 @@ extern "C" void free_x14(int thr_id) cudaFree(d_hash[thr_id]); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + cuda_check_cpu_free(thr_id); init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +} diff --git a/x15/x15.cu b/x15/x15.cu index 92b6efc..5808770 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -39,6 +39,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -57,6 +58,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -66,6 +68,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -275,6 +278,10 @@ extern "C" void free_x15(int thr_id) cudaFree(d_hash[thr_id]); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/x17/x17.cu b/x17/x17.cu index 086b5e8..ee42522 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -42,6 +42,7 @@ extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -60,6 +61,7 @@ extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern int x11_simd512_cpu_init(int thr_id, uint32_t threads); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_simd512_cpu_free(int thr_id); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -69,6 +71,7 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t star extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -297,6 +300,10 @@ extern "C" void free_x17(int thr_id) cudaFree(d_hash[thr_id]); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/zr5.cu b/zr5.cu index 5ee62a0..93fe4c7 100644 --- a/zr5.cu +++ b/zr5.cu @@ -319,6 +319,7 @@ extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_groestl512_cpu_free(int thr_id); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -491,6 +492,7 @@ extern "C" void free_zr5(int thr_id) cudaFree(d_txs[thr_id]); + quark_groestl512_cpu_free(thr_id); cuda_check_cpu_free(thr_id); init[thr_id] = false;