Browse Source

various fixes for SM 2.1 and the benchmark

X11+ algos and quark are not compatible for the moment

but these ones are :

Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB):

   blakecoin :     159090.5 kH/s,     1 MB,  1048576 thr.
       blake :      70208.9 kH/s,     1 MB,  1048576 thr.
         bmw :     122802.6 kH/s,    65 MB,  2097152 thr.
        deep :       3533.6 kH/s,    33 MB,   524288 thr.
    fugue256 :      43177.9 kH/s,    17 MB,   524288 thr.
       heavy :       4118.2 kH/s,   147 MB,   524032 thr.
      keccak :      18673.1 kH/s,   129 MB,  2097152 thr.
       luffa :      28816.0 kH/s,   257 MB,  4194304 thr.
       lyra2 :        213.7 kH/s,   570 MB,    65536 thr.
    mjollnir :       3895.6 kH/s,   147 MB,   524032 thr.
       nist5 :       1101.4 kH/s,    67 MB,  1048576 thr.
       penta :        501.6 kH/s,    21 MB,   327680 thr.
       skein :       5432.4 kH/s,    65 MB,  1048576 thr.
      skein2 :       6788.9 kH/s,    33 MB,   524288 thr.
   whirlpool :        688.5 kH/s,    33 MB,   524288 thr.
         zr5 :        122.5 kH/s,    86 MB,   262144 thr.
2upstream
Tanguy Pruvot 9 years ago
parent
commit
5bf1f98200
  1. 10
      Algo256/blake256.cu
  2. 9
      Algo256/bmw.cu
  3. 2
      Algo256/cuda_blake256.cu
  4. 8
      Algo256/keccak256.cu
  5. 15
      JHA/jackpotcoin.cu
  6. 15
      bench.cpp
  7. 2
      cuda_groestlcoin.cu
  8. 1
      lyra2/cuda_lyra2.cu
  9. 11
      pentablake.cu
  10. 5
      quark/quarkcoin.cu
  11. 10
      qubit/luffa.cu
  12. 2
      util.cpp
  13. 8
      x15/whirlpoolx.cu
  14. 8
      zr5.cu

10
Algo256/blake256.cu

@ -10,9 +10,11 @@ @@ -10,9 +10,11 @@
extern "C" {
#include "sph/sph_blake.h"
//extern int blake256_rounds;
}
#include <stdint.h>
#include <memory.h>
}
/* threads per block and throughput (intensity) */
#define TPB 128
@ -467,10 +469,10 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non @@ -467,10 +469,10 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
#endif
return rc;
}
else if (opt_debug) {
else if (vhashcpu[7] > ptarget[7] && opt_debug) {
applog_hash((uchar*)ptarget);
applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget);
applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
}
}
@ -493,7 +495,7 @@ extern "C" void free_blake256(int thr_id) @@ -493,7 +495,7 @@ extern "C" void free_blake256(int thr_id)
if (!init[thr_id])
return;
cudaSetDevice(device_map[thr_id]);
cudaThreadSynchronize();
cudaFreeHost(h_resNonce[thr_id]);
cudaFree(d_resNonce[thr_id]);

9
Algo256/bmw.cu

@ -87,7 +87,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u @@ -87,7 +87,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
return 1;
}
else {
applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
}
}
@ -110,13 +110,12 @@ extern "C" void free_bmw(int thr_id) @@ -110,13 +110,12 @@ extern "C" void free_bmw(int thr_id)
if (!init[thr_id])
return;
cudaSetDevice(device_map[thr_id]);
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
bmw256_midstate_free(thr_id);
cuda_check_cpu_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}
init[thr_id] = false;
}

2
Algo256/cuda_blake256.cu

@ -207,7 +207,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin @@ -207,7 +207,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
input[3] = startNonce + thread;
blake256_compress2nd(h, input, 640);
#pragma unroll
#pragma unroll
for (int i = 0; i<4; i++) {
Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
}

8
Algo256/keccak256.cu

@ -80,7 +80,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no @@ -80,7 +80,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
return 1;
}
else {
applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
}
}
@ -101,12 +101,12 @@ extern "C" void free_keccak256(int thr_id) @@ -101,12 +101,12 @@ extern "C" void free_keccak256(int thr_id)
if (!init[thr_id])
return;
cudaSetDevice(device_map[thr_id]);
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
keccak256_cpu_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}
init[thr_id] = false;
}

15
JHA/jackpotcoin.cu

@ -95,6 +95,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc @@ -95,6 +95,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -104,7 +105,13 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc @@ -104,7 +105,13 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(dev_id);
cuda_get_arch(thr_id);
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
proper_exit(EXIT_CODE_CUDA_ERROR);
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
@ -214,6 +221,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc @@ -214,6 +221,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
CUDA_LOG_ERROR();
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != UINT32_MAX)
{
uint32_t vhash64[8];
@ -239,15 +247,14 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc @@ -239,15 +247,14 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
pdata[19] = foundNonce;
return res;
} else {
applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!",
device_map[thr_id], foundNonce);
gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
}
}
if ((uint64_t) pdata[19] + throughput > max_nonce) {
*hashes_done = pdata[19] - first_nonce;
pdata[19] = max_nonce;
break;
return 0;
}
pdata[19] += throughput;

15
bench.cpp

@ -97,6 +97,21 @@ bool bench_algo_switch_next(int thr_id) @@ -97,6 +97,21 @@ bool bench_algo_switch_next(int thr_id)
if (algo == ALGO_C11) algo++; // same as x11
if (algo == ALGO_DMD_GR) algo++; // same as groestl
if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool
if (device_sm[dev_id] && device_sm[dev_id] < 300) {
// incompatible SM 2.1 kernels...
if (algo == ALGO_FRESH) algo++;
if (algo == ALGO_GROESTL) algo++;
if (algo == ALGO_MYR_GR) algo++;
if (algo == ALGO_JACKPOT) algo++;
if (algo == ALGO_LYRA2v2) algo++;
if (algo == ALGO_NEOSCRYPT) algo++;
if (algo == ALGO_QUARK) algo++;
if (algo == ALGO_QUBIT) algo++;
if (algo == ALGO_S3) algo++; // to check...
while (algo >= ALGO_X11 && algo <= ALGO_X17) algo++;
if (algo == ALGO_WHIRLPOOLX) algo++;
}
// and unwanted ones...
if (algo == ALGO_SCRYPT) algo++;
if (algo == ALGO_SCRYPT_JANE) algo++;

2
cuda_groestlcoin.cu

@ -153,7 +153,7 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo @@ -153,7 +153,7 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
int dev_id = device_map[thr_id];
if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
printf("Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
proper_exit(EXIT_CODE_CUDA_ERROR);
}

1
lyra2/cuda_lyra2.cu

@ -261,6 +261,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6 @@ -261,6 +261,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t tpb = TPB52;
if (device_sm[dev_id] == 500) tpb = TPB50;
if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not)
if (device_sm[dev_id] <= 300) tpb = TPB30;
dim3 grid((threads + tpb - 1) / tpb);

11
pentablake.cu

@ -423,7 +423,6 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n @@ -423,7 +423,6 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
pentablakehash(vhash, endiandata);
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio)
work_set_target_ratio(work, vhash);
applog(LOG_NOTICE, "GPU found more than one result yippee!");
pdata[21] = extra_results[0];
extra_results[0] = UINT32_MAX;
rc++;
@ -431,7 +430,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n @@ -431,7 +430,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
pdata[19] = foundNonce;
return rc;
} else {
applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
}
}
@ -449,13 +448,13 @@ void free_pentablake(int thr_id) @@ -449,13 +448,13 @@ void free_pentablake(int thr_id)
if (!init[thr_id])
return;
cudaSetDevice(device_map[thr_id]);
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
cudaFreeHost(h_resNounce[thr_id]);
cudaFree(d_resNounce[thr_id]);
init[thr_id] = false;
cudaDeviceSynchronize();
}
init[thr_id] = false;
}

5
quark/quarkcoin.cu

@ -262,8 +262,7 @@ extern "C" void free_quark(int thr_id) @@ -262,8 +262,7 @@ extern "C" void free_quark(int thr_id)
if (!init[thr_id])
return;
cudaSetDevice(device_map[thr_id]);
cudaDeviceSynchronize();
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
@ -278,4 +277,4 @@ extern "C" void free_quark(int thr_id) @@ -278,4 +277,4 @@ extern "C" void free_quark(int thr_id)
init[thr_id] = false;
cudaDeviceSynchronize();
}
}

10
qubit/luffa.cu

@ -45,8 +45,10 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, @@ -45,8 +45,10 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1) // to reduce cpu usage...
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
//if (opt_cudaschedule == -1) // to reduce cpu usage...
// cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
@ -103,12 +105,12 @@ extern "C" void free_luffa(int thr_id) @@ -103,12 +105,12 @@ extern "C" void free_luffa(int thr_id)
if (!init[thr_id])
return;
cudaDeviceSynchronize();
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
cuda_check_cpu_free(thr_id);
cudaThreadSynchronize();
cudaDeviceSynchronize();
init[thr_id] = false;
}

2
util.cpp

@ -1856,7 +1856,7 @@ static uint32_t zrtest[20] = { @@ -1856,7 +1856,7 @@ static uint32_t zrtest[20] = {
void do_gpu_tests(void)
{
#ifdef _DEBUG
#if 1 //def _DEBUG
unsigned long done;
char s[128] = { '\0' };
uchar buf[160];

8
x15/whirlpoolx.cu

@ -9,7 +9,7 @@ extern "C" { @@ -9,7 +9,7 @@ extern "C" {
#include "miner.h"
#include "cuda_helper.h"
static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_hash[MAX_GPUS] = { 0 };
extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads);
extern void whirlpoolx_cpu_free(int thr_id);
@ -54,7 +54,7 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc @@ -54,7 +54,7 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc
if (!init[thr_id]) {
cudaSetDevice(device_map[thr_id]);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
whirlpoolx_cpu_init(thr_id, throughput);
@ -84,7 +84,7 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc @@ -84,7 +84,7 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc
pdata[19] = foundNonce;
return 1;
} else {
applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
}
}
@ -115,4 +115,4 @@ extern "C" void free_whirlx(int thr_id) @@ -115,4 +115,4 @@ extern "C" void free_whirlx(int thr_id)
init[thr_id] = false;
cudaDeviceSynchronize();
}
}

8
zr5.cu

@ -419,6 +419,10 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work, @@ -419,6 +419,10 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
}
zr5_final_round(thr_id, throughput);
// do not scan results on interuption
if (work_restart[thr_id].restart)
return -1;
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != UINT32_MAX)
{
@ -455,7 +459,7 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work, @@ -455,7 +459,7 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
}
return res;
} else {
applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
pdata[19]++;
pdata[0] = oldp0;
@ -497,4 +501,4 @@ extern "C" void free_zr5(int thr_id) @@ -497,4 +501,4 @@ extern "C" void free_zr5(int thr_id)
init[thr_id] = false;
cudaDeviceSynchronize();
}
}

Loading…
Cancel
Save