Browse Source

windows: add support for SM 2.1, drop SM 3.5 (x86)

Mostly to do compatibilty tests, SM 2.1 support is very limited

SM 3.0 code should run on SM 3.5 (only a few cards use this arch)

As i can't test SM 3.5, its best to let users do their own tests...
master
Tanguy Pruvot 9 years ago
parent
commit
4868c412b0
  1. 2
      Algo256/blake256.cu
  2. 3
      README.txt
  3. 1
      bench.cpp
  4. 2
      ccminer.vcxproj
  5. 1
      lyra2/cuda_lyra2v2_sm3.cuh
  6. 5
      lyra2/lyra2REv2.cu

2
Algo256/blake256.cu

@ -409,9 +409,9 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
} }
if (!init[thr_id]) { if (!init[thr_id]) {
if (active_gpus > 1)
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
CUDA_LOG_ERROR(); CUDA_LOG_ERROR();
cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)); cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t));
cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)); cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t));
CUDA_LOG_ERROR(); CUDA_LOG_ERROR();

3
README.txt

@ -229,7 +229,8 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Under Dev... v1.7 Under Dev... v1.7
Improve lyra2 (v1) cuda implementation Add windows support for SM 2.1 and drop SM 3.5 (x86)
Improve lyra2 (v1/v2) cuda implementation
Restore whirlpool algo (and whirlcoin variant) Restore whirlpool algo (and whirlcoin variant)
Prepare algo switch ability Prepare algo switch ability
Add --benchmark alone to run a benchmark for all algos Add --benchmark alone to run a benchmark for all algos

1
bench.cpp

@ -96,6 +96,7 @@ bool bench_algo_switch_next(int thr_id)
// skip some duplicated algos // skip some duplicated algos
if (algo == ALGO_C11) algo++; // same as x11 if (algo == ALGO_C11) algo++; // same as x11
if (algo == ALGO_DMD_GR) algo++; // same as groestl if (algo == ALGO_DMD_GR) algo++; // same as groestl
if (algo == ALGO_MJOLLNIR) algo++; // same as heavy
if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool
if (device_sm[dev_id] && device_sm[dev_id] < 300) { if (device_sm[dev_id] && device_sm[dev_id] < 300) {

2
ccminer.vcxproj

@ -158,7 +158,7 @@
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep> <Keep>true</Keep>
<CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_35,sm_35</CodeGeneration> <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
<Optimization>O2</Optimization> <Optimization>O2</Optimization>
</CudaCompile> </CudaCompile>

1
lyra2/cuda_lyra2v2_sm3.cuh

@ -17,7 +17,6 @@
#define Ncol 4 #define Ncol 4
#define vectype ulonglong4 #define vectype ulonglong4
#define u64type uint64_t
#define memshift 4 #define memshift 4
__device__ vectype *DMatrix; __device__ vectype *DMatrix;

5
lyra2/lyra2REv2.cu

@ -82,7 +82,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id]; int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18; int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
uint32_t throughput = cuda_default_throughput(dev_id, 1U << intensity); uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
@ -100,6 +100,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
keccak256_cpu_init(thr_id,throughput); keccak256_cpu_init(thr_id,throughput);
skein256_cpu_init(thr_id, throughput); skein256_cpu_init(thr_id, throughput);
bmw256_cpu_init(thr_id, throughput); bmw256_cpu_init(thr_id, throughput);
CUDA_LOG_ERROR();
// DMatrix (780Ti may prefer 16 instead of 12, cf djm34) // DMatrix (780Ti may prefer 16 instead of 12, cf djm34)
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)12 * sizeof(uint64_t) * 4 * 4 * throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)12 * sizeof(uint64_t) * 4 * 4 * throughput));
@ -118,7 +119,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
uint32_t endiandata[20]; uint32_t endiandata[20];
for (int k=0; k < 20; k++) for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); be32enc(&endiandata[k], pdata[k]);
blake256_cpu_setBlock_80(pdata); blake256_cpu_setBlock_80(pdata);
bmw256_setTarget(ptarget); bmw256_setTarget(ptarget);

Loading…
Cancel
Save