Browse Source

cleanup: use unsigned throughput parameters

Yes, its a big commit, was waiting 1.6 to do that...
Sorry for your possible merge issues ;)
2upstream
Tanguy Pruvot 10 years ago
parent
commit
e6112e878d
  1. 4
      Algo256/blake256.cu
  2. 4
      Algo256/cuda_blake256.cu
  3. 12
      Algo256/cuda_fugue256.cu
  4. 14
      Algo256/cuda_groestl256.cu
  5. 18
      Algo256/cuda_keccak256.cu
  6. 14
      Algo256/cuda_skein256.cu
  7. 6
      Algo256/keccak256.cu
  8. 12
      JHA/cuda_jha_compactionTest.cu
  9. 10
      JHA/cuda_jha_keccak512.cu
  10. 30
      JHA/jackpotcoin.cu
  11. 24
      cuda_checkhash.cu
  12. 4
      cuda_fugue256.h
  13. 10
      cuda_groestlcoin.cu
  14. 4
      cuda_groestlcoin.h
  15. 6
      cuda_helper.h
  16. 14
      cuda_myriadgroestl.cu
  17. 24
      cuda_nist5.cu
  18. 2
      fuguecoin.cpp
  19. 10
      heavy/cuda_blake512.cu
  20. 10
      heavy/cuda_combine.cu
  21. 12
      heavy/cuda_groestl512.cu
  22. 12
      heavy/cuda_hefty1.cu
  23. 12
      heavy/cuda_keccak512.cu
  24. 12
      heavy/cuda_sha256.cu
  25. 4
      heavy/heavy.cu
  26. 32
      heavy/heavy.h
  27. 8
      lyra2/cuda_lyra2.cu
  28. 20
      lyra2/lyra2RE.cu
  29. 4
      myriadgroestl.cpp
  30. 22
      pentablake.cu
  31. 40
      quark/animecoin.cu
  32. 6
      quark/cuda_bmw512.cu
  33. 10
      quark/cuda_jh512.cu
  34. 18
      quark/cuda_quark_blake512.cu
  35. 14
      quark/cuda_quark_compactionTest.cu
  36. 14
      quark/cuda_quark_groestl512.cu
  37. 10
      quark/cuda_quark_groestl512_sm20.cu
  38. 14
      quark/cuda_quark_keccak512.cu
  39. 14
      quark/cuda_skein512.cu
  40. 40
      quark/quarkcoin.cu
  41. 18
      qubit/deep.cu
  42. 8
      qubit/doom.cu
  43. 28
      qubit/qubit.cu
  44. 18
      qubit/qubit_luffa512.cu
  45. 4
      scrypt.c
  46. 10
      x11/cuda_x11_cubehash512.cu
  47. 10
      x11/cuda_x11_echo.cu
  48. 10
      x11/cuda_x11_luffa512.cu
  49. 10
      x11/cuda_x11_luffa512_Cubehash.cu
  50. 18
      x11/cuda_x11_shavite512.cu
  51. 24
      x11/cuda_x11_simd512.cu
  52. 22
      x11/fresh.cu
  53. 16
      x11/s3.cu
  54. 50
      x11/x11.cu
  55. 10
      x13/cuda_x13_fugue512.cu
  56. 10
      x13/cuda_x13_hamsi512.cu
  57. 56
      x13/x13.cu
  58. 10
      x15/cuda_x14_shabal512.cu
  59. 20
      x15/cuda_x15_whirlpool.cu
  60. 12
      x15/whirlpool.cu
  61. 60
      x15/x14.cu
  62. 64
      x15/x15.cu
  63. 10
      x17/cuda_x17_haval512.cu
  64. 10
      x17/cuda_x17_sha512.cu
  65. 72
      x17/x17.cu

4
Algo256/blake256.cu

@ -245,7 +245,7 @@ __host__
uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget, uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
const uint32_t crcsum, const int8_t rounds) const uint32_t crcsum, const int8_t rounds)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
uint32_t result = UINT32_MAX; uint32_t result = UINT32_MAX;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
@ -331,7 +331,7 @@ __host__
static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget, static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint64_t highTarget,
const int8_t rounds) const int8_t rounds)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
uint32_t result = UINT32_MAX; uint32_t result = UINT32_MAX;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

4
Algo256/cuda_blake256.cu

@ -218,7 +218,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
__host__ __host__
void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -243,7 +243,7 @@ void blake256_cpu_setBlock_80(uint32_t *pdata)
} }
__host__ __host__
void blake256_cpu_init(int thr_id, int threads) void blake256_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);

12
Algo256/cuda_fugue256.cu

@ -548,7 +548,7 @@ __global__ void __launch_bounds__(256)
#else #else
__global__ void __global__ void
#endif #endif
fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
{ {
#if USE_SHARED #if USE_SHARED
extern __shared__ char mixtabs[]; extern __shared__ char mixtabs[];
@ -561,7 +561,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
__syncthreads(); __syncthreads();
#endif #endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
/* Nimm den State und verarbeite das letztenByte (die Nounce) */ /* Nimm den State und verarbeite das letztenByte (die Nounce) */
@ -718,7 +718,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
void fugue256_cpu_init(int thr_id, int threads) void fugue256_cpu_init(int thr_id, uint32_t threads)
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
@ -751,12 +751,12 @@ __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
} }
__host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce) __host__ void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
{ {
#if USE_SHARED #if USE_SHARED
const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
#else #else
const int threadsperblock = 512; // so einstellen wie gewünscht ;-) const uint32_t threadsperblock = 512; // so einstellen wie gewünscht ;-)
#endif #endif
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

14
Algo256/cuda_groestl256.cu

@ -105,7 +105,7 @@ extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[]; extern uint32_t T3dn_cpu[];
__device__ __forceinline__ __device__ __forceinline__
void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs) void groestl256_perm_P(uint32_t thread,uint32_t *a, char *mixtabs)
{ {
#pragma unroll 10 #pragma unroll 10
for (int r = 0; r<10; r++) for (int r = 0; r<10; r++)
@ -136,7 +136,7 @@ void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs)
} }
__device__ __forceinline__ __device__ __forceinline__
void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs) void groestl256_perm_Q(uint32_t thread, uint32_t *a, char *mixtabs)
{ {
#pragma unroll #pragma unroll
for (int r = 0; r<10; r++) for (int r = 0; r<10; r++)
@ -175,7 +175,7 @@ void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs)
} }
__global__ __launch_bounds__(256,1) __global__ __launch_bounds__(256,1)
void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector) void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector)
{ {
#if USE_SHARED #if USE_SHARED
extern __shared__ char mixtabs[]; extern __shared__ char mixtabs[];
@ -194,7 +194,7 @@ void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHa
__syncthreads(); __syncthreads();
#endif #endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// GROESTL // GROESTL
@ -259,7 +259,7 @@ void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHa
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
__host__ __host__
void groestl256_cpu_init(int thr_id, int threads) void groestl256_cpu_init(int thr_id, uint32_t threads)
{ {
// Texturen mit obigem Makro initialisieren // Texturen mit obigem Makro initialisieren
@ -277,11 +277,11 @@ void groestl256_cpu_init(int thr_id, int threads)
} }
__host__ __host__
uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
uint32_t result = 0xffffffff; uint32_t result = 0xffffffff;
cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t)); cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

18
Algo256/cuda_keccak256.cu

@ -170,9 +170,9 @@ static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
#endif #endif
__global__ __launch_bounds__(128,5) __global__ __launch_bounds__(128,5)
void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
@ -210,11 +210,11 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash,
} }
__host__ __host__
uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{ {
uint32_t result = UINT32_MAX; uint32_t result = UINT32_MAX;
cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t)); cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 128; const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -232,9 +232,9 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui
} }
__global__ __launch_bounds__(256,3) __global__ __launch_bounds__(256,3)
void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) void keccak256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */ #if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
@ -272,9 +272,9 @@ void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHa
} }
__host__ __host__
void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -293,7 +293,7 @@ void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
} }
__host__ __host__
void keccak256_cpu_init(int thr_id, int threads) void keccak256_cpu_init(int thr_id, uint32_t threads)
{ {
CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants, CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice)); sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));

14
Algo256/cuda_skein256.cu

@ -96,9 +96,9 @@ void Round_8_512v35(uint2 *ks, uint2 *ts,
__global__ __launch_bounds__(256,3) __global__ __launch_bounds__(256,3)
void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint2 h[9]; uint2 h[9];
@ -214,9 +214,9 @@ void Round_8_512v30(uint64_t *ks, uint64_t *ts, uint64_t &p0, uint64_t &p1, uint
} }
__global__ __launch_bounds__(256, 3) __global__ __launch_bounds__(256, 3)
void skein256_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outputHash) void skein256_gpu_hash_32_v30(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint64_t h[9]; uint64_t h[9];
@ -293,15 +293,15 @@ void skein256_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outpu
} }
__host__ __host__
void skein256_cpu_init(int thr_id, int threads) void skein256_cpu_init(int thr_id, uint32_t threads)
{ {
//empty //empty
} }
__host__ __host__
void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

6
Algo256/keccak256.cu

@ -16,9 +16,9 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void keccak256_cpu_init(int thr_id, int threads); extern void keccak256_cpu_init(int thr_id, uint32_t threads);
extern void keccak256_setBlock_80(void *pdata,const void *ptarget); extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
extern uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
// CPU Hash // CPU Hash
extern "C" void keccak256_hash(void *state, const void *input) extern "C" void keccak256_hash(void *state, const void *input)
@ -42,7 +42,7 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t throughput = device_intensity(thr_id, __func__, 1U << 21); // 256*256*8*4 uint32_t throughput = device_intensity(thr_id, __func__, 1U << 21); // 256*256*8*4
throughput = min(throughput, (max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0005; ((uint32_t*)ptarget)[7] = 0x0005;

12
JHA/cuda_jha_compactionTest.cu

@ -30,7 +30,7 @@ __device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d
cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFunction[MAX_GPUS]; cuda_compactTestFunction_t h_JackpotTrueFunction[MAX_GPUS], h_JackpotFalseFunction[MAX_GPUS];
// Setup-Funktionen // Setup-Funktionen
__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) __host__ void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t));
@ -56,7 +56,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
#endif #endif
// Die Summenfunktion (vom NVIDIA SDK) // Die Summenfunktion (vom NVIDIA SDK)
__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {
extern __shared__ uint32_t sums[]; extern __shared__ uint32_t sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@ -185,7 +185,7 @@ __global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_su
} }
// Der Scatter // Der Scatter
__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) __global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t actNounce = id; uint32_t actNounce = id;
@ -234,7 +234,7 @@ __host__ static uint32_t jackpot_compactTest_roundUpExp(uint32_t val)
return mask; return mask;
} }
__host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, __host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
uint32_t *d_nonces1, cuda_compactTestFunction_t function, uint32_t *d_nonces1, cuda_compactTestFunction_t function,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{ {
@ -290,7 +290,7 @@ __host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads,
} }
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
uint32_t *d_nonces1, uint32_t *d_nonces2, uint32_t *d_nonces1, uint32_t *d_nonces2,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{ {
@ -329,7 +329,7 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, ui
*/ */
} }
__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, __host__ void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2, uint32_t *d_nonces2, size_t *nrm2,
int order) int order)

10
JHA/cuda_jha_keccak512.cu

@ -100,9 +100,9 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
} }
} }
__global__ void jackpot_keccak512_gpu_hash(int threads, uint32_t startNounce, uint64_t *g_hash) __global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
@ -145,7 +145,7 @@ __global__ void jackpot_keccak512_gpu_hash(int threads, uint32_t startNounce, ui
} }
// Setup-Funktionen // Setup-Funktionen
__host__ void jackpot_keccak512_cpu_init(int thr_id, int threads) __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants, cudaMemcpyToSymbol( c_keccak_round_constants,
@ -522,9 +522,9 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
} }
__host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order) __host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

30
JHA/jackpotcoin.cu

@ -12,29 +12,29 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void jackpot_keccak512_cpu_init(int thr_id, int threads); extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen); extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
extern void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void jackpot_compactTest_cpu_init(int thr_id, int threads); extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2, uint32_t *d_nonces2, size_t *nrm2,
int order); int order);
extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_jackpotNonces[MAX_GPUS]; static uint32_t *d_jackpotNonces[MAX_GPUS];
@ -93,8 +93,8 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int throughput = (int) device_intensity(thr_id, __func__, 1U << 20); uint32_t throughput = device_intensity(thr_id, __func__, 1U << 20);
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000f; ((uint32_t*)ptarget)[7] = 0x000f;

24
cuda_checkhash.cu

@ -15,7 +15,7 @@ static uint32_t* h_resNonces[MAX_GPUS];
static uint32_t* d_resNonces[MAX_GPUS]; static uint32_t* d_resNonces[MAX_GPUS];
__host__ __host__
void cuda_check_cpu_init(int thr_id, int threads) void cuda_check_cpu_init(int thr_id, uint32_t threads)
{ {
CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t))); CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));
CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t))); CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t)));
@ -71,9 +71,9 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
} }
__global__ __launch_bounds__(512, 4) __global__ __launch_bounds__(512, 4)
void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces) void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// shl 4 = *16 x 4 (uint32) = 64 bytes // shl 4 = *16 x 4 (uint32) = 64 bytes
@ -88,11 +88,11 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32
} }
__host__ __host__
uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash) uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
{ {
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 512; const uint32_t threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -109,7 +109,7 @@ uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t
__global__ __launch_bounds__(512, 4) __global__ __launch_bounds__(512, 4)
void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces) void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t *inpHash = &hash[thread << 4]; uint32_t *inpHash = &hash[thread << 4];
@ -122,11 +122,11 @@ void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *res
} }
__host__ __host__
uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce) uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
{ {
uint32_t rescnt, result = 0; uint32_t rescnt, result = 0;
const int threadsperblock = 512; const uint32_t threadsperblock = 512;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -152,9 +152,9 @@ uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, ui
/* --------------------------------------------------------------------------------------------- */ /* --------------------------------------------------------------------------------------------- */
__global__ __global__
void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = g_nonceVector[thread]; uint32_t nounce = g_nonceVector[thread];
@ -175,12 +175,12 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no
} }
__host__ __host__
uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{ {
uint32_t result = 0xffffffff; uint32_t result = 0xffffffff;
cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

4
cuda_fugue256.h

@ -1,8 +1,8 @@
#ifndef _CUDA_FUGUE512_H #ifndef _CUDA_FUGUE512_H
#define _CUDA_FUGUE512_H #define _CUDA_FUGUE512_H
void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *outputHashes, uint32_t *nounce); void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce);
void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn); void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
void fugue256_cpu_init(int thr_id, int threads); void fugue256_cpu_init(int thr_id, uint32_t threads);
#endif #endif

10
cuda_groestlcoin.cu

@ -23,11 +23,11 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
#define SWAB32(x) cuda_swab32(x) #define SWAB32(x) cuda_swab32(x)
__global__ __launch_bounds__(256, 4) __global__ __launch_bounds__(256, 4)
void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce) void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
{ {
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
if (thread < threads) if (thread < threads)
{ {
// GROESTL // GROESTL
@ -95,7 +95,7 @@ void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resN
} }
// Setup-Funktionen // Setup-Funktionen
__host__ void groestlcoin_cpu_init(int thr_id, int threads) __host__ void groestlcoin_cpu_init(int thr_id, uint32_t threads)
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
@ -130,9 +130,9 @@ __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
sizeof(uint32_t) * 8 ); sizeof(uint32_t) * 8 );
} }
__host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) __host__ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{ {
int threadsperblock = 256; uint32_t threadsperblock = 256;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl

4
cuda_groestlcoin.h

@ -1,8 +1,8 @@
#ifndef _CUDA_GROESTLCOIN_H #ifndef _CUDA_GROESTLCOIN_H
#define _CUDA_GROESTLCOIN_H #define _CUDA_GROESTLCOIN_H
void groestlcoin_cpu_init(int thr_id, int threads); void groestlcoin_cpu_init(int thr_id, uint32_t threads);
void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn); void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
#endif #endif

6
cuda_helper.h

@ -21,10 +21,10 @@ extern "C" short device_map[MAX_GPUS];
extern "C" long device_sm[MAX_GPUS]; extern "C" long device_sm[MAX_GPUS];
// common functions // common functions
extern void cuda_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
extern void cuda_check_cpu_setTarget(const void *ptarget); extern void cuda_check_cpu_setTarget(const void *ptarget);
extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash); extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
extern uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce); extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
extern __device__ __device_builtin__ void __syncthreads(void); extern __device__ __device_builtin__ void __syncthreads(void);

14
cuda_myriadgroestl.cu

@ -222,11 +222,11 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
} }
__global__ void __launch_bounds__(256, 4) __global__ void __launch_bounds__(256, 4)
myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer) myriadgroestl_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
{ {
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4; uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
if (thread < threads) if (thread < threads)
{ {
// GROESTL // GROESTL
@ -259,10 +259,10 @@ __global__ void __launch_bounds__(256, 4)
} }
__global__ void __global__ void
myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer) myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
{ {
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
@ -302,7 +302,7 @@ __global__ void
} }
// Setup-Funktionen // Setup-Funktionen
__host__ void myriadgroestl_cpu_init(int thr_id, int threads) __host__ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
@ -357,9 +357,9 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn
sizeof(uint32_t) * 8 ); sizeof(uint32_t) * 8 );
} }
__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) __host__ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{ {
int threadsperblock = 256; uint32_t threadsperblock = 256;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl

24
cuda_nist5.cu

@ -13,21 +13,21 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
// Original nist5hash Funktion aus einem miner Quelltext // Original nist5hash Funktion aus einem miner Quelltext
@ -72,8 +72,8 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int throughput = (int) device_intensity(thr_id, __func__, 1 << 20); // 256*256*16 uint32_t throughput = device_intensity(thr_id, __func__, 1 << 20); // 256*256*16
throughput = min(throughput, (int) (max_nonce - first_nonce)); throughput = min(throughput, (max_nonce - first_nonce));
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00FF; ((uint32_t*)ptarget)[7] = 0x00FF;

2
fuguecoin.cpp

@ -27,7 +27,7 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
{ {
uint32_t start_nonce = pdata[19]++; uint32_t start_nonce = pdata[19]++;
int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19; int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19;
int throughput = (int) device_intensity(thr_id, __func__, 1 << intensity); // 256*256*8 uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 256*256*8
throughput = min(throughput, max_nonce - start_nonce); throughput = min(throughput, max_nonce - start_nonce);
if (opt_benchmark) if (opt_benchmark)

10
heavy/cuda_blake512.cu

@ -118,9 +118,9 @@ template <int BLOCKSIZE> __device__ void blake512_compress( uint64_t *h, const u
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
} }
template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) template <int BLOCKSIZE> __global__ void blake512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
@ -192,7 +192,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
// ---------------------------- END CUDA blake512 functions ------------------------------------ // ---------------------------- END CUDA blake512 functions ------------------------------------
// Setup-Funktionen // Setup-Funktionen
__host__ void blake512_cpu_init(int thr_id, int threads) __host__ void blake512_cpu_init(int thr_id, uint32_t threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_sigma, cudaMemcpyToSymbol( c_sigma,
@ -238,9 +238,9 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len)
BLOCKSIZE = len; BLOCKSIZE = len;
} }
__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) __host__ void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

10
heavy/cuda_combine.cu

@ -102,9 +102,9 @@ static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint
} }
__global__ __global__
void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector) void combine_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = nonceVector[thread]; uint32_t nounce = nonceVector[thread];
@ -121,18 +121,18 @@ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t
} }
__host__ __host__
void combine_cpu_init(int thr_id, int threads) void combine_cpu_init(int thr_id, uint32_t threads)
{ {
// Speicher für alle Ergebnisse belegen // Speicher für alle Ergebnisse belegen
CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads)); CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
} }
__host__ __host__
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash) void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash)
{ {
// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten // diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
const int threadsperblock = 128; const uint32_t threadsperblock = 128;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

12
heavy/cuda_groestl512.cu

@ -669,9 +669,9 @@ __device__ void groestl512_perm_Q(uint32_t *a)
} }
} }
template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t message[32]; uint32_t message[32];
@ -741,7 +741,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
// Setup-Funktionen // Setup-Funktionen
__host__ void groestl512_cpu_init(int thr_id, int threads) __host__ void groestl512_cpu_init(int thr_id, uint32_t threads)
{ {
// Texturen mit obigem Makro initialisieren // Texturen mit obigem Makro initialisieren
texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256); texDef(t0up, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
@ -794,16 +794,16 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
BLOCKSIZE = len; BLOCKSIZE = len;
} }
__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) __host__ void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
{ {
// Hefty1 Hashes kopieren (eigentlich nur zum debuggen) // Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
if (copy) if (copy)
CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
} }
__host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) __host__ void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
{ {
const int threadsperblock = 128; const uint32_t threadsperblock = 128;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

12
heavy/cuda_hefty1.cu

@ -207,7 +207,7 @@ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
} }
__global__ __global__
void hefty_gpu_hash(int threads, uint32_t startNounce, uint32_t *outputHash) void hefty_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
{ {
#if USE_SHARED #if USE_SHARED
extern __shared__ unsigned char heftytab[]; extern __shared__ unsigned char heftytab[];
@ -219,7 +219,7 @@ void hefty_gpu_hash(int threads, uint32_t startNounce, uint32_t *outputHash)
__syncthreads(); __syncthreads();
#endif #endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
@ -306,7 +306,7 @@ void hefty_gpu_hash(int threads, uint32_t startNounce, uint32_t *outputHash)
} }
__host__ __host__
void hefty_cpu_init(int thr_id, int threads) void hefty_cpu_init(int thr_id, uint32_t threads)
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
@ -320,7 +320,7 @@ void hefty_cpu_init(int thr_id, int threads)
} }
__host__ __host__
void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len) void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len)
// data muss 80/84-Byte haben! // data muss 80/84-Byte haben!
{ {
// Nachricht expandieren und setzen // Nachricht expandieren und setzen
@ -390,9 +390,9 @@ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
} }
__host__ __host__
void hefty_cpu_hash(int thr_id, int threads, int startNounce) void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce)
{ {
int threadsperblock = 256; uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

12
heavy/cuda_keccak512.cu

@ -137,9 +137,9 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
} }
// Die Hash-Funktion // Die Hash-Funktion
template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
@ -186,7 +186,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
// ---------------------------- END CUDA keccak512 functions ------------------------------------ // ---------------------------- END CUDA keccak512 functions ------------------------------------
__host__ __host__
void keccak512_cpu_init(int thr_id, int threads) void keccak512_cpu_init(int thr_id, uint32_t threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants, cudaMemcpyToSymbol( c_keccak_round_constants,
@ -252,7 +252,7 @@ void keccak512_cpu_setBlock(void *data, int len)
} }
__host__ __host__
void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
{ {
// Hefty1 Hashes kopieren // Hefty1 Hashes kopieren
if (copy) if (copy)
@ -261,9 +261,9 @@ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int
} }
__host__ __host__
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
{ {
const int threadsperblock = 128; const uint32_t threadsperblock = 128;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

12
heavy/cuda_sha256.cu

@ -42,9 +42,9 @@ uint32_t sha256_cpu_constantTable[] = {
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) #define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
// Die Hash-Funktion // Die Hash-Funktion
template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) template <int BLOCKSIZE> __global__ void sha256_gpu_hash(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// bestimme den aktuellen Zähler // bestimme den aktuellen Zähler
@ -161,7 +161,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
} }
// Setup-Funktionen // Setup-Funktionen
__host__ void sha256_cpu_init(int thr_id, int threads) __host__ void sha256_cpu_init(int thr_id, uint32_t threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( sha256_gpu_constantTable, cudaMemcpyToSymbol( sha256_gpu_constantTable,
@ -248,7 +248,7 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
BLOCKSIZE = len; BLOCKSIZE = len;
} }
__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) __host__ void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy)
{ {
// Hefty1 Hashes kopieren // Hefty1 Hashes kopieren
if (copy) if (copy)
@ -256,9 +256,9 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
//else cudaThreadSynchronize(); //else cudaThreadSynchronize();
} }
__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) __host__ void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

4
heavy/heavy.cu

@ -136,8 +136,8 @@ int scanhash_heavy(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
// CUDA will process thousands of threads. // CUDA will process thousands of threads.
int throughput = (int) device_intensity(thr_id, __func__, (1U << 19) - 256); uint32_t throughput = device_intensity(thr_id, __func__, (1U << 19) - 256);
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
int rc = 0; int rc = 0;
uint32_t *hash = NULL; uint32_t *hash = NULL;

32
heavy/heavy.h

@ -1,30 +1,30 @@
#ifndef _CUDA_HEAVY_H #ifndef _CUDA_HEAVY_H
#define _CUDA_HEAVY_H #define _CUDA_HEAVY_H
void blake512_cpu_init(int thr_id, int threads); void blake512_cpu_init(int thr_id, uint32_t threads);
void blake512_cpu_setBlock(void *pdata, int len); void blake512_cpu_setBlock(void *pdata, int len);
void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce); void blake512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
void groestl512_cpu_init(int thr_id, int threads); void groestl512_cpu_init(int thr_id, uint32_t threads);
void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); void groestl512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
void groestl512_cpu_setBlock(void *data, int len); void groestl512_cpu_setBlock(void *data, int len);
void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce); void groestl512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
void hefty_cpu_hash(int thr_id, int threads, int startNounce); void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce);
void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len); void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len);
void hefty_cpu_init(int thr_id, int threads); void hefty_cpu_init(int thr_id, uint32_t threads);
void keccak512_cpu_init(int thr_id, int threads); void keccak512_cpu_init(int thr_id, uint32_t threads);
void keccak512_cpu_setBlock(void *data, int len); void keccak512_cpu_setBlock(void *data, int len);
void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); void keccak512_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce); void keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
void sha256_cpu_init(int thr_id, int threads); void sha256_cpu_init(int thr_id, uint32_t threads);
void sha256_cpu_setBlock(void *data, int len); void sha256_cpu_setBlock(void *data, int len);
void sha256_cpu_hash(int thr_id, int threads, int startNounce); void sha256_cpu_hash(int thr_id, uint32_t threads, int startNounce);
void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); void sha256_cpu_copyHeftyHash(int thr_id, uint32_t threads, void *heftyHashes, int copy);
void combine_cpu_init(int thr_id, int threads); void combine_cpu_init(int thr_id, uint32_t threads);
void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash); void combine_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *hash);
#endif #endif

8
lyra2/cuda_lyra2.cu

@ -117,9 +117,9 @@ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut,
} }
__global__ __launch_bounds__(TPB, 1) __global__ __launch_bounds__(TPB, 1)
void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint2 state[16]; uint2 state[16];
@ -209,9 +209,9 @@ void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
} }
__host__ __host__
void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

20
lyra2/lyra2RE.cu

@ -11,19 +11,19 @@ extern "C" {
static _ALIGN(64) uint64_t *d_hash[MAX_GPUS]; static _ALIGN(64) uint64_t *d_hash[MAX_GPUS];
extern void blake256_cpu_init(int thr_id, int threads); extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void blake256_cpu_setBlock_80(uint32_t *pdata); extern void blake256_cpu_setBlock_80(uint32_t *pdata);
extern void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_cpu_init(int thr_id, int threads); extern void keccak256_cpu_init(int thr_id, uint32_t threads);
extern void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, int threads); extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void groestl256_setTarget(const void *ptarget); extern void groestl256_setTarget(const void *ptarget);
extern uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order); extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
extern void groestl256_cpu_init(int thr_id, int threads); extern void groestl256_cpu_init(int thr_id, uint32_t threads);
extern "C" void lyra2_hash(void *state, const void *input) extern "C" void lyra2_hash(void *state, const void *input)
{ {
@ -63,8 +63,8 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 18 : 17; int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 18 : 17;
int throughput = (int) device_intensity(thr_id, __func__, 1U << intensity); // 18=256*256*4; uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); // 18=256*256*4;
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;

4
myriadgroestl.cpp

@ -7,9 +7,9 @@
#include "miner.h" #include "miner.h"
void myriadgroestl_cpu_init(int thr_id, int threads); void myriadgroestl_cpu_init(int thr_id, uint32_t threads);
void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn); void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
#define SWAP32(x) \ #define SWAP32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \

22
pentablake.cu

@ -173,9 +173,9 @@ void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0)
} }
__global__ __global__
void pentablake_gpu_hash_80(int threads, const uint32_t startNounce, void *outputHash) void pentablake_gpu_hash_80(uint32_t threads, const uint32_t startNounce, void *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint64_t h[8]; uint64_t h[8];
@ -213,9 +213,9 @@ void pentablake_gpu_hash_80(int threads, const uint32_t startNounce, void *outpu
} }
__host__ __host__
void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce, uint32_t *d_outputHash, int order) void pentablake_cpu_hash_80(int thr_id, uint32_t threads, const uint32_t startNounce, uint32_t *d_outputHash, int order)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -228,9 +228,9 @@ void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce,
__global__ __global__
void pentablake_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash) void pentablake_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
@ -271,9 +271,9 @@ void pentablake_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash)
} }
__host__ __host__
void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) void pentablake_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -321,7 +321,7 @@ void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t
__host__ static __host__ static
uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order) uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
uint32_t result = UINT32_MAX; uint32_t result = UINT32_MAX;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
@ -370,8 +370,8 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20]; uint32_t endiandata[20];
int rc = 0; int rc = 0;
int throughput = (int) device_intensity(thr_id, __func__, 128U * 2560); // 18.5 uint32_t throughput = device_intensity(thr_id, __func__, 128U * 2560); // 18.5
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000F; ((uint32_t*)ptarget)[7] = 0x000F;

40
quark/animecoin.cu

@ -17,37 +17,37 @@ static uint32_t *d_branch1Nonces[MAX_GPUS];
static uint32_t *d_branch2Nonces[MAX_GPUS]; static uint32_t *d_branch2Nonces[MAX_GPUS];
static uint32_t *d_branch3Nonces[MAX_GPUS]; static uint32_t *d_branch3Nonces[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
extern void quark_bmw512_cpu_setBlock_80(void *pdata); extern void quark_bmw512_cpu_setBlock_80(void *pdata);
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order); extern void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2, uint32_t *d_nonces2, size_t *nrm2,
int order); int order);
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces1, size_t *nrm1,
int order); int order);
extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
/* CPU Hash */ /* CPU Hash */
extern "C" void animehash(void *state, const void *input) extern "C" void animehash(void *state, const void *input)
@ -166,8 +166,8 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
unsigned long *hashes_done) unsigned long *hashes_done)
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int throughput = (int) device_intensity(thr_id, __func__, 1 << 19); // 256*256*8 uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f; ((uint32_t*)ptarget)[7] = 0x00000f;

6
quark/cuda_bmw512.cu

@ -436,7 +436,7 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *
} }
__host__ __host__
void quark_bmw512_cpu_init(int thr_id, int threads) void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
{ {
} }
@ -453,7 +453,7 @@ void quark_bmw512_cpu_setBlock_80(void *pdata)
} }
__host__ __host__
void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const uint32_t threadsperblock = 32; const uint32_t threadsperblock = 32;
@ -465,7 +465,7 @@ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uin
} }
__host__ __host__
void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order) void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
{ {
const uint32_t threadsperblock = 128; const uint32_t threadsperblock = 128;

10
quark/cuda_jh512.cu

@ -331,9 +331,9 @@ __device__ __forceinline__ void JHHash(const uint32_t *data, uint32_t *hashval)
// Die Hash-Funktion // Die Hash-Funktion
__global__ __launch_bounds__(256, 3) __global__ __launch_bounds__(256, 3)
void quark_jh512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -347,7 +347,7 @@ void quark_jh512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_jh512_cpu_init(int thr_id, int threads) __host__ void quark_jh512_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyToSymbol( c_E8_bitslice_roundconstant, cudaMemcpyToSymbol( c_E8_bitslice_roundconstant,
@ -361,9 +361,9 @@ __host__ void quark_jh512_cpu_init(int thr_id, int threads)
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
} }
__host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

18
quark/cuda_quark_blake512.cu

@ -122,9 +122,9 @@ static const uint64_t d_constHashPadding[8] = {
}; };
__global__ __launch_bounds__(256, 4) __global__ __launch_bounds__(256, 4)
void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
#if USE_SHUFFLE #if USE_SHUFFLE
const int warpID = threadIdx.x & 0x0F; // 16 warps const int warpID = threadIdx.x & 0x0F; // 16 warps
@ -186,9 +186,9 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
} }
} }
__global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) __global__ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint64_t buf[16]; uint64_t buf[16];
@ -236,7 +236,7 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
// ---------------------------- END CUDA quark_blake512 functions ------------------------------------ // ---------------------------- END CUDA quark_blake512 functions ------------------------------------
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_blake512_cpu_init(int thr_id, int threads) __host__ void quark_blake512_cpu_init(int thr_id, uint32_t threads)
{ {
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
CUDA_CALL_OR_RET( cudaMemcpyToSymbol(c_sigma, CUDA_CALL_OR_RET( cudaMemcpyToSymbol(c_sigma,
@ -263,9 +263,9 @@ __host__ void quark_blake512_cpu_setBlock_80(void *pdata)
); );
} }
__host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order) __host__ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
@ -280,9 +280,9 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
__host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) __host__ void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

14
quark/cuda_quark_compactionTest.cu

@ -28,7 +28,7 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua
cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS]; cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS];
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_compactTest_cpu_init(int thr_id, int threads) __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
@ -54,7 +54,7 @@ __host__ void quark_compactTest_cpu_init(int thr_id, int threads)
#endif #endif
// Die Summenfunktion (vom NVIDIA SDK) // Die Summenfunktion (vom NVIDIA SDK)
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {
extern __shared__ uint32_t sums[]; extern __shared__ uint32_t sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
@ -183,7 +183,7 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums
} }
// Der Scatter // Der Scatter
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) __global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t actNounce = id; uint32_t actNounce = id;
@ -232,7 +232,7 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
return mask; return mask;
} }
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
uint32_t *d_nonces1, cuda_compactTestFunction_t function, uint32_t *d_nonces1, cuda_compactTestFunction_t function,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{ {
@ -288,7 +288,7 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, ui
} }
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm,
uint32_t *d_nonces1, uint32_t *d_nonces2, uint32_t *d_nonces1, uint32_t *d_nonces2,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{ {
@ -327,7 +327,7 @@ __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint
*/ */
} }
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2, uint32_t *d_nonces2, size_t *nrm2,
int order) int order)
@ -344,7 +344,7 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st
*nrm2 = (size_t)h_numValid[thr_id][1]; *nrm2 = (size_t)h_numValid[thr_id][1];
} }
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, __host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces1, size_t *nrm1,
int order) int order)
{ {

14
quark/cuda_quark_groestl512.cu

@ -16,11 +16,11 @@
#include "quark/cuda_quark_groestl512_sm20.cu" #include "quark/cuda_quark_groestl512_sm20.cu"
__global__ __launch_bounds__(TPB, THF) __global__ __launch_bounds__(TPB, THF)
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector) void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector)
{ {
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
if (thread < threads) if (thread < threads)
{ {
// GROESTL // GROESTL
@ -62,10 +62,10 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
} }
__global__ void __launch_bounds__(TPB, THF) __global__ void __launch_bounds__(TPB, THF)
quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) quark_doublegroestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{ {
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2; uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
if (thread < threads) if (thread < threads)
{ {
// GROESTL // GROESTL
@ -124,13 +124,13 @@ __global__ void __launch_bounds__(TPB, THF)
__host__ void quark_groestl512_cpu_init(int thr_id, int threads) __host__ void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
{ {
if (device_sm[device_map[thr_id]] < 300) if (device_sm[device_map[thr_id]] < 300)
quark_groestl512_sm20_init(thr_id, threads); quark_groestl512_sm20_init(thr_id, threads);
} }
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
int threadsperblock = TPB; int threadsperblock = TPB;
@ -154,7 +154,7 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int factor = THF; const int factor = THF;
int threadsperblock = TPB; int threadsperblock = TPB;

10
quark/cuda_quark_groestl512_sm20.cu

@ -202,7 +202,7 @@ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
#endif #endif
__global__ __global__
void quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{ {
#if __CUDA_ARCH__ < 300 #if __CUDA_ARCH__ < 300
extern __shared__ char mixtabs[]; extern __shared__ char mixtabs[];
@ -221,7 +221,7 @@ void quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g
__syncthreads(); __syncthreads();
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// GROESTL // GROESTL
@ -285,7 +285,7 @@ void quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
__host__ __host__
void quark_groestl512_sm20_init(int thr_id, int threads) void quark_groestl512_sm20_init(int thr_id, uint32_t threads)
{ {
// Texturen mit obigem Makro initialisieren // Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
@ -299,7 +299,7 @@ void quark_groestl512_sm20_init(int thr_id, int threads)
} }
__host__ __host__
void quark_groestl512_sm20_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void quark_groestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
int threadsperblock = 512; int threadsperblock = 512;
@ -314,7 +314,7 @@ void quark_groestl512_sm20_hash_64(int thr_id, int threads, uint32_t startNounce
} }
__host__ __host__
void quark_doublegroestl512_sm20_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void quark_doublegroestl512_sm20_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
int threadsperblock = 512; int threadsperblock = 512;

14
quark/cuda_quark_keccak512.cu

@ -94,9 +94,9 @@ static void keccak_block(uint2 *s)
} }
__global__ __global__
void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -193,9 +193,9 @@ static void keccak_block_v30(uint64_t *s, const uint32_t *in)
} }
__global__ __global__
void quark_keccak512_gpu_hash_64_v30(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -232,7 +232,7 @@ void quark_keccak512_gpu_hash_64_v30(int threads, uint32_t startNounce, uint64_t
} }
__host__ __host__
void quark_keccak512_cpu_init(int thr_id, int threads) void quark_keccak512_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyToSymbol( d_keccak_round_constants, cudaMemcpyToSymbol( d_keccak_round_constants,
host_keccak_round_constants, host_keccak_round_constants,
@ -241,9 +241,9 @@ void quark_keccak512_cpu_init(int thr_id, int threads)
} }
__host__ __host__
void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

14
quark/cuda_skein512.cu

@ -350,9 +350,9 @@ uint64_t skein_rotl64(const uint64_t x, const int offset)
__global__ __global__
void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector) void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// Skein // Skein
@ -450,9 +450,9 @@ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t * co
} }
__global__ __global__
void quark_skein512_gpu_hash_64_v30(int threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector) void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// Skein // Skein
@ -550,14 +550,14 @@ void quark_skein512_gpu_hash_64_v30(int threads, uint32_t startNounce, uint64_t
} }
__host__ __host__
void quark_skein512_cpu_init(int thr_id, int threads) void quark_skein512_cpu_init(int thr_id, uint32_t threads)
{ {
} }
__host__ __host__
void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

40
quark/quarkcoin.cu

@ -19,37 +19,37 @@ static uint32_t *d_branch1Nonces[MAX_GPUS];
static uint32_t *d_branch2Nonces[MAX_GPUS]; static uint32_t *d_branch2Nonces[MAX_GPUS];
static uint32_t *d_branch3Nonces[MAX_GPUS]; static uint32_t *d_branch3Nonces[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2, uint32_t *d_nonces2, size_t *nrm2,
int order); int order);
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces1, size_t *nrm1,
int order); int order);
extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
// Original Quarkhash Funktion aus einem miner Quelltext // Original Quarkhash Funktion aus einem miner Quelltext
extern "C" void quarkhash(void *state, const void *input) extern "C" void quarkhash(void *state, const void *input)
@ -137,8 +137,8 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int throughput = (int) device_intensity(thr_id, __func__, 1 << 20); // 256*4096 uint32_t throughput = device_intensity(thr_id, __func__, 1 << 20); // 256*4096
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00F; ((uint32_t*)ptarget)[7] = 0x00F;

18
qubit/deep.cu

@ -16,17 +16,17 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void qubit_luffa512_cpu_init(int thr_id, int threads); extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
extern void qubit_luffa512_cpu_setBlock_80(void *pdata); extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
extern void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget); extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget);
extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x11_cubehash512_cpu_init(int thr_id, int threads); extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads);
extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern "C" void deephash(void *state, const void *input) extern "C" void deephash(void *state, const void *input)
{ {
@ -60,8 +60,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20]; uint32_t endiandata[20];
int throughput = (int) device_intensity(thr_id, __func__, 1U << 19); // 256*256*8 uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8
throughput = min(throughput, (int) (max_nonce - first_nonce)); throughput = min(throughput, (max_nonce - first_nonce));
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000f; ((uint32_t*)ptarget)[7] = 0x0000f;

8
qubit/doom.cu

@ -12,11 +12,11 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void qubit_luffa512_cpu_init(int thr_id, int threads); extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
extern void qubit_luffa512_cpu_setBlock_80(void *pdata); extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
extern void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget); extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget);
extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void doomhash(void *state, const void *input) extern void doomhash(void *state, const void *input)
{ {
@ -41,7 +41,7 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20]; uint32_t endiandata[20];
uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); // 256*256*8*8 uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); // 256*256*8*8
throughput = min(throughput, (max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000f; ((uint32_t*)ptarget)[7] = 0x0000f;

28
qubit/qubit.cu

@ -16,24 +16,24 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void qubit_luffa512_cpu_init(int thr_id, int threads); extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
extern void qubit_luffa512_cpu_setBlock_80(void *pdata); extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
extern void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x11_cubehash512_cpu_init(int thr_id, int threads); extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads);
extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern int x11_simd512_cpu_init(int thr_id, int threads); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
int order); int order);
@ -80,8 +80,8 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
{ {
uint32_t endiandata[20]; uint32_t endiandata[20];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int throughput = (int) device_intensity(thr_id, __func__, 1U << 19); // 256*256*8 uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;

18
qubit/qubit_luffa512.cu

@ -352,9 +352,9 @@ void finalization512(hashState *state, uint32_t *b)
/***************************************************/ /***************************************************/
// Die Hash-Funktion // Die Hash-Funktion
__global__ __global__
void qubit_luffa512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
@ -382,9 +382,9 @@ void qubit_luffa512_gpu_hash_80(int threads, uint32_t startNounce, void *outputH
} }
__global__ __global__
void qubit_luffa512_gpu_finalhash_80(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
@ -445,7 +445,7 @@ void qubit_luffa512_gpu_finalhash_80(int threads, uint32_t startNounce, void *ou
} }
__host__ __host__
void qubit_luffa512_cpu_init(int thr_id, int threads) void qubit_luffa512_cpu_init(int thr_id, uint32_t threads)
{ {
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice));
@ -454,11 +454,11 @@ void qubit_luffa512_cpu_init(int thr_id, int threads)
} }
__host__ __host__
uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash,int order) uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
{ {
uint32_t result = UINT32_MAX; uint32_t result = UINT32_MAX;
cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t)); cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t));
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -478,9 +478,9 @@ uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t start
} }
__host__ __host__
void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash,int order) void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

4
scrypt.c

@ -701,8 +701,8 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata,
uint32_t midstate[8]; uint32_t midstate[8];
uint32_t n = pdata[19] - 1; uint32_t n = pdata[19] - 1;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
int throughput = scrypt_best_throughput(); uint32_t throughput = scrypt_best_throughput();
int i; uint32_t i;
#if HAVE_SHA256_4WAY #if HAVE_SHA256_4WAY
if (sha256_use_4way()) if (sha256_use_4way())

10
x11/cuda_x11_cubehash512.cu

@ -256,9 +256,9 @@ static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
/***************************************************/ /***************************************************/
// GPU Hash Function // GPU Hash Function
__global__ __global__
void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -289,14 +289,14 @@ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_
// Setup-Funktionen // Setup-Funktionen
__host__ __host__
void x11_cubehash512_cpu_init(int thr_id, int threads) void x11_cubehash512_cpu_init(int thr_id, uint32_t threads)
{ {
} }
__host__ __host__
void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

10
x11/cuda_x11_echo.cu

@ -279,13 +279,13 @@ void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
} }
__global__ __launch_bounds__(128, 7) /* will force 72 registers */ __global__ __launch_bounds__(128, 7) /* will force 72 registers */
void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__shared__ uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
echo_gpu_init(sharedMemory); echo_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -298,15 +298,15 @@ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash
} }
__host__ __host__
void x11_echo512_cpu_init(int thr_id, int threads) void x11_echo512_cpu_init(int thr_id, uint32_t threads)
{ {
aes_cpu_init(thr_id); aes_cpu_init(thr_id);
} }
__host__ __host__
void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 128; const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

10
x11/cuda_x11_luffa512.cu

@ -335,9 +335,9 @@ void finalization512(hashState *state, uint32_t *b)
/***************************************************/ /***************************************************/
// Die Hash-Funktion // Die Hash-Funktion
__global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) __global__ void x11_luffa512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -358,15 +358,15 @@ __global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint
// Setup Function // Setup Function
__host__ __host__
void x11_luffa512_cpu_init(int thr_id, int threads) void x11_luffa512_cpu_init(int thr_id, uint32_t threads)
{ {
CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice)); CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice));
CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice)); CUDA_CALL_OR_RET(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice));
} }
__host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

10
x11/cuda_x11_luffa512_Cubehash.cu

@ -587,9 +587,9 @@ static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
/***************************************************/ /***************************************************/
// Hash Function // Hash Function
__global__ __global__
void x11_luffaCubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -625,16 +625,16 @@ void x11_luffaCubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_
// Setup // Setup
__host__ __host__
void x11_luffaCubehash512_cpu_init(int thr_id, int threads) void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice);
} }
__host__ __host__
void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

18
x11/cuda_x11_shavite512.cu

@ -1341,13 +1341,13 @@ void shavite_gpu_init(uint32_t *sharedMemory)
// GPU Hash // GPU Hash
__global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */ __global__ __launch_bounds__(TPB, 7) /* 64 registers with 128,8 - 72 regs with 128,7 */
void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__shared__ uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
shavite_gpu_init(sharedMemory); shavite_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -1392,13 +1392,13 @@ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_h
} }
__global__ __launch_bounds__(TPB, 7) __global__ __launch_bounds__(TPB, 7)
void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
{ {
__shared__ uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
shavite_gpu_init(sharedMemory); shavite_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
const uint32_t nounce = startNounce + thread; const uint32_t nounce = startNounce + thread;
@ -1434,9 +1434,9 @@ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputH
} }
__host__ __host__
void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -1446,9 +1446,9 @@ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, u
} }
__host__ __host__
void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -1458,7 +1458,7 @@ void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, u
} }
__host__ __host__
void x11_shavite512_cpu_init(int thr_id, int threads) void x11_shavite512_cpu_init(int thr_id, uint32_t threads)
{ {
aes_cpu_init(thr_id); aes_cpu_init(thr_id);
} }

24
x11/cuda_x11_simd512.cu

@ -578,7 +578,7 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
/***************************************************/ /***************************************************/
__global__ __launch_bounds__(TPB, 4) __global__ __launch_bounds__(TPB, 4)
void x11_simd512_gpu_expand_64(int threads, uint32_t *g_hash, uint4 *g_temp4) void x11_simd512_gpu_expand_64(uint32_t threads, uint32_t *g_hash, uint4 *g_temp4)
{ {
int threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) / 8; int threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) / 8;
if (threadBloc < threads) if (threadBloc < threads)
@ -600,9 +600,9 @@ void x11_simd512_gpu_expand_64(int threads, uint32_t *g_hash, uint4 *g_temp4)
} }
__global__ __launch_bounds__(TPB, 1) __global__ __launch_bounds__(TPB, 1)
void x11_simd512_gpu_compress1_64(int threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) void x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t *Hash = &g_hash[thread * 16]; uint32_t *Hash = &g_hash[thread * 16];
@ -611,9 +611,9 @@ void x11_simd512_gpu_compress1_64(int threads, uint32_t *g_hash, uint4 *g_fft4,
} }
__global__ __launch_bounds__(TPB, 1) __global__ __launch_bounds__(TPB, 1)
void x11_simd512_gpu_compress2_64(int threads, uint4 *g_fft4, uint32_t *g_state) void x11_simd512_gpu_compress2_64(uint32_t threads, uint4 *g_fft4, uint32_t *g_state)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
Compression2(thread, g_fft4, g_state); Compression2(thread, g_fft4, g_state);
@ -621,9 +621,9 @@ void x11_simd512_gpu_compress2_64(int threads, uint4 *g_fft4, uint32_t *g_state)
} }
__global__ __launch_bounds__(TPB, 2) __global__ __launch_bounds__(TPB, 2)
void x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t *Hash = &g_hash[thread * 16]; uint32_t *Hash = &g_hash[thread * 16];
@ -633,9 +633,9 @@ void x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t *g_hash, uint4 *g
} }
__global__ __launch_bounds__(TPB, 2) __global__ __launch_bounds__(TPB, 2)
void x11_simd512_gpu_final_64(int threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t *Hash = &g_hash[thread * 16]; uint32_t *Hash = &g_hash[thread * 16];
@ -644,7 +644,7 @@ void x11_simd512_gpu_final_64(int threads, uint32_t *g_hash, uint4 *g_fft4, uint
} }
__host__ __host__
int x11_simd512_cpu_init(int thr_id, int threads) int x11_simd512_cpu_init(int thr_id, uint32_t threads)
{ {
CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */ CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err); CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
@ -671,9 +671,9 @@ int x11_simd512_cpu_init(int thr_id, int threads)
} }
__host__ __host__
void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = TPB; const uint32_t threadsperblock = TPB;
dim3 block(threadsperblock); dim3 block(threadsperblock);
dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 grid((threads + threadsperblock-1) / threadsperblock);

22
x11/fresh.cu

@ -14,19 +14,19 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_setBlock_80(void *pdata); extern void x11_shavite512_setBlock_80(void *pdata);
extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern int x11_simd512_cpu_init(int thr_id, int threads); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
int order); int order);
@ -77,8 +77,8 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20]; uint32_t endiandata[20];
int throughput = (int) device_intensity(thr_id, __func__, 1 << 19); uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19);
throughput = min(throughput, (int) (max_nonce - first_nonce)); throughput = min(throughput, (max_nonce - first_nonce));
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00ff; ((uint32_t*)ptarget)[7] = 0x00ff;

16
x11/s3.cu

@ -15,15 +15,15 @@ extern "C" {
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x11_shavite512_setBlock_80(void *pdata); extern void x11_shavite512_setBlock_80(void *pdata);
extern int x11_simd512_cpu_init(int thr_id, int threads); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
/* CPU HASH */ /* CPU HASH */
extern "C" void s3hash(void *output, const void *input) extern "C" void s3hash(void *output, const void *input)
@ -62,8 +62,8 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
// reduce by one the intensity on windows // reduce by one the intensity on windows
intensity--; intensity--;
#endif #endif
int throughput = (int) device_intensity(thr_id, __func__, 1 << intensity); uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity);
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0xF; ((uint32_t*)ptarget)[7] = 0xF;

50
x11/x11.cu

@ -22,40 +22,40 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_luffaCubehash512_cpu_init(int thr_id, int threads); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern int x11_simd512_cpu_init(int thr_id, int threads); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
// X11 CPU Hash // X11 CPU Hash
@ -133,8 +133,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19; int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
int throughput = (int) device_intensity(thr_id, __func__, 1 << intensity); // 19=256*256*8; uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 19=256*256*8;
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x5; ((uint32_t*)ptarget)[7] = 0x5;

10
x13/cuda_x13_fugue512.cu

@ -553,7 +553,7 @@ static const uint32_t mixtab0_cpu[] = {
/***************************************************/ /***************************************************/
// Die Hash-Funktion // Die Hash-Funktion
__global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) __global__ void x13_fugue512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
extern __shared__ char mixtabs[]; extern __shared__ char mixtabs[];
@ -565,7 +565,7 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
__syncthreads(); __syncthreads();
int i; int i;
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -667,7 +667,7 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
__host__ void x13_fugue512_cpu_init(int thr_id, int threads) __host__ void x13_fugue512_cpu_init(int thr_id, uint32_t threads)
{ {
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
@ -675,9 +675,9 @@ __host__ void x13_fugue512_cpu_init(int thr_id, int threads)
texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); texDef(mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256);
} }
__host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

10
x13/cuda_x13_hamsi512.cu

@ -587,9 +587,9 @@ static const uint32_t T512[64][16] = {
}; };
__global__ __global__
void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -678,7 +678,7 @@ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_has
} }
__host__ __host__
void x13_hamsi512_cpu_init(int thr_id, int threads) void x13_hamsi512_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyToSymbol(d_alpha_n, alpha_n, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_alpha_n, alpha_n, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_alpha_f, alpha_f, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_alpha_f, alpha_f, sizeof(uint32_t)*32, 0, cudaMemcpyHostToDevice);
@ -686,9 +686,9 @@ void x13_hamsi512_cpu_init(int thr_id, int threads)
} }
__host__ __host__
void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 128; const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

56
x13/x13.cu

@ -26,45 +26,45 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_luffaCubehash512_cpu_init(int thr_id, int threads); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern int x11_simd512_cpu_init(int thr_id, int threads); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_hamsi512_cpu_init(int thr_id, int threads); extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_init(int thr_id, int threads); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
// X13 CPU Hash // X13 CPU Hash
@ -152,8 +152,8 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19; int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
int throughput = (int) device_intensity(thr_id, __func__, 1 << intensity); // 19=256*256*8; uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity); // 19=256*256*8;
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000f; ((uint32_t*)ptarget)[7] = 0x000f;

10
x15/cuda_x14_shabal512.cu

@ -361,11 +361,11 @@ static const uint32_t d_C512[] = {
/***************************************************/ /***************************************************/
// GPU Hash Function // GPU Hash Function
__global__ void x14_shabal512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) __global__ void x14_shabal512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__syncthreads(); __syncthreads();
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
@ -453,14 +453,14 @@ __global__ void x14_shabal512_gpu_hash_64(int threads, uint32_t startNounce, uin
} }
} }
__host__ void x14_shabal512_cpu_init(int thr_id, int threads) __host__ void x14_shabal512_cpu_init(int thr_id, uint32_t threads)
{ {
} }
// #include <stdio.h> // #include <stdio.h>
__host__ void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

20
x15/cuda_x15_whirlpool.cu

@ -2285,7 +2285,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
__global__ __global__
void oldwhirlpool_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
{ {
__shared__ uint64_t sharedMemory[2048]; __shared__ uint64_t sharedMemory[2048];
@ -2302,7 +2302,7 @@ void oldwhirlpool_gpu_hash_80(int threads, uint32_t startNounce, void *outputHas
#endif #endif
} }
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
@ -2381,7 +2381,7 @@ void oldwhirlpool_gpu_hash_80(int threads, uint32_t startNounce, void *outputHas
} }
__global__ __global__
void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
__shared__ uint64_t sharedMemory[2048]; __shared__ uint64_t sharedMemory[2048];
@ -2398,7 +2398,7 @@ void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_ha
#endif #endif
} }
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
@ -2457,7 +2457,7 @@ void x15_whirlpool_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_ha
} }
__global__ __global__
void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce) void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce)
{ {
__shared__ uint64_t sharedMemory[2048]; __shared__ uint64_t sharedMemory[2048];
@ -2475,7 +2475,7 @@ void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t *
#endif #endif
} }
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
@ -2544,7 +2544,7 @@ void oldwhirlpool_gpu_finalhash_64(int threads, uint32_t startNounce, uint64_t *
} }
__host__ __host__
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode) extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode)
{ {
switch (mode) { switch (mode) {
case 0: /* x15 with rotated T1-T7 (based on T0) */ case 0: /* x15 with rotated T1-T7 (based on T0) */
@ -2586,7 +2586,7 @@ extern void x15_whirlpool_cpu_free(int thr_id)
} }
__host__ __host__
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 grid((threads + threadsperblock-1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -2599,7 +2599,7 @@ extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNou
} }
__host__ __host__
extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
uint32_t result = 0xffffffff; uint32_t result = 0xffffffff;
@ -2621,7 +2621,7 @@ extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, int threads, uint32_t
} }
__host__ __host__
void whirlpool512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{ {
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 grid((threads + threadsperblock-1) / threadsperblock);

12
x15/whirlpool.cu

@ -11,12 +11,12 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode); extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget); extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
extern void whirlpool512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
// CPU Hash function // CPU Hash function
@ -57,8 +57,8 @@ extern "C" int scanhash_whc(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20]; uint32_t endiandata[20];
int throughput = (int) device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8; uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;

60
x15/x14.cu

@ -29,48 +29,48 @@ extern "C" {
// Memory for the hash functions // Memory for the hash functions
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_luffaCubehash512_cpu_init(int thr_id, int threads); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern int x11_simd512_cpu_init(int thr_id, int threads); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_hamsi512_cpu_init(int thr_id, int threads); extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_init(int thr_id, int threads); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x14_shabal512_cpu_init(int thr_id, int threads); extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
// X14 CPU Hash function // X14 CPU Hash function
@ -164,8 +164,8 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20]; uint32_t endiandata[20];
int throughput = (int) device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8; uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000f; ((uint32_t*)ptarget)[7] = 0x000f;

64
x15/x15.cu

@ -30,52 +30,52 @@ extern "C" {
// Memory for the hash functions // Memory for the hash functions
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_luffaCubehash512_cpu_init(int thr_id, int threads); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern int x11_simd512_cpu_init(int thr_id, int threads); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_hamsi512_cpu_init(int thr_id, int threads); extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_init(int thr_id, int threads); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x14_shabal512_cpu_init(int thr_id, int threads); extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode); extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_free(int thr_id); extern void x15_whirlpool_cpu_free(int thr_id);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
// X15 CPU Hash function // X15 CPU Hash function
@ -174,8 +174,8 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20]; uint32_t endiandata[20];
int throughput = (int) device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8; uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00FF; ((uint32_t*)ptarget)[7] = 0x00FF;

10
x17/cuda_x17_haval512.cu

@ -291,9 +291,9 @@ static const uint32_t c_initVector[8] = {
__global__ __global__
void x17_haval256_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x17_haval256_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
@ -378,15 +378,15 @@ void x17_haval256_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_has
} }
__host__ __host__
void x17_haval256_cpu_init(int thr_id, int threads) void x17_haval256_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyToSymbol(initVector,c_initVector,sizeof(c_initVector),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(initVector,c_initVector,sizeof(c_initVector),0, cudaMemcpyHostToDevice);
} }
__host__ __host__
void x17_haval256_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);

10
x17/cuda_x17_sha512.cu

@ -149,9 +149,9 @@ uint64_t Tone(const uint64_t* sharedMemory, uint64_t r[8], uint64_t W[80], uint3
} }
__global__ __global__
void x17_sha512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
@ -215,16 +215,16 @@ void x17_sha512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash,
} }
__host__ __host__
void x17_sha512_cpu_init(int thr_id, int threads) void x17_sha512_cpu_init(int thr_id, uint32_t threads)
{ {
cudaMemcpyToSymbol(K_512,K512,80*sizeof(uint64_t),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(K_512,K512,80*sizeof(uint64_t),0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(H_512,H512,sizeof(H512),0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(H_512,H512,sizeof(H512),0, cudaMemcpyHostToDevice);
} }
__host__ __host__
void x17_sha512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);

72
x17/x17.cu

@ -33,57 +33,57 @@ extern "C"
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata); extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads); extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads); extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads); extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_luffaCubehash512_cpu_init(int thr_id, int threads); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads);
extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_init(int thr_id, int threads); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern int x11_simd512_cpu_init(int thr_id, int threads); extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads); extern void x11_echo512_cpu_init(int thr_id, uint32_t threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_hamsi512_cpu_init(int thr_id, int threads); extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_init(int thr_id, int threads); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x14_shabal512_cpu_init(int thr_id, int threads); extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int flag); extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x17_sha512_cpu_init(int thr_id, int threads); extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
extern void x17_sha512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x17_haval256_cpu_init(int thr_id, int threads); extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
extern void x17_haval256_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, extern void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
int order); int order);
@ -192,8 +192,8 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int throughput = (int) device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8; uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
throughput = min(throughput, (int)(max_nonce - first_nonce)); throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00ff; ((uint32_t*)ptarget)[7] = 0x00ff;

Loading…
Cancel
Save