mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-15 01:00:19 +00:00
e50556b637
small fixes to handle better the multi thread per gpu explicitly report than quark is not compatible with SM 2.1 (compact shuffle)
336 lines
10 KiB
Plaintext
336 lines
10 KiB
Plaintext
#include <memory.h>
|
|
|
|
#define SPH_C32(x) ((uint32_t)(x ## U))
|
|
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
|
|
|
#include "cuda_helper.h"
|
|
|
|
static uint32_t *h_GNonces[MAX_GPUS];
|
|
static uint32_t *d_GNonces[MAX_GPUS];
|
|
static unsigned int* d_textures[MAX_GPUS][8];
|
|
|
|
__constant__ uint32_t pTarget[8];
|
|
|
|
#define C32e(x) \
|
|
((SPH_C32(x) >> 24) \
|
|
| ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \
|
|
| ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \
|
|
| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
|
|
|
|
#define PC32up(j, r) ((uint32_t)((j) + (r)))
|
|
#define PC32dn(j, r) 0
|
|
#define QC32up(j, r) 0xFFFFFFFF
|
|
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
|
|
|
|
#define B32_0(x) __byte_perm(x, 0, 0x4440)
|
|
//((x) & 0xFF)
|
|
#define B32_1(x) __byte_perm(x, 0, 0x4441)
|
|
//(((x) >> 8) & 0xFF)
|
|
#define B32_2(x) __byte_perm(x, 0, 0x4442)
|
|
//(((x) >> 16) & 0xFF)
|
|
#define B32_3(x) __byte_perm(x, 0, 0x4443)
|
|
//((x) >> 24)
|
|
|
|
#define MAXWELL_OR_FERMI 1
|
|
#if MAXWELL_OR_FERMI
|
|
#define USE_SHARED 1
|
|
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
|
|
#if USE_SHARED
|
|
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
|
|
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
|
|
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
|
|
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
|
|
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
|
|
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
|
|
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
|
|
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
|
|
#else
|
|
#define T0up(x) tex1Dfetch(t0up2, x)
|
|
#define T0dn(x) tex1Dfetch(t0dn2, x)
|
|
#define T1up(x) tex1Dfetch(t1up2, x)
|
|
#define T1dn(x) tex1Dfetch(t1dn2, x)
|
|
#define T2up(x) tex1Dfetch(t2up2, x)
|
|
#define T2dn(x) tex1Dfetch(t2dn2, x)
|
|
#define T3up(x) tex1Dfetch(t3up2, x)
|
|
#define T3dn(x) tex1Dfetch(t3dn2, x)
|
|
#endif
|
|
#else
|
|
#define USE_SHARED 1
|
|
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
|
|
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
|
|
#define T0dn(x) tex1Dfetch(t0dn2, x)
|
|
#define T1up(x) tex1Dfetch(t1up2, x)
|
|
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
|
|
#define T2up(x) tex1Dfetch(t2up2, x)
|
|
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
|
|
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
|
|
#define T3dn(x) tex1Dfetch(t3dn2, x)
|
|
#endif
|
|
|
|
static texture<unsigned int, 1, cudaReadModeElementType> t0up2;
|
|
static texture<unsigned int, 1, cudaReadModeElementType> t0dn2;
|
|
static texture<unsigned int, 1, cudaReadModeElementType> t1up2;
|
|
static texture<unsigned int, 1, cudaReadModeElementType> t1dn2;
|
|
static texture<unsigned int, 1, cudaReadModeElementType> t2up2;
|
|
static texture<unsigned int, 1, cudaReadModeElementType> t2dn2;
|
|
static texture<unsigned int, 1, cudaReadModeElementType> t3up2;
|
|
static texture<unsigned int, 1, cudaReadModeElementType> t3dn2;
|
|
|
|
#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
|
|
t[d0] = T0up(B32_0(a[b0])) \
|
|
^ T1up(B32_1(a[b1])) \
|
|
^ T2up(B32_2(a[b2])) \
|
|
^ T3up(B32_3(a[b3])) \
|
|
^ T0dn(B32_0(a[b4])) \
|
|
^ T1dn(B32_1(a[b5])) \
|
|
^ T2dn(B32_2(a[b6])) \
|
|
^ T3dn(B32_3(a[b7])); \
|
|
t[d1] = T0dn(B32_0(a[b0])) \
|
|
^ T1dn(B32_1(a[b1])) \
|
|
^ T2dn(B32_2(a[b2])) \
|
|
^ T3dn(B32_3(a[b3])) \
|
|
^ T0up(B32_0(a[b4])) \
|
|
^ T1up(B32_1(a[b5])) \
|
|
^ T2up(B32_2(a[b6])) \
|
|
^ T3up(B32_3(a[b7])); \
|
|
} while (0)
|
|
|
|
|
|
extern uint32_t T0up_cpu[];
|
|
extern uint32_t T0dn_cpu[];
|
|
extern uint32_t T1up_cpu[];
|
|
extern uint32_t T1dn_cpu[];
|
|
extern uint32_t T2up_cpu[];
|
|
extern uint32_t T2dn_cpu[];
|
|
extern uint32_t T3up_cpu[];
|
|
extern uint32_t T3dn_cpu[];
|
|
|
|
__device__ __forceinline__
|
|
void groestl256_perm_P(uint32_t thread,uint32_t *a, char *mixtabs)
|
|
{
|
|
#pragma unroll 10
|
|
for (int r = 0; r<10; r++)
|
|
{
|
|
uint32_t t[16];
|
|
|
|
a[0x0] ^= PC32up(0x00, r);
|
|
a[0x2] ^= PC32up(0x10, r);
|
|
a[0x4] ^= PC32up(0x20, r);
|
|
a[0x6] ^= PC32up(0x30, r);
|
|
a[0x8] ^= PC32up(0x40, r);
|
|
a[0xA] ^= PC32up(0x50, r);
|
|
a[0xC] ^= PC32up(0x60, r);
|
|
a[0xE] ^= PC32up(0x70, r);
|
|
RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF);
|
|
RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1);
|
|
RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3);
|
|
RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5);
|
|
RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7);
|
|
RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9);
|
|
RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB);
|
|
RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD);
|
|
|
|
#pragma unroll 16
|
|
for (int k = 0; k<16; k++)
|
|
a[k] = t[k];
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__
|
|
void groestl256_perm_Q(uint32_t thread, uint32_t *a, char *mixtabs)
|
|
{
|
|
#pragma unroll
|
|
for (int r = 0; r<10; r++)
|
|
{
|
|
uint32_t t[16];
|
|
|
|
a[0x0] ^= QC32up(0x00, r);
|
|
a[0x1] ^= QC32dn(0x00, r);
|
|
a[0x2] ^= QC32up(0x10, r);
|
|
a[0x3] ^= QC32dn(0x10, r);
|
|
a[0x4] ^= QC32up(0x20, r);
|
|
a[0x5] ^= QC32dn(0x20, r);
|
|
a[0x6] ^= QC32up(0x30, r);
|
|
a[0x7] ^= QC32dn(0x30, r);
|
|
a[0x8] ^= QC32up(0x40, r);
|
|
a[0x9] ^= QC32dn(0x40, r);
|
|
a[0xA] ^= QC32up(0x50, r);
|
|
a[0xB] ^= QC32dn(0x50, r);
|
|
a[0xC] ^= QC32up(0x60, r);
|
|
a[0xD] ^= QC32dn(0x60, r);
|
|
a[0xE] ^= QC32up(0x70, r);
|
|
a[0xF] ^= QC32dn(0x70, r);
|
|
RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD);
|
|
RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF);
|
|
RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1);
|
|
RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3);
|
|
RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5);
|
|
RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7);
|
|
RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9);
|
|
RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB);
|
|
|
|
#pragma unroll
|
|
for (int k = 0; k<16; k++)
|
|
a[k] = t[k];
|
|
}
|
|
}
|
|
|
|
__global__ __launch_bounds__(256,1)
|
|
void groestl256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *resNonces)
|
|
{
|
|
#if USE_SHARED
|
|
extern __shared__ char mixtabs[];
|
|
|
|
if (threadIdx.x < 256) {
|
|
*((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(t0up2, threadIdx.x);
|
|
*((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(t0dn2, threadIdx.x);
|
|
*((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(t1up2, threadIdx.x);
|
|
*((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(t1dn2, threadIdx.x);
|
|
*((uint32_t*)mixtabs + (1024 + threadIdx.x)) = tex1Dfetch(t2up2, threadIdx.x);
|
|
*((uint32_t*)mixtabs + (1280 + threadIdx.x)) = tex1Dfetch(t2dn2, threadIdx.x);
|
|
*((uint32_t*)mixtabs + (1536 + threadIdx.x)) = tex1Dfetch(t3up2, threadIdx.x);
|
|
*((uint32_t*)mixtabs + (1792 + threadIdx.x)) = tex1Dfetch(t3dn2, threadIdx.x);
|
|
}
|
|
|
|
__syncthreads();
|
|
#endif
|
|
|
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
|
if (thread < threads)
|
|
{
|
|
// GROESTL
|
|
uint32_t message[16];
|
|
uint32_t state[16];
|
|
|
|
#pragma unroll
|
|
for (int k = 0; k<4; k++)
|
|
LOHI(message[2*k], message[2*k+1], outputHash[k*threads+thread]);
|
|
|
|
#pragma unroll
|
|
for (int k = 9; k<15; k++)
|
|
message[k] = 0;
|
|
|
|
message[8] = 0x80;
|
|
message[15] = 0x01000000;
|
|
|
|
#pragma unroll 16
|
|
for (int u = 0; u<16; u++)
|
|
state[u] = message[u];
|
|
|
|
state[15] ^= 0x10000;
|
|
|
|
// Perm
|
|
|
|
#if USE_SHARED
|
|
groestl256_perm_P(thread, state, mixtabs);
|
|
state[15] ^= 0x10000;
|
|
groestl256_perm_Q(thread, message, mixtabs);
|
|
#else
|
|
groestl256_perm_P(thread, state, NULL);
|
|
state[15] ^= 0x10000;
|
|
groestl256_perm_P(thread, message, NULL);
|
|
#endif
|
|
#pragma unroll 16
|
|
for (int u = 0; u<16; u++) state[u] ^= message[u];
|
|
#pragma unroll 16
|
|
for (int u = 0; u<16; u++) message[u] = state[u];
|
|
#if USE_SHARED
|
|
groestl256_perm_P(thread, message, mixtabs);
|
|
#else
|
|
groestl256_perm_P(thread, message, NULL);
|
|
#endif
|
|
state[14] ^= message[14];
|
|
state[15] ^= message[15];
|
|
|
|
uint32_t nonce = startNounce + thread;
|
|
if (state[15] <= pTarget[7]) {
|
|
atomicMin(&resNonces[1], resNonces[0]);
|
|
atomicMin(&resNonces[0], nonce);
|
|
}
|
|
}
|
|
}
|
|
|
|
#define texDef(id, texname, texmem, texsource, texsize) { \
|
|
unsigned int *texmem; \
|
|
cudaMalloc(&texmem, texsize); \
|
|
d_textures[thr_id][id] = texmem; \
|
|
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
|
|
texname.normalized = 0; \
|
|
texname.filterMode = cudaFilterModePoint; \
|
|
texname.addressMode[0] = cudaAddressModeClamp; \
|
|
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
|
|
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
|
|
} \
|
|
}
|
|
|
|
__host__
|
|
void groestl256_cpu_init(int thr_id, uint32_t threads)
|
|
{
|
|
// Texturen mit obigem Makro initialisieren
|
|
texDef(0, t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
|
|
texDef(1, t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
|
|
texDef(2, t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
|
|
texDef(3, t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
|
|
texDef(4, t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
|
|
texDef(5, t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
|
|
texDef(6, t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
|
|
texDef(7, t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
|
|
|
|
cudaMalloc(&d_GNonces[thr_id], 2*sizeof(uint32_t));
|
|
cudaMallocHost(&h_GNonces[thr_id], 2*sizeof(uint32_t));
|
|
}
|
|
|
|
__host__
|
|
void groestl256_cpu_free(int thr_id)
|
|
{
|
|
for (int i=0; i<8; i++)
|
|
cudaFree(d_textures[thr_id][i]);
|
|
|
|
cudaFree(d_GNonces[thr_id]);
|
|
cudaFreeHost(h_GNonces[thr_id]);
|
|
}
|
|
|
|
__host__
|
|
uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
|
|
{
|
|
uint32_t result = UINT32_MAX;
|
|
cudaMemset(d_GNonces[thr_id], 0xff, 2*sizeof(uint32_t));
|
|
const uint32_t threadsperblock = 256;
|
|
|
|
// berechne wie viele Thread Blocks wir brauchen
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
|
dim3 block(threadsperblock);
|
|
|
|
#if USE_SHARED
|
|
size_t shared_size = 8 * 256 * sizeof(uint32_t);
|
|
#else
|
|
size_t shared_size = 0;
|
|
#endif
|
|
groestl256_gpu_hash_32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonces[thr_id]);
|
|
|
|
MyStreamSynchronize(NULL, order, thr_id);
|
|
|
|
// get first found nonce
|
|
cudaMemcpy(h_GNonces[thr_id], d_GNonces[thr_id], 1*sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
|
result = *h_GNonces[thr_id];
|
|
|
|
return result;
|
|
}
|
|
|
|
__host__
|
|
uint32_t groestl256_getSecNonce(int thr_id, int num)
|
|
{
|
|
uint32_t results[2];
|
|
memset(results, 0xFF, sizeof(results));
|
|
cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost);
|
|
if (results[1] == results[0])
|
|
return UINT32_MAX;
|
|
return results[num];
|
|
}
|
|
|
|
__host__
|
|
void groestl256_setTarget(const void *pTargetIn)
|
|
{
|
|
cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
|
|
}
|