mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-23 13:04:13 +00:00
x15: use djm34 code with asm xor64 + my rot64
some optimizations could be done later, after whirlcoin integration
This commit is contained in:
parent
194fda87c1
commit
4bc23048b5
189
cuda_helper.h
189
cuda_helper.h
@ -113,9 +113,135 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
|
|||||||
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
|
(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
/*********************************************************************/
|
||||||
#if __CUDA_ARCH__ >= 350
|
// Macro to catch CUDA errors in CUDA runtime calls
|
||||||
__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
|
#define CUDA_SAFE_CALL(call) \
|
||||||
|
do { \
|
||||||
|
cudaError_t err = call; \
|
||||||
|
if (cudaSuccess != err) { \
|
||||||
|
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
|
||||||
|
__FILE__, __LINE__, cudaGetErrorString(err) ); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
// device asm for whirpool
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t xor1(uint64_t a, uint64_t b)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a) ,"l"(b));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// device asm for whirpool
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("{\n\t"
|
||||||
|
" .reg .u64 t1;\n\t"
|
||||||
|
"xor.b64 t1, %2, %3;\n\t"
|
||||||
|
"xor.b64 %0, %1, t1;\n\t"
|
||||||
|
"}"
|
||||||
|
: "=l"(result) : "l"(a) ,"l"(b),"l"(c));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// device asm for whirpool
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
|
||||||
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
|
||||||
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
|
||||||
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
|
||||||
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
|
||||||
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
|
||||||
|
asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// device asm for whirpool
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("{\n\t"
|
||||||
|
".reg .u64 m,n;\n\t"
|
||||||
|
"xor.b64 m, %2,%3;\n\t"
|
||||||
|
"and.b64 n, m,%1;\n\t"
|
||||||
|
"xor.b64 %0, n,%3;\n\t"
|
||||||
|
"}\n\t"
|
||||||
|
: "=l"(result) : "l"(a), "l"(b), "l"(c));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// device asm for whirpool
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t sph_t64(uint64_t x)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("{\n\t"
|
||||||
|
"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t"
|
||||||
|
"}\n\t"
|
||||||
|
: "=l"(result) : "l"(x));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// device asm for whirpool
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("{\n\t"
|
||||||
|
".reg .u64 m,n,o;\n\t"
|
||||||
|
"and.b64 m, %1, %2;\n\t"
|
||||||
|
" or.b64 n, %1, %2;\n\t"
|
||||||
|
"and.b64 o, n, %3;\n\t"
|
||||||
|
" or.b64 %0, m, o ;\n\t"
|
||||||
|
"}\n\t"
|
||||||
|
: "=l"(result) : "l"(a), "l"(b), "l"(c));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// device asm for whirpool
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t shr_t64(uint64_t x, uint32_t n)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("{\n\t"
|
||||||
|
".reg .u64 m;\n\t"
|
||||||
|
"shr.b64 m,%1,%2;\n\t"
|
||||||
|
"and.b64 %0,m,0xFFFFFFFFFFFFFFFF;\n\t"
|
||||||
|
"}\n\t"
|
||||||
|
: "=l"(result) : "l"(x), "r"(n));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// device asm for whirpool
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t shl_t64(uint64_t x, uint32_t n)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("{\n\t"
|
||||||
|
".reg .u64 m;\n\t"
|
||||||
|
"shl.b64 m,%1,%2;\n\t"
|
||||||
|
"and.b64 %0,m,0xFFFFFFFFFFFFFFFF;\n\t"
|
||||||
|
"}\n\t"
|
||||||
|
: "=l"(result) : "l"(x), "r"(n));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// 64-bit ROTATE RIGHT
|
||||||
|
#ifdef DJM_SM35_ROT64
|
||||||
|
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t ROTR64(const uint64_t value, const int offset) {
|
||||||
uint2 result;
|
uint2 result;
|
||||||
if(offset < 32) {
|
if(offset < 32) {
|
||||||
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||||
@ -126,13 +252,32 @@ __device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offse
|
|||||||
}
|
}
|
||||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||||
}
|
}
|
||||||
|
#elif __CUDA_ARCH__ >= 120
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t ROTR64(const uint64_t x, const int offset)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("{\n\t"
|
||||||
|
".reg .b64 lhs, rhs;\n\t"
|
||||||
|
".reg .u32 amt2;\n\t"
|
||||||
|
"shr.b64 lhs, %1, %2;\n\t"
|
||||||
|
"sub.u32 amt2, 64, %2;\n\t"
|
||||||
|
"shl.b64 rhs, %1, amt2;\n\t"
|
||||||
|
"add.u64 %0, lhs, rhs;\n\t"
|
||||||
|
"}\n\t"
|
||||||
|
: "=l"(result) : "l"(x), "r"(offset));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
|
/* host */
|
||||||
|
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
|
// 64-bit ROTATE LEFT
|
||||||
#if __CUDA_ARCH__ >= 350
|
#ifdef DJM_SM35_ROT64
|
||||||
__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
|
/* complicated sm >= 3.5 one, to bench */
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t ROTL64(const uint64_t value, const int offset) {
|
||||||
uint2 result;
|
uint2 result;
|
||||||
if(offset >= 32) {
|
if(offset >= 32) {
|
||||||
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
|
||||||
@ -143,19 +288,25 @@ __device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offse
|
|||||||
}
|
}
|
||||||
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
return __double_as_longlong(__hiloint2double(result.y, result.x));
|
||||||
}
|
}
|
||||||
|
#elif __CUDA_ARCH__ >= 120
|
||||||
|
__device__ __forceinline__
|
||||||
|
uint64_t ROTL64(const uint64_t x, const int offset)
|
||||||
|
{
|
||||||
|
uint64_t result;
|
||||||
|
asm("{\n\t"
|
||||||
|
".reg .b64 lhs, rhs;\n\t"
|
||||||
|
".reg .u32 amt2;\n\t"
|
||||||
|
"shl.b64 lhs, %1, %2;\n\t"
|
||||||
|
"sub.u32 amt2, 64, %2;\n\t"
|
||||||
|
"shr.b64 rhs, %1, amt2;\n\t"
|
||||||
|
"add.u64 %0, lhs, rhs;\n\t"
|
||||||
|
"}\n\t"
|
||||||
|
: "=l"(result) : "l"(x), "r"(offset));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
/* host */
|
||||||
|
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Macro to catch CUDA errors in CUDA runtime calls
|
|
||||||
#define CUDA_SAFE_CALL(call) \
|
|
||||||
do { \
|
|
||||||
cudaError_t err = call; \
|
|
||||||
if (cudaSuccess != err) { \
|
|
||||||
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
|
|
||||||
__FILE__, __LINE__, cudaGetErrorString(err) ); \
|
|
||||||
exit(EXIT_FAILURE); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#endif // #ifndef CUDA_HELPER_H
|
#endif // #ifndef CUDA_HELPER_H
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -80,8 +80,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
|
|||||||
extern void x14_shabal512_cpu_init(int thr_id, int threads);
|
extern void x14_shabal512_cpu_init(int thr_id, int threads);
|
||||||
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
extern void x15_whirlpool_cpu_init(int thr_id, int threads);
|
extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);
|
||||||
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
extern void x15_whirlpool_cpu_free(int thr_id);
|
||||||
|
|
||||||
extern void cuda_check_cpu_init(int thr_id, int threads);
|
extern void cuda_check_cpu_init(int thr_id, int threads);
|
||||||
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
extern void cuda_check_cpu_setTarget(const void *ptarget);
|
||||||
@ -228,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
|
|||||||
x13_hamsi512_cpu_init(thr_id, throughput);
|
x13_hamsi512_cpu_init(thr_id, throughput);
|
||||||
x13_fugue512_cpu_init(thr_id, throughput);
|
x13_fugue512_cpu_init(thr_id, throughput);
|
||||||
x14_shabal512_cpu_init(thr_id, throughput);
|
x14_shabal512_cpu_init(thr_id, throughput);
|
||||||
x15_whirlpool_cpu_init(thr_id, throughput);
|
x15_whirlpool_cpu_init(thr_id, throughput, 0);
|
||||||
|
|
||||||
cuda_check_cpu_init(thr_id, throughput);
|
cuda_check_cpu_init(thr_id, throughput);
|
||||||
init[thr_id] = true;
|
init[thr_id] = true;
|
||||||
@ -276,6 +277,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
|
|||||||
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
|
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
|
||||||
pdata[19] = foundNonce;
|
pdata[19] = foundNonce;
|
||||||
*hashes_done = foundNonce - first_nonce + 1;
|
*hashes_done = foundNonce - first_nonce + 1;
|
||||||
|
x15_whirlpool_cpu_free(thr_id);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
else if (vhash64[7] > Htarg) {
|
else if (vhash64[7] > Htarg) {
|
||||||
@ -291,5 +293,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
|
|||||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
||||||
|
|
||||||
*hashes_done = pdata[19] - first_nonce + 1;
|
*hashes_done = pdata[19] - first_nonce + 1;
|
||||||
|
|
||||||
|
x15_whirlpool_cpu_free(thr_id);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user