mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-08 22:07:56 +00:00
cuda_helper: rename correctly hiword/loword functions
This commit is contained in:
parent
b35a6742fe
commit
2f541065fb
@ -12,7 +12,7 @@ extern "C" {
|
|||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
|
|
||||||
static __device__ uint64_t cuda_swab32ll(uint64_t x) {
|
static __device__ uint64_t cuda_swab32ll(uint64_t x) {
|
||||||
return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
|
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
|
||||||
}
|
}
|
||||||
|
|
||||||
__constant__ static uint32_t c_data[20];
|
__constant__ static uint32_t c_data[20];
|
||||||
|
@ -199,7 +199,7 @@ void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputH
|
|||||||
if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
|
if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
|
||||||
else keccak_gpu_state[i] = 0;
|
else keccak_gpu_state[i] = 0;
|
||||||
}
|
}
|
||||||
keccak_gpu_state[9] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
|
keccak_gpu_state[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
|
||||||
keccak_gpu_state[10] = 0x0000000000000001;
|
keccak_gpu_state[10] = 0x0000000000000001;
|
||||||
keccak_gpu_state[16] = 0x8000000000000000;
|
keccak_gpu_state[16] = 0x8000000000000000;
|
||||||
|
|
||||||
|
@ -81,12 +81,12 @@ __device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// das Hi Word in einem 64 Bit Typen ersetzen
|
// das Hi Word in einem 64 Bit Typen ersetzen
|
||||||
__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
|
__device__ __forceinline__ uint64_t REPLACE_HIDWORD(const uint64_t &x, const uint32_t &y) {
|
||||||
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U);
|
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U);
|
||||||
}
|
}
|
||||||
|
|
||||||
// das Lo Word in einem 64 Bit Typen ersetzen
|
// das Lo Word in einem 64 Bit Typen ersetzen
|
||||||
__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
|
__device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uint32_t &y) {
|
||||||
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
|
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,7 +105,7 @@ __device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// das Lo Word aus einem 64 Bit Typen extrahieren
|
// das Lo Word aus einem 64 Bit Typen extrahieren
|
||||||
__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
|
__device__ __forceinline__ uint32_t _LODWORD(const uint64_t &x) {
|
||||||
#if __CUDA_ARCH__ >= 130
|
#if __CUDA_ARCH__ >= 130
|
||||||
return (uint32_t)__double2loint(__longlong_as_double(x));
|
return (uint32_t)__double2loint(__longlong_as_double(x));
|
||||||
#else
|
#else
|
||||||
@ -114,7 +114,7 @@ __device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// das Hi Word aus einem 64 Bit Typen extrahieren
|
// das Hi Word aus einem 64 Bit Typen extrahieren
|
||||||
__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
|
__device__ __forceinline__ uint32_t _HIDWORD(const uint64_t &x) {
|
||||||
#if __CUDA_ARCH__ >= 130
|
#if __CUDA_ARCH__ >= 130
|
||||||
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
||||||
#else
|
#else
|
||||||
@ -128,7 +128,7 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
|
|||||||
// Input: 77665544 33221100
|
// Input: 77665544 33221100
|
||||||
// Output: 00112233 44556677
|
// Output: 00112233 44556677
|
||||||
uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
|
uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123);
|
||||||
return (result << 32) | __byte_perm(_HIWORD(x), 0, 0x0123);
|
return (result << 32) | __byte_perm(_HIDWORD(x), 0, 0x0123);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
/* host */
|
/* host */
|
||||||
@ -483,8 +483,8 @@ void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
|
|||||||
asm("mov.b64 {%0,%1},%2; \n\t"
|
asm("mov.b64 {%0,%1},%2; \n\t"
|
||||||
: "=r"(lo), "=r"(hi) : "l"(x));
|
: "=r"(lo), "=r"(hi) : "l"(x));
|
||||||
#else
|
#else
|
||||||
lo = _LOWORD(x);
|
lo = _LODWORD(x);
|
||||||
hi = _HIWORD(x);
|
hi = _HIDWORD(x);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -149,20 +149,20 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(uint32_t threads, uin
|
|||||||
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
|
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
|
||||||
|
|
||||||
// die Nounce durch die thread-spezifische ersetzen
|
// die Nounce durch die thread-spezifische ersetzen
|
||||||
buf[9] = REPLACE_HIWORD(buf[9], nounce);
|
buf[9] = REPLACE_HIDWORD(buf[9], nounce);
|
||||||
|
|
||||||
uint32_t *hefty = heftyHashes + 8 * hashPosition;
|
uint32_t *hefty = heftyHashes + 8 * hashPosition;
|
||||||
if (BLOCKSIZE == 84) {
|
if (BLOCKSIZE == 84) {
|
||||||
// den thread-spezifischen Hefty1 hash einsetzen
|
// den thread-spezifischen Hefty1 hash einsetzen
|
||||||
// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
|
// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
|
||||||
buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
|
buf[10] = REPLACE_HIDWORD(buf[10], hefty[0]);
|
||||||
buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
|
buf[11] = REPLACE_LODWORD(buf[11], hefty[1]);
|
||||||
buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
|
buf[11] = REPLACE_HIDWORD(buf[11], hefty[2]);
|
||||||
buf[12] = REPLACE_LOWORD(buf[12], hefty[3]);
|
buf[12] = REPLACE_LODWORD(buf[12], hefty[3]);
|
||||||
buf[12] = REPLACE_HIWORD(buf[12], hefty[4]);
|
buf[12] = REPLACE_HIDWORD(buf[12], hefty[4]);
|
||||||
buf[13] = REPLACE_LOWORD(buf[13], hefty[5]);
|
buf[13] = REPLACE_LODWORD(buf[13], hefty[5]);
|
||||||
buf[13] = REPLACE_HIWORD(buf[13], hefty[6]);
|
buf[13] = REPLACE_HIDWORD(buf[13], hefty[6]);
|
||||||
buf[14] = REPLACE_LOWORD(buf[14], hefty[7]);
|
buf[14] = REPLACE_LODWORD(buf[14], hefty[7]);
|
||||||
}
|
}
|
||||||
else if (BLOCKSIZE == 80) {
|
else if (BLOCKSIZE == 80) {
|
||||||
buf[10] = MAKE_ULONGLONG(hefty[0], hefty[1]);
|
buf[10] = MAKE_ULONGLONG(hefty[0], hefty[1]);
|
||||||
|
@ -199,8 +199,8 @@ void pentablake_gpu_hash_80(uint32_t threads, const uint32_t startNounce, void *
|
|||||||
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
|
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (uint32_t i=0; i < 8; i++) {
|
for (uint32_t i=0; i < 8; i++) {
|
||||||
outHash[2*i] = cuda_swab32( _HIWORD(h[i]) );
|
outHash[2*i] = cuda_swab32( _HIDWORD(h[i]) );
|
||||||
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
|
outHash[2*i+1] = cuda_swab32( _LODWORD(h[i]) );
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
|
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
|
||||||
@ -258,8 +258,8 @@ void pentablake_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_
|
|||||||
uint32_t *outHash = (uint32_t*)&g_hash[thread<<3];
|
uint32_t *outHash = (uint32_t*)&g_hash[thread<<3];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (int i=0; i < 8; i++) {
|
for (int i=0; i < 8; i++) {
|
||||||
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
|
outHash[2*i+0] = cuda_swab32( _HIDWORD(h[i]) );
|
||||||
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
|
outHash[2*i+1] = cuda_swab32( _LODWORD(h[i]) );
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
uint64_t *outHash = &g_hash[thread<<3];
|
uint64_t *outHash = &g_hash[thread<<3];
|
||||||
|
@ -417,7 +417,7 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *
|
|||||||
message[i] = vectorize(c_PaddedMessage80[i]);
|
message[i] = vectorize(c_PaddedMessage80[i]);
|
||||||
|
|
||||||
// die Nounce durch die thread-spezifische ersetzen
|
// die Nounce durch die thread-spezifische ersetzen
|
||||||
message[9].y = cuda_swab32(nounce); //REPLACE_HIWORD(message[9], cuda_swab32(nounce));
|
message[9].y = cuda_swab32(nounce); //REPLACE_HIDWORD(message[9], cuda_swab32(nounce));
|
||||||
|
|
||||||
// Compression 1
|
// Compression 1
|
||||||
Compression512(message, h);
|
Compression512(message, h);
|
||||||
|
@ -232,7 +232,7 @@ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_
|
|||||||
for(int i=0;i<16;i++)
|
for(int i=0;i<16;i++)
|
||||||
message[i] = c_PaddedMessage80[i];
|
message[i] = c_PaddedMessage80[i];
|
||||||
|
|
||||||
message[9] = REPLACE_HIWORD(message[9], cuda_swab32(nounce));
|
message[9] = REPLACE_HIDWORD(message[9], cuda_swab32(nounce));
|
||||||
|
|
||||||
// Compression 1
|
// Compression 1
|
||||||
Compression512_30(message, h);
|
Compression512_30(message, h);
|
||||||
|
@ -166,8 +166,8 @@ void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t
|
|||||||
uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U];
|
uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U];
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (int i=0; i < 8; i++) {
|
for (int i=0; i < 8; i++) {
|
||||||
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
|
outHash[2*i+0] = cuda_swab32( _HIDWORD(h[i]) );
|
||||||
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
|
outHash[2*i+1] = cuda_swab32( _LODWORD(h[i]) );
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
uint64_t *outHash = &g_hash[hashPosition * 8U];
|
uint64_t *outHash = &g_hash[hashPosition * 8U];
|
||||||
@ -210,8 +210,8 @@ void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
|
|||||||
uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U);
|
uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U);
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (uint32_t i=0; i < 8; i++) {
|
for (uint32_t i=0; i < 8; i++) {
|
||||||
outHash[2*i] = cuda_swab32( _HIWORD(h[i]) );
|
outHash[2*i] = cuda_swab32( _HIDWORD(h[i]) );
|
||||||
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
|
outHash[2*i+1] = cuda_swab32( _LODWORD(h[i]) );
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
uint64_t *outHash = (uint64_t*)outputHash + (thread * 8U);
|
uint64_t *outHash = (uint64_t*)outputHash + (thread * 8U);
|
||||||
|
@ -613,7 +613,7 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
|
|||||||
t2 = vectorize(c_PaddedMessage80[18]);
|
t2 = vectorize(c_PaddedMessage80[18]);
|
||||||
|
|
||||||
uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread;
|
uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread;
|
||||||
uint2 nonce2 = make_uint2(_LOWORD(c_PaddedMessage80[9]), nonce);
|
uint2 nonce2 = make_uint2(_LODWORD(c_PaddedMessage80[9]), nonce);
|
||||||
|
|
||||||
uint2 p[8];
|
uint2 p[8];
|
||||||
p[0] = vectorize(c_PaddedMessage80[8]);
|
p[0] = vectorize(c_PaddedMessage80[8]);
|
||||||
@ -714,7 +714,7 @@ void skein512_gpu_hash_80_sm3(uint32_t threads, uint32_t startNounce, uint64_t *
|
|||||||
h7 = c_PaddedMessage80[7] ^ p[7];
|
h7 = c_PaddedMessage80[7] ^ p[7];
|
||||||
|
|
||||||
uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread;
|
uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread;
|
||||||
uint64_t nonce64 = MAKE_ULONGLONG(_LOWORD(c_PaddedMessage80[9]), nonce);
|
uint64_t nonce64 = MAKE_ULONGLONG(_LODWORD(c_PaddedMessage80[9]), nonce);
|
||||||
|
|
||||||
// skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16
|
// skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16
|
||||||
p[0] = c_PaddedMessage80[8];
|
p[0] = c_PaddedMessage80[8];
|
||||||
|
@ -367,7 +367,7 @@ void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
|
|||||||
for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
|
for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
|
||||||
|
|
||||||
// die Nounce durch die thread-spezifische ersetzen
|
// die Nounce durch die thread-spezifische ersetzen
|
||||||
buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce));
|
buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce));
|
||||||
|
|
||||||
|
|
||||||
hashState state;
|
hashState state;
|
||||||
@ -398,7 +398,7 @@ void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, voi
|
|||||||
for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
|
for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
|
||||||
|
|
||||||
// Tested nonce
|
// Tested nonce
|
||||||
buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce));
|
buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce));
|
||||||
|
|
||||||
hashState state;
|
hashState state;
|
||||||
#pragma unroll 40
|
#pragma unroll 40
|
||||||
|
@ -421,7 +421,7 @@ void whirlpoolx_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *resNo
|
|||||||
uint64_t tmp[8];
|
uint64_t tmp[8];
|
||||||
uint32_t nounce = startNounce + thread;
|
uint32_t nounce = startNounce + thread;
|
||||||
|
|
||||||
n[1] = xor1(REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)),c_xtra[0]);
|
n[1] = xor1(REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)),c_xtra[0]);
|
||||||
|
|
||||||
uint32_t* n32 = (uint32_t*)&n[0];
|
uint32_t* n32 = (uint32_t*)&n[0];
|
||||||
n[0]=sharedMemory[__byte_perm(n32[3], 0, 0x4443) + 1792];
|
n[0]=sharedMemory[__byte_perm(n32[3], 0, 0x4443) + 1792];
|
||||||
|
@ -2337,7 +2337,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
|
|||||||
/// round 2 ///////
|
/// round 2 ///////
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
n[0] = c_PaddedMessage80[8]; //read data
|
n[0] = c_PaddedMessage80[8]; //read data
|
||||||
n[1] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); //whirlpool
|
n[1] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); //whirlpool
|
||||||
n[2] = 0x0000000000000080; //whirlpool
|
n[2] = 0x0000000000000080; //whirlpool
|
||||||
n[3] = 0;
|
n[3] = 0;
|
||||||
n[4] = 0;
|
n[4] = 0;
|
||||||
@ -2359,7 +2359,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
|
|||||||
}
|
}
|
||||||
|
|
||||||
state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]);
|
state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]);
|
||||||
state[1] = xor3(state[1], n[1], REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce)) );
|
state[1] = xor3(state[1], n[1], REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)) );
|
||||||
state[2] = xor3(state[2], n[2], 0x0000000000000080);
|
state[2] = xor3(state[2], n[2], 0x0000000000000080);
|
||||||
state[3] = xor1(state[3], n[3]);
|
state[3] = xor1(state[3], n[3]);
|
||||||
state[4] = xor1(state[4], n[4]);
|
state[4] = xor1(state[4], n[4]);
|
||||||
|
Loading…
Reference in New Issue
Block a user