mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-10 14:57:53 +00:00
lbry: small changes for second build
sha512/ripemd swab this was preview 2
This commit is contained in:
parent
5703a74e22
commit
940c1b3a2f
@ -37,6 +37,18 @@ static __constant__ uint32_t c_IV[5] = {
|
||||
0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u
|
||||
};
|
||||
|
||||
__device__ __forceinline__
|
||||
uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
|
||||
uint32_t result;
|
||||
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
|
||||
asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b" //0x96 = 0xF0 ^ 0xCC ^ 0xAA
|
||||
: "=r"(result) : "r"(a), "r"(b),"r"(c));
|
||||
#else
|
||||
result = a^b^c;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
//__host__
|
||||
//uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) {
|
||||
// return c ^ (a | !b);
|
||||
@ -83,7 +95,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
|
||||
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
|
||||
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
|
||||
#else
|
||||
#define F1(x, y, z) xor3(x,y,z)
|
||||
#define F1(x, y, z) xor3b(x,y,z)
|
||||
#define F2(x, y, z) xandx(x,y,z)
|
||||
#define F3(x, y, z) xornot64(x,y,z)
|
||||
#define F4(x, y, z) xandx(z,x,y)
|
||||
@ -305,59 +317,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
|
||||
h[0] = tmp; \
|
||||
}
|
||||
|
||||
#if 0
|
||||
__global__
|
||||
void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]);
|
||||
|
||||
uint32_t in[16];
|
||||
for (int i=0; i<8; i++)
|
||||
in[i] = (hash[i]);
|
||||
in[8] = 0x80;
|
||||
|
||||
#pragma unroll
|
||||
for (int i=9;i<16;i++) in[i] = 0;
|
||||
|
||||
in[14] = 0x100; // size in bits
|
||||
|
||||
uint32_t h[5];
|
||||
#pragma unroll
|
||||
for (int i=0; i<5; i++)
|
||||
h[i] = c_IV[i];
|
||||
|
||||
RIPEMD160_ROUND_BODY(in, h);
|
||||
|
||||
#pragma unroll
|
||||
for (int i=0; i<5; i++)
|
||||
hash[i] = h[i];
|
||||
|
||||
#ifdef PAD_ZEROS
|
||||
// 20 bytes hash on 32 or 64 bytes output space
|
||||
hash[5] = 0;
|
||||
hash[6] = 0;
|
||||
hash[7] = 0;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__host__
|
||||
void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream)
|
||||
{
|
||||
const uint32_t threadsperblock = 128;
|
||||
|
||||
dim3 grid(threads/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
lbry_ripemd160_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash, byteOffset);
|
||||
}
|
||||
#endif
|
||||
|
||||
__global__
|
||||
//__launch_bounds__(256,6)
|
||||
void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
@ -366,6 +326,7 @@ void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
|
||||
uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]);
|
||||
|
||||
uint32_t in[16];
|
||||
#pragma unroll
|
||||
for (int i=0; i<8; i++)
|
||||
in[i] = (hash[i]);
|
||||
in[8] = 0x80;
|
||||
|
@ -160,20 +160,6 @@ uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ __forceinline__
|
||||
uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
|
||||
uint32_t result;
|
||||
asm("{ .reg .u32 t1; // xor3b \n\t"
|
||||
"xor.b32 t1, %2, %3;\n\t"
|
||||
"xor.b32 %0, %1, t1;"
|
||||
"}"
|
||||
: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
|
||||
return result;
|
||||
}
|
||||
#define xor3b(a,b,c) (a ^ b ^ c)
|
||||
*/
|
||||
|
||||
__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
|
||||
{
|
||||
uint32_t r1 = ROTR32(x,2);
|
||||
@ -220,6 +206,13 @@ __device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b,
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
|
||||
uint2 result;
|
||||
asm("mov.b64 {%0,%1},%2; \n\t"
|
||||
: "=r"(result.y), "=r"(result.x) : "l"(v));
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__
|
||||
static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
|
||||
uint32_t in, const uint32_t Kshared)
|
||||
@ -393,7 +386,8 @@ void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512)
|
||||
uint2* output = (uint2*) input;
|
||||
#pragma unroll
|
||||
for (int i=0;i<4;i++) {
|
||||
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
|
||||
//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
|
||||
output[i] = vectorizeswap(((uint64_t*)buf)[i]);
|
||||
}
|
||||
#ifdef PAD_ZEROS
|
||||
#pragma unroll
|
||||
@ -447,8 +441,8 @@ void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce
|
||||
uint2* output = (uint2*) (&outputHash[thread * 8U]);
|
||||
#pragma unroll
|
||||
for (int i=0;i<4;i++) {
|
||||
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
|
||||
//output[i] = vectorize(((uint64_t*)buf)[i]);
|
||||
// //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
|
||||
output[i] = vectorizeswap(((uint64_t*)buf)[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
//#define USE_ROT_ASM_OPT 0
|
||||
#include <cuda_helper.h>
|
||||
|
||||
static __constant__ uint64_t K_512[80];
|
||||
@ -36,8 +37,6 @@ static const uint64_t K512[80] = {
|
||||
//#undef xor3
|
||||
//#define xor3(a,b,c) (a^b^c)
|
||||
|
||||
//#undef
|
||||
|
||||
static __device__ __forceinline__
|
||||
uint64_t bsg5_0(const uint64_t x)
|
||||
{
|
||||
@ -111,26 +110,17 @@ __global__
|
||||
void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
//if (thread < threads)
|
||||
{
|
||||
uint64_t *pHash = &g_hash[thread * 8U];
|
||||
|
||||
uint64_t W[80];
|
||||
uint64_t r[8];
|
||||
|
||||
uint64_t IV512[8] = {
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++)
|
||||
r[i] = IV512[i];
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; i++) {
|
||||
// 32 bytes input
|
||||
W[i] = cuda_swab64(pHash[i]);
|
||||
W[i] = pHash[i];
|
||||
//W[i] = cuda_swab64(pHash[i]); // made in sha256
|
||||
}
|
||||
|
||||
W[4] = 0x8000000000000000; // end tag
|
||||
@ -140,13 +130,23 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
|
||||
|
||||
W[15] = 0x100; // 256 bits
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 16; i < 80; i++) W[i] = 0;
|
||||
//#pragma unroll
|
||||
//for (int i = 16; i < 78; i++) W[i] = 0;
|
||||
|
||||
#pragma unroll 64
|
||||
#pragma unroll
|
||||
for (int i = 16; i < 80; i++)
|
||||
W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16];
|
||||
|
||||
const uint64_t IV512[8] = {
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
uint64_t r[8];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++)
|
||||
r[i] = IV512[i];
|
||||
|
||||
#pragma unroll 10
|
||||
for (int i = 0; i < 10; i++) {
|
||||
#pragma unroll 8
|
||||
@ -168,7 +168,7 @@ void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStr
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
size_t shared_size = 80*8;
|
||||
size_t shared_size = 0;
|
||||
lbry_sha512_gpu_hash_32 <<<grid, block, shared_size, stream>>> (threads, (uint64_t*)d_hash);
|
||||
}
|
||||
|
||||
|
@ -106,7 +106,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
|
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
|
||||
if (opt_benchmark) {
|
||||
ptarget[7] = 0xff;
|
||||
ptarget[7] = 0xf;
|
||||
}
|
||||
|
||||
if (!init[thr_id]){
|
||||
|
Loading…
Reference in New Issue
Block a user