Browse Source

lbry: small changes for second build

sha512/ripemd swab

this was preview 2
2upstream
Tanguy Pruvot 8 years ago
parent
commit
940c1b3a2f
  1. 67
      lbry/cuda_ripemd160.cu
  2. 28
      lbry/cuda_sha256_lbry.cu
  3. 36
      lbry/cuda_sha512_lbry.cu
  4. 2
      lbry/lbry.cu

67
lbry/cuda_ripemd160.cu

@ -37,6 +37,18 @@ static __constant__ uint32_t c_IV[5] = { @@ -37,6 +37,18 @@ static __constant__ uint32_t c_IV[5] = {
0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u
};
__device__ __forceinline__
uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
uint32_t result;
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b" //0x96 = 0xF0 ^ 0xCC ^ 0xAA
: "=r"(result) : "r"(a), "r"(b),"r"(c));
#else
result = a^b^c;
#endif
return result;
}
//__host__
//uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) {
// return c ^ (a | !b);
@ -83,7 +95,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) @@ -83,7 +95,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
#else
#define F1(x, y, z) xor3(x,y,z)
#define F1(x, y, z) xor3b(x,y,z)
#define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
@ -305,59 +317,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) @@ -305,59 +317,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
h[0] = tmp; \
}
#if 0
__global__
void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]);
uint32_t in[16];
for (int i=0; i<8; i++)
in[i] = (hash[i]);
in[8] = 0x80;
#pragma unroll
for (int i=9;i<16;i++) in[i] = 0;
in[14] = 0x100; // size in bits
uint32_t h[5];
#pragma unroll
for (int i=0; i<5; i++)
h[i] = c_IV[i];
RIPEMD160_ROUND_BODY(in, h);
#pragma unroll
for (int i=0; i<5; i++)
hash[i] = h[i];
#ifdef PAD_ZEROS
// 20 bytes hash on 32 or 64 bytes output space
hash[5] = 0;
hash[6] = 0;
hash[7] = 0;
#endif
}
}
__host__
void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream)
{
const uint32_t threadsperblock = 128;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_ripemd160_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash, byteOffset);
}
#endif
__global__
//__launch_bounds__(256,6)
void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -366,6 +326,7 @@ void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash) @@ -366,6 +326,7 @@ void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]);
uint32_t in[16];
#pragma unroll
for (int i=0; i<8; i++)
in[i] = (hash[i]);
in[8] = 0x80;

28
lbry/cuda_sha256_lbry.cu

@ -160,20 +160,6 @@ uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) { @@ -160,20 +160,6 @@ uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
return result;
}
/*
__device__ __forceinline__
uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
uint32_t result;
asm("{ .reg .u32 t1; // xor3b \n\t"
"xor.b32 t1, %2, %3;\n\t"
"xor.b32 %0, %1, t1;"
"}"
: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
return result;
}
#define xor3b(a,b,c) (a ^ b ^ c)
*/
__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
{
uint32_t r1 = ROTR32(x,2);
@ -220,6 +206,13 @@ __device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, @@ -220,6 +206,13 @@ __device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b,
return result;
}
__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
uint2 result;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(result.y), "=r"(result.x) : "l"(v));
return result;
}
__device__
static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
@ -393,7 +386,8 @@ void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512) @@ -393,7 +386,8 @@ void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512)
uint2* output = (uint2*) input;
#pragma unroll
for (int i=0;i<4;i++) {
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
output[i] = vectorizeswap(((uint64_t*)buf)[i]);
}
#ifdef PAD_ZEROS
#pragma unroll
@ -447,8 +441,8 @@ void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce @@ -447,8 +441,8 @@ void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce
uint2* output = (uint2*) (&outputHash[thread * 8U]);
#pragma unroll
for (int i=0;i<4;i++) {
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
//output[i] = vectorize(((uint64_t*)buf)[i]);
// //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
output[i] = vectorizeswap(((uint64_t*)buf)[i]);
}
}
}

36
lbry/cuda_sha512_lbry.cu

@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
#include <stdint.h>
#include <memory.h>
//#define USE_ROT_ASM_OPT 0
#include <cuda_helper.h>
static __constant__ uint64_t K_512[80];
@ -36,8 +37,6 @@ static const uint64_t K512[80] = { @@ -36,8 +37,6 @@ static const uint64_t K512[80] = {
//#undef xor3
//#define xor3(a,b,c) (a^b^c)
//#undef
static __device__ __forceinline__
uint64_t bsg5_0(const uint64_t x)
{
@ -111,26 +110,17 @@ __global__ @@ -111,26 +110,17 @@ __global__
void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
//if (thread < threads)
{
uint64_t *pHash = &g_hash[thread * 8U];
uint64_t W[80];
uint64_t r[8];
uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
#pragma unroll
for (int i = 0; i < 8; i++)
r[i] = IV512[i];
#pragma unroll
for (int i = 0; i < 4; i++) {
// 32 bytes input
W[i] = cuda_swab64(pHash[i]);
W[i] = pHash[i];
//W[i] = cuda_swab64(pHash[i]); // made in sha256
}
W[4] = 0x8000000000000000; // end tag
@ -140,13 +130,23 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) @@ -140,13 +130,23 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
W[15] = 0x100; // 256 bits
#pragma unroll
for (int i = 16; i < 80; i++) W[i] = 0;
//#pragma unroll
//for (int i = 16; i < 78; i++) W[i] = 0;
#pragma unroll 64
#pragma unroll
for (int i = 16; i < 80; i++)
W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16];
const uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
uint64_t r[8];
#pragma unroll
for (int i = 0; i < 8; i++)
r[i] = IV512[i];
#pragma unroll 10
for (int i = 0; i < 10; i++) {
#pragma unroll 8
@ -168,7 +168,7 @@ void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStr @@ -168,7 +168,7 @@ void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStr
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 80*8;
size_t shared_size = 0;
lbry_sha512_gpu_hash_32 <<<grid, block, shared_size, stream>>> (threads, (uint64_t*)d_hash);
}

2
lbry/lbry.cu

@ -106,7 +106,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, @@ -106,7 +106,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) {
ptarget[7] = 0xff;
ptarget[7] = 0xf;
}
if (!init[thr_id]){

Loading…
Cancel
Save