Browse Source

lbry: small changes for second build

sha512/ripemd swab

this was preview 2
2upstream
Tanguy Pruvot 8 years ago
parent
commit
940c1b3a2f
  1. 67
      lbry/cuda_ripemd160.cu
  2. 28
      lbry/cuda_sha256_lbry.cu
  3. 36
      lbry/cuda_sha512_lbry.cu
  4. 2
      lbry/lbry.cu

67
lbry/cuda_ripemd160.cu

@ -37,6 +37,18 @@ static __constant__ uint32_t c_IV[5] = {
0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u
}; };
__device__ __forceinline__
uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
uint32_t result;
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b" //0x96 = 0xF0 ^ 0xCC ^ 0xAA
: "=r"(result) : "r"(a), "r"(b),"r"(c));
#else
result = a^b^c;
#endif
return result;
}
//__host__ //__host__
//uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) { //uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) {
// return c ^ (a | !b); // return c ^ (a | !b);
@ -83,7 +95,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) #define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z))) #define F5(x, y, z) ((x) ^ ((y) | ~(z)))
#else #else
#define F1(x, y, z) xor3(x,y,z) #define F1(x, y, z) xor3b(x,y,z)
#define F2(x, y, z) xandx(x,y,z) #define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z) #define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y) #define F4(x, y, z) xandx(z,x,y)
@ -305,59 +317,7 @@ uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
h[0] = tmp; \ h[0] = tmp; \
} }
#if 0
__global__
void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]);
uint32_t in[16];
for (int i=0; i<8; i++)
in[i] = (hash[i]);
in[8] = 0x80;
#pragma unroll
for (int i=9;i<16;i++) in[i] = 0;
in[14] = 0x100; // size in bits
uint32_t h[5];
#pragma unroll
for (int i=0; i<5; i++)
h[i] = c_IV[i];
RIPEMD160_ROUND_BODY(in, h);
#pragma unroll
for (int i=0; i<5; i++)
hash[i] = h[i];
#ifdef PAD_ZEROS
// 20 bytes hash on 32 or 64 bytes output space
hash[5] = 0;
hash[6] = 0;
hash[7] = 0;
#endif
}
}
__host__
void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream)
{
const uint32_t threadsperblock = 128;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_ripemd160_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash, byteOffset);
}
#endif
__global__ __global__
//__launch_bounds__(256,6)
void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash) void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -366,6 +326,7 @@ void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]); uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]);
uint32_t in[16]; uint32_t in[16];
#pragma unroll
for (int i=0; i<8; i++) for (int i=0; i<8; i++)
in[i] = (hash[i]); in[i] = (hash[i]);
in[8] = 0x80; in[8] = 0x80;

28
lbry/cuda_sha256_lbry.cu

@ -160,20 +160,6 @@ uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
return result; return result;
} }
/*
__device__ __forceinline__
uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
uint32_t result;
asm("{ .reg .u32 t1; // xor3b \n\t"
"xor.b32 t1, %2, %3;\n\t"
"xor.b32 %0, %1, t1;"
"}"
: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
return result;
}
#define xor3b(a,b,c) (a ^ b ^ c)
*/
__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x) __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
{ {
uint32_t r1 = ROTR32(x,2); uint32_t r1 = ROTR32(x,2);
@ -220,6 +206,13 @@ __device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b,
return result; return result;
} }
__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
uint2 result;
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(result.y), "=r"(result.x) : "l"(v));
return result;
}
__device__ __device__
static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared) uint32_t in, const uint32_t Kshared)
@ -393,7 +386,8 @@ void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512)
uint2* output = (uint2*) input; uint2* output = (uint2*) input;
#pragma unroll #pragma unroll
for (int i=0;i<4;i++) { for (int i=0;i<4;i++) {
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
output[i] = vectorizeswap(((uint64_t*)buf)[i]);
} }
#ifdef PAD_ZEROS #ifdef PAD_ZEROS
#pragma unroll #pragma unroll
@ -447,8 +441,8 @@ void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce
uint2* output = (uint2*) (&outputHash[thread * 8U]); uint2* output = (uint2*) (&outputHash[thread * 8U]);
#pragma unroll #pragma unroll
for (int i=0;i<4;i++) { for (int i=0;i<4;i++) {
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); // //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
//output[i] = vectorize(((uint64_t*)buf)[i]); output[i] = vectorizeswap(((uint64_t*)buf)[i]);
} }
} }
} }

36
lbry/cuda_sha512_lbry.cu

@ -6,6 +6,7 @@
#include <stdint.h> #include <stdint.h>
#include <memory.h> #include <memory.h>
//#define USE_ROT_ASM_OPT 0
#include <cuda_helper.h> #include <cuda_helper.h>
static __constant__ uint64_t K_512[80]; static __constant__ uint64_t K_512[80];
@ -36,8 +37,6 @@ static const uint64_t K512[80] = {
//#undef xor3 //#undef xor3
//#define xor3(a,b,c) (a^b^c) //#define xor3(a,b,c) (a^b^c)
//#undef
static __device__ __forceinline__ static __device__ __forceinline__
uint64_t bsg5_0(const uint64_t x) uint64_t bsg5_0(const uint64_t x)
{ {
@ -111,26 +110,17 @@ __global__
void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) //if (thread < threads)
{ {
uint64_t *pHash = &g_hash[thread * 8U]; uint64_t *pHash = &g_hash[thread * 8U];
uint64_t W[80]; uint64_t W[80];
uint64_t r[8];
uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
#pragma unroll
for (int i = 0; i < 8; i++)
r[i] = IV512[i];
#pragma unroll #pragma unroll
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
// 32 bytes input // 32 bytes input
W[i] = cuda_swab64(pHash[i]); W[i] = pHash[i];
//W[i] = cuda_swab64(pHash[i]); // made in sha256
} }
W[4] = 0x8000000000000000; // end tag W[4] = 0x8000000000000000; // end tag
@ -140,13 +130,23 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
W[15] = 0x100; // 256 bits W[15] = 0x100; // 256 bits
#pragma unroll //#pragma unroll
for (int i = 16; i < 80; i++) W[i] = 0; //for (int i = 16; i < 78; i++) W[i] = 0;
#pragma unroll 64 #pragma unroll
for (int i = 16; i < 80; i++) for (int i = 16; i < 80; i++)
W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16]; W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16];
const uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
uint64_t r[8];
#pragma unroll
for (int i = 0; i < 8; i++)
r[i] = IV512[i];
#pragma unroll 10 #pragma unroll 10
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
#pragma unroll 8 #pragma unroll 8
@ -168,7 +168,7 @@ void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStr
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size = 80*8; size_t shared_size = 0;
lbry_sha512_gpu_hash_32 <<<grid, block, shared_size, stream>>> (threads, (uint64_t*)d_hash); lbry_sha512_gpu_hash_32 <<<grid, block, shared_size, stream>>> (threads, (uint64_t*)d_hash);
} }

2
lbry/lbry.cu

@ -106,7 +106,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) { if (opt_benchmark) {
ptarget[7] = 0xff; ptarget[7] = 0xf;
} }
if (!init[thr_id]){ if (!init[thr_id]){

Loading…
Cancel
Save