Browse Source

quark/blake512 opt. pointed by sp without asm

indeed, the pragma unroll doesnt always make things faster

asm part... to check later
2upstream
Tanguy Pruvot 10 years ago
parent
commit
c859041993
  1. 26
      quark/cuda_quark_blake512.cu
  2. 1
      x11/x11.cu
  3. 1
      x13/x13.cu
  4. 1
      x15/x14.cu
  5. 1
      x15/x15.cu
  6. 1
      x17/x17.cu

26
quark/cuda_quark_blake512.cu

@ -59,18 +59,19 @@ const uint64_t c_u512[16] = @@ -59,18 +59,19 @@ const uint64_t c_u512[16] =
v[b] = ROTR( v[b] ^ v[c], 11); \
}
__device__ static
__device__ __forceinline__
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
{
uint64_t v[16], m[16], i;
uint64_t v[16];
uint64_t m[16];
#pragma unroll 16
for( i = 0; i < 16; i++) {
#pragma unroll
for(int i=0; i < 16; i++) {
m[i] = cuda_swab64(block[i]);
}
#pragma unroll 8
for (i = 0; i < 8; i++)
//#pragma unroll 8
for(int i=0; i < 8; i++)
v[i] = h[i];
v[ 8] = u512[0];
@ -83,7 +84,7 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t @@ -83,7 +84,7 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
v[15] = u512[7];
//#pragma unroll 16
for( i = 0; i < 16; ++i )
for(int i=0; i < 16; i++)
{
/* column step */
G( 0, 4, 8, 12, 0 );
@ -97,9 +98,14 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t @@ -97,9 +98,14 @@ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t
G( 3, 4, 9, 14, 14 );
}
#pragma unroll 16
for( i = 0; i < 16; ++i )
h[i % 8] ^= v[i];
h[0] ^= v[0] ^ v[8];
h[1] ^= v[1] ^ v[9];
h[2] ^= v[2] ^ v[10];
h[3] ^= v[3] ^ v[11];
h[4] ^= v[4] ^ v[12];
h[5] ^= v[5] ^ v[13];
h[6] ^= v[6] ^ v[14];
h[7] ^= v[7] ^ v[15];
}
// Hash-Padding

1
x11/x11.cu

@ -25,7 +25,6 @@ static uint32_t *d_hash[8]; @@ -25,7 +25,6 @@ static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

1
x13/x13.cu

@ -29,7 +29,6 @@ static uint32_t *d_hash[8]; @@ -29,7 +29,6 @@ static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

1
x15/x14.cu

@ -32,7 +32,6 @@ static uint32_t *d_hash[8]; @@ -32,7 +32,6 @@ static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

1
x15/x15.cu

@ -33,7 +33,6 @@ static uint32_t *d_hash[8]; @@ -33,7 +33,6 @@ static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

1
x17/x17.cu

@ -36,7 +36,6 @@ static uint32_t *d_hash[8]; @@ -36,7 +36,6 @@ static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

Loading…
Cancel
Save