lyra2(v1): use a common uint2x4 include

lyrav2 still need more definitions (uint16)
2015-10-23 14:43:22 +02:00 · 2015-10-23 14:43:22 +02:00 · 3b7ef923c7
commit 3b7ef923c7
parent 82a7e62b30
8 changed files with 20 additions and 272 deletions
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -307,7 +307,7 @@
    <ClInclude Include="cpuminer-config.h" />
    <ClInclude Include="cuda_groestlcoin.h" />
    <ClInclude Include="cuda_helper.h" />
-    <ClInclude Include="cuda_vector.h" />
+    <ClInclude Include="cuda_vector_uint2x4.h" />
    <ClInclude Include="elist.h" />
    <ClInclude Include="heavy\heavy.h" />
    <ClInclude Include="hefty1.h" />
@ -526,4 +526,4 @@
  <Target Name="AfterClean">
    <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
  </Target>
-</Project>
+</Project>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -338,7 +338,7 @@
    <ClInclude Include="cuda_helper.h">
      <Filter>Header Files\CUDA</Filter>
    </ClInclude>
-    <ClInclude Include="cuda_vector.h">
+    <ClInclude Include="cuda_vector_uint2x4.h">
      <Filter>Header Files\CUDA</Filter>
    </ClInclude>
    <ClInclude Include="sph\sph_hamsi.h">
@ -695,4 +695,4 @@
      <Filter>Ressources</Filter>
    </Text>
  </ItemGroup>
-</Project>
+</Project>
--- a/cuda_vector.h
+++ b/cuda_vector.h
@ -1,248 +0,0 @@
-#ifndef CUDA_VECTOR_H
-#define CUDA_VECTOR_H
-
-
-///////////////////////////////////////////////////////////////////////////////////
-#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
-#define __LDG_PTR   "l"
-#else
-#define __LDG_PTR   "r"
-#endif
-
-#include "cuda_helper.h"
-
-//typedef __device_builtin__ struct ulong16 ulong16;
-
-typedef struct __align__(32) uint8
-{
-	unsigned int s0, s1, s2, s3, s4, s5, s6, s7;
-} uint8;
-
-typedef struct __align__(64) uint16
-{
-	union {
-		struct {unsigned int  s0, s1, s2, s3, s4, s5, s6, s7;};
-		uint8 lo;
-	};
-	union {
-		struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;};
-		uint8 hi;
-	};
-} uint16;
-
-
-static __inline__ __host__ __device__ uint16 make_uint16(
-	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7,
-	unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf)
-{
-	uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
-	t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf;
-	return t;
-}
-
-static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b)
-{
-	uint16 t; t.lo=a; t.hi=b; return t;
-}
-
-static __inline__ __host__ __device__ uint8 make_uint8(
-	unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7)
-{
-	uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7;
-	return t;
-}
-
-
-static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
-static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
-
-
-static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
-static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
-
-
-static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); }
-static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
-static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); }
-
-
-static __forceinline__ __device__  __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); }
-
-static __forceinline__ __device__  __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); }
-
-static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) {
-	return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7,
-		a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf);
-}
-
-static __forceinline__ __device__  __host__ uint16 operator+ (const uint16 &a, const uint16 &b) {
-	return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7,
-		a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf);
-}
-
-static __forceinline__ __device__ void operator^= (uint4 &a, uint4 b) { a = a ^ b; }
-static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; }
-static __forceinline__ __device__  __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; }
-static __forceinline__ __device__  __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; }
-
-
-static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; }
-static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; }
-
-static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
-static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
-static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
-static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
-
-#if __CUDA_ARCH__ < 320
-
-#define rotateL ROTL32
-#define rotateR ROTR32
-
-#else
-
-static __forceinline__ __device__ uint32_t rotateL(uint32_t vec4, uint32_t shift)
-{
-	uint32_t ret;
-	asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
-	return ret;
-}
-
-static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
-{
-	uint32_t ret;
-	asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(vec4), "r"(vec4), "r"(shift));
-	return ret;
-}
-
-static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr)
-{
-	uint8 test;
-	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(test.s0), "=r"(test.s1), "=r"(test.s2), "=r"(test.s3) : __LDG_PTR(ptr));
-	asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3},[%4+16];" : "=r"(test.s4), "=r"(test.s5), "=r"(test.s6), "=r"(test.s7) : __LDG_PTR(ptr));
-	return (test);
-}
-
-static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr)
-{
-	uint32_t test;
-	asm volatile ("ld.global.nc.u32 {%0},[%1];" : "=r"(test) : __LDG_PTR(ptr));
-	return (test);
-}
-
-static __device__ __inline__ uint32_t __ldgtoint64(const uint8_t *ptr)
-{
-	uint64_t test;
-	asm volatile ("ld.global.nc.u64 {%0},[%1];" : "=l"(test) : __LDG_PTR(ptr));
-	return (test);
-}
-
-
-static __device__ __inline__ uint32_t __ldgtoint_unaligned(const uint8_t *ptr)
-{
-	uint32_t test;
-	asm volatile ("{\n\t"
-		".reg .u8 a,b,c,d; \n\t"
-	"ld.global.nc.u8 a,[%1]; \n\t"
-	"ld.global.nc.u8 b,[%1+1]; \n\t"
-	"ld.global.nc.u8 c,[%1+2]; \n\t"
-	"ld.global.nc.u8 d,[%1+3]; \n\t"
-	"mov.b32 %0,{a,b,c,d}; }\n\t"
-		: "=r"(test) : __LDG_PTR(ptr));
-	return (test);
-}
-
-static __device__ __inline__ uint64_t __ldgtoint64_unaligned(const uint8_t *ptr)
-{
-	uint64_t test;
-	asm volatile ("{\n\t"
-		".reg .u8 a,b,c,d,e,f,g,h; \n\t"
-		".reg .u32 i,j; \n\t"
-		"ld.global.nc.u8 a,[%1]; \n\t"
-		"ld.global.nc.u8 b,[%1+1]; \n\t"
-		"ld.global.nc.u8 c,[%1+2]; \n\t"
-		"ld.global.nc.u8 d,[%1+3]; \n\t"
-		"ld.global.nc.u8 e,[%1+4]; \n\t"
-		"ld.global.nc.u8 f,[%1+5]; \n\t"
-		"ld.global.nc.u8 g,[%1+6]; \n\t"
-		"ld.global.nc.u8 h,[%1+7]; \n\t"
-		 "mov.b32 i,{a,b,c,d}; \n\t"
-         "mov.b32 j,{e,f,g,h}; \n\t"
-		 "mov.b64 %0,{i,j}; }\n\t"
-		: "=l"(test) : __LDG_PTR(ptr));
-	return (test);
-}
-
-
-static __device__ __inline__ uint64_t __ldgtoint64_trunc(const uint8_t *ptr)
-{
-	uint32_t zero = 0;
-	uint64_t test;
-	asm volatile ("{\n\t"
-		".reg .u8 a,b,c,d; \n\t"
-		".reg .u32 i; \n\t"
-		"ld.global.nc.u8 a,[%1]; \n\t"
-		"ld.global.nc.u8 b,[%1+1]; \n\t"
-		"ld.global.nc.u8 c,[%1+2]; \n\t"
-		"ld.global.nc.u8 d,[%1+3]; \n\t"
-		"mov.b32 i,{a,b,c,d}; \n\t"
-		"mov.b64 %0,{i,%1}; }\n\t"
-		: "=l"(test) : __LDG_PTR(ptr), "r"(zero));
-	return (test);
-}
-
-
-
-static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)
-{
-	uint32_t test;
-	asm("{\n\t"
-		".reg .u8 e,b,c,d; \n\t"
-		"ld.global.nc.u8 e,[%1]; \n\t"
-		"ld.global.nc.u8 b,[%1+1]; \n\t"
-		"ld.global.nc.u8 c,[%1+2]; \n\t"
-		"ld.global.nc.u8 d,[%1+3]; \n\t"
-		"mov.b32 %0,{e,b,c,d}; }\n\t"
-		: "=r"(test) : __LDG_PTR(ptr));
-	return (test);
-}
-
-#endif
-
-
-static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
-{
-	uint8 vec;
-	vec.s0 = cuda_swab32(buf[0].s0);
-	vec.s1 = cuda_swab32(buf[0].s1);
-	vec.s2 = cuda_swab32(buf[0].s2);
-	vec.s3 = cuda_swab32(buf[0].s3);
-	vec.s4 = cuda_swab32(buf[0].s4);
-	vec.s5 = cuda_swab32(buf[0].s5);
-	vec.s6 = cuda_swab32(buf[0].s6);
-	vec.s7 = cuda_swab32(buf[0].s7);
-	return vec;
-}
-
-static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
-{
-	uint16 vec;
-	vec.s0 = cuda_swab32(buf[0].s0);
-	vec.s1 = cuda_swab32(buf[0].s1);
-	vec.s2 = cuda_swab32(buf[0].s2);
-	vec.s3 = cuda_swab32(buf[0].s3);
-	vec.s4 = cuda_swab32(buf[0].s4);
-	vec.s5 = cuda_swab32(buf[0].s5);
-	vec.s6 = cuda_swab32(buf[0].s6);
-	vec.s7 = cuda_swab32(buf[0].s7);
-	vec.s8 = cuda_swab32(buf[0].s8);
-	vec.s9 = cuda_swab32(buf[0].s9);
-	vec.sa = cuda_swab32(buf[0].sa);
-	vec.sb = cuda_swab32(buf[0].sb);
-	vec.sc = cuda_swab32(buf[0].sc);
-	vec.sd = cuda_swab32(buf[0].sd);
-	vec.se = cuda_swab32(buf[0].se);
-	vec.sf = cuda_swab32(buf[0].sf);
-	return vec;
-}
-#endif // #ifndef CUDA_VECTOR_H
--- a/quark/cuda_vector_uint2x4.h
+++ b/quark/cuda_vector_uint2x4.h
@ -1,5 +1,3 @@
-/* todo: strip to keep only uint2x4 */
-
 #ifndef CUDA_VECTOR_UINT2x4_H
 #define CUDA_VECTOR_UINT2x4_H

@ -12,18 +10,15 @@

 #include "cuda_helper.h"

-#if __CUDA_ARCH__ < 320 && !defined(__ldg4)
-#define __ldg4(x) (*(x))
-#define ldg4(ptr, ret) { ret = (*(ptr)); }
-#endif
-
 typedef struct __align__(16) uint2x4 {
 	uint2 x, y, z, w;
 } uint2x4;

+
 static __inline__ __device__ uint2x4 make_uint2x4(uint2 s0, uint2 s1, uint2 s2, uint2 s3)
 {
-	uint2x4 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3;
+	uint2x4 t;
+	t.x = s0; t.y = s1; t.z = s2; t.w = s3;
 	return t;
 }

@ -45,21 +40,23 @@ static __forceinline__ __device__ void operator+= (uint2x4 &a, const uint2x4 &b)
 static __device__ __inline__ uint2x4 __ldg4(const uint2x4 *ptr)
 {
 	uint2x4 ret;
-	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"    : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr));
 	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr));
 	return ret;
 }

 static __device__ __inline__ void ldg4(const uint2x4 *ptr, uint2x4 *ret)
 {
-	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
-	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"     : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];"  : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr));
 	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];"  : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr));
-	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];"  : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr));
 	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];"  : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr));
-	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
+	asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];"  : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr));
 }
-
+#elif !defined(__ldg4)
+#define __ldg4(x) (*(x))
+#define ldg4(ptr, ret) { *(ret) = (*(ptr)); }
 #endif

 #endif // H
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@ -18,9 +18,8 @@

 #if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ >= 500

-#include "cuda_lyra2_vectors.h"
+#include "cuda_vector_uint2x4.h"

-#define uint2x4 uint28
 #define memshift 3

 #define Ncol 8
--- a/lyra2/cuda_lyra2_vectors.h
+++ b/lyra2/cuda_lyra2_vectors.h
@ -81,7 +81,8 @@ typedef struct __align__(256) ulonglong16 {

 typedef struct __align__(16) uint28 {
 	uint2 x, y, z, w;
-} uint28;
+} uint2x4;
+typedef uint2x4 uint28; /* name deprecated */

 typedef struct __builtin_align__(32) uint48 {
 		uint4 s0,s1;
--- a/lyra2/cuda_lyra2v2.cu
+++ b/lyra2/cuda_lyra2v2.cu
@ -18,7 +18,6 @@

 #define Nrow 4
 #define Ncol 4
-#define uint2x4 uint28
 #define memshift 3

 __device__ uint2x4 *DMatrix;
--- a/lyra2/cuda_lyra2v2_sm3.cuh
+++ b/lyra2/cuda_lyra2v2_sm3.cuh
@ -171,8 +171,8 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

 	vectype state[4];
-	uint28 blake2b_IV[2];
-	uint28 padding[2];
+	vectype blake2b_IV[2];
+	vectype padding[2];

 	if (threadIdx.x == 0) {