x15: use djm34 code with asm xor64 + my rot64

some optimizations could be done later, after whirlcoin integration
2025-08-26 13:51:51 +00:00 · 2014-08-20 05:44:02 +02:00 · 2014-08-20 05:44:02 +02:00 · 4bc23048b5
commit 4bc23048b5
parent 194fda87c1
3 changed files with 1799 additions and 201 deletions
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -113,40 +113,7 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
 			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
 #endif

-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
-	uint2 result;
-	if(offset < 32) {
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-	} else {
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-	}
-	return __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTR64(x, n)        (((x) >> (n)) | ((x) << (64 - (n))))
-#endif
-
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
-	uint2 result;
-	if(offset >= 32) {
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-	} else {
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-	}
-	return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
-
+/*********************************************************************/
 // Macro to catch CUDA errors in CUDA runtime calls
 #define CUDA_SAFE_CALL(call)                                          \
 do {                                                                  \
@ -158,4 +125,188 @@ do {                                                                  \
 	}                                                                 \
 } while (0)

+/*********************************************************************/
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor1(uint64_t a, uint64_t b)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a) ,"l"(b));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor3(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("{\n\t"
+		" .reg .u64 t1;\n\t"
+		"xor.b64 t1, %2, %3;\n\t"
+		"xor.b64 %0, %1, t1;\n\t"
+		"}"
+	: "=l"(result) : "l"(a) ,"l"(b),"l"(c));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h)
+{
+	uint64_t result;
+	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
+	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t xandx(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .u64 m,n;\n\t"
+		"xor.b64 m, %2,%3;\n\t"
+		"and.b64 n, m,%1;\n\t"
+		"xor.b64 %0, n,%3;\n\t"
+		"}\n\t"
+	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t sph_t64(uint64_t x)
+{
+	uint64_t result;
+	asm("{\n\t"
+		"and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t"
+		"}\n\t"
+	: "=l"(result) : "l"(x));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t andor(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .u64 m,n,o;\n\t"
+		"and.b64 m,  %1, %2;\n\t"
+		" or.b64 n,  %1, %2;\n\t"
+		"and.b64 o,   n, %3;\n\t"
+		" or.b64 %0,  m, o ;\n\t"
+		"}\n\t"
+	: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t shr_t64(uint64_t x, uint32_t n)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .u64 m;\n\t"
+		"shr.b64 m,%1,%2;\n\t"
+		"and.b64 %0,m,0xFFFFFFFFFFFFFFFF;\n\t"
+		"}\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+	return result;
+}
+
+// device asm for whirpool
+__device__ __forceinline__
+uint64_t shl_t64(uint64_t x, uint32_t n)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .u64 m;\n\t"
+		"shl.b64 m,%1,%2;\n\t"
+		"and.b64 %0,m,0xFFFFFFFFFFFFFFFF;\n\t"
+		"}\n\t"
+	: "=l"(result) : "l"(x), "r"(n));
+	return result;
+}
+
+
+// 64-bit ROTATE RIGHT
+#ifdef DJM_SM35_ROT64
+/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120
+__device__ __forceinline__
+uint64_t ROTR64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .b64 lhs, rhs;\n\t"
+		".reg .u32 amt2;\n\t"
+		"shr.b64 lhs, %1, %2;\n\t"
+		"sub.u32 amt2, 64, %2;\n\t"
+		"shl.b64 rhs, %1, amt2;\n\t"
+		"add.u64 %0, lhs, rhs;\n\t"
+		"}\n\t"
+	: "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#else
+/* host */
+#define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
+#endif
+
+// 64-bit ROTATE LEFT
+#ifdef DJM_SM35_ROT64
+/* complicated sm >= 3.5 one, to bench */
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return  __double_as_longlong(__hiloint2double(result.y, result.x));
+}
+#elif __CUDA_ARCH__ >= 120
+__device__ __forceinline__
+uint64_t ROTL64(const uint64_t x, const int offset)
+{
+	uint64_t result;
+	asm("{\n\t"
+		".reg .b64 lhs, rhs;\n\t"
+		".reg .u32 amt2;\n\t"
+		"shl.b64 lhs, %1, %2;\n\t"
+		"sub.u32 amt2, 64, %2;\n\t"
+		"shr.b64 rhs, %1, amt2;\n\t"
+		"add.u64 %0, lhs, rhs;\n\t"
+		"}\n\t"
+	: "=l"(result) : "l"(x), "r"(offset));
+	return result;
+}
+#else
+/* host */
+#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
+#endif
+
 #endif // #ifndef CUDA_HELPER_H
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -80,8 +80,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
 extern void x14_shabal512_cpu_init(int thr_id, int threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void x15_whirlpool_cpu_init(int thr_id, int threads);
+extern void x15_whirlpool_cpu_init(int thr_id, int threads, int mode);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);

 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
@ -228,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
-		x15_whirlpool_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);

 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
@ -276,6 +277,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				pdata[19] = foundNonce;
 				*hashes_done = foundNonce - first_nonce + 1;
+				x15_whirlpool_cpu_free(thr_id);
 				return 1;
 			}
 			else if (vhash64[7] > Htarg) {
@ -291,5 +293,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);

 	*hashes_done = pdata[19] - first_nonce + 1;
+
+	x15_whirlpool_cpu_free(thr_id);
 	return 0;
 }