From be478bd72576ec9ee7ca37580a8e362c8a04d442 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 12 May 2015 02:35:56 +0200
Subject: [PATCH] groestl: tabs to space + arch check

---
 cuda_groestlcoin.cu | 213 ++++++++++++++++++++++----------------------
 1 file changed, 108 insertions(+), 105 deletions(-)

diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index 6d57fe8..27da418 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -4,9 +4,9 @@
 #include <memory.h>
 
 #include "cuda_helper.h"
-#include <host_defines.h>
 
-// globaler Speicher für alle HeftyHashes aller Threads
+#include "miner.h"
+
 __constant__ uint32_t pTarget[8]; // Single GPU
 __constant__ uint32_t groestlcoin_gpu_msg[32];
 
@@ -24,135 +24,138 @@ __global__ __launch_bounds__(256, 4)
 void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
 {
 #if __CUDA_ARCH__ >= 300
-    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
-    if (thread < threads)
-    {
-        // GROESTL
-        uint32_t paddedInput[8];
-
-        #pragma unroll 8
-        for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
-
-        uint32_t nounce = startNounce + thread;
-        if ((threadIdx.x % 4) == 3)
-            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
-
-        uint32_t msgBitsliced[8];
-        to_bitslice_quad(paddedInput, msgBitsliced);
-
-        uint32_t state[8];
-        for (int round=0; round<2; round++)
-        {
-            groestl512_progressMessage_quad(state, msgBitsliced);
-
-            if (round < 1)
-            {
-                // Verkettung zweier Runden inclusive Padding.
-                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
-                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
-                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
-                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
-                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
-                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
-                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
-                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
-            }
-        }
-
-        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t out_state[16];
-        from_bitslice_quad(state, out_state);
-        
-        if (threadIdx.x % 4 == 0)
-        {
-            int i, position = -1;
-            bool rc = true;
-
-            #pragma unroll 8
-            for (i = 7; i >= 0; i--) {
-                if (out_state[i] > pTarget[i]) {
-                    if(position < i) {
-                        position = i;
-                        rc = false;
-                    }
-                 }
-                 if (out_state[i] < pTarget[i]) {
-                    if(position < i) {
-                        position = i;
-                        rc = true;
-                    }
-                 }
-            }
-
-            if(rc == true)
-                if(resNounce[0] > nounce)
-                    resNounce[0] = nounce;
-        }
-    }
+	// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
+	if (thread < threads)
+	{
+		// GROESTL
+		uint32_t paddedInput[8];
+
+		#pragma unroll 8
+		for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
+
+		uint32_t nounce = startNounce + thread;
+		if ((threadIdx.x % 4) == 3)
+			paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
+
+		uint32_t msgBitsliced[8];
+		to_bitslice_quad(paddedInput, msgBitsliced);
+
+		uint32_t state[8];
+		for (int round=0; round<2; round++)
+		{
+			groestl512_progressMessage_quad(state, msgBitsliced);
+
+			if (round < 1)
+			{
+				// Verkettung zweier Runden inclusive Padding.
+				msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
+				msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
+				msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
+				msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
+				msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
+				msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
+				msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
+				msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
+			}
+		}
+
+		// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
+		uint32_t out_state[16];
+		from_bitslice_quad(state, out_state);
+
+		if (threadIdx.x % 4 == 0)
+		{
+			int i, position = -1;
+			bool rc = true;
+
+			#pragma unroll 8
+			for (i = 7; i >= 0; i--) {
+				if (out_state[i] > pTarget[i]) {
+					if(position < i) {
+						position = i;
+						rc = false;
+					}
+				 }
+				 if (out_state[i] < pTarget[i]) {
+					if(position < i) {
+						position = i;
+						rc = true;
+					}
+				 }
+			}
+
+			if(rc == true)
+				if(resNounce[0] > nounce)
+					resNounce[0] = nounce;
+		}
+	}
 #endif
 }
 
 __host__
 void groestlcoin_cpu_init(int thr_id, uint32_t threads)
 {
-    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+	// to check if the binary supports SM3+
+	cuda_get_arch(thr_id);
+
+	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }
 
 __host__
 void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-    // Nachricht expandieren und setzen
-    uint32_t msgBlock[32];
+	uint32_t msgBlock[32];
 
-    memset(msgBlock, 0, sizeof(uint32_t) * 32);
-    memcpy(&msgBlock[0], data, 80);
+	memset(msgBlock, 0, sizeof(uint32_t) * 32);
+	memcpy(&msgBlock[0], data, 80);
 
-    // Erweitere die Nachricht auf den Nachrichtenblock (padding)
-    // Unsere Nachricht hat 80 Byte
-    msgBlock[20] = 0x80;
-    msgBlock[31] = 0x01000000;
+	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
+	// Unsere Nachricht hat 80 Byte
+	msgBlock[20] = 0x80;
+	msgBlock[31] = 0x01000000;
 
-    // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
-    // auf der GPU ausgeführt)
+	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
+	// auf der GPU ausgeführt)
 
-    // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-    cudaMemcpyToSymbol( groestlcoin_gpu_msg,
-                        msgBlock,
-                        128);
+	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
+	cudaMemcpyToSymbol( groestlcoin_gpu_msg,
+						msgBlock,
+						128);
 
-    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    cudaMemcpyToSymbol( pTarget,
-                        pTargetIn,
-                        sizeof(uint32_t) * 8 );
+	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+	cudaMemcpyToSymbol( pTarget,
+						pTargetIn,
+						sizeof(uint32_t) * 8 );
 }
 
 __host__
 void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
-    uint32_t threadsperblock = 256;
+	uint32_t threadsperblock = 256;
 
-    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    int factor = 4;
+	// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
+	// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
+	int factor = 4;
 
-        // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
-    dim3 block(threadsperblock);
+		// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
+	dim3 block(threadsperblock);
 
-    // Größe des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	// Größe des dynamischen Shared Memory Bereichs
+	size_t shared_size = 0;
 
-    if (device_sm[device_map[thr_id]] < 300) {
-        printf("Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
-        return;
-    }
+	int dev_id = device_map[thr_id];
+	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+		printf("Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
+		proper_exit(EXIT_CODE_CUDA_ERROR);
+	}
 
-    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+	groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
 
-    // Strategisches Sleep Kommando zur Senkung der CPU Last
-    MyStreamSynchronize(NULL, 0, thr_id);
+	// Strategisches Sleep Kommando zur Senkung der CPU Last
+	MyStreamSynchronize(NULL, 0, thr_id);
 
-    cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }