ccminer-gostd-lite/x11/cuda_x11_echo.cu

#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdint.h>
#include <memory.h>

// das Hi Word aus einem 64 Bit Typen extrahieren
#if 0
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
	return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
	return (uint32_t)(x >> 32);
#endif
}

// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
	return (uint32_t)__double2loint(__longlong_as_double(x));
#else
	return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#endif

#define SPH_C64(x)    ((uint64_t)(x ## ULL))
#define SPH_C32(x)    ((uint32_t)(x ## U))

// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

#include "cuda_x11_aes.cu"

__device__ __forceinline__ void AES_2ROUND(
	const uint32_t* __restrict__ sharedMemory,
	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
{
	uint32_t y0, y1, y2, y3;
	
	aes_round(sharedMemory,
		x0, x1, x2, x3,
		k0,
		y0, y1, y2, y3);

	aes_round(sharedMemory,
		y0, y1, y2, y3,
		x0, x1, x2, x3);

	// hier werden wir ein carry brauchen (oder auch nicht)
	k0++;
}

__device__ __forceinline__ void cuda_echo_round(
	const uint32_t *sharedMemory,
	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3,
	uint32_t *W, int round)
{
	// W hat 16*4 als Abma<EFBFBD>e

	// Big Sub Words
#pragma unroll 16
	for(int i=0;i<16;i++)
	{
		int idx = i<<2; // *4
		AES_2ROUND(sharedMemory,
			W[idx+0], W[idx+1], W[idx+2], W[idx+3],
			k0, k1, k2, k3);
	}

	// Shift Rows
#pragma unroll 4
	for(int i=0;i<4;i++)
	{
		uint32_t t;

		/// 1, 5, 9, 13
		t = W[4 + i];
		W[4 + i] = W[20 + i];
		W[20 + i] = W[36 + i];
		W[36 + i] = W[52 + i];
		W[52 + i] = t;

		// 2, 6, 10, 14
		t = W[8 + i];
		W[8 + i] = W[40 + i];
		W[40 + i] = t;
		t = W[24 + i];
		W[24 + i] = W[56 + i];
		W[56 + i] = t;

		// 15, 11, 7, 3
		t = W[60 + i];
		W[60 + i] = W[44 + i];
		W[44 + i] = W[28 + i];
		W[28 + i] = W[12 + i];
		W[12 + i] = t;
	}

	// Mix Columns
#pragma unroll 4
	for(int i=0;i<4;i++) // Schleife <EFBFBD>ber je 2*uint32_t
	{
#pragma unroll 4
		for(int j=0;j<4;j++) // Schleife <EFBFBD>ber die elemnte
		{
			int idx = j<<2; // j*4

			uint32_t a = W[ ((idx + 0)<<2) + i];
			uint32_t b = W[ ((idx + 1)<<2) + i];
			uint32_t c = W[ ((idx + 2)<<2) + i];
			uint32_t d = W[ ((idx + 3)<<2) + i];

			uint32_t ab = a ^ b;
			uint32_t bc = b ^ c;
			uint32_t cd = c ^ d;

			uint32_t t;
			t = ((ab & 0x80808080) >> 7);
			uint32_t abx = t<<4 ^ t<<3 ^ t<<1 ^ t;
			t = ((bc & 0x80808080) >> 7);
			uint32_t bcx = t<<4 ^ t<<3 ^ t<<1 ^ t;
			t = ((cd & 0x80808080) >> 7);
			uint32_t cdx = t<<4 ^ t<<3 ^ t<<1 ^ t;

			abx ^= ((ab & 0x7F7F7F7F) << 1);
			bcx ^= ((bc & 0x7F7F7F7F) << 1);
			cdx ^= ((cd & 0x7F7F7F7F) << 1);

			W[ ((idx + 0)<<2) + i] = abx ^ bc ^ d;
			W[ ((idx + 1)<<2) + i] = bcx ^ a ^ cd;
			W[ ((idx + 2)<<2) + i] = cdx ^ ab ^ d;
			W[ ((idx + 3)<<2) + i] = abx ^ bcx ^ cdx ^ ab ^ c;
		}
	}
}

__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
	__shared__ uint32_t sharedMemory[1024];

	aes_gpu_init(sharedMemory);

	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

        int hashPosition = nounce - startNounce;
        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];

		uint32_t W[64];
		uint32_t k0 = 512, k1 = 0, k2 = 0, k3 = 0; // K0 = bitlen
		/* Initialisierung */
#pragma unroll 8
		for(int i=0;i<32;i+=4)
		{
			W[i + 0] = 512;
			W[i + 1] = 0;
			W[i + 2] = 0;
			W[i + 3] = 0;
		}

		// kopiere 32-byte gro<EFBFBD>en hash
#pragma unroll 16
		for(int i=0;i<16;i++)
			W[i+32] = Hash[i];
		W[48] = 0x80; // fest
#pragma unroll 10
		for(int i=49;i<59;i++)
			W[i] = 0;
		W[59] = 0x02000000; // fest
		W[60] = k0; // bitlen
		W[61] = k1;
		W[62] = k2;
		W[63] = k3;
		
		for(int i=0;i<10;i++)
		{
			cuda_echo_round(sharedMemory, k0, k1, k2, k3, W, i);
		}

#pragma unroll 8
		for(int i=0;i<32;i+=4)
		{
			W[i  ] ^= W[32 + i    ] ^ 512;
			W[i+1] ^= W[32 + i + 1];
			W[i+2] ^= W[32 + i + 2];
			W[i+3] ^= W[32 + i + 3];
		}

#pragma unroll 16
		for(int i=0;i<16;i++)
			W[i] ^= Hash[i];

		// tsiv: I feel	iffy about removing	this, but it seems to break	the	full hash
		// fortunately for X11 the flipped bit lands outside the first 32 bytes	used as	the	final X11 hash
		// try chaining	more algos after echo (X13)	and	boom
		//W[8] ^= 0x10;

		W[27] ^= 0x02000000;
		W[28] ^= k0;

#pragma unroll 16
		for(int i=0;i<16;i++)
			Hash[i] = W[i];
    }
}

// Setup-Funktionen
__host__ void x11_echo512_cpu_init(int thr_id, int threads)
{
	aes_cpu_init();
}

__host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
    const int threadsperblock = 256;

    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid((threads + threadsperblock-1)/threadsperblock);
    dim3 block(threadsperblock);

    // Gr<EFBFBD><EFBFBD>e des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;

//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

    x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
}
-												v1.0 - Yo, I heard y'all like X11

											
										
										
											11 years ago
+								#include <cuda.h>
-												Implement x14 (cuda + cpu functions)

Project was updated for VS2013 and CUDA SDK 6.5

add also a --cputest function to dump cpu hash results

TODO: x15 is not fully functional, but first loop seems ok

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#include <cuda_runtime.h>
-												v1.0 - Yo, I heard y'all like X11

											
										
										
											11 years ago
+								#include "device_launch_parameters.h"
 								#include <stdio.h>
-												Implement x14 (cuda + cpu functions)

Project was updated for VS2013 and CUDA SDK 6.5

add also a --cputest function to dump cpu hash results

TODO: x15 is not fully functional, but first loop seems ok

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#include <stdint.h>
-												v1.0 - Yo, I heard y'all like X11

											
										
										
											11 years ago
+								#include <memory.h>
 								// das Hi Word aus einem 64 Bit Typen extrahieren
-												Implement x14 (cuda + cpu functions)

Project was updated for VS2013 and CUDA SDK 6.5

add also a --cputest function to dump cpu hash results

TODO: x15 is not fully functional, but first loop seems ok

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#if 0
-												v1.0 - Yo, I heard y'all like X11

											
										
										
											11 years ago
+								static __device__ uint32_t HIWORD(const uint64_t &x) {
 								#if __CUDA_ARCH__ >= 130
 									return (uint32_t)__double2hiint(__longlong_as_double(x));
 								#else
 									return (uint32_t)(x >> 32);
 								#endif
 								}
 								// das Lo Word aus einem 64 Bit Typen extrahieren
 								static __device__ uint32_t LOWORD(const uint64_t &x) {
 								#if __CUDA_ARCH__ >= 130
 									return (uint32_t)__double2loint(__longlong_as_double(x));
 								#else
 									return (uint32_t)(x & 0xFFFFFFFFULL);
 								#endif
 								}
-												Implement x14 (cuda + cpu functions)

Project was updated for VS2013 and CUDA SDK 6.5

add also a --cputest function to dump cpu hash results

TODO: x15 is not fully functional, but first loop seems ok

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								#endif
-												v1.0 - Yo, I heard y'all like X11

											
										
										
											11 years ago
 								#define SPH_C64(x)    ((uint64_t)(x ## ULL))
 								#define SPH_C32(x)    ((uint32_t)(x ## U))
 								// aus heavy.cu
 								extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 								#include "cuda_x11_aes.cu"
 								__device__ __forceinline__ void AES_2ROUND(
 									const uint32_t* __restrict__ sharedMemory,
 									uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
 									uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
 								{
 									uint32_t y0, y1, y2, y3;
 									aes_round(sharedMemory,
 										x0, x1, x2, x3,
 										k0,
 										y0, y1, y2, y3);
 									aes_round(sharedMemory,
 										y0, y1, y2, y3,
 										x0, x1, x2, x3);
 									// hier werden wir ein carry brauchen (oder auch nicht)
 									k0++;
 								}
 								__device__ __forceinline__ void cuda_echo_round(
 									const uint32_t *sharedMemory,
 									uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3,
 									uint32_t *W, int round)
 								{
 									// W hat 16*4 als Abma<EFBFBD>e
 									// Big Sub Words
 								#pragma unroll 16
 									for(int i=0;i<16;i++)
 									{
 										int idx = i<<2; // *4
 										AES_2ROUND(sharedMemory,
 											W[idx+0], W[idx+1], W[idx+2], W[idx+3],
 											k0, k1, k2, k3);
 									}
 									// Shift Rows
 								#pragma unroll 4
 									for(int i=0;i<4;i++)
 									{
 										uint32_t t;
 										/// 1, 5, 9, 13
 										t = W[4 + i];
 										W[4 + i] = W[20 + i];
 										W[20 + i] = W[36 + i];
 										W[36 + i] = W[52 + i];
 										W[52 + i] = t;
 										// 2, 6, 10, 14
 										t = W[8 + i];
 										W[8 + i] = W[40 + i];
 										W[40 + i] = t;
 										t = W[24 + i];
 										W[24 + i] = W[56 + i];
 										W[56 + i] = t;
 										// 15, 11, 7, 3
 										t = W[60 + i];
 										W[60 + i] = W[44 + i];
 										W[44 + i] = W[28 + i];
 										W[28 + i] = W[12 + i];
 										W[12 + i] = t;
 									}
 									// Mix Columns
 								#pragma unroll 4
 									for(int i=0;i<4;i++) // Schleife <EFBFBD>ber je 2*uint32_t
 									{
 								#pragma unroll 4
 										for(int j=0;j<4;j++) // Schleife <EFBFBD>ber die elemnte
 										{
 											int idx = j<<2; // j*4
 											uint32_t a = W[ ((idx + 0)<<2) + i];
 											uint32_t b = W[ ((idx + 1)<<2) + i];
 											uint32_t c = W[ ((idx + 2)<<2) + i];
 											uint32_t d = W[ ((idx + 3)<<2) + i];
 											uint32_t ab = a ^ b;
 											uint32_t bc = b ^ c;
 											uint32_t cd = c ^ d;
 											uint32_t t;
 											t = ((ab & 0x80808080) >> 7);
 											uint32_t abx = t<<4 ^ t<<3 ^ t<<1 ^ t;
 											t = ((bc & 0x80808080) >> 7);
 											uint32_t bcx = t<<4 ^ t<<3 ^ t<<1 ^ t;
 											t = ((cd & 0x80808080) >> 7);
 											uint32_t cdx = t<<4 ^ t<<3 ^ t<<1 ^ t;
 											abx ^= ((ab & 0x7F7F7F7F) << 1);
 											bcx ^= ((bc & 0x7F7F7F7F) << 1);
 											cdx ^= ((cd & 0x7F7F7F7F) << 1);
 											W[ ((idx + 0)<<2) + i] = abx ^ bc ^ d;
 											W[ ((idx + 1)<<2) + i] = bcx ^ a ^ cd;
 											W[ ((idx + 2)<<2) + i] = cdx ^ ab ^ d;
 											W[ ((idx + 3)<<2) + i] = abx ^ bcx ^ cdx ^ ab ^ c;
 										}
 									}
 								}
 								__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 								{
 									__shared__ uint32_t sharedMemory[1024];
 									aes_gpu_init(sharedMemory);
 									int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 								    if (thread < threads)
 								    {
 								        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 								        int hashPosition = nounce - startNounce;
 								        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
 										uint32_t W[64];
 										uint32_t k0 = 512, k1 = 0, k2 = 0, k3 = 0; // K0 = bitlen
 										/* Initialisierung */
 								#pragma unroll 8
 										for(int i=0;i<32;i+=4)
 										{
 											W[i + 0] = 512;
 											W[i + 1] = 0;
 											W[i + 2] = 0;
 											W[i + 3] = 0;
 										}
 										// kopiere 32-byte gro<EFBFBD>en hash
 								#pragma unroll 16
 										for(int i=0;i<16;i++)
 											W[i+32] = Hash[i];
 										W[48] = 0x80; // fest
 								#pragma unroll 10
 										for(int i=49;i<59;i++)
 											W[i] = 0;
 										W[59] = 0x02000000; // fest
 										W[60] = k0; // bitlen
 										W[61] = k1;
 										W[62] = k2;
 										W[63] = k3;
 										for(int i=0;i<10;i++)
 										{
 											cuda_echo_round(sharedMemory, k0, k1, k2, k3, W, i);
 										}
 								#pragma unroll 8
 										for(int i=0;i<32;i+=4)
 										{
 											W[i  ] ^= W[32 + i    ] ^ 512;
 											W[i+1] ^= W[32 + i + 1];
 											W[i+2] ^= W[32 + i + 2];
 											W[i+3] ^= W[32 + i + 3];
 										}
 								#pragma unroll 16
 										for(int i=0;i<16;i++)
 											W[i] ^= Hash[i];
-												adding third party X13 and Diamond Groestl code contributions.

											
										
										
											11 years ago
+										// tsiv: I feel	iffy about removing	this, but it seems to break	the	full hash
 										// fortunately for X11 the flipped bit lands outside the first 32 bytes	used as	the	final X11 hash
 										// try chaining	more algos after echo (X13)	and	boom
 										//W[8] ^= 0x10;
-												v1.0 - Yo, I heard y'all like X11

											
										
										
											11 years ago
+										W[27] ^= 0x02000000;
 										W[28] ^= k0;
 								#pragma unroll 16
 										for(int i=0;i<16;i++)
 											Hash[i] = W[i];
 								    }
 								}
 								// Setup-Funktionen
 								__host__ void x11_echo512_cpu_init(int thr_id, int threads)
 								{
 									aes_cpu_init();
 								}
 								__host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 								{
 								    const int threadsperblock = 256;
 								    // berechne wie viele Thread Blocks wir brauchen
 								    dim3 grid((threads + threadsperblock-1)/threadsperblock);
 								    dim3 block(threadsperblock);
 								    // Gr<EFBFBD><EFBFBD>e des dynamischen Shared Memory Bereichs
 								    size_t shared_size = 0;
-												adding third party X13 and Diamond Groestl code contributions.

											
										
										
											11 years ago
+								//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-												v1.0 - Yo, I heard y'all like X11

											
										
										
											11 years ago
+								    x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 								    MyStreamSynchronize(NULL, order, thr_id);
 								}