ccminer/x11/cuda_x11_cubehash512.cu

#include "cuda_helper.h"

typedef unsigned char BitSequence;

#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */

#if __CUDA_ARCH__ < 350
#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
#else
#define LROT(x, bits) __funnelshift_l(x, x, bits)
#endif

#define ROTATEUPWARDS7(a)  LROT(a,7)
#define ROTATEUPWARDS11(a) LROT(a,11)

#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }

__device__ __constant__
static const uint32_t c_IV_512[32] = {
	0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
	0x4167D83E, 0x3FEE2313, 0xC701CF8C,
	0xCC39968E, 0x50AC5695, 0x4D42C787,
	0xA647A8B3, 0x97CF0BEF, 0x825B4537,
	0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
	0xA23911AE, 0xFCD398D9, 0x148FE485,
	0x1B017BEF, 0xB6444532, 0x6A536159,
	0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
	0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
	0xBC796576, 0x1921C8F7, 0xE7989AF1,
	0x7795D246, 0xD43E3B44
};

__device__ __forceinline__
static void rrounds(uint32_t x[2][2][2][2][2])
{
    int r;
    int j;
    int k;
    int l;
    int m;

//#pragma unroll 16
    for (r = 0;r < CUBEHASH_ROUNDS;++r) {

        /* "add x_0jklm into x_1jklmn modulo 2^32" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (k = 0;k < 2;++k)
#pragma unroll 2
                for (l = 0;l < 2;++l)
#pragma unroll 2
                    for (m = 0;m < 2;++m)
                        x[1][j][k][l][m] += x[0][j][k][l][m];

        /* "rotate x_0jklm upwards by 7 bits" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (k = 0;k < 2;++k)
#pragma unroll 2
                for (l = 0;l < 2;++l)
#pragma unroll 2
                    for (m = 0;m < 2;++m)
                        x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);

        /* "swap x_00klm with x_01klm" */
#pragma unroll 2
        for (k = 0;k < 2;++k)
#pragma unroll 2
            for (l = 0;l < 2;++l)
#pragma unroll 2
                for (m = 0;m < 2;++m)
                    SWAP(x[0][0][k][l][m],x[0][1][k][l][m])

        /* "xor x_1jklm into x_0jklm" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (k = 0;k < 2;++k)
#pragma unroll 2
                for (l = 0;l < 2;++l)
#pragma unroll 2
                    for (m = 0;m < 2;++m)
                        x[0][j][k][l][m] ^= x[1][j][k][l][m];

        /* "swap x_1jk0m with x_1jk1m" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (k = 0;k < 2;++k)
#pragma unroll 2
                for (m = 0;m < 2;++m)
                    SWAP(x[1][j][k][0][m],x[1][j][k][1][m])

        /* "add x_0jklm into x_1jklm modulo 2^32" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (k = 0;k < 2;++k)
#pragma unroll 2
                for (l = 0;l < 2;++l)
#pragma unroll 2
                    for (m = 0;m < 2;++m)
                        x[1][j][k][l][m] += x[0][j][k][l][m];

        /* "rotate x_0jklm upwards by 11 bits" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (k = 0;k < 2;++k)
#pragma unroll 2
                for (l = 0;l < 2;++l)
#pragma unroll 2
                    for (m = 0;m < 2;++m)
                        x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);

        /* "swap x_0j0lm with x_0j1lm" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (l = 0;l < 2;++l)
#pragma unroll 2
                for (m = 0;m < 2;++m)
                    SWAP(x[0][j][0][l][m],x[0][j][1][l][m])

        /* "xor x_1jklm into x_0jklm" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (k = 0;k < 2;++k)
#pragma unroll 2
                for (l = 0;l < 2;++l)
#pragma unroll 2
                    for (m = 0;m < 2;++m)
                        x[0][j][k][l][m] ^= x[1][j][k][l][m];

        /* "swap x_1jkl0 with x_1jkl1" */
#pragma unroll 2
        for (j = 0;j < 2;++j)
#pragma unroll 2
            for (k = 0;k < 2;++k)
#pragma unroll 2
                for (l = 0;l < 2;++l)
                    SWAP(x[1][j][k][l][0],x[1][j][k][l][1])

    }
}

__device__ __forceinline__
static void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
{
    int k;
    int l;
    int m;
    uint32_t *in = block;

#pragma unroll 2
    for (k = 0;k < 2;++k)
#pragma unroll 2
        for (l = 0;l < 2;++l)
#pragma unroll 2
            for (m = 0;m < 2;++m)
                x[0][0][k][l][m] ^= *in++;
}

__device__ __forceinline__
static void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
{
    int j;
    int k;
    int l;
    int m;
    uint32_t *out = hash;

#pragma unroll 2
    for (j = 0;j < 2;++j)
#pragma unroll 2
        for (k = 0;k < 2;++k)
#pragma unroll 2
            for (l = 0;l < 2;++l)
#pragma unroll 2
                for (m = 0;m < 2;++m)
                    *out++ = x[0][j][k][l][m];
}

__device__
void Init(uint32_t x[2][2][2][2][2])
{
    int i,j,k,l,m;
#if 0
    /* "the first three state words x_00000, x_00001, x_00010" */
    /* "are set to the integers h/8, b, r respectively." */
    /* "the remaining state words are set to 0." */
#pragma unroll 2
    for (i = 0;i < 2;++i)
#pragma unroll 2
      for (j = 0;j < 2;++j)
#pragma unroll 2
        for (k = 0;k < 2;++k)
#pragma unroll 2
          for (l = 0;l < 2;++l)
#pragma unroll 2
            for (m = 0;m < 2;++m)
              x[i][j][k][l][m] = 0;
    x[0][0][0][0][0] = 512/8;
    x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
    x[0][0][0][1][0] = CUBEHASH_ROUNDS;

    /* "the state is then transformed invertibly through 10r identical rounds */
    for (i = 0;i < 10;++i) rrounds(x);
#else
    const uint32_t *iv = c_IV_512;

#pragma unroll 2
    for (i = 0;i < 2;++i)
#pragma unroll 2
      for (j = 0;j < 2;++j)
#pragma unroll 2
        for (k = 0;k < 2;++k)
#pragma unroll 2
          for (l = 0;l < 2;++l)
#pragma unroll 2
            for (m = 0;m < 2;++m)
              x[i][j][k][l][m] = *iv++;
#endif
}

__device__ __forceinline__
void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
{
    /* "xor the block into the first b bytes of the state" */
    /* "and then transform the state invertibly through r identical rounds" */
    block_tox((uint32_t*)data, x);
    rrounds(x);
}

__device__ __forceinline__
void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
{
    int i;

    /* "the integer 1 is xored into the last state word x_11111" */
    x[1][1][1][1][1] ^= 1;

    /* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
    for (i = 0;i < 10;++i) rrounds(x);

    /* "output the first h/8 bytes of the state" */
    hash_fromx((uint32_t*)hashval, x);
}


/***************************************************/
// GPU Hash Function
__global__
void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

        int hashPosition = nounce - startNounce;
        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];

        uint32_t x[2][2][2][2][2];
        Init(x);

        // erste Hälfte des Hashes (32 bytes)
        Update32(x, (const BitSequence*)Hash);

        // zweite Hälfte des Hashes (32 bytes)
        Update32(x, (const BitSequence*)(Hash+8));

        // Padding Block
        uint32_t last[8];
        last[0] = 0x80;
#pragma unroll 7
        for (int i=1; i < 8; i++) last[i] = 0;
        Update32(x, (const BitSequence*)last);

        Final(x, (BitSequence*)Hash);
    }
}


// Setup-Funktionen
__host__
void x11_cubehash512_cpu_init(int thr_id, int threads)
{
}

__host__
void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
    const int threadsperblock = 256;

    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid((threads + threadsperblock-1)/threadsperblock);
    dim3 block(threadsperblock);

    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;

    x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
}
Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`#include "cuda_helper.h"`
Implement x14 (cuda + cpu functions) Project was updated for VS2013 and CUDA SDK 6.5 add also a --cputest function to dump cpu hash results TODO: x15 is not fully functional, but first loop seems ok Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
v1.0 - Yo, I heard y'all like X11 10 years ago			`typedef unsigned char BitSequence;`

			`#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */`
			`#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */`

cubehash and luffa funnel shit (from klaus) No gain... but i like this define, more readable in luffa ;) 10 years ago			`#if __CUDA_ARCH__ < 350`
			`#define LROT(x,bits) ((x << bits) \| (x >> (32 - bits)))`
			`#else`
			`#define LROT(x, bits) __funnelshift_l(x, x, bits)`
			`#endif`

			`#define ROTATEUPWARDS7(a) LROT(a,7)`
			`#define ROTATEUPWARDS11(a) LROT(a,11)`

v1.0 - Yo, I heard y'all like X11 10 years ago			`#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }`

x10 funcs cleanup, we dont need host constant tables 10 years ago			`__device__ __constant__`
			`static const uint32_t c_IV_512[32] = {`
v1.0 - Yo, I heard y'all like X11 10 years ago			`0x2AEA2A61, 0x50F494D4, 0x2D538B8B,`
			`0x4167D83E, 0x3FEE2313, 0xC701CF8C,`
			`0xCC39968E, 0x50AC5695, 0x4D42C787,`
			`0xA647A8B3, 0x97CF0BEF, 0x825B4537,`
			`0xEEF864D2, 0xF22090C4, 0xD0E5CD33,`
			`0xA23911AE, 0xFCD398D9, 0x148FE485,`
			`0x1B017BEF, 0xB6444532, 0x6A536159,`
			`0x2FF5781C, 0x91FA7934, 0x0DBADEA9,`
			`0xD65C8A2B, 0xA5A70E75, 0xB1C62456,`
			`0xBC796576, 0x1921C8F7, 0xE7989AF1,`
			`0x7795D246, 0xD43E3B44`
			`};`

x11: echo and cubehash optimization echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes 10 years ago			`__device__ __forceinline__`
			`static void rrounds(uint32_t x[2][2][2][2][2])`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`int r;`
			`int j;`
			`int k;`
			`int l;`
			`int m;`

			`//#pragma unroll 16`
			`for (r = 0;r < CUBEHASH_ROUNDS;++r) {`

			`/* "add x_0jklm into x_1jklmn modulo 2^32" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[1][j][k][l][m] += x[0][j][k][l][m];`

			`/* "rotate x_0jklm upwards by 7 bits" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);`

			`/* "swap x_00klm with x_01klm" */`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`SWAP(x[0][0][k][l][m],x[0][1][k][l][m])`

			`/* "xor x_1jklm into x_0jklm" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[0][j][k][l][m] ^= x[1][j][k][l][m];`

			`/* "swap x_1jk0m with x_1jk1m" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`SWAP(x[1][j][k][0][m],x[1][j][k][1][m])`

			`/* "add x_0jklm into x_1jklm modulo 2^32" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[1][j][k][l][m] += x[0][j][k][l][m];`

			`/* "rotate x_0jklm upwards by 11 bits" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);`

			`/* "swap x_0j0lm with x_0j1lm" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`SWAP(x[0][j][0][l][m],x[0][j][1][l][m])`

			`/* "xor x_1jklm into x_0jklm" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[0][j][k][l][m] ^= x[1][j][k][l][m];`

			`/* "swap x_1jkl0 with x_1jkl1" */`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`SWAP(x[1][j][k][l][0],x[1][j][k][l][1])`

			`}`
			`}`

x11: echo and cubehash optimization echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes 10 years ago			`__device__ __forceinline__`
			`static void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`int k;`
			`int l;`
			`int m;`
			`uint32_t *in = block;`

			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[0][0][k][l][m] ^= *in++;`
			`}`

x11: echo and cubehash optimization echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes 10 years ago			`__device__ __forceinline__`
			`static void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`int j;`
			`int k;`
			`int l;`
			`int m;`
			`uint32_t *out = hash;`

			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`*out++ = x[0][j][k][l][m];`
			`}`

x11: echo and cubehash optimization echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes 10 years ago			`__device__`
			`void Init(uint32_t x[2][2][2][2][2])`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`int i,j,k,l,m;`
			`#if 0`
			`/* "the first three state words x_00000, x_00001, x_00010" */`
			`/* "are set to the integers h/8, b, r respectively." */`
			`/* "the remaining state words are set to 0." */`
			`#pragma unroll 2`
			`for (i = 0;i < 2;++i)`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[i][j][k][l][m] = 0;`
			`x[0][0][0][0][0] = 512/8;`
			`x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;`
			`x[0][0][0][1][0] = CUBEHASH_ROUNDS;`

			`/* "the state is then transformed invertibly through 10r identical rounds */`
			`for (i = 0;i < 10;++i) rrounds(x);`
			`#else`
x10 funcs cleanup, we dont need host constant tables 10 years ago			`const uint32_t *iv = c_IV_512;`
v1.0 - Yo, I heard y'all like X11 10 years ago
			`#pragma unroll 2`
			`for (i = 0;i < 2;++i)`
			`#pragma unroll 2`
			`for (j = 0;j < 2;++j)`
			`#pragma unroll 2`
			`for (k = 0;k < 2;++k)`
			`#pragma unroll 2`
			`for (l = 0;l < 2;++l)`
			`#pragma unroll 2`
			`for (m = 0;m < 2;++m)`
			`x[i][j][k][l][m] = *iv++;`
			`#endif`
			`}`

x11: echo and cubehash optimization echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes 10 years ago			`__device__ __forceinline__`
			`void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`/* "xor the block into the first b bytes of the state" */`
			`/* "and then transform the state invertibly through r identical rounds" */`
			`block_tox((uint32_t*)data, x);`
			`rrounds(x);`
			`}`

x11: echo and cubehash optimization echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes 10 years ago			`__device__ __forceinline__`
			`void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`int i;`

			`/* "the integer 1 is xored into the last state word x_11111" */`
			`x[1][1][1][1][1] ^= 1;`

			`/* "the state is then transformed invertibly through 10r identical rounds" */`
			`#pragma unroll 10`
			`for (i = 0;i < 10;++i) rrounds(x);`

			`/* "output the first h/8 bytes of the state" */`
			`hash_fromx((uint32_t*)hashval, x);`
			`}`


			`/***************************************************/`
x11: echo and cubehash optimization echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes 10 years ago			`// GPU Hash Function`
			`__global__`
			`void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t g_hash, uint32_t g_nonceVector)`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`int thread = (blockDim.x * blockIdx.x + threadIdx.x);`
			`if (thread < threads)`
			`{`
			`uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);`

			`int hashPosition = nounce - startNounce;`
			`uint32_t Hash = (uint32_t)&g_hash[8 * hashPosition];`

			`uint32_t x[2][2][2][2][2];`
			`Init(x);`

sources: swith to UTF-8 10 years ago			`// erste Hälfte des Hashes (32 bytes)`
v1.0 - Yo, I heard y'all like X11 10 years ago			`Update32(x, (const BitSequence*)Hash);`

sources: swith to UTF-8 10 years ago			`// zweite Hälfte des Hashes (32 bytes)`
v1.0 - Yo, I heard y'all like X11 10 years ago			`Update32(x, (const BitSequence*)(Hash+8));`

			`// Padding Block`
			`uint32_t last[8];`
			`last[0] = 0x80;`
			`#pragma unroll 7`
			`for (int i=1; i < 8; i++) last[i] = 0;`
			`Update32(x, (const BitSequence*)last);`

			`Final(x, (BitSequence*)Hash);`
			`}`
			`}`


			`// Setup-Funktionen`
cubehash and luffa funnel shit (from klaus) No gain... but i like this define, more readable in luffa ;) 10 years ago			`__host__`
			`void x11_cubehash512_cpu_init(int thr_id, int threads)`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`}`

cubehash and luffa funnel shit (from klaus) No gain... but i like this define, more readable in luffa ;) 10 years ago			`__host__`
			`void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t d_nonceVector, uint32_t d_hash, int order)`
v1.0 - Yo, I heard y'all like X11 10 years ago			`{`
			`const int threadsperblock = 256;`

			`// berechne wie viele Thread Blocks wir brauchen`
			`dim3 grid((threads + threadsperblock-1)/threadsperblock);`
			`dim3 block(threadsperblock);`

sources: swith to UTF-8 10 years ago			`// Größe des dynamischen Shared Memory Bereichs`
v1.0 - Yo, I heard y'all like X11 10 years ago			`size_t shared_size = 0;`

			`x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);`
			`MyStreamSynchronize(NULL, order, thr_id);`
			`}`