ccminer-gostd-lite/quark/cuda_quark_groestl512.cu

// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice

#include <stdio.h>
#include <memory.h>

#include "cuda_helper.h"

#define TPB 256
#define THF 4

#if __CUDA_ARCH__ >= 300
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#endif

#include "quark/cuda_quark_groestl512_sm20.cu"

__global__ __launch_bounds__(TPB, THF)
void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector)
{
#if __CUDA_ARCH__ >= 300
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
    if (thread < threads)
    {
        // GROESTL
        uint32_t message[8];
        uint32_t state[8];

        uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
        int hashPosition = nounce - startNounce;
        uint32_t *inpHash = &g_hash[hashPosition << 4];

        const uint16_t thr = threadIdx.x % THF;

        #pragma unroll
        for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];

        #pragma unroll
        for(int k=4;k<8;k++) message[k] = 0;

        if (thr == 0) message[4] = 0x80;
        if (thr == 3) message[7] = 0x01000000;

        uint32_t msgBitsliced[8];
        to_bitslice_quad(message, msgBitsliced);

        groestl512_progressMessage_quad(state, msgBitsliced);

        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
        uint32_t *outpHash = inpHash;
        uint32_t hash[16];
        from_bitslice_quad(state, hash);


        // uint4 = 4x4 uint32_t = 16 bytes
        if (thr == 0) {
            uint4 *phash = (uint4*) hash;
            uint4 *outpt = (uint4*) outpHash; /* var kept for hash align */
            outpt[0] = phash[0];
            outpt[1] = phash[1];
            outpt[2] = phash[2];
            outpt[3] = phash[3];
        }
/*
        if (thr == 0) {
            #pragma unroll
            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
*/
    }
#endif
}

__global__ void __launch_bounds__(TPB, THF)
 quark_doublegroestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
#if __CUDA_ARCH__ >= 300
    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
    if (thread < threads)
    {
        // GROESTL
        uint32_t message[8];
        uint32_t state[8];

        uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);

        int hashPosition = nounce - startNounce;
        uint32_t * inpHash = &g_hash[hashPosition<<4];
        const uint16_t thr = threadIdx.x % THF;

        #pragma unroll
        for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];

        #pragma unroll
        for(int k=4;k<8;k++) message[k] = 0;

        if (thr == 0) message[4] = 0x80;
        if (thr == 3) message[7] = 0x01000000;

        uint32_t msgBitsliced[8];
        to_bitslice_quad(message, msgBitsliced);

        for (int round=0; round<2; round++)
        {
            groestl512_progressMessage_quad(state, msgBitsliced);

            if (round < 1)
            {
                // Verkettung zweier Runden inclusive Padding.
                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));
                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));
            }
        }

        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
        uint32_t *outpHash = inpHash;
        uint32_t hash[16];
        from_bitslice_quad(state, hash);

        if (thr == 0)
        {
            #pragma unroll
            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
    }
#endif
}

__host__
void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
{
    int dev_id = device_map[thr_id];
    cuda_get_arch(thr_id);
    if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
        quark_groestl512_sm20_init(thr_id, threads);
}

__host__
void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
    int threadsperblock = TPB;

    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
    const int factor = THF;

    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);

    int dev_id = device_map[thr_id];

    if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
        quark_groestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
    else
        quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);

    // Strategisches Sleep Kommando zur Senkung der CPU Last
    MyStreamSynchronize(NULL, order, thr_id);
}

__host__
void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
    const int factor = THF;
    int threadsperblock = TPB;

    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);

    int dev_id = device_map[thr_id];

    if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
        quark_doublegroestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
    else
        quark_doublegroestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);

    MyStreamSynchronize(NULL, order, thr_id);
}
bump to revision V1.1 with Killer Groestl 11 years ago			`// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice`
bump to revision 0.7 11 years ago
			`#include <stdio.h>`
			`#include <memory.h>`

Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`#include "cuda_helper.h"`

Add fresh algo (based on djm34 code) Cleaned up and adapted to my changes (cputest added) Remove Makefile.in which should be in gitignore (Plz refresh it with ./config.sh to compile on linux) 10 years ago			`#define TPB 256`
			`#define THF 4`

groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`#if __CUDA_ARCH__ >= 300`
bump to revision V1.1 with Killer Groestl 11 years ago			`#include "groestl_functions_quad.cu"`
			`#include "bitslice_transformations_quad.cu"`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`#endif`

			`#include "quark/cuda_quark_groestl512_sm20.cu"`
bump to revision 0.7 11 years ago
Add fresh algo (based on djm34 code) Cleaned up and adapted to my changes (cputest added) Remove Makefile.in which should be in gitignore (Plz refresh it with ./config.sh to compile on linux) 10 years ago			`__global__ __launch_bounds__(TPB, THF)`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector)`
bump to revision V1.1 with Killer Groestl 11 years ago			`{`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`#if __CUDA_ARCH__ >= 300`
bump to revision V1.1 with Killer Groestl 11 years ago			`// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;`
bump to revision V1.1 with Killer Groestl 11 years ago			`if (thread < threads)`
bump to revision 0.7 11 years ago			`{`
bump to revision V1.1 with Killer Groestl 11 years ago			`// GROESTL`
			`uint32_t message[8];`
			`uint32_t state[8];`
bump to revision 0.7 11 years ago
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);`
bump to revision V1.1 with Killer Groestl 11 years ago			`int hashPosition = nounce - startNounce;`
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`uint32_t *inpHash = &g_hash[hashPosition << 4];`

			`const uint16_t thr = threadIdx.x % THF;`
bump to revision 0.7 11 years ago
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`#pragma unroll`
			`for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];`

			`#pragma unroll`
bump to revision V1.1 with Killer Groestl 11 years ago			`for(int k=4;k<8;k++) message[k] = 0;`
bump to revision 0.7 11 years ago
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`if (thr == 0) message[4] = 0x80;`
			`if (thr == 3) message[7] = 0x01000000;`
bump to revision V1.1 with Killer Groestl 11 years ago
			`uint32_t msgBitsliced[8];`
			`to_bitslice_quad(message, msgBitsliced);`

			`groestl512_progressMessage_quad(state, msgBitsliced);`

			`// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash`
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`uint32_t *outpHash = inpHash;`
bump to revision V1.1 with Killer Groestl 11 years ago			`uint32_t hash[16];`
			`from_bitslice_quad(state, hash);`

Add zr5 algo (for SM 3.5+) uint4 copy + keccak cleanup, groestl: small uint4 opt Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
			`// uint4 = 4x4 uint32_t = 16 bytes`
			`if (thr == 0) {`
			`uint4 phash = (uint4) hash;`
			`uint4 outpt = (uint4) outpHash; /* var kept for hash align */`
			`outpt[0] = phash[0];`
			`outpt[1] = phash[1];`
			`outpt[2] = phash[2];`
			`outpt[3] = phash[3];`
			`}`
			`/*`
			`if (thr == 0) {`
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`#pragma unroll`
bump to revision V1.1 with Killer Groestl 11 years ago			`for(int k=0;k<16;k++) outpHash[k] = hash[k];`
bump to revision 0.7 11 years ago			`}`
Add zr5 algo (for SM 3.5+) uint4 copy + keccak cleanup, groestl: small uint4 opt Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`*/`
bump to revision 0.7 11 years ago			`}`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`#endif`
bump to revision 0.7 11 years ago			`}`

Add fresh algo (based on djm34 code) Cleaned up and adapted to my changes (cputest added) Remove Makefile.in which should be in gitignore (Plz refresh it with ./config.sh to compile on linux) 10 years ago			`__global__ void __launch_bounds__(TPB, THF)`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`quark_doublegroestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, uint32_t g_hash, uint32_t g_nonceVector)`
bump to revision V1.1 with Killer Groestl 11 years ago			`{`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`#if __CUDA_ARCH__ >= 300`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;`
bump to revision 0.7 11 years ago			`if (thread < threads)`
			`{`
			`// GROESTL`
bump to revision V1.1 with Killer Groestl 11 years ago			`uint32_t message[8];`
			`uint32_t state[8];`
bump to revision 0.7 11 years ago
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);`
bump to revision 0.7 11 years ago
			`int hashPosition = nounce - startNounce;`
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`uint32_t * inpHash = &g_hash[hashPosition<<4];`
			`const uint16_t thr = threadIdx.x % THF;`

			`#pragma unroll`
			`for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];`
bump to revision 0.7 11 years ago
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`#pragma unroll`
bump to revision V1.1 with Killer Groestl 11 years ago			`for(int k=4;k<8;k++) message[k] = 0;`

groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`if (thr == 0) message[4] = 0x80;`
			`if (thr == 3) message[7] = 0x01000000;`
bump to revision V1.1 with Killer Groestl 11 years ago
			`uint32_t msgBitsliced[8];`
			`to_bitslice_quad(message, msgBitsliced);`

			`for (int round=0; round<2; round++)`
			`{`
			`groestl512_progressMessage_quad(state, msgBitsliced);`

			`if (round < 1)`
			`{`
			`// Verkettung zweier Runden inclusive Padding.`
			`msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));`
			`msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);`
			`msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);`
			`msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);`
			`msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);`
			`msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);`
			`msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);`
			`msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));`
			`}`
			`}`
bump to revision 0.7 11 years ago
bump to revision V1.1 with Killer Groestl 11 years ago			`// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash`
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`uint32_t *outpHash = inpHash;`
bump to revision V1.1 with Killer Groestl 11 years ago			`uint32_t hash[16];`
			`from_bitslice_quad(state, hash);`

groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`if (thr == 0)`
bump to revision V1.1 with Killer Groestl 11 years ago			`{`
groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... 10 years ago			`#pragma unroll`
bump to revision V1.1 with Killer Groestl 11 years ago			`for(int k=0;k<16;k++) outpHash[k] = hash[k];`
			`}`
bump to revision 0.7 11 years ago			`}`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`#endif`
bump to revision 0.7 11 years ago			`}`

Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`__host__`
			`void quark_groestl512_cpu_init(int thr_id, uint32_t threads)`
bump to revision 0.7 11 years ago			`{`
Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`int dev_id = device_map[thr_id];`
			`cuda_get_arch(thr_id);`
			`if (device_sm[dev_id] < 300 \|\| cuda_arch[dev_id] < 300)`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`quark_groestl512_sm20_init(thr_id, threads);`
bump to revision 0.7 11 years ago			`}`

Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`__host__`
			`void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t d_nonceVector, uint32_t d_hash, int order)`
bump to revision 0.7 11 years ago			`{`
Add fresh algo (based on djm34 code) Cleaned up and adapted to my changes (cputest added) Remove Makefile.in which should be in gitignore (Plz refresh it with ./config.sh to compile on linux) 10 years ago			`int threadsperblock = TPB;`
bump to revision V1.1 with Killer Groestl 11 years ago
			`// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle`
			`// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl`
Add fresh algo (based on djm34 code) Cleaned up and adapted to my changes (cputest added) Remove Makefile.in which should be in gitignore (Plz refresh it with ./config.sh to compile on linux) 10 years ago			`const int factor = THF;`
bump to revision 0.7 11 years ago
			`// berechne wie viele Thread Blocks wir brauchen`
bump to revision V1.1 with Killer Groestl 11 years ago			`dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));`
bump to revision 0.7 11 years ago			`dim3 block(threadsperblock);`

Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`int dev_id = device_map[thr_id];`
bump to revision 0.7 11 years ago
Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)`
			`quark_groestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`else`
			`quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);`
bump to revision 0.7 11 years ago
			`// Strategisches Sleep Kommando zur Senkung der CPU Last`
			`MyStreamSynchronize(NULL, order, thr_id);`
			`}`

Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`__host__`
			`void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t d_nonceVector, uint32_t d_hash, int order)`
bump to revision 0.7 11 years ago			`{`
Add fresh algo (based on djm34 code) Cleaned up and adapted to my changes (cputest added) Remove Makefile.in which should be in gitignore (Plz refresh it with ./config.sh to compile on linux) 10 years ago			`const int factor = THF;`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`int threadsperblock = TPB;`
bump to revision 0.7 11 years ago
bump to revision V1.1 with Killer Groestl 11 years ago			`dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));`
bump to revision 0.7 11 years ago			`dim3 block(threadsperblock);`

Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`int dev_id = device_map[thr_id];`
bump to revision 0.7 11 years ago
Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)`
			`quark_doublegroestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);`
groestl: use sp bitslice enhancement, prepare SM 2.x variant todo: simd512 SM 2.x variant (shfl op), and groestl/myriad functions 10 years ago			`else`
			`quark_doublegroestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);`
bump to revision 0.7 11 years ago
			`MyStreamSynchronize(NULL, order, thr_id);`
			`}`