based on klaus commits, will increase a bit speed of most algos
PS: main increase is due to the register count tuning in Makefile
and for skein512 on linux, its the ROTL64
but almost no changes on X11 : 2648MH/s vs 2630 before
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern int device_map[8];
// Take a look at: https://www.schneier.com/skein1.3.pdf
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
// Zum testen Hostcode...
/* Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll.
mal direkt in den Code eingesetzt werden
*/
__device__
uint64_t skein_rotl64(const uint64_t x, const int offset)
{
uint64_t res;
asm("{\n\t"
".reg .u32 tl,th,vl,vh;\n\t"
".reg .pred p;\n\t"
"mov.b64 {tl,th}, %1;\n\t"
"shf.l.wrap.b32 vl, tl, th, %2;\n\t"
"shf.l.wrap.b32 vh, th, tl, %2;\n\t"
"setp.lt.u32 p, %2, 32;\n\t"
"@!p mov.b64 %0, {vl,vh};\n\t"
"@p mov.b64 %0, {vh,vl};\n\t"
"}"
: "=l"(res) : "l"(x) , "r"(offset)
);
return res;
}
#if __CUDA_ARCH__ >= 350
#undef ROTL64
#define ROTL64 skein_rotl64
#endif
/*
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
@@ -288,18 +305,8 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t