int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
b.w = __shfl((int)b.w, x1);
b.z = __shfl((int)b.z, x2);
b.y = __shfl((int)b.y, x3);
b.w = __shfl2((int)b.w, x1);
b.z = __shfl2((int)b.z, x2);
b.y = __shfl2((int)b.y, x3);
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
bx.w = __shfl((int)bx.w, x1);
bx.z = __shfl((int)bx.z, x2);
bx.y = __shfl((int)bx.y, x3);
bx.w = __shfl2((int)bx.w, x1);
bx.z = __shfl2((int)bx.z, x2);
bx.y = __shfl2((int)bx.y, x3);
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
}
@ -318,9 +327,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
@@ -318,9 +327,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
/* Unclear if this optimization is needed: These are ordered based
* upon the dependencies needed in the later xors. Compiler should be
* able to figure this out, but might as well give it a hand. */
x.y = __shfl((int)x.y, x3);
x.w = __shfl((int)x.w, x1);
x.z = __shfl((int)x.z, x2);
x.y = __shfl2((int)x.y, x3);
x.w = __shfl2((int)x.w, x1);
x.z = __shfl2((int)x.z, x2);
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
* but the register targets are rewritten here to swap x[1] and x[3] so that
@ -333,9 +342,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
@@ -333,9 +342,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
x.w = __shfl((int)x.w, x3);
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl2((int)x.w, x3);
x.y = __shfl2((int)x.y, x1);
x.z = __shfl2((int)x.z, x2);
}
b += x;
@ -352,18 +361,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
@@ -352,18 +361,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
x.y = __shfl((int)x.y, x3);
x.w = __shfl((int)x.w, x1);
x.z = __shfl((int)x.z, x2);
x.y = __shfl2((int)x.y, x3);
x.w = __shfl2((int)x.w, x1);
x.z = __shfl2((int)x.z, x2);
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
x.w = __shfl((int)x.w, x3);
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl2((int)x.w, x3);
x.y = __shfl2((int)x.y, x1);
x.z = __shfl2((int)x.z, x2);
}
// At the end of these iterations, the data is in primary order again.
@ -407,9 +416,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
@@ -407,9 +416,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x3);
x.y = __shfl2((int)x.y, x1);
x.z = __shfl2((int)x.z, x2);
x.w = __shfl2((int)x.w, x3);
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
@ -417,9 +426,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
@@ -417,9 +426,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x3);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x1);
x.y = __shfl2((int)x.y, x3);
x.z = __shfl2((int)x.z, x2);
x.w = __shfl2((int)x.w, x1);
}
b += x;
@ -436,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
@@ -436,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x3);
x.y = __shfl2((int)x.y, x1);
x.z = __shfl2((int)x.z, x2);
x.w = __shfl2((int)x.w, x3);
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
@ -446,9 +455,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
@@ -446,9 +455,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x3);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x1);
x.y = __shfl2((int)x.y, x3);
x.z = __shfl2((int)x.z, x2);
x.w = __shfl2((int)x.w, x1);
}
#undef CHACHA_PRIMITIVE
@ -572,7 +581,7 @@ void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
@@ -572,7 +581,7 @@ void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
} else load_key<ALGO>(d_odata, b, bx);
for (int i = begin; i < end; i++) {
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+2)&3);
int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+3)&3);
b.w = __shfl((int)b.w, x1);
b.z = __shfl((int)b.z, x2);
b.y = __shfl((int)b.y, x3);
b.w = __shfl2((int)b.w, x1);
b.z = __shfl2((int)b.z, x2);
b.y = __shfl2((int)b.y, x3);
uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
bx.w = __shfl((int)bx.w, x1);
bx.z = __shfl((int)bx.z, x2);
bx.y = __shfl((int)bx.y, x3);
bx.w = __shfl2((int)bx.w, x1);
bx.z = __shfl2((int)bx.z, x2);
bx.y = __shfl2((int)bx.y, x3);
tmp = bx.y; bx.y = bx.w; bx.w = tmp;
}
@ -327,9 +338,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
@@ -327,9 +338,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
/* Unclear if this optimization is needed: These are ordered based
* upon the dependencies needed in the later xors. Compiler should be
* able to figure this out, but might as well give it a hand. */
x.y = __shfl((int)x.y, x3);
x.w = __shfl((int)x.w, x1);
x.z = __shfl((int)x.z, x2);
x.y = __shfl2((int)x.y, x3);
x.w = __shfl2((int)x.w, x1);
x.z = __shfl2((int)x.z, x2);
/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
* but the register targets are rewritten here to swap x[1] and x[3] so that
@ -342,9 +353,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
@@ -342,9 +353,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
x.w = __shfl((int)x.w, x3);
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl2((int)x.w, x3);
x.y = __shfl2((int)x.y, x1);
x.z = __shfl2((int)x.z, x2);
}
b += x;
@ -362,18 +373,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
@@ -362,18 +373,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
x.y = __shfl((int)x.y, x3);
x.w = __shfl((int)x.w, x1);
x.z = __shfl((int)x.z, x2);
x.y = __shfl2((int)x.y, x3);
x.w = __shfl2((int)x.w, x1);
x.z = __shfl2((int)x.z, x2);
XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
x.w = __shfl((int)x.w, x3);
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl2((int)x.w, x3);
x.y = __shfl2((int)x.y, x1);
x.z = __shfl2((int)x.z, x2);
}
// At the end of these iterations, the data is in primary order again.
@ -424,9 +435,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
@@ -424,9 +435,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x3);
x.y = __shfl2((int)x.y, x1);
x.z = __shfl2((int)x.z, x2);
x.w = __shfl2((int)x.w, x3);
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
@ -434,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
@@ -434,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x3);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x1);
x.y = __shfl2((int)x.y, x3);
x.z = __shfl2((int)x.z, x2);
x.w = __shfl2((int)x.w, x1);
}
b += x;
@ -454,9 +465,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
@@ -454,9 +465,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x1);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x3);
x.y = __shfl2((int)x.y, x1);
x.z = __shfl2((int)x.z, x2);
x.w = __shfl2((int)x.w, x3);
// Diagonal Mixing phase of chacha
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
@ -464,9 +475,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
@@ -464,9 +475,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8)
CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7)
x.y = __shfl((int)x.y, x3);
x.z = __shfl((int)x.z, x2);
x.w = __shfl((int)x.w, x1);
x.y = __shfl2((int)x.y, x3);
x.z = __shfl2((int)x.z, x2);
x.w = __shfl2((int)x.w, x1);
}
#undef CHACHA_PRIMITIVE
@ -589,7 +600,7 @@ void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
@@ -589,7 +600,7 @@ void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
} else load_key<ALGO>(d_odata, b, bx);
for (int i = begin; i < end; i++) {
int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));