You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2117 lines
57 KiB
2117 lines
57 KiB
/* |
|
* Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash |
|
* Adapted to be more compatible with older C++ compilers |
|
* |
|
* cuda_djezo solver was released by NiceHash (www.nicehash.com) under |
|
* GPL 3.0 license. If you don't have a copy, you can obtain one from |
|
* https://www.gnu.org/licenses/gpl-3.0.txt |
|
* |
|
* Based on CUDA solver by John Tromp released under MIT license. |
|
* Some helper functions taken out of OpenCL solver by Marc Bevand |
|
* released under MIT license. |
|
* |
|
* Copyright (c) 2016 John Tromp, Marc Bevand |
|
* Copyright (c) 2017 djeZo, Tanguy Pruvot (GPL v3) |
|
*/ |
|
|
|
#ifdef WIN32 |
|
#include <Windows.h> |
|
#endif |
|
|
|
#include <stdio.h> |
|
#include <vector> |
|
//#include <mutex> |
|
|
|
#include "equihash.h" |
|
#include "eqcuda.hpp" // eq_cuda_context |
|
|
|
#include "blake2/blake2.h" |
|
|
|
//#define WN 200 |
|
//#define WK 9 |
|
#ifndef MAX_GPUS |
|
#define MAX_GPUS 16 |
|
#endif |
|
|
|
#define NDIGITS (WK+1) |
|
#define DIGITBITS (WN/(NDIGITS)) |
|
#define PROOFSIZE (1<<WK) |
|
#define BASE (1<<DIGITBITS) |
|
#define NHASHES (2*BASE) |
|
#define HASHESPERBLAKE (512/WN) |
|
#define HASHOUT (HASHESPERBLAKE*WN/8) |
|
#define NBLOCKS ((NHASHES + HASHESPERBLAKE - 1) / HASHESPERBLAKE) |
|
#define BUCKBITS (DIGITBITS - RB) |
|
#define NBUCKETS (1 << BUCKBITS) |
|
#define BUCKMASK (NBUCKETS - 1) |
|
#define SLOTBITS (RB + 2) |
|
#define SLOTRANGE (1 << SLOTBITS) |
|
#define NSLOTS SM |
|
#define SLOTMASK (SLOTRANGE - 1) |
|
#define NRESTS (1 << RB) |
|
#define RESTMASK (NRESTS - 1) |
|
#define CANTORBITS (2 * SLOTBITS - 2) |
|
#define CANTORMASK ((1 << CANTORBITS) - 1) |
|
#define CANTORMAXSQRT (2 * NSLOTS) |
|
#define RB8_NSLOTS 640 |
|
#define RB8_NSLOTS_LD 624 |
|
#define FD_THREADS 128 |
|
|
|
#ifdef __INTELLISENSE__ |
|
// reduce vstudio editor warnings |
|
#include <device_functions.h> |
|
#include <device_launch_parameters.h> |
|
#define __launch_bounds__(max_tpb, min_blocks) |
|
#define __CUDA_ARCH__ 520 |
|
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); |
|
uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); |
|
uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); |
|
uint32_t atomicExch(uint32_t *x, uint32_t y); |
|
uint32_t atomicAdd(uint32_t *x, uint32_t y); |
|
void __syncthreads(void); |
|
void __threadfence(void); |
|
void __threadfence_block(void); |
|
uint32_t __ldg(const uint32_t* address); |
|
uint64_t __ldg(const uint64_t* address); |
|
uint4 __ldca(const uint4 *ptr); |
|
u32 __ldca(const u32 *ptr); |
|
u32 umin(const u32, const u32); |
|
u32 umax(const u32, const u32); |
|
#endif |
|
|
|
typedef u32 proof[PROOFSIZE]; |
|
|
|
struct __align__(32) slot { |
|
u32 hash[8]; |
|
}; |
|
|
|
struct __align__(16) slotsmall { |
|
u32 hash[4]; |
|
}; |
|
|
|
struct __align__(8) slottiny { |
|
u32 hash[2]; |
|
}; |
|
|
|
template <u32 RB, u32 SM> |
|
struct equi |
|
{ |
|
slot round0trees[4096][RB8_NSLOTS]; |
|
slot trees[1][NBUCKETS][NSLOTS]; |
|
struct { |
|
slotsmall treessmall[NSLOTS]; |
|
slottiny treestiny[NSLOTS]; |
|
} round2trees[NBUCKETS]; |
|
struct { |
|
slotsmall treessmall[NSLOTS]; |
|
slottiny treestiny[NSLOTS]; |
|
} round3trees[NBUCKETS]; |
|
slotsmall treessmall[4][NBUCKETS][NSLOTS]; |
|
slottiny treestiny[1][4096][RB8_NSLOTS_LD]; |
|
u32 round4bidandsids[NBUCKETS][NSLOTS]; |
|
union { |
|
u64 blake_h[8]; |
|
u32 blake_h32[16]; |
|
}; |
|
struct { |
|
u32 nslots8[4096]; |
|
u32 nslots0[4096]; |
|
u32 nslots[9][NBUCKETS]; |
|
scontainerreal srealcont; |
|
} edata; |
|
}; |
|
|
|
// todo: use cuda_helper.h and/or cuda_vector.h |
|
__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) |
|
{ |
|
return make_uint2(a.x ^ b.x, a.y ^ b.y); |
|
} |
|
|
|
__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) |
|
{ |
|
return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); |
|
} |
|
|
|
// for ROR 63 (or ROL 1); this func only support (32 <= offset < 64) |
|
__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) |
|
{ |
|
uint2 result; |
|
#if __CUDA_ARCH__ > 300 |
|
{ |
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); |
|
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); |
|
} |
|
#else |
|
result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); |
|
result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); |
|
#endif |
|
return result; |
|
} |
|
|
|
|
|
__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) |
|
{ |
|
return make_uint2(value.y, value.x); |
|
} |
|
|
|
__device__ __forceinline__ uint2 ROR24(const uint2 a) |
|
{ |
|
uint2 result; |
|
result.x = __byte_perm(a.y, a.x, 0x2107); |
|
result.y = __byte_perm(a.y, a.x, 0x6543); |
|
return result; |
|
} |
|
|
|
__device__ __forceinline__ uint2 ROR16(const uint2 a) |
|
{ |
|
uint2 result; |
|
result.x = __byte_perm(a.y, a.x, 0x1076); |
|
result.y = __byte_perm(a.y, a.x, 0x5432); |
|
return result; |
|
} |
|
|
|
__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) |
|
{ |
|
a = a + b + x; |
|
((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); |
|
c = c + d; |
|
((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]); |
|
a = a + b + y; |
|
((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); |
|
c = c + d; |
|
((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); |
|
} |
|
|
|
// untested.. |
|
struct packer_default |
|
{ |
|
__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM) |
|
{ |
|
return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1; |
|
} |
|
|
|
__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM) |
|
{ |
|
// BUCKMASK-ed to prevent illegal memory accesses in case of memory errors |
|
return (bid >> (2 * SLOTBITS)) & BUCKMASK; |
|
} |
|
|
|
__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM) |
|
{ |
|
return bid & SLOTMASK; |
|
} |
|
|
|
__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM) |
|
{ |
|
return (bid >> SLOTBITS) & SLOTMASK; |
|
} |
|
}; |
|
|
|
|
|
struct packer_cantor |
|
{ |
|
__device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1) |
|
{ |
|
u32 a = umax(s0, s1); |
|
u32 b = umin(s0, s1); |
|
return a * (a + 1) / 2 + b; |
|
} |
|
|
|
__device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM) |
|
{ |
|
return (bucketid << CANTORBITS) | cantor(s0, s1); |
|
} |
|
|
|
__device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM) |
|
{ |
|
return (bid >> CANTORBITS) & BUCKMASK; |
|
} |
|
|
|
__device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM) |
|
{ |
|
return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK; |
|
} |
|
|
|
__device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM) |
|
{ |
|
u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1; |
|
// this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9 |
|
for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2); |
|
return ((k - 1) / 2) & SLOTMASK; |
|
} |
|
}; |
|
|
|
__device__ __constant__ const u64 blake_iv[] = { |
|
0x6a09e667f3bcc908, 0xbb67ae8584caa73b, |
|
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, |
|
0x510e527fade682d1, 0x9b05688c2b3e6c1f, |
|
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, |
|
}; |
|
|
|
#if CUDART_VERSION < 8000 || !defined(__ldca) |
|
#define __ldca(ptr) *(ptr) |
|
#endif |
|
|
|
template <u32 RB, u32 SM, typename PACKER> |
|
__global__ void digit_first(equi<RB, SM>* eq, u32 nonce) |
|
{ |
|
const u32 block = blockIdx.x * blockDim.x + threadIdx.x; |
|
__shared__ u64 hash_h[8]; |
|
u32* hash_h32 = (u32*)hash_h; |
|
|
|
if (threadIdx.x < 16) |
|
hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]); |
|
|
|
__syncthreads(); |
|
|
|
u64 m = (u64)block << 32 | (u64)nonce; |
|
|
|
union |
|
{ |
|
u64 v[16]; |
|
u32 v32[32]; |
|
uint4 v128[8]; |
|
}; |
|
|
|
v[0] = hash_h[0]; |
|
v[1] = hash_h[1]; |
|
v[2] = hash_h[2]; |
|
v[3] = hash_h[3]; |
|
v[4] = hash_h[4]; |
|
v[5] = hash_h[5]; |
|
v[6] = hash_h[6]; |
|
v[7] = hash_h[7]; |
|
v[8] = blake_iv[0]; |
|
v[9] = blake_iv[1]; |
|
v[10] = blake_iv[2]; |
|
v[11] = blake_iv[3]; |
|
v[12] = blake_iv[4] ^ (128 + 16); |
|
v[13] = blake_iv[5]; |
|
v[14] = blake_iv[6] ^ 0xffffffffffffffff; |
|
v[15] = blake_iv[7]; |
|
|
|
// mix 1 |
|
G2(v[0], v[4], v[8], v[12], 0, m); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 2 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], m, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 3 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, m); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 4 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, m); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 5 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, m); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 6 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], m, 0); |
|
|
|
// mix 7 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], m, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 8 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, m); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 9 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], m, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 10 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], m, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 11 |
|
G2(v[0], v[4], v[8], v[12], 0, m); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], 0, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
// mix 12 |
|
G2(v[0], v[4], v[8], v[12], 0, 0); |
|
G2(v[1], v[5], v[9], v[13], 0, 0); |
|
G2(v[2], v[6], v[10], v[14], 0, 0); |
|
G2(v[3], v[7], v[11], v[15], 0, 0); |
|
G2(v[0], v[5], v[10], v[15], m, 0); |
|
G2(v[1], v[6], v[11], v[12], 0, 0); |
|
G2(v[2], v[7], v[8], v[13], 0, 0); |
|
G2(v[3], v[4], v[9], v[14], 0, 0); |
|
|
|
v[0] ^= hash_h[0] ^ v[8]; |
|
v[1] ^= hash_h[1] ^ v[9]; |
|
v[2] ^= hash_h[2] ^ v[10]; |
|
v[3] ^= hash_h[3] ^ v[11]; |
|
v[4] ^= hash_h[4] ^ v[12]; |
|
v[5] ^= hash_h[5] ^ v[13]; |
|
v32[12] ^= hash_h32[12] ^ v32[28]; |
|
|
|
u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits |
|
u32 bucketid; |
|
asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor)); |
|
u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1); |
|
if (slotp < RB8_NSLOTS) |
|
{ |
|
slot* s = &eq->round0trees[bucketid][slotp]; |
|
|
|
uint4 tt; |
|
tt.x = __byte_perm(v32[0], v32[1], 0x1234); |
|
tt.y = __byte_perm(v32[1], v32[2], 0x1234); |
|
tt.z = __byte_perm(v32[2], v32[3], 0x1234); |
|
tt.w = __byte_perm(v32[3], v32[4], 0x1234); |
|
*(uint4*)(&s->hash[0]) = tt; |
|
|
|
tt.x = __byte_perm(v32[4], v32[5], 0x1234); |
|
tt.y = __byte_perm(v32[5], v32[6], 0x1234); |
|
tt.z = 0; |
|
tt.w = block << 1; |
|
*(uint4*)(&s->hash[4]) = tt; |
|
} |
|
|
|
bexor = __byte_perm(v32[6], 0, 0x0123); |
|
asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor)); |
|
slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1); |
|
if (slotp < RB8_NSLOTS) |
|
{ |
|
slot* s = &eq->round0trees[bucketid][slotp]; |
|
|
|
uint4 tt; |
|
tt.x = __byte_perm(v32[6], v32[7], 0x2345); |
|
tt.y = __byte_perm(v32[7], v32[8], 0x2345); |
|
tt.z = __byte_perm(v32[8], v32[9], 0x2345); |
|
tt.w = __byte_perm(v32[9], v32[10], 0x2345); |
|
*(uint4*)(&s->hash[0]) = tt; |
|
|
|
tt.x = __byte_perm(v32[10], v32[11], 0x2345); |
|
tt.y = __byte_perm(v32[11], v32[12], 0x2345); |
|
tt.z = 0; |
|
tt.w = (block << 1) + 1; |
|
*(uint4*)(&s->hash[4]) = tt; |
|
} |
|
} |
|
|
|
/* |
|
Functions digit_1 to digit_8 works by the same principle; |
|
Each thread does 2-3 slot loads (loads are coalesced). |
|
Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1). |
|
At the same time, restbits (8 or 9 bits) in xorwork are used for collisions. |
|
Restbits determine position in ht. |
|
Following next is pair creation. First one (or two) pairs' xorworks are put into global memory |
|
as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices). |
|
In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step. |
|
In last step we save xorwork of pairs in memory. |
|
*/ |
|
template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS> |
|
__global__ void digit_1(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u16 ht[256][SSM - 1]; |
|
__shared__ uint2 lastword1[RB8_NSLOTS]; |
|
__shared__ uint4 lastword2[RB8_NSLOTS]; |
|
__shared__ int ht_len[MAXPAIRS]; |
|
__shared__ u32 pairs_len; |
|
__shared__ u32 next_pair; |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
// reset hashtable len |
|
if (threadid < 256) |
|
ht_len[threadid] = 0; |
|
else if (threadid == (THREADS - 1)) |
|
pairs_len = 0; |
|
else if (threadid == (THREADS - 33)) |
|
next_pair = 0; |
|
|
|
u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS); |
|
|
|
u32 hr[2]; |
|
int pos[2]; |
|
pos[0] = pos[1] = SSM; |
|
|
|
uint2 ta[2]; |
|
uint4 tb[2]; |
|
|
|
u32 si[2]; |
|
|
|
// enable this to make fully safe shared mem operations; |
|
// disabled gains some speed, but can rarely cause a crash |
|
//__syncthreads(); |
|
|
|
#pragma unroll |
|
for (u32 i = 0; i != 2; ++i) |
|
{ |
|
si[i] = i * THREADS + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
const slot* pslot1 = eq->round0trees[bucketid] + si[i]; |
|
|
|
// get xhash |
|
uint4 a1 = *(uint4*)(&pslot1->hash[0]); |
|
uint2 a2 = *(uint2*)(&pslot1->hash[4]); |
|
ta[i].x = a1.x; |
|
ta[i].y = a1.y; |
|
lastword1[si[i]] = ta[i]; |
|
tb[i].x = a1.z; |
|
tb[i].y = a1.w; |
|
tb[i].z = a2.x; |
|
tb[i].w = a2.y; |
|
lastword2[si[i]] = tb[i]; |
|
|
|
asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x)); |
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; |
|
} |
|
|
|
__syncthreads(); |
|
int* pairs = ht_len; |
|
|
|
u32 xors[6]; |
|
u32 xorbucketid, xorslot; |
|
|
|
#pragma unroll |
|
for (u32 i = 0; i != 2; ++i) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
if (pos[i] > 0) |
|
{ |
|
u16 p = ht[hr[i]][0]; |
|
|
|
*(uint2*)(&xors[0]) = ta[i] ^ lastword1[p]; |
|
|
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1); |
|
|
|
if (xorslot < NSLOTS) |
|
{ |
|
*(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p]; |
|
|
|
slot &xs = eq->trees[0][xorbucketid][xorslot]; |
|
*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); |
|
uint4 ttx; |
|
ttx.x = xors[5]; |
|
ttx.y = xors[0]; |
|
ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS); |
|
ttx.w = 0; |
|
*(uint4*)(&xs.hash[4]) = ttx; |
|
} |
|
|
|
for (int k = 1; k != pos[i]; ++k) |
|
{ |
|
u32 pindex = atomicAdd(&pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
u16 prev = ht[hr[i]][k]; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
|
|
// process pairs |
|
u32 plen = umin(pairs_len, MAXPAIRS); |
|
|
|
u32 i, k; |
|
for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) |
|
{ |
|
int pair = pairs[s]; |
|
i = __byte_perm(pair, 0, 0x4510); |
|
k = __byte_perm(pair, 0, 0x4532); |
|
|
|
*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k]; |
|
|
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1); |
|
|
|
if (xorslot < NSLOTS) |
|
{ |
|
*(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k]; |
|
|
|
slot &xs = eq->trees[0][xorbucketid][xorslot]; |
|
*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); |
|
uint4 ttx; |
|
ttx.x = xors[5]; |
|
ttx.y = xors[0]; |
|
ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS); |
|
ttx.w = 0; |
|
*(uint4*)(&xs.hash[4]) = ttx; |
|
} |
|
} |
|
} |
|
|
|
|
|
template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS> |
|
__global__ void digit_2(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u16 ht[NRESTS][SSM - 1]; |
|
__shared__ u32 lastword1[NSLOTS]; |
|
__shared__ uint4 lastword2[NSLOTS]; |
|
__shared__ int ht_len[NRESTS]; |
|
__shared__ int pairs[MAXPAIRS]; |
|
__shared__ u32 pairs_len; |
|
__shared__ u32 next_pair; |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
// reset hashtable len |
|
if (threadid < NRESTS) |
|
ht_len[threadid] = 0; |
|
else if (threadid == (THREADS - 1)) |
|
pairs_len = 0; |
|
else if (threadid == (THREADS - 33)) |
|
next_pair = 0; |
|
|
|
slot* buck = eq->trees[0][bucketid]; |
|
u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS); |
|
|
|
u32 hr[2]; |
|
int pos[2]; |
|
pos[0] = pos[1] = SSM; |
|
|
|
u32 ta[2]; |
|
uint4 tt[2]; |
|
|
|
u32 si[2]; |
|
|
|
// enable this to make fully safe shared mem operations; |
|
// disabled gains some speed, but can rarely cause a crash |
|
//__syncthreads(); |
|
|
|
#pragma unroll 2 |
|
for (u32 i = 0; i < 2; i++) |
|
{ |
|
si[i] = i * THREADS + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
// get slot |
|
const slot* pslot1 = buck + si[i]; |
|
|
|
uint4 ttx = *(uint4*)(&pslot1->hash[0]); |
|
lastword1[si[i]] = ta[i] = ttx.x; |
|
uint2 tty = *(uint2*)(&pslot1->hash[4]); |
|
tt[i].x = ttx.y; |
|
tt[i].y = ttx.z; |
|
tt[i].z = ttx.w; |
|
tt[i].w = tty.x; |
|
lastword2[si[i]] = tt[i]; |
|
|
|
hr[i] = tty.y & RESTMASK; |
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; |
|
} |
|
|
|
__syncthreads(); |
|
|
|
u32 xors[5]; |
|
u32 xorbucketid, xorslot; |
|
|
|
#pragma unroll 2 |
|
for (u32 i = 0; i < 2; i++) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
if (pos[i] > 0) |
|
{ |
|
u16 p = ht[hr[i]][0]; |
|
|
|
xors[0] = ta[i] ^ lastword1[p]; |
|
|
|
xorbucketid = xors[0] >> (12 + RB); |
|
xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
*(uint4*)(&xors[1]) = tt[i] ^ lastword2[p]; |
|
slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot]; |
|
*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); |
|
slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot]; |
|
uint2 ttx; |
|
ttx.x = xors[4]; |
|
ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
*(uint2*)(&xst.hash[0]) = ttx; |
|
} |
|
|
|
for (int k = 1; k != pos[i]; ++k) |
|
{ |
|
u32 pindex = atomicAdd(&pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
u16 prev = ht[hr[i]][k]; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
|
|
// process pairs |
|
u32 plen = umin(pairs_len, MAXPAIRS); |
|
|
|
u32 i, k; |
|
for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) |
|
{ |
|
int pair = pairs[s]; |
|
i = __byte_perm(pair, 0, 0x4510); |
|
k = __byte_perm(pair, 0, 0x4532); |
|
|
|
xors[0] = lastword1[i] ^ lastword1[k]; |
|
|
|
xorbucketid = xors[0] >> (12 + RB); |
|
xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
*(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k]; |
|
slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot]; |
|
*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); |
|
slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot]; |
|
uint2 ttx; |
|
ttx.x = xors[4]; |
|
ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); |
|
*(uint2*)(&xst.hash[0]) = ttx; |
|
} |
|
} |
|
} |
|
|
|
|
|
template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS> |
|
__global__ void digit_3(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u16 ht[NRESTS][(SSM - 1)]; |
|
__shared__ uint4 lastword1[NSLOTS]; |
|
__shared__ u32 lastword2[NSLOTS]; |
|
__shared__ int ht_len[NRESTS]; |
|
__shared__ int pairs[MAXPAIRS]; |
|
__shared__ u32 pairs_len; |
|
__shared__ u32 next_pair; |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
// reset hashtable len |
|
if (threadid < NRESTS) |
|
ht_len[threadid] = 0; |
|
else if (threadid == (THREADS - 1)) |
|
pairs_len = 0; |
|
else if (threadid == (THREADS - 33)) |
|
next_pair = 0; |
|
|
|
u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS); |
|
|
|
u32 hr[2]; |
|
int pos[2]; |
|
pos[0] = pos[1] = SSM; |
|
|
|
u32 si[2]; |
|
uint4 tt[2]; |
|
u32 ta[2]; |
|
|
|
// enable this to make fully safe shared mem operations; |
|
// disabled gains some speed, but can rarely cause a crash |
|
//__syncthreads(); |
|
|
|
#pragma unroll 2 |
|
for (u32 i = 0; i < 2; i++) |
|
{ |
|
si[i] = i * THREADS + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]]; |
|
slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]]; |
|
|
|
tt[i] = *(uint4*)(&xs.hash[0]); |
|
lastword1[si[i]] = tt[i]; |
|
ta[i] = xst.hash[0]; |
|
lastword2[si[i]] = ta[i]; |
|
asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); |
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; |
|
} |
|
|
|
__syncthreads(); |
|
|
|
u32 xors[5]; |
|
u32 bexor, xorbucketid, xorslot; |
|
|
|
#pragma unroll 2 |
|
for (u32 i = 0; i < 2; i++) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
if (pos[i] > 0) |
|
{ |
|
u16 p = ht[hr[i]][0]; |
|
|
|
xors[4] = ta[i] ^ lastword2[p]; |
|
|
|
if (xors[4] != 0) |
|
{ |
|
*(uint4*)(&xors[0]) = tt[i] ^ lastword1[p]; |
|
|
|
bexor = __byte_perm(xors[0], xors[1], 0x2107); |
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1); |
|
|
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot]; |
|
*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); |
|
slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot]; |
|
uint2 ttx; |
|
ttx.x = bexor; |
|
ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
*(uint2*)(&xst.hash[0]) = ttx; |
|
} |
|
} |
|
|
|
for (int k = 1; k != pos[i]; ++k) |
|
{ |
|
u32 pindex = atomicAdd(&pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
u16 prev = ht[hr[i]][k]; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
|
|
// process pairs |
|
u32 plen = umin(pairs_len, MAXPAIRS); |
|
|
|
u32 i, k; |
|
for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) |
|
{ |
|
int pair = pairs[s]; |
|
i = __byte_perm(pair, 0, 0x4510); |
|
k = __byte_perm(pair, 0, 0x4532); |
|
|
|
xors[4] = lastword2[i] ^ lastword2[k]; |
|
|
|
if (xors[4] != 0) |
|
{ |
|
*(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k]; |
|
|
|
bexor = __byte_perm(xors[0], xors[1], 0x2107); |
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1); |
|
|
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot]; |
|
*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); |
|
slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot]; |
|
uint2 ttx; |
|
ttx.x = bexor; |
|
ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); |
|
*(uint2*)(&xst.hash[0]) = ttx; |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS> |
|
__global__ void digit_4(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u16 ht[NRESTS][(SSM - 1)]; |
|
__shared__ uint4 lastword[NSLOTS]; |
|
__shared__ int ht_len[NRESTS]; |
|
__shared__ int pairs[MAXPAIRS]; |
|
__shared__ u32 pairs_len; |
|
__shared__ u32 next_pair; |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
// reset hashtable len |
|
if (threadid < NRESTS) |
|
ht_len[threadid] = 0; |
|
else if (threadid == (THREADS - 1)) |
|
pairs_len = 0; |
|
else if (threadid == (THREADS - 33)) |
|
next_pair = 0; |
|
|
|
u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS); |
|
|
|
u32 hr[2]; |
|
int pos[2]; |
|
pos[0] = pos[1] = SSM; |
|
|
|
u32 si[2]; |
|
uint4 tt[2]; |
|
|
|
// enable this to make fully safe shared mem operations; |
|
// disabled gains some speed, but can rarely cause a crash |
|
//__syncthreads(); |
|
|
|
#pragma unroll 2 |
|
for (u32 i = 0; i < 2; i++) |
|
{ |
|
si[i] = i * THREADS + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]]; |
|
slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]]; |
|
|
|
// get xhash |
|
tt[i] = *(uint4*)(&xs.hash[0]); |
|
lastword[si[i]] = tt[i]; |
|
hr[i] = xst.hash[0] & RESTMASK; |
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; |
|
} |
|
|
|
__syncthreads(); |
|
u32 xors[4]; |
|
u32 xorbucketid, xorslot; |
|
|
|
#pragma unroll 2 |
|
for (u32 i = 0; i < 2; i++) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
if (pos[i] > 0) |
|
{ |
|
u16 p = ht[hr[i]][0]; |
|
|
|
*(uint4*)(&xors[0]) = tt[i] ^ lastword[p]; |
|
|
|
if (xors[3] != 0) |
|
{ |
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot]; |
|
*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); |
|
|
|
eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
} |
|
} |
|
|
|
for (int k = 1; k != pos[i]; ++k) |
|
{ |
|
u32 pindex = atomicAdd(&pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
u16 prev = ht[hr[i]][k]; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
|
|
// process pairs |
|
u32 plen = umin(pairs_len, MAXPAIRS); |
|
u32 i, k; |
|
for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) |
|
{ |
|
int pair = pairs[s]; |
|
i = __byte_perm(pair, 0, 0x4510); |
|
k = __byte_perm(pair, 0, 0x4532); |
|
|
|
*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k]; |
|
if (xors[3] != 0) |
|
{ |
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot]; |
|
*(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); |
|
eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS, u32 THREADS> |
|
__global__ void digit_5(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u16 ht[NRESTS][(SSM - 1)]; |
|
__shared__ uint4 lastword[NSLOTS]; |
|
__shared__ int ht_len[NRESTS]; |
|
__shared__ int pairs[MAXPAIRS]; |
|
__shared__ u32 pairs_len; |
|
__shared__ u32 next_pair; |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
if (threadid < NRESTS) |
|
ht_len[threadid] = 0; |
|
else if (threadid == (THREADS - 1)) |
|
pairs_len = 0; |
|
else if (threadid == (THREADS - 33)) |
|
next_pair = 0; |
|
|
|
slotsmall* buck = eq->treessmall[3][bucketid]; |
|
u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS); |
|
|
|
u32 hr[2]; |
|
int pos[2]; |
|
pos[0] = pos[1] = SSM; |
|
|
|
u32 si[2]; |
|
uint4 tt[2]; |
|
|
|
// enable this to make fully safe shared mem operations; |
|
// disabled gains some speed, but can rarely cause a crash |
|
//__syncthreads(); |
|
|
|
#pragma unroll 2 |
|
for (u32 i = 0; i < 2; i++) |
|
{ |
|
si[i] = i * THREADS + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
const slotsmall* pslot1 = buck + si[i]; |
|
|
|
tt[i] = *(uint4*)(&pslot1->hash[0]); |
|
lastword[si[i]] = tt[i]; |
|
asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); |
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; |
|
} |
|
|
|
__syncthreads(); |
|
u32 xors[4]; |
|
u32 bexor, xorbucketid, xorslot; |
|
|
|
#pragma unroll 2 |
|
for (u32 i = 0; i < 2; i++) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
if (pos[i] > 0) |
|
{ |
|
u16 p = ht[hr[i]][0]; |
|
|
|
*(uint4*)(&xors[0]) = tt[i] ^ lastword[p]; |
|
|
|
if (xors[3] != 0) |
|
{ |
|
bexor = __byte_perm(xors[0], xors[1], 0x1076); |
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot]; |
|
uint4 ttx; |
|
ttx.x = xors[1]; |
|
ttx.y = xors[2]; |
|
ttx.z = xors[3]; |
|
ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
*(uint4*)(&xs.hash[0]) = ttx; |
|
} |
|
} |
|
|
|
for (int k = 1; k != pos[i]; ++k) |
|
{ |
|
u32 pindex = atomicAdd(&pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
u16 prev = ht[hr[i]][k]; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
|
|
// process pairs |
|
u32 plen = umin(pairs_len, MAXPAIRS); |
|
u32 i, k; |
|
for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) |
|
{ |
|
int pair = pairs[s]; |
|
i = __byte_perm(pair, 0, 0x4510); |
|
k = __byte_perm(pair, 0, 0x4532); |
|
|
|
*(uint4*)(&xors[0]) = lastword[i] ^ lastword[k]; |
|
|
|
if (xors[3] != 0) |
|
{ |
|
bexor = __byte_perm(xors[0], xors[1], 0x1076); |
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot]; |
|
uint4 tt; |
|
tt.x = xors[1]; |
|
tt.y = xors[2]; |
|
tt.z = xors[3]; |
|
tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); |
|
*(uint4*)(&xs.hash[0]) = tt; |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS> |
|
__global__ void digit_6(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u16 ht[NRESTS][(SSM - 1)]; |
|
__shared__ uint2 lastword1[NSLOTS]; |
|
__shared__ u32 lastword2[NSLOTS]; |
|
__shared__ int ht_len[MAXPAIRS]; |
|
__shared__ u32 pairs_len; |
|
__shared__ u32 bsize_sh; |
|
__shared__ u32 next_pair; |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
// reset hashtable len |
|
ht_len[threadid] = 0; |
|
if (threadid == (NRESTS - 1)) |
|
{ |
|
pairs_len = 0; |
|
next_pair = 0; |
|
} |
|
else if (threadid == (NRESTS - 33)) |
|
bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS); |
|
|
|
slotsmall* buck = eq->treessmall[2][bucketid]; |
|
|
|
u32 hr[3]; |
|
int pos[3]; |
|
pos[0] = pos[1] = pos[2] = SSM; |
|
|
|
u32 si[3]; |
|
uint4 tt[3]; |
|
|
|
__syncthreads(); |
|
|
|
u32 bsize = bsize_sh; |
|
|
|
#pragma unroll 3 |
|
for (u32 i = 0; i < 3; i++) |
|
{ |
|
si[i] = i * NRESTS + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
const slotsmall* pslot1 = buck + si[i]; |
|
|
|
tt[i] = *(uint4*)(&pslot1->hash[0]); |
|
lastword1[si[i]] = *(uint2*)(&tt[i].x); |
|
lastword2[si[i]] = tt[i].z; |
|
asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); |
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; |
|
} |
|
|
|
// doing this to save shared memory |
|
int* pairs = ht_len; |
|
__syncthreads(); |
|
|
|
u32 xors[3]; |
|
u32 bexor, xorbucketid, xorslot; |
|
|
|
#pragma unroll 3 |
|
for (u32 i = 0; i < 3; i++) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
if (pos[i] > 0) |
|
{ |
|
u16 p = ht[hr[i]][0]; |
|
|
|
xors[2] = tt[i].z ^ lastword2[p]; |
|
|
|
if (xors[2] != 0) |
|
{ |
|
*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p]; |
|
|
|
bexor = __byte_perm(xors[0], xors[1], 0x1076); |
|
xorbucketid = bexor >> (12 + RB); |
|
xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; |
|
uint4 ttx; |
|
ttx.x = xors[1]; |
|
ttx.y = xors[2]; |
|
ttx.z = bexor; |
|
ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
*(uint4*)(&xs.hash[0]) = ttx; |
|
} |
|
} |
|
|
|
if (pos[i] > 1) |
|
{ |
|
p = ht[hr[i]][1]; |
|
|
|
xors[2] = tt[i].z ^ lastword2[p]; |
|
|
|
if (xors[2] != 0) |
|
{ |
|
*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p]; |
|
|
|
bexor = __byte_perm(xors[0], xors[1], 0x1076); |
|
xorbucketid = bexor >> (12 + RB); |
|
xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; |
|
uint4 ttx; |
|
ttx.x = xors[1]; |
|
ttx.y = xors[2]; |
|
ttx.z = bexor; |
|
ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
*(uint4*)(&xs.hash[0]) = ttx; |
|
} |
|
} |
|
|
|
for (int k = 2; k != pos[i]; ++k) |
|
{ |
|
u32 pindex = atomicAdd(&pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
u16 prev = ht[hr[i]][k]; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
|
|
// process pairs |
|
u32 plen = umin(pairs_len, MAXPAIRS); |
|
for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) |
|
{ |
|
u32 pair = pairs[s]; |
|
u32 i = __byte_perm(pair, 0, 0x4510); |
|
u32 k = __byte_perm(pair, 0, 0x4532); |
|
|
|
xors[2] = lastword2[i] ^ lastword2[k]; |
|
if (xors[2] == 0) |
|
continue; |
|
|
|
*(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k]; |
|
|
|
bexor = __byte_perm(xors[0], xors[1], 0x1076); |
|
xorbucketid = bexor >> (12 + RB); |
|
xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); |
|
if (xorslot >= NSLOTS) continue; |
|
slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; |
|
uint4 ttx; |
|
ttx.x = xors[1]; |
|
ttx.y = xors[2]; |
|
ttx.z = bexor; |
|
ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); |
|
*(uint4*)(&xs.hash[0]) = ttx; |
|
} |
|
} |
|
|
|
|
|
template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS> |
|
__global__ void digit_7(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u16 ht[NRESTS][(SSM - 1)]; |
|
__shared__ u32 lastword[NSLOTS][2]; |
|
__shared__ int ht_len[NRESTS]; |
|
__shared__ int pairs[MAXPAIRS]; |
|
__shared__ u32 pairs_len; |
|
__shared__ u32 bsize_sh; |
|
__shared__ u32 next_pair; |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
// reset hashtable len |
|
ht_len[threadid] = 0; |
|
if (threadid == (NRESTS - 1)) |
|
{ |
|
pairs_len = 0; |
|
next_pair = 0; |
|
} |
|
else if (threadid == (NRESTS - 33)) |
|
bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS); |
|
|
|
slotsmall* buck = eq->treessmall[0][bucketid]; |
|
|
|
u32 hr[3]; |
|
int pos[3]; |
|
pos[0] = pos[1] = pos[2] = SSM; |
|
|
|
u32 si[3]; |
|
uint4 tt[3]; |
|
|
|
__syncthreads(); |
|
|
|
u32 bsize = bsize_sh; |
|
|
|
#pragma unroll 3 |
|
for (u32 i = 0; i < 3; i++) |
|
{ |
|
si[i] = i * NRESTS + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
const slotsmall* pslot1 = buck + si[i]; |
|
|
|
// get xhash |
|
tt[i] = *(uint4*)(&pslot1->hash[0]); |
|
*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x); |
|
asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB)); |
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; |
|
} |
|
|
|
__syncthreads(); |
|
|
|
u32 xors[2]; |
|
u32 xorbucketid, xorslot; |
|
|
|
#pragma unroll 3 |
|
for (u32 i = 0; i < 3; i++) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
if (pos[i] > 0) |
|
{ |
|
u16 p = ht[hr[i]][0]; |
|
|
|
*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); |
|
|
|
if (xors[1] != 0) |
|
{ |
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; |
|
uint4 ttx; |
|
ttx.x = xors[0]; |
|
ttx.y = xors[1]; |
|
ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
ttx.w = 0; |
|
*(uint4*)(&xs.hash[0]) = ttx; |
|
} |
|
} |
|
|
|
if (pos[i] > 1) |
|
{ |
|
p = ht[hr[i]][1]; |
|
|
|
*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); |
|
|
|
if (xors[1] != 0) |
|
{ |
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); |
|
if (xorslot < NSLOTS) |
|
{ |
|
slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; |
|
uint4 ttx; |
|
ttx.x = xors[0]; |
|
ttx.y = xors[1]; |
|
ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
ttx.w = 0; |
|
*(uint4*)(&xs.hash[0]) = ttx; |
|
} |
|
} |
|
|
|
for (int k = 2; k != pos[i]; ++k) |
|
{ |
|
u32 pindex = atomicAdd(&pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
u16 prev = ht[hr[i]][k]; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
|
|
// process pairs |
|
u32 plen = umin(pairs_len, MAXPAIRS); |
|
for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) |
|
{ |
|
int pair = pairs[s]; |
|
u32 i = __byte_perm(pair, 0, 0x4510); |
|
u32 k = __byte_perm(pair, 0, 0x4532); |
|
|
|
*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]); |
|
|
|
if (xors[1] == 0) |
|
continue; |
|
|
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); |
|
xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); |
|
if (xorslot >= NSLOTS) continue; |
|
slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; |
|
uint4 tt; |
|
tt.x = xors[0]; |
|
tt.y = xors[1]; |
|
tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); |
|
tt.w = 0; |
|
*(uint4*)(&xs.hash[0]) = tt; |
|
} |
|
} |
|
|
|
|
|
template <u32 RB, u32 SM, int SSM, typename PACKER, u32 MAXPAIRS> |
|
__global__ void digit_8(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u16 ht[NRESTS][(SSM - 1)]; |
|
__shared__ u32 lastword[NSLOTS][2]; |
|
__shared__ int ht_len[NRESTS]; |
|
__shared__ int pairs[MAXPAIRS]; |
|
__shared__ u32 pairs_len; |
|
__shared__ u32 bsize_sh; |
|
__shared__ u32 next_pair; |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
// reset hashtable len |
|
ht_len[threadid] = 0; |
|
if (threadid == (NRESTS - 1)) |
|
{ |
|
next_pair = 0; |
|
pairs_len = 0; |
|
} |
|
else if (threadid == (NRESTS - 33)) |
|
bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS); |
|
|
|
slotsmall* buck = eq->treessmall[1][bucketid]; |
|
|
|
u32 hr[3]; |
|
int pos[3]; |
|
pos[0] = pos[1] = pos[2] = SSM; |
|
|
|
u32 si[3]; |
|
uint2 tt[3]; |
|
|
|
__syncthreads(); |
|
|
|
u32 bsize = bsize_sh; |
|
|
|
#pragma unroll 3 |
|
for (u32 i = 0; i < 3; i++) |
|
{ |
|
si[i] = i * NRESTS + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
const slotsmall* pslot1 = buck + si[i]; |
|
|
|
// get xhash |
|
tt[i] = *(uint2*)(&pslot1->hash[0]); |
|
*(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x); |
|
asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); |
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; |
|
} |
|
|
|
__syncthreads(); |
|
|
|
u32 xors[2]; |
|
u32 bexor, xorbucketid, xorslot; |
|
|
|
#pragma unroll 3 |
|
for (u32 i = 0; i < 3; i++) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
if (pos[i] > 0) |
|
{ |
|
u16 p = ht[hr[i]][0]; |
|
|
|
*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); |
|
|
|
if (xors[1] != 0) |
|
{ |
|
bexor = __byte_perm(xors[0], xors[1], 0x0765); |
|
xorbucketid = bexor >> (12 + 8); |
|
xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); |
|
if (xorslot < RB8_NSLOTS_LD) |
|
{ |
|
slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; |
|
uint2 tt; |
|
tt.x = xors[1]; |
|
tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
*(uint2*)(&xs.hash[0]) = tt; |
|
} |
|
} |
|
|
|
if (pos[i] > 1) |
|
{ |
|
p = ht[hr[i]][1]; |
|
|
|
*(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); |
|
|
|
if (xors[1] != 0) |
|
{ |
|
bexor = __byte_perm(xors[0], xors[1], 0x0765); |
|
xorbucketid = bexor >> (12 + 8); |
|
xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); |
|
if (xorslot < RB8_NSLOTS_LD) |
|
{ |
|
slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; |
|
uint2 tt; |
|
tt.x = xors[1]; |
|
tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); |
|
*(uint2*)(&xs.hash[0]) = tt; |
|
} |
|
} |
|
|
|
for (int k = 2; k != pos[i]; ++k) |
|
{ |
|
u32 pindex = atomicAdd(&pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
u16 prev = ht[hr[i]][k]; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
|
|
// process pairs |
|
u32 plen = umin(pairs_len, MAXPAIRS); |
|
for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) |
|
{ |
|
int pair = pairs[s]; |
|
u32 i = __byte_perm(pair, 0, 0x4510); |
|
u32 k = __byte_perm(pair, 0, 0x4532); |
|
|
|
*(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]); |
|
|
|
if (xors[1] == 0) |
|
continue; |
|
|
|
bexor = __byte_perm(xors[0], xors[1], 0x0765); |
|
xorbucketid = bexor >> (12 + 8); |
|
xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); |
|
if (xorslot >= RB8_NSLOTS_LD) continue; |
|
slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; |
|
uint2 tt; |
|
tt.x = xors[1]; |
|
tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); |
|
*(uint2*)(&xs.hash[0]) = tt; |
|
} |
|
} |
|
|
|
/* |
|
Last round function is similar to previous ones but has different ending. |
|
We use warps to process final candidates. Each warp process one candidate. |
|
First two bidandsids (u32 of stored bucketid and two slotids) are retreived by |
|
lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until |
|
all lanes in warp have bidandsids from round 4. Next, each thread retreives |
|
16 indices. While doing so, indices are put into comparison using atomicExch |
|
to determine if there are duplicates (tromp's method). At the end, if no |
|
duplicates are found, candidate solution is saved (all indices). Note that this |
|
dup check method is not exact so CPU dup checking is needed after. |
|
*/ |
|
template <u32 RB, u32 SM, int SSM, u32 FCT, typename PACKER, u32 MAXPAIRS, u32 DUPBITS, u32 W> |
|
__global__ void digit_last_wdc(equi<RB, SM>* eq) |
|
{ |
|
__shared__ u8 shared_data[8192]; |
|
int* ht_len = (int*)(&shared_data[0]); |
|
int* pairs = ht_len; |
|
u32* lastword = (u32*)(&shared_data[256 * 4]); |
|
u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]); |
|
u32* pairs_len = (u32*)(&shared_data[8188]); |
|
|
|
const u32 threadid = threadIdx.x; |
|
const u32 bucketid = blockIdx.x; |
|
|
|
// reset hashtable len |
|
#pragma unroll |
|
for (u32 i = 0; i < FCT; i++) |
|
ht_len[(i * (256 / FCT)) + threadid] = 0; |
|
|
|
if (threadid == ((256 / FCT) - 1)) |
|
*pairs_len = 0; |
|
|
|
slottiny* buck = eq->treestiny[0][bucketid]; |
|
u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD); |
|
|
|
u32 si[3 * FCT]; |
|
u32 hr[3 * FCT]; |
|
int pos[3 * FCT]; |
|
u32 lw[3 * FCT]; |
|
|
|
#pragma unroll |
|
for (u32 i = 0; i < (3 * FCT); i++) |
|
pos[i] = SSM; |
|
|
|
__syncthreads(); |
|
|
|
#pragma unroll |
|
for (u32 i = 0; i < (3 * FCT); i++) |
|
{ |
|
si[i] = i * (256 / FCT) + threadid; |
|
if (si[i] >= bsize) break; |
|
|
|
const slottiny* pslot1 = buck + si[i]; |
|
|
|
// get xhash |
|
uint2 tt = *(uint2*)(&pslot1->hash[0]); |
|
lw[i] = tt.x; |
|
lastword[si[i]] = lw[i]; |
|
|
|
u32 a; |
|
asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i])); |
|
hr[i] = a; |
|
|
|
pos[i] = atomicAdd(&ht_len[hr[i]], 1); |
|
if (pos[i] < (SSM - 1)) |
|
ht[hr[i] * (SSM - 1) + pos[i]] = si[i]; |
|
} |
|
|
|
__syncthreads(); |
|
|
|
#pragma unroll |
|
for (u32 i = 0; i < (3 * FCT); i++) |
|
{ |
|
if (pos[i] >= SSM) continue; |
|
|
|
for (int k = 0; k != pos[i]; ++k) |
|
{ |
|
u16 prev = ht[hr[i] * (SSM - 1) + k]; |
|
if (lw[i] != lastword[prev]) continue; |
|
u32 pindex = atomicAdd(pairs_len, 1); |
|
if (pindex >= MAXPAIRS) break; |
|
pairs[pindex] = __byte_perm(si[i], prev, 0x1054); |
|
} |
|
} |
|
|
|
__syncthreads(); |
|
u32 plen = umin(*pairs_len, 64); |
|
|
|
#define CALC_LEVEL(a, b, c, d) { \ |
|
u32 plvl = levels[b]; \ |
|
u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \ |
|
u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \ |
|
u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \ |
|
levels[b] = bucks[slot1]; \ |
|
levels[c] = bucks[slot0]; \ |
|
} |
|
|
|
#define CALC_LEVEL_SMALL(a, b, c, d) { \ |
|
u32 plvl = levels[b]; \ |
|
slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \ |
|
u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \ |
|
u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \ |
|
levels[b] = bucks[slot1].hash[d]; \ |
|
levels[c] = bucks[slot0].hash[d]; \ |
|
} |
|
|
|
u32 lane = threadIdx.x & 0x1f; |
|
u32 par = threadIdx.x >> 5; |
|
|
|
u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)]; |
|
u32* susp = levels; |
|
|
|
while (par < plen) |
|
{ |
|
int pair = pairs[par]; |
|
par += W; |
|
|
|
if (lane % 16 == 0) |
|
{ |
|
u32 plvl; |
|
if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1]; |
|
else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1]; |
|
slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)]; |
|
u32 slot1 = PACKER::get_slot1(plvl, RB, SM); |
|
u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); |
|
levels[lane] = bucks[slot1].hash[2]; |
|
levels[lane + 8] = bucks[slot0].hash[2]; |
|
} |
|
|
|
if (lane % 8 == 0) |
|
CALC_LEVEL_SMALL(0, lane, lane + 4, 3); |
|
|
|
if (lane % 4 == 0) |
|
CALC_LEVEL_SMALL(2, lane, lane + 2, 3); |
|
|
|
if (lane % 2 == 0) |
|
CALC_LEVEL(0, lane, lane + 1, 4); |
|
|
|
u32 ind[16]; |
|
|
|
u32 f1 = levels[lane]; |
|
const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0]; |
|
const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM); |
|
const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM); |
|
|
|
susp[lane] = 0xffffffff; |
|
susp[32 + lane] = 0xffffffff; |
|
|
|
#define CHECK_DUP(a) \ |
|
__any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS)) |
|
|
|
u32 f2 = buck_v4[slot1_v4].hash[1]; |
|
const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0]; |
|
const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM); |
|
const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM); |
|
|
|
susp[64 + lane] = 0xffffffff; |
|
susp[96 + lane] = 0xffffffff; |
|
|
|
u32 f0 = buck_v3_1[slot1_v3_1].hash[1]; |
|
const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)]; |
|
const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM); |
|
const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM); |
|
|
|
susp[128 + lane] = 0xffffffff; |
|
susp[160 + lane] = 0xffffffff; |
|
|
|
u32 f3 = buck_v2_1[slot1_v2_1].hash[6]; |
|
const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)]; |
|
const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS); |
|
const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS); |
|
|
|
susp[192 + lane] = 0xffffffff; |
|
susp[224 + lane] = 0xffffffff; |
|
|
|
ind[0] = buck_fin_1[slot1_fin_1].hash[7]; |
|
if (CHECK_DUP(0)) continue; |
|
ind[1] = buck_fin_1[slot0_fin_1].hash[7]; |
|
if (CHECK_DUP(1)) continue; |
|
|
|
u32 f4 = buck_v2_1[slot0_v2_1].hash[6]; |
|
const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)]; |
|
const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS); |
|
const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS); |
|
|
|
ind[2] = buck_fin_2[slot1_fin_2].hash[7]; |
|
if (CHECK_DUP(2)) continue; |
|
ind[3] = buck_fin_2[slot0_fin_2].hash[7]; |
|
if (CHECK_DUP(3)) continue; |
|
|
|
u32 f5 = buck_v3_1[slot0_v3_1].hash[1]; |
|
const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)]; |
|
const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM); |
|
const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM); |
|
|
|
u32 f6 = buck_v2_2[slot1_v2_2].hash[6]; |
|
const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)]; |
|
const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS); |
|
const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS); |
|
|
|
ind[4] = buck_fin_3[slot1_fin_3].hash[7]; |
|
if (CHECK_DUP(4)) continue; |
|
ind[5] = buck_fin_3[slot0_fin_3].hash[7]; |
|
if (CHECK_DUP(5)) continue; |
|
|
|
u32 f7 = buck_v2_2[slot0_v2_2].hash[6]; |
|
const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)]; |
|
const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS); |
|
const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS); |
|
|
|
ind[6] = buck_fin_4[slot1_fin_4].hash[7]; |
|
if (CHECK_DUP(6)) continue; |
|
ind[7] = buck_fin_4[slot0_fin_4].hash[7]; |
|
if (CHECK_DUP(7)) continue; |
|
|
|
u32 f8 = buck_v4[slot0_v4].hash[1]; |
|
const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0]; |
|
const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM); |
|
const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM); |
|
|
|
u32 f9 = buck_v3_2[slot1_v3_2].hash[1]; |
|
const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)]; |
|
const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM); |
|
const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM); |
|
|
|
u32 f10 = buck_v2_3[slot1_v2_3].hash[6]; |
|
const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)]; |
|
const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS); |
|
const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS); |
|
|
|
ind[8] = buck_fin_5[slot1_fin_5].hash[7]; |
|
if (CHECK_DUP(8)) continue; |
|
ind[9] = buck_fin_5[slot0_fin_5].hash[7]; |
|
if (CHECK_DUP(9)) continue; |
|
|
|
u32 f11 = buck_v2_3[slot0_v2_3].hash[6]; |
|
const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)]; |
|
const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS); |
|
const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS); |
|
|
|
ind[10] = buck_fin_6[slot1_fin_6].hash[7]; |
|
if (CHECK_DUP(10)) continue; |
|
ind[11] = buck_fin_6[slot0_fin_6].hash[7]; |
|
if (CHECK_DUP(11)) continue; |
|
|
|
u32 f12 = buck_v3_2[slot0_v3_2].hash[1]; |
|
const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)]; |
|
const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM); |
|
const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM); |
|
|
|
u32 f13 = buck_v2_4[slot1_v2_4].hash[6]; |
|
const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)]; |
|
const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS); |
|
const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS); |
|
|
|
ind[12] = buck_fin_7[slot1_fin_7].hash[7]; |
|
if (CHECK_DUP(12)) continue; |
|
ind[13] = buck_fin_7[slot0_fin_7].hash[7]; |
|
if (CHECK_DUP(13)) continue; |
|
|
|
u32 f14 = buck_v2_4[slot0_v2_4].hash[6]; |
|
const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)]; |
|
const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS); |
|
const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS); |
|
|
|
ind[14] = buck_fin_8[slot1_fin_8].hash[7]; |
|
if (CHECK_DUP(14)) continue; |
|
ind[15] = buck_fin_8[slot0_fin_8].hash[7]; |
|
if (CHECK_DUP(15)) continue; |
|
|
|
u32 soli; |
|
if (lane == 0) { |
|
soli = atomicAdd(&eq->edata.srealcont.nsols, 1); |
|
} |
|
#if __CUDA_ARCH__ >= 300 |
|
// useful ? |
|
soli = __shfl(soli, 0); |
|
#else |
|
__syncthreads(); |
|
#endif |
|
if (soli < MAXREALSOLS) |
|
{ |
|
u32 pos = lane << 4; |
|
*(uint4*)(&eq->edata.srealcont.sols[soli][pos ]) = *(uint4*)(&ind[ 0]); |
|
*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 4]) = *(uint4*)(&ind[ 4]); |
|
*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 8]) = *(uint4*)(&ind[ 8]); |
|
*(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]); |
|
} |
|
} |
|
} |
|
|
|
//std::mutex dev_init; |
|
int dev_init_done[MAX_GPUS] = { 0 }; |
|
|
|
__host__ |
|
static int compu32(const void *pa, const void *pb) |
|
{ |
|
uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb; |
|
return a<b ? -1 : a == b ? 0 : +1; |
|
} |
|
|
|
__host__ |
|
static bool duped(uint32_t* prf) |
|
{ |
|
uint32_t sortprf[512]; |
|
memcpy(sortprf, prf, sizeof(uint32_t) * 512); |
|
qsort(sortprf, 512, sizeof(uint32_t), &compu32); |
|
for (uint32_t i = 1; i<512; i++) { |
|
if (sortprf[i] <= sortprf[i - 1]) |
|
return true; |
|
} |
|
return false; |
|
} |
|
|
|
__host__ |
|
static void sort_pair(uint32_t *a, uint32_t len) |
|
{ |
|
uint32_t *b = a + len; |
|
uint32_t tmp, need_sorting = 0; |
|
for (uint32_t i = 0; i < len; i++) { |
|
if (need_sorting || a[i] > b[i]) |
|
{ |
|
need_sorting = 1; |
|
tmp = a[i]; |
|
a[i] = b[i]; |
|
b[i] = tmp; |
|
} |
|
else if (a[i] < b[i]) |
|
return; |
|
} |
|
} |
|
|
|
__host__ |
|
static void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) |
|
{ |
|
uint32_t le_N = WN; |
|
uint32_t le_K = WK; |
|
uchar personal[] = "ZcashPoW01230123"; |
|
memcpy(personal + 8, &le_N, 4); |
|
memcpy(personal + 12, &le_K, 4); |
|
blake2b_param P[1]; |
|
P->digest_length = HASHOUT; |
|
P->key_length = 0; |
|
P->fanout = 1; |
|
P->depth = 1; |
|
P->leaf_length = 0; |
|
P->node_offset = 0; |
|
P->node_depth = 0; |
|
P->inner_length = 0; |
|
memset(P->reserved, 0, sizeof(P->reserved)); |
|
memset(P->salt, 0, sizeof(P->salt)); |
|
memcpy(P->personal, (const uint8_t *)personal, 16); |
|
eq_blake2b_init_param(ctx, P); |
|
eq_blake2b_update(ctx, (const uchar *)header, headerLen); |
|
if (nonceLen) eq_blake2b_update(ctx, (const uchar *)nce, nonceLen); |
|
} |
|
|
|
#ifdef WIN32 |
|
typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int); |
|
typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice); |
|
typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext); |
|
typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext); |
|
|
|
dec_cuDeviceGet _cuDeviceGet = nullptr; |
|
dec_cuCtxCreate _cuCtxCreate = nullptr; |
|
dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr; |
|
dec_cuCtxDestroy _cuCtxDestroy = nullptr; |
|
#endif |
|
|
|
template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER> |
|
__host__ eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::eq_cuda_context(int thr_id, int dev_id) |
|
{ |
|
thread_id = thr_id; |
|
device_id = dev_id; |
|
solutions = nullptr; |
|
equi_mem_sz = sizeof(equi<RB, SM>); |
|
throughput = NBLOCKS; |
|
totalblocks = NBLOCKS/FD_THREADS; |
|
threadsperblock = FD_THREADS; |
|
threadsperblock_digits = THREADS; |
|
|
|
//dev_init.lock(); |
|
if (!dev_init_done[device_id]) |
|
{ |
|
// only first thread shall init device |
|
checkCudaErrors(cudaSetDevice(device_id)); |
|
checkCudaErrors(cudaDeviceReset()); |
|
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); |
|
|
|
pctx = nullptr; |
|
} |
|
else |
|
{ |
|
// create new context |
|
CUdevice dev; |
|
|
|
#ifdef WIN32 |
|
if (_cuDeviceGet == nullptr) |
|
{ |
|
HMODULE hmod = LoadLibraryA("nvcuda.dll"); |
|
if (hmod == NULL) |
|
throw std::runtime_error("Failed to load nvcuda.dll"); |
|
_cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet"); |
|
if (_cuDeviceGet == nullptr) |
|
throw std::runtime_error("Failed to get cuDeviceGet address"); |
|
_cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2"); |
|
if (_cuCtxCreate == nullptr) |
|
throw std::runtime_error("Failed to get cuCtxCreate address"); |
|
_cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2"); |
|
if (_cuCtxPushCurrent == nullptr) |
|
throw std::runtime_error("Failed to get cuCtxPushCurrent address"); |
|
_cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2"); |
|
if (_cuCtxDestroy == nullptr) |
|
throw std::runtime_error("Failed to get cuCtxDestroy address"); |
|
} |
|
|
|
checkCudaDriverErrors(_cuDeviceGet(&dev, device_id)); |
|
checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev)); |
|
checkCudaDriverErrors(_cuCtxPushCurrent(pctx)); |
|
#else |
|
checkCudaDriverErrors(cuDeviceGet(&dev, device_id)); |
|
checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev)); |
|
checkCudaDriverErrors(cuCtxPushCurrent(pctx)); |
|
#endif |
|
} |
|
++dev_init_done[device_id]; |
|
//dev_init.unlock(); |
|
|
|
if (cudaMalloc((void**)&device_eq, equi_mem_sz) != cudaSuccess) |
|
throw std::runtime_error("CUDA: failed to alloc memory"); |
|
|
|
solutions = (scontainerreal*) malloc(sizeof(scontainerreal)); |
|
if (!solutions) |
|
throw std::runtime_error("EOM: failed to alloc solutions memory"); |
|
} |
|
|
|
template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER> |
|
__host__ void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::solve(const char *tequihash_header, |
|
unsigned int tequihash_header_len, |
|
const char* nonce, |
|
unsigned int nonce_len, |
|
fn_cancel cancelf, |
|
fn_solution solutionf, |
|
fn_hashdone hashdonef) |
|
{ |
|
blake2b_state blake_ctx; |
|
|
|
int blocks = NBUCKETS; |
|
|
|
setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len); |
|
|
|
// todo: improve |
|
// djezo solver allows last 4 bytes of nonce to be iterrated |
|
// this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h) |
|
// then just iterate nn++ |
|
// less CPU load, 1 cudaMemcpy less -> faster |
|
//u32 nn = *(u32*)&nonce[28]; |
|
u32 nn = 0; |
|
|
|
checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice)); |
|
|
|
checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata))); |
|
|
|
digit_first<RB, SM, PACKER> <<<NBLOCKS / FD_THREADS, FD_THREADS >>>(device_eq, nn); |
|
|
|
digit_1<RB, SM, SSM, PACKER, 4 * NRESTS, 512> <<<4096, 512 >>>(device_eq); |
|
digit_2<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq); |
|
digit_3<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq); |
|
|
|
if (cancelf(thread_id)) return; |
|
|
|
digit_4<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq); |
|
digit_5<RB, SM, SSM, PACKER, 4 * NRESTS, THREADS> <<<blocks, THREADS >>>(device_eq); |
|
|
|
digit_6<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq); |
|
digit_7<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq); |
|
digit_8<RB, SM, SSM - 1, PACKER, 3 * NRESTS> <<<blocks, NRESTS >>>(device_eq); |
|
|
|
digit_last_wdc<RB, SM, SSM - 3, 2, PACKER, 64, 8, 4> <<<4096, 256 / 2 >>>(device_eq); |
|
|
|
checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost)); |
|
|
|
//printf("T%d nsols: %u\n", thread_id, solutions->nsols); |
|
//if (solutions->nsols > 9) |
|
// printf("missing sol, total: %u\n", solutions->nsols); |
|
|
|
for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++) |
|
{ |
|
// remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions) |
|
if (duped(solutions->sols[s])) continue; |
|
|
|
// perform sort of pairs |
|
for (uint32_t level = 0; level < 9; level++) |
|
for (uint32_t i = 0; i < (1 << 9); i += (2 << level)) |
|
sort_pair(&solutions->sols[s][i], 1 << level); |
|
|
|
std::vector<uint32_t> index_vector(PROOFSIZE); |
|
for (u32 i = 0; i < PROOFSIZE; i++) { |
|
index_vector[i] = solutions->sols[s][i]; |
|
} |
|
|
|
solutionf(thread_id, index_vector, DIGITBITS, nullptr); |
|
} |
|
|
|
// ccminer: only use hashdonef if no solutions... |
|
if (!solutions->nsols) |
|
hashdonef(thread_id); |
|
} |
|
|
|
// destructor |
|
template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER> |
|
__host__ |
|
eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context() |
|
{ |
|
if (solutions) |
|
free(solutions); |
|
|
|
if (device_eq) { |
|
cudaFree(device_eq); |
|
device_eq = NULL; |
|
} |
|
|
|
if (pctx) { |
|
// non primary thread, destroy context |
|
#ifdef WIN32 |
|
checkCudaDriverErrors(_cuCtxDestroy(pctx)); |
|
#else |
|
checkCudaDriverErrors(cuCtxDestroy(pctx)); |
|
#endif |
|
} else { |
|
checkCudaErrors(cudaDeviceReset()); |
|
dev_init_done[device_id] = 0; |
|
} |
|
} |
|
|
|
|
|
#ifdef CONFIG_MODE_1 |
|
template class eq_cuda_context<CONFIG_MODE_1>; |
|
#endif |
|
|
|
#ifdef CONFIG_MODE_2 |
|
template class eq_cuda_context<CONFIG_MODE_2>; |
|
#endif |
|
|
|
#ifdef CONFIG_MODE_3 |
|
template class eq_cuda_context<CONFIG_MODE_3>; |
|
#endif
|
|
|