From ffbc9d929fcad28fd300f0af9967e4d3ed7a54d6 Mon Sep 17 00:00:00 2001 From: samr7 Date: Sat, 30 Jul 2011 07:29:48 -0700 Subject: [PATCH] Optimize global memory access patterns in heap_invert. --- calc_addrs.cl | 161 +++++++++++++++++++++++++++++++++++++------------ oclvanitygen.c | 8 ++- 2 files changed, 130 insertions(+), 39 deletions(-) diff --git a/calc_addrs.cl b/calc_addrs.cl index 6a408b5..b970563 100644 --- a/calc_addrs.cl +++ b/calc_addrs.cl @@ -971,7 +971,7 @@ calc_addrs(__global uint *hashes_out, #define ACCESS_STRIDE (ACCESS_BUNDLE/BN_NWORDS) __kernel void -ec_add_grid(__global bn_word *points_out, __global bignum *z_heap, +ec_add_grid(__global bn_word *points_out, __global bn_word *z_heap, __global bignum *row_in, __global bignum *col_in) { bignum rx, ry; @@ -990,8 +990,15 @@ ec_add_grid(__global bn_word *points_out, __global bignum *z_heap, bn_mod_sub(&z, &x1, &rx); - z_heap[(2 * get_global_id(1) * get_global_size(0)) + - (get_global_size(0) - 1) + get_global_id(0)] = z; + cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0)); + start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (cell % ACCESS_STRIDE)); + +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) + z_heap[start + (i*ACCESS_STRIDE)] = z.d[i]; bn_mod_sub(&b, &y1, &ry); bn_mod_add(&c, &x1, &rx); @@ -1006,7 +1013,6 @@ ec_add_grid(__global bn_word *points_out, __global bignum *z_heap, * various GPUs, by giving it a nice contiguous patch to write * per warp/wavefront. */ - cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0)); start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % (ACCESS_STRIDE/2))); @@ -1037,20 +1043,44 @@ ec_add_grid(__global bn_word *points_out, __global bignum *z_heap, } __kernel void -heap_invert(__global bignum *z_heap, int ncols) +heap_invert(__global bn_word *z_heap, int batch) { bignum a, b, c, z; - int i; + int i, j, off, lcell, hcell, start; + + off = get_global_size(0); + lcell = get_global_id(0); + hcell = (off * batch) + lcell; + for (i = 0; i < (batch-1); i++) { + start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (lcell % ACCESS_STRIDE)); +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + a.d[j] = z_heap[start + j*ACCESS_STRIDE]; - i = get_global_id(0); - z_heap += (2 * i * ncols); + lcell += off; + start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (lcell % ACCESS_STRIDE)); +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + b.d[j] = z_heap[start + j*ACCESS_STRIDE]; - /* Compute the product hierarchy in z_heap */ - for (i = ncols - 1; i > 0; i--) { - a = z_heap[(i*2) - 1]; - b = z_heap[(i*2)]; bn_mul_mont(&z, &a, &b); - z_heap[i-1] = z; + + start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (hcell % ACCESS_STRIDE)); +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + z_heap[start + j*ACCESS_STRIDE] = z.d[j]; + + lcell += off; + hcell += off; } /* Invert the root, fix up 1/ZR -> R/Z */ @@ -1063,21 +1093,68 @@ heap_invert(__global bignum *z_heap, int ncols) a.d[i] = mont_rr[i]; bn_mul_mont(&z, &z, &a); bn_mul_mont(&z, &z, &a); - z_heap[0] = z; - for (i = 1; i < ncols; i++) { - a = z_heap[i - 1]; - b = z_heap[(i*2) - 1]; - c = z_heap[i*2]; - bn_mul_mont(&z, &a, &c); - z_heap[(i*2) - 1] = z; - bn_mul_mont(&z, &a, &b); - z_heap[i*2] = z; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + z_heap[start + j*ACCESS_STRIDE] = z.d[j]; + + lcell = (off * 2 * (batch - 2)) + get_global_id(0); + hcell = lcell + (off << 1); + for (i = 0; i < (batch-1); i++) { + start = (((hcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (hcell % ACCESS_STRIDE)); +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + z.d[j] = z_heap[start + j*ACCESS_STRIDE]; + + + start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (lcell % ACCESS_STRIDE)); +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + a.d[j] = z_heap[start + j*ACCESS_STRIDE]; + + lcell += off; + start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (lcell % ACCESS_STRIDE)); +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + b.d[j] = z_heap[start + j*ACCESS_STRIDE]; + + bn_mul_mont(&c, &a, &z); + +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + z_heap[start + j*ACCESS_STRIDE] = c.d[j]; + + bn_mul_mont(&c, &b, &z); + + lcell -= off; + start = (((lcell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (lcell % ACCESS_STRIDE)); +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (j = 0; j < BN_NWORDS; j++) + z_heap[start + j*ACCESS_STRIDE] = c.d[j]; + + lcell -= (off << 1); + hcell -= off; } } void -hash_ec_point(uint *hash_out, __global bn_word *xy, __global bignum *zip) +hash_ec_point(uint *hash_out, __global bn_word *xy, __global bn_word *zip) { uint hash1[16], hash2[16]; bignum c, zi, zzi; @@ -1091,13 +1168,20 @@ hash_ec_point(uint *hash_out, __global bn_word *xy, __global bignum *zip) * is big-endian, so swapping is unnecessary, but * inserting the format byte in front causes a headache. */ - zi = zip[0]; +#ifdef UNROLL_MAX +#pragma unroll UNROLL_MAX +#endif + for (i = 0; i < BN_NWORDS; i++) + zi.d[i] = zip[i*ACCESS_STRIDE]; + bn_mul_mont(&zzi, &zi, &zi); /* 1 / Z^2 */ + #ifdef UNROLL_MAX #pragma unroll UNROLL_MAX #endif for (i = 0; i < BN_NWORDS; i++) c.d[i] = xy[i*ACCESS_STRIDE]; + bn_mul_mont(&c, &c, &zzi); /* X / Z^2 */ bn_from_mont(&c, &c); @@ -1181,23 +1265,26 @@ hash_ec_point(uint *hash_out, __global bn_word *xy, __global bignum *zip) __kernel void hash_ec_point_get(__global uint *hashes_out, - __global bn_word *points_in, __global bignum *z_heap) + __global bn_word *points_in, __global bn_word *z_heap) { uint hash[5]; int i, p, cell, start; - p = get_global_size(0); - i = p * get_global_id(1); - z_heap += (i * 2); - hashes_out += 5 * (i + get_global_id(0)); - cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0)); + start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (cell % ACCESS_STRIDE)); + z_heap += start; + start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % (ACCESS_STRIDE/2))); points_in += start; /* Complete the coordinates and hash */ - hash_ec_point(hash, points_in, &z_heap[(p - 1) + get_global_id(0)]); + hash_ec_point(hash, points_in, z_heap); + + p = get_global_size(0); + i = p * get_global_id(1); + hashes_out += 5 * (i + get_global_id(0)); /* Output the hash in proper byte-order */ #ifdef UNROLL_MAX @@ -1239,23 +1326,23 @@ hash160_ucmp_g(uint *a, __global uint *bound) __kernel void hash_ec_point_search_prefix(__global uint *found, __global bn_word *points_in, - __global bignum *z_heap, + __global bn_word *z_heap, __global uint *target_table, int ntargets) { uint hash[5]; int i, high, low, p, cell, start; - p = get_global_size(0); - i = p * get_global_id(1); - z_heap += (i * 2); - cell = ((get_global_id(1) * get_global_size(0)) + get_global_id(0)); + start = (((cell / ACCESS_STRIDE) * ACCESS_BUNDLE) + + (cell % ACCESS_STRIDE)); + z_heap += start; + start = ((((2 * cell) / ACCESS_STRIDE) * ACCESS_BUNDLE) + (cell % (ACCESS_STRIDE/2))); points_in += start; /* Complete the coordinates and hash */ - hash_ec_point(hash, points_in, &z_heap[(p - 1) + get_global_id(0)]); + hash_ec_point(hash, points_in, z_heap); /* * Unconditionally byteswap the hash result, because: diff --git a/oclvanitygen.c b/oclvanitygen.c index 0635995..83c3afb 100644 --- a/oclvanitygen.c +++ b/oclvanitygen.c @@ -39,6 +39,8 @@ const int debug = 0; #define MAX_ARG 6 #define MAX_KERNEL 3 +#define round_up_pow2(x, a) (((x) + ((a)-1)) & ~((a)-1)) + /* OpenCL address searching mode */ struct _vg_ocl_context_s; typedef int (*vg_ocl_init_t)(struct _vg_ocl_context_s *); @@ -1112,8 +1114,10 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int worksize) * - The z_heap and point scratch spaces * - The row point array */ - if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1, 32 * 2 * round, 0) || - !vg_ocl_kernel_arg_alloc(vocp, -1, 2, 32 * 2 * round, 0) || + if (!vg_ocl_kernel_arg_alloc(vocp, -1, 1, + round_up_pow2(32 * 2 * round, 4096), 0) || + !vg_ocl_kernel_arg_alloc(vocp, -1, 2, + round_up_pow2(32 * 2 * round, 4096), 0) || !vg_ocl_kernel_arg_alloc(vocp, -1, 3, 32 * 2 * batchsize, 1)) goto enomem;