mirror of
https://github.com/GOSTSec/vanitygen
synced 2025-02-07 12:24:20 +00:00
Change the meaning of -w again. Now it's the number of address
calculations per thread in a work unit. The previous meaning (thread slots per multiprocessor) is now -t.
This commit is contained in:
parent
bbc59bf106
commit
dd2f5670e6
@ -426,17 +426,22 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
|
|||||||
#endif
|
#endif
|
||||||
break;
|
break;
|
||||||
case 0x1002: /* AMD/ATI */
|
case 0x1002: /* AMD/ATI */
|
||||||
|
if (vg_ocl_device_gettype(vocp->voc_ocldid) &
|
||||||
|
CL_DEVICE_TYPE_GPU) {
|
||||||
quirks |= VG_OCL_EXPENSIVE_BRANCHES;
|
quirks |= VG_OCL_EXPENSIVE_BRANCHES;
|
||||||
quirks |= VG_OCL_DEEP_VLIW;
|
quirks |= VG_OCL_DEEP_VLIW;
|
||||||
dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
|
dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
|
||||||
CL_DEVICE_EXTENSIONS);
|
CL_DEVICE_EXTENSIONS);
|
||||||
if (dvn && strstr(dvn, "cl_amd_media_ops"))
|
if (dvn && strstr(dvn, "cl_amd_media_ops"))
|
||||||
quirks |= VG_OCL_AMD_BFI_INT;
|
quirks |= VG_OCL_AMD_BFI_INT;
|
||||||
dvn = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
|
|
||||||
|
dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
|
||||||
|
CL_DEVICE_NAME);
|
||||||
if (!strcmp(dvn, "ATI RV710")) {
|
if (!strcmp(dvn, "ATI RV710")) {
|
||||||
quirks &= ~VG_OCL_OPTIMIZATIONS;
|
quirks &= ~VG_OCL_OPTIMIZATIONS;
|
||||||
quirks |= VG_OCL_NO_BINARIES;
|
quirks |= VG_OCL_NO_BINARIES;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
@ -1567,10 +1572,10 @@ out:
|
|||||||
|
|
||||||
void *
|
void *
|
||||||
vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode,
|
vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode,
|
||||||
int worksize, int nrows, int ncols, int invsize)
|
int worksize, int nthreads, int nrows, int ncols, int invsize)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
int round, full_worksize;
|
int round, full_threads, wsmult;
|
||||||
cl_ulong memsize, allocsize;
|
cl_ulong memsize, allocsize;
|
||||||
|
|
||||||
const BN_ULONG rekey_max = 100000000;
|
const BN_ULONG rekey_max = 100000000;
|
||||||
@ -1610,15 +1615,44 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode,
|
|||||||
* (create two, keep one running while we service the other or wait)
|
* (create two, keep one running while we service the other or wait)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if (!worksize) {
|
if (!nthreads) {
|
||||||
/* Pick a work size sufficient to saturate one compute unit */
|
/* Pick nthreads sufficient to saturate one compute unit */
|
||||||
worksize = vg_ocl_device_getsizet(vocp->voc_ocldid,
|
if (vg_ocl_device_gettype(vocp->voc_ocldid) &
|
||||||
|
CL_DEVICE_TYPE_CPU)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = vg_ocl_device_getsizet(vocp->voc_ocldid,
|
||||||
CL_DEVICE_MAX_WORK_GROUP_SIZE);
|
CL_DEVICE_MAX_WORK_GROUP_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
full_worksize = vg_ocl_device_getsizet(vocp->voc_ocldid,
|
full_threads = vg_ocl_device_getsizet(vocp->voc_ocldid,
|
||||||
CL_DEVICE_MAX_COMPUTE_UNITS);
|
CL_DEVICE_MAX_COMPUTE_UNITS);
|
||||||
full_worksize *= worksize;
|
full_threads *= nthreads;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The work size should be set to a point of diminishing
|
||||||
|
* returns for the batch size of the heap_invert kernel.
|
||||||
|
* Each value added to the batch trades one complete modular
|
||||||
|
* inversion for four multiply operations.
|
||||||
|
* Selection of a work size depends on the throughput ratio of
|
||||||
|
* the multiply and modular inversion operations.
|
||||||
|
*
|
||||||
|
* The measured value for the OpenSSL implementations on my CPU
|
||||||
|
* is 80:1. This causes heap_invert to break even with batches
|
||||||
|
* of 20, and receive 10% incremental returns at 200. The CPU
|
||||||
|
* work size is therefore set to 256.
|
||||||
|
*
|
||||||
|
* The ratio on most GPUs with the oclvanitygen implementations
|
||||||
|
* is closer to 500:1, and larger batches are required for
|
||||||
|
* good performance.
|
||||||
|
*/
|
||||||
|
if (!worksize) {
|
||||||
|
if (vg_ocl_device_gettype(vocp->voc_ocldid) &
|
||||||
|
CL_DEVICE_TYPE_GPU)
|
||||||
|
worksize = 2048;
|
||||||
|
else
|
||||||
|
worksize = 256;
|
||||||
|
}
|
||||||
|
|
||||||
if (!ncols) {
|
if (!ncols) {
|
||||||
memsize = vg_ocl_device_getulong(vocp->voc_ocldid,
|
memsize = vg_ocl_device_getulong(vocp->voc_ocldid,
|
||||||
@ -1626,23 +1660,27 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode,
|
|||||||
allocsize = vg_ocl_device_getulong(vocp->voc_ocldid,
|
allocsize = vg_ocl_device_getulong(vocp->voc_ocldid,
|
||||||
CL_DEVICE_MAX_MEM_ALLOC_SIZE);
|
CL_DEVICE_MAX_MEM_ALLOC_SIZE);
|
||||||
memsize /= 2;
|
memsize /= 2;
|
||||||
ncols = full_worksize;
|
ncols = full_threads;
|
||||||
nrows = 1;
|
nrows = 1;
|
||||||
/* Find row and column counts close to sqrt(full_worksize) */
|
/* Find row and column counts close to sqrt(full_threads) */
|
||||||
while ((ncols > nrows) && !(ncols & 1)) {
|
while ((ncols > nrows) && !(ncols & 1)) {
|
||||||
ncols >>= 1;
|
ncols /= 2;
|
||||||
nrows <<= 1;
|
nrows *= 2;
|
||||||
}
|
}
|
||||||
/* Increase row & column counts to saturate device memory */
|
|
||||||
if (!(vg_ocl_device_gettype(vocp->voc_ocldid) &
|
/*
|
||||||
CL_DEVICE_TYPE_CPU)) {
|
* Increase row & column counts to satisfy work size
|
||||||
while (((ncols * nrows * 2 * 128) < memsize) &&
|
* multiplier or fill available memory.
|
||||||
|
*/
|
||||||
|
wsmult = 1;
|
||||||
|
while ((!worksize || (wsmult < worksize)) &&
|
||||||
|
((ncols * nrows * 2 * 128) < memsize) &&
|
||||||
((ncols * nrows * 2 * 64) < allocsize)) {
|
((ncols * nrows * 2 * 64) < allocsize)) {
|
||||||
if (ncols > nrows)
|
if (ncols > nrows)
|
||||||
nrows *= 2;
|
nrows *= 2;
|
||||||
else
|
else
|
||||||
ncols *= 2;
|
ncols *= 2;
|
||||||
}
|
wsmult *= 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1651,7 +1689,7 @@ vg_opencl_loop(vg_context_t *vcp, cl_device_id did, int safe_mode,
|
|||||||
if (!invsize) {
|
if (!invsize) {
|
||||||
invsize = 1;
|
invsize = 1;
|
||||||
while (!(round % (invsize << 1)) &&
|
while (!(round % (invsize << 1)) &&
|
||||||
((round / invsize) > full_worksize))
|
((round / invsize) > full_threads))
|
||||||
invsize <<= 1;
|
invsize <<= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2191,7 +2229,8 @@ usage(const char *name)
|
|||||||
"-p <platform> Select OpenCL platform\n"
|
"-p <platform> Select OpenCL platform\n"
|
||||||
"-d <device> Select OpenCL device\n"
|
"-d <device> Select OpenCL device\n"
|
||||||
"-S Safe mode, disable OpenCL loop unrolling optimizations\n"
|
"-S Safe mode, disable OpenCL loop unrolling optimizations\n"
|
||||||
"-w <worksize> Set target thread count per multiprocessor\n"
|
"-w <worksize> Set work items per thread in a work unit\n"
|
||||||
|
"-t <threads> Set target thread count per multiprocessor\n"
|
||||||
"-g <x>x<y> Set grid size\n"
|
"-g <x>x<y> Set grid size\n"
|
||||||
"-b <invsize> Set modular inverse ops per thread\n"
|
"-b <invsize> Set modular inverse ops per thread\n"
|
||||||
"-f <file> File containing list of patterns, one per line\n"
|
"-f <file> File containing list of patterns, one per line\n"
|
||||||
@ -2215,6 +2254,7 @@ main(int argc, char **argv)
|
|||||||
char **patterns, *pend;
|
char **patterns, *pend;
|
||||||
int verbose = 1;
|
int verbose = 1;
|
||||||
int npatterns = 0;
|
int npatterns = 0;
|
||||||
|
int nthreads = 0;
|
||||||
int worksize = 0;
|
int worksize = 0;
|
||||||
int nrows = 0, ncols = 0;
|
int nrows = 0, ncols = 0;
|
||||||
int invsize = 0;
|
int invsize = 0;
|
||||||
@ -2225,7 +2265,7 @@ main(int argc, char **argv)
|
|||||||
const char *result_file = NULL;
|
const char *result_file = NULL;
|
||||||
|
|
||||||
while ((opt = getopt(argc, argv,
|
while ((opt = getopt(argc, argv,
|
||||||
"vqrikNTX:p:d:w:g:b:Sh?f:o:s:")) != -1) {
|
"vqrikNTX:p:d:w:t:g:b:Sh?f:o:s:")) != -1) {
|
||||||
switch (opt) {
|
switch (opt) {
|
||||||
case 'v':
|
case 'v':
|
||||||
verbose = 2;
|
verbose = 2;
|
||||||
@ -2267,6 +2307,13 @@ main(int argc, char **argv)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 't':
|
||||||
|
nthreads = atoi(optarg);
|
||||||
|
if (nthreads == 0) {
|
||||||
|
printf("Invalid thread count '%s'\n", optarg);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 'g':
|
case 'g':
|
||||||
nrows = 0;
|
nrows = 0;
|
||||||
ncols = strtol(optarg, &pend, 0);
|
ncols = strtol(optarg, &pend, 0);
|
||||||
@ -2407,6 +2454,6 @@ main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
vg_opencl_loop(vcp, did, safe_mode,
|
vg_opencl_loop(vcp, did, safe_mode,
|
||||||
worksize, nrows, ncols, invsize);
|
worksize, nthreads, nrows, ncols, invsize);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user