luffa/cube: fine tuning of maxregcount for the 750Ti

This allow to get 69 regs used (tested on linux) 69 or 72 make
the compiler to use 64 regs which is not enough on the 750Ti
for optimal performance...
This commit is contained in:
Tanguy Pruvot 2015-06-17 03:56:22 +02:00
parent 634bea21f5
commit 9f5744d4c0
3 changed files with 2 additions and 4 deletions

View File

@ -105,7 +105,7 @@ x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $<

View File

@ -455,7 +455,7 @@
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_luffa512.cu" />
<CudaCompile Include="x11\cuda_x11_luffa512_Cubehash.cu">
<MaxRegCount>80</MaxRegCount>
<MaxRegCount>76</MaxRegCount>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_shavite512.cu">
<MaxRegCount>128</MaxRegCount>

View File

@ -731,8 +731,6 @@ static void finalization512(uint32_t *statebuffer, uint32_t *statechainv, uint32
__global__
#if __CUDA_ARCH__ > 500
__launch_bounds__(256, 4)
#else
__launch_bounds__(256, 3)
#endif
void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
{