You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
384 lines
14 KiB
384 lines
14 KiB
/* |
|
* Copyright 2008-2012 NVIDIA Corporation |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
|
|
#pragma once |
|
|
|
#include <cstddef> |
|
#include <thrust/detail/config.h> |
|
|
|
namespace thrust |
|
{ |
|
namespace system |
|
{ |
|
namespace cuda |
|
{ |
|
namespace detail |
|
{ |
|
|
|
|
|
// XXX define our own device_properties_t to avoid errors when #including |
|
// this file in the absence of a CUDA installation |
|
struct device_properties_t |
|
{ |
|
// mirror the type and spelling of cudaDeviceProp's members |
|
// keep these alphabetized |
|
int major; |
|
int maxGridSize[3]; |
|
int maxThreadsPerBlock; |
|
int maxThreadsPerMultiProcessor; |
|
int minor; |
|
int multiProcessorCount; |
|
int regsPerBlock; |
|
size_t sharedMemPerBlock; |
|
int warpSize; |
|
}; |
|
|
|
|
|
// XXX define our own device_properties_t to avoid errors when #including |
|
// this file in the absence of a CUDA installation |
|
struct function_attributes_t |
|
{ |
|
// mirror the type and spelling of cudaFuncAttributes' members |
|
// keep these alphabetized |
|
size_t constSizeBytes; |
|
size_t localSizeBytes; |
|
int maxThreadsPerBlock; |
|
int numRegs; |
|
size_t sharedSizeBytes; |
|
}; |
|
|
|
|
|
/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic. |
|
* \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest. |
|
* \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest. |
|
* \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can |
|
* accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by |
|
* the "CUDA Occupancy Calculator". |
|
* \note The __global__ function of interest is presumed to use 0 bytes of dynamically-allocated __shared__ memory. |
|
*/ |
|
inline __host__ __device__ |
|
std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes, |
|
const device_properties_t &properties); |
|
|
|
/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic. |
|
* Use this version of the function when a CUDA block's dynamically-allocated __shared__ memory requirements |
|
* vary with the size of the block. |
|
* \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest. |
|
* \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest. |
|
* \param block_size_to_dynamic_smem_bytes A unary function which maps an integer CUDA block size to the number of bytes |
|
* of dynamically-allocated __shared__ memory required by a CUDA block of that size. |
|
* \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can |
|
* accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by |
|
* the "CUDA Occupancy Calculator". |
|
*/ |
|
template<typename UnaryFunction> |
|
inline __host__ __device__ |
|
std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes, |
|
const device_properties_t &properties, |
|
UnaryFunction block_size_to_dynamic_smem_size); |
|
|
|
|
|
/*! Returns the maximum amount of dynamic shared memory each block |
|
* can utilize without reducing thread occupancy. |
|
* |
|
* \param properties CUDA device properties |
|
* \param attributes CUDA function attributes |
|
* \param blocks_per_processor Number of blocks per streaming multiprocessor |
|
*/ |
|
inline __host__ __device__ |
|
size_t proportional_smem_allocation(const device_properties_t &properties, |
|
const function_attributes_t &attributes, |
|
size_t blocks_per_processor); |
|
|
|
|
|
template<typename UnaryFunction> |
|
inline __host__ __device__ |
|
size_t max_blocksize_subject_to_smem_usage(const device_properties_t &properties, |
|
const function_attributes_t &attributes, |
|
UnaryFunction blocksize_to_dynamic_smem_usage); |
|
|
|
|
|
|
|
namespace cuda_launch_config_detail |
|
{ |
|
|
|
using std::size_t; |
|
|
|
namespace util |
|
{ |
|
|
|
|
|
template<typename T> |
|
inline __host__ __device__ |
|
T min_(const T &lhs, const T &rhs) |
|
{ |
|
return rhs < lhs ? rhs : lhs; |
|
} |
|
|
|
|
|
template <typename T> |
|
struct zero_function |
|
{ |
|
inline __host__ __device__ |
|
T operator()(T) |
|
{ |
|
return 0; |
|
} |
|
}; |
|
|
|
|
|
// x/y rounding towards +infinity for integers, used to determine # of blocks/warps etc. |
|
template<typename L, typename R> |
|
inline __host__ __device__ L divide_ri(const L x, const R y) |
|
{ |
|
return (x + (y - 1)) / y; |
|
} |
|
|
|
// x/y rounding towards zero for integers, used to determine # of blocks/warps etc. |
|
template<typename L, typename R> |
|
inline __host__ __device__ L divide_rz(const L x, const R y) |
|
{ |
|
return x / y; |
|
} |
|
|
|
// round x towards infinity to the next multiple of y |
|
template<typename L, typename R> |
|
inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); } |
|
|
|
// round x towards zero to the next multiple of y |
|
template<typename L, typename R> |
|
inline __host__ __device__ L round_z(const L x, const R y){ return y * divide_rz(x, y); } |
|
|
|
} // end namespace util |
|
|
|
|
|
|
|
// granularity of shared memory allocation |
|
inline __host__ __device__ |
|
size_t smem_allocation_unit(const device_properties_t &properties) |
|
{ |
|
switch(properties.major) |
|
{ |
|
case 1: return 512; |
|
case 2: return 128; |
|
case 3: return 256; |
|
default: return 256; // unknown GPU; have to guess |
|
} |
|
} |
|
|
|
|
|
// granularity of register allocation |
|
inline __host__ __device__ |
|
size_t reg_allocation_unit(const device_properties_t &properties, const size_t regsPerThread) |
|
{ |
|
switch(properties.major) |
|
{ |
|
case 1: return (properties.minor <= 1) ? 256 : 512; |
|
case 2: switch(regsPerThread) |
|
{ |
|
case 21: |
|
case 22: |
|
case 29: |
|
case 30: |
|
case 37: |
|
case 38: |
|
case 45: |
|
case 46: |
|
return 128; |
|
default: |
|
return 64; |
|
} |
|
case 3: return 256; |
|
default: return 256; // unknown GPU; have to guess |
|
} |
|
} |
|
|
|
|
|
// granularity of warp allocation |
|
inline __host__ __device__ |
|
size_t warp_allocation_multiple(const device_properties_t &properties) |
|
{ |
|
return (properties.major <= 1) ? 2 : 1; |
|
} |
|
|
|
// number of "sides" into which the multiprocessor is partitioned |
|
inline __host__ __device__ |
|
size_t num_sides_per_multiprocessor(const device_properties_t &properties) |
|
{ |
|
switch(properties.major) |
|
{ |
|
case 1: return 1; |
|
case 2: return 2; |
|
case 3: return 4; |
|
default: return 4; // unknown GPU; have to guess |
|
} |
|
} |
|
|
|
|
|
inline __host__ __device__ |
|
size_t max_blocks_per_multiprocessor(const device_properties_t &properties) |
|
{ |
|
return (properties.major <= 2) ? 8 : 16; |
|
} |
|
|
|
|
|
inline __host__ __device__ |
|
size_t max_active_blocks_per_multiprocessor(const device_properties_t &properties, |
|
const function_attributes_t &attributes, |
|
int CTA_SIZE, |
|
size_t dynamic_smem_bytes) |
|
{ |
|
// Determine the maximum number of CTAs that can be run simultaneously per SM |
|
// This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet |
|
|
|
////////////////////////////////////////// |
|
// Limits due to threads/SM or blocks/SM |
|
////////////////////////////////////////// |
|
const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor; // 768, 1024, 1536, etc. |
|
const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties); |
|
|
|
// Calc limits |
|
const size_t ctaLimitThreads = (CTA_SIZE <= properties.maxThreadsPerBlock) ? maxThreadsPerSM / CTA_SIZE : 0; |
|
const size_t ctaLimitBlocks = maxBlocksPerSM; |
|
|
|
////////////////////////////////////////// |
|
// Limits due to shared memory/SM |
|
////////////////////////////////////////// |
|
const size_t smemAllocationUnit = smem_allocation_unit(properties); |
|
const size_t smemBytes = attributes.sharedSizeBytes + dynamic_smem_bytes; |
|
const size_t smemPerCTA = util::round_i(smemBytes, smemAllocationUnit); |
|
|
|
// Calc limit |
|
const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM; |
|
|
|
////////////////////////////////////////// |
|
// Limits due to registers/SM |
|
////////////////////////////////////////// |
|
const size_t regAllocationUnit = reg_allocation_unit(properties, attributes.numRegs); |
|
const size_t warpAllocationMultiple = warp_allocation_multiple(properties); |
|
const size_t numWarps = util::round_i(util::divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple); |
|
|
|
// Calc limit |
|
size_t ctaLimitRegs; |
|
if(properties.major <= 1) |
|
{ |
|
// GPUs of compute capability 1.x allocate registers to CTAs |
|
// Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit |
|
const size_t regsPerCTA = util::round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit); |
|
ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM; |
|
} |
|
else |
|
{ |
|
// GPUs of compute capability 2.x and higher allocate registers to warps |
|
// Number of regs per warp is regs per thread times times warp size, rounded up to allocation unit |
|
const size_t regsPerWarp = util::round_i(attributes.numRegs * properties.warpSize, regAllocationUnit); |
|
const size_t numSides = num_sides_per_multiprocessor(properties); |
|
const size_t numRegsPerSide = properties.regsPerBlock / numSides; |
|
ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM; |
|
} |
|
|
|
////////////////////////////////////////// |
|
// Overall limit is min() of limits due to above reasons |
|
////////////////////////////////////////// |
|
return util::min_(ctaLimitRegs, util::min_(ctaLimitSMem, util::min_(ctaLimitThreads, ctaLimitBlocks))); |
|
} |
|
|
|
|
|
} // end namespace cuda_launch_config_detail |
|
|
|
|
|
template<typename UnaryFunction> |
|
inline __host__ __device__ |
|
std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes, |
|
const device_properties_t &properties, |
|
UnaryFunction block_size_to_dynamic_smem_size) |
|
{ |
|
size_t max_occupancy = properties.maxThreadsPerMultiProcessor; |
|
size_t largest_blocksize = cuda_launch_config_detail::util::min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); |
|
size_t granularity = properties.warpSize; |
|
size_t max_blocksize = 0; |
|
size_t highest_occupancy = 0; |
|
|
|
for(size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity) |
|
{ |
|
size_t occupancy = blocksize * cuda_launch_config_detail::max_active_blocks_per_multiprocessor(properties, attributes, blocksize, block_size_to_dynamic_smem_size(blocksize)); |
|
|
|
if(occupancy > highest_occupancy) |
|
{ |
|
max_blocksize = blocksize; |
|
highest_occupancy = occupancy; |
|
} |
|
|
|
// early out, can't do better |
|
if(highest_occupancy == max_occupancy) |
|
break; |
|
} |
|
|
|
return max_blocksize; |
|
} |
|
|
|
|
|
inline __host__ __device__ |
|
std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes, |
|
const device_properties_t &properties) |
|
{ |
|
return block_size_with_maximum_potential_occupancy(attributes, properties, cuda_launch_config_detail::util::zero_function<std::size_t>()); |
|
} |
|
|
|
|
|
inline __host__ __device__ |
|
size_t proportional_smem_allocation(const device_properties_t &properties, |
|
const function_attributes_t &attributes, |
|
size_t blocks_per_processor) |
|
{ |
|
size_t smem_per_processor = properties.sharedMemPerBlock; |
|
size_t smem_allocation_unit = cuda_launch_config_detail::smem_allocation_unit(properties); |
|
|
|
size_t total_smem_per_block = cuda_launch_config_detail::util::round_z(smem_per_processor / blocks_per_processor, smem_allocation_unit); |
|
size_t static_smem_per_block = attributes.sharedSizeBytes; |
|
|
|
return total_smem_per_block - static_smem_per_block; |
|
} |
|
|
|
|
|
template<typename UnaryFunction> |
|
inline __host__ __device__ |
|
size_t max_blocksize_subject_to_smem_usage(const device_properties_t &properties, |
|
const function_attributes_t &attributes, |
|
UnaryFunction blocksize_to_dynamic_smem_usage) |
|
{ |
|
size_t largest_blocksize = (thrust::min)(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); |
|
size_t granularity = properties.warpSize; |
|
|
|
for(int blocksize = largest_blocksize; blocksize > 0; blocksize -= granularity) |
|
{ |
|
size_t total_smem_usage = blocksize_to_dynamic_smem_usage(blocksize) + attributes.sharedSizeBytes; |
|
|
|
if(total_smem_usage <= properties.sharedMemPerBlock) |
|
{ |
|
return blocksize; |
|
} |
|
} |
|
|
|
return 0; |
|
} |
|
|
|
|
|
} // end detail |
|
} // end cuda |
|
} // end system |
|
} // end thrust |
|
|
|
|