ccminer/compat/thrust/system/cuda/detail/reduce.inl

/*
 *  Copyright 2008-2012 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */


/*! \file reduce.inl
 *  \brief Inline file for reduce.h
 */

#include <thrust/detail/config.h>
#include <thrust/distance.h>
#include <thrust/iterator/iterator_traits.h>
#include <thrust/detail/minmax.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/system/detail/generic/select_system.h>

#include <thrust/system/cuda/detail/runtime_introspection.h>
#include <thrust/system/cuda/detail/extern_shared_ptr.h>
#include <thrust/system/cuda/detail/block/reduce.h>
#include <thrust/system/cuda/detail/detail/launch_closure.h>
#include <thrust/system/cuda/detail/detail/launch_calculator.h>
#include <thrust/system/cuda/detail/execution_policy.h>

namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{

namespace reduce_detail
{

/*
 * Reduce a vector of n elements using binary_op()
 *
 * The order of reduction is not defined, so binary_op() should
 * be a commutative (and associative) operator such as 
 * (integer) addition.  Since floating point operations
 * do not completely satisfy these criteria, the result is 
 * generally not the same as a consecutive reduction of 
 * the elements.
 * 
 * Uses the same pattern as reduce6() in the CUDA SDK
 *
 */
template <typename InputIterator,
          typename Size,
          typename T,
          typename OutputIterator,
          typename BinaryFunction,
          typename Context>
struct unordered_reduce_closure
{
  InputIterator  input;
  Size           n;
  T              init;
  OutputIterator output;
  BinaryFunction binary_op;
  unsigned int shared_array_size;

  typedef Context context_type;
  context_type context;

  unordered_reduce_closure(InputIterator input, Size n, T init, OutputIterator output, BinaryFunction binary_op, unsigned int shared_array_size, Context context = Context())
    : input(input), n(n), init(init), output(output), binary_op(binary_op), shared_array_size(shared_array_size), context(context) {}

  __device__ __thrust_forceinline__
  void operator()(void)
  {
    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
    extern_shared_ptr<OutputType>  shared_array;

    Size grid_size = context.block_dimension() * context.grid_dimension();

    Size i = context.linear_index();
      
    input += i;

    // compute reduction with all blockDim.x threads
    OutputType sum = thrust::raw_reference_cast(*input);

    i     += grid_size;
    input += grid_size;

    while (i < n)
    {
      OutputType val = thrust::raw_reference_cast(*input);

      sum = binary_op(sum, val);

      i      += grid_size;
      input  += grid_size;
    }

    // write first shared_array_size values into shared memory
    if (context.thread_index() < shared_array_size)
      shared_array[context.thread_index()] = sum;  

    // accumulate remaining values (if any) to shared memory in stages
    if (context.block_dimension() > shared_array_size)
    {
      unsigned int lb = shared_array_size;
      unsigned int ub = shared_array_size + lb;
      
      while (lb < context.block_dimension())
      {
        context.barrier();

        if (lb <= context.thread_index() && context.thread_index() < ub)
        {
          OutputType tmp = shared_array[context.thread_index() - lb];
          shared_array[context.thread_index() - lb] = binary_op(tmp, sum);
        }

        lb += shared_array_size;
        ub += shared_array_size;
      }
    }
    
    context.barrier();

    block::reduce_n(context, shared_array, thrust::min<unsigned int>(context.block_dimension(), shared_array_size), binary_op);
  
    if (context.thread_index() == 0)
    {
      OutputType tmp = shared_array[0];

      if (context.grid_dimension() == 1)
        tmp = binary_op(init, tmp);

      output += context.block_index();
      *output = tmp;
    }
  }
};


__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN

template<typename DerivedPolicy,
         typename InputIterator,
         typename OutputType,
         typename BinaryFunction>
  OutputType reduce(execution_policy<DerivedPolicy> &exec,
                    InputIterator first,
                    InputIterator last,
                    OutputType init,
                    BinaryFunction binary_op)
{
  // we're attempting to launch a kernel, assert we're compiling with nvcc
  // ========================================================================
  // X Note to the user: If you've found this line due to a compiler error, X
  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
  // ========================================================================
  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );

  typedef typename thrust::iterator_difference<InputIterator>::type difference_type;

  difference_type n = thrust::distance(first,last);

  if (n == 0)
    return init;

  typedef thrust::detail::temporary_array<OutputType, DerivedPolicy> OutputArray;
  typedef typename OutputArray::iterator OutputIterator;

  typedef detail::blocked_thread_array Context;
  typedef unordered_reduce_closure<InputIterator,difference_type,OutputType,OutputIterator,BinaryFunction,Context> Closure;
    
  function_attributes_t attributes = detail::closure_attributes<Closure>();
  
  // TODO chose this in a more principled manner
  size_t threshold = thrust::max<size_t>(2 * attributes.maxThreadsPerBlock, 1024);

  device_properties_t properties = device_properties();

  // launch configuration
  size_t num_blocks; 
  size_t block_size; 
  size_t array_size; 
  size_t smem_bytes; 

  // first level reduction
  if (static_cast<size_t>(n) < threshold)
  {
    num_blocks = 1;
    block_size = thrust::min(static_cast<size_t>(n), static_cast<size_t>(attributes.maxThreadsPerBlock));
    array_size = thrust::min(block_size, (properties.sharedMemPerBlock - attributes.sharedSizeBytes) / sizeof(OutputType));
    smem_bytes = sizeof(OutputType) * array_size;
  }
  else
  {
    detail::launch_calculator<Closure> calculator;
    
    thrust::tuple<size_t,size_t,size_t> config = calculator.with_variable_block_size_available_smem();

    num_blocks = thrust::min(thrust::get<0>(config), static_cast<size_t>(n) / thrust::get<1>(config));
    block_size = thrust::get<1>(config);
    array_size = thrust::min(block_size, thrust::get<2>(config) / sizeof(OutputType));
    smem_bytes = sizeof(OutputType) * array_size;
  }
 
  // TODO assert(n <= num_blocks * block_size);
  // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"

  OutputArray output(exec, num_blocks);

  Closure closure(first, n, init, output.begin(), binary_op, array_size);
  
  //std::cout << "Launching " << num_blocks << " blocks of kernel with " << block_size << " threads and " << smem_bytes << " shared memory per block " << std::endl;

  detail::launch_closure(closure, num_blocks, block_size, smem_bytes);

  // second level reduction
  if (num_blocks > 1)
  {
    typedef detail::blocked_thread_array Context;
    typedef unordered_reduce_closure<OutputIterator,difference_type,OutputType,OutputIterator,BinaryFunction,Context> Closure;

    function_attributes_t attributes = detail::closure_attributes<Closure>();

    num_blocks = 1;
    block_size = thrust::min(output.size(), static_cast<size_t>(attributes.maxThreadsPerBlock));
    array_size = thrust::min(block_size, (properties.sharedMemPerBlock - attributes.sharedSizeBytes) / sizeof(OutputType));
    smem_bytes = sizeof(OutputType) * array_size;
  
    // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"

    Closure closure(output.begin(), output.size(), init, output.begin(), binary_op, array_size);

    //std::cout << "Launching " << num_blocks << " blocks of kernel with " << block_size << " threads and " << smem_bytes << " shared memory per block " << std::endl;

    detail::launch_closure(closure, num_blocks, block_size, smem_bytes);
  }
  
  return output[0];
} // end reduce

} // end reduce_detail

__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END

template<typename DerivedPolicy,
         typename InputIterator,
         typename OutputType,
         typename BinaryFunction>
  OutputType reduce(execution_policy<DerivedPolicy> &exec,
                    InputIterator first,
                    InputIterator last,
                    OutputType init,
                    BinaryFunction binary_op)
{
  return reduce_detail::reduce(exec, first, last, init, binary_op);
} // end reduce()

} // end namespace detail
} // end namespace cuda
} // end namespace system
} // end namespace thrust
commit initial version 0.1 11 years ago			`/*`
			`* Copyright 2008-2012 NVIDIA Corporation`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`


			`/*! \file reduce.inl`
			`* \brief Inline file for reduce.h`
			`*/`

			`#include <thrust/detail/config.h>`
			`#include <thrust/distance.h>`
			`#include <thrust/iterator/iterator_traits.h>`
			`#include <thrust/detail/minmax.h>`
			`#include <thrust/detail/temporary_array.h>`
			`#include <thrust/system/detail/generic/select_system.h>`

			`#include <thrust/system/cuda/detail/runtime_introspection.h>`
			`#include <thrust/system/cuda/detail/extern_shared_ptr.h>`
			`#include <thrust/system/cuda/detail/block/reduce.h>`
			`#include <thrust/system/cuda/detail/detail/launch_closure.h>`
			`#include <thrust/system/cuda/detail/detail/launch_calculator.h>`
			`#include <thrust/system/cuda/detail/execution_policy.h>`

			`namespace thrust`
			`{`
			`namespace system`
			`{`
			`namespace cuda`
			`{`
			`namespace detail`
			`{`

			`namespace reduce_detail`
			`{`

			`/*`
			`* Reduce a vector of n elements using binary_op()`
			`*`
			`* The order of reduction is not defined, so binary_op() should`
			`* be a commutative (and associative) operator such as`
			`* (integer) addition. Since floating point operations`
			`* do not completely satisfy these criteria, the result is`
			`* generally not the same as a consecutive reduction of`
			`* the elements.`
			`*`
			`* Uses the same pattern as reduce6() in the CUDA SDK`
			`*`
			`*/`
			`template <typename InputIterator,`
			`typename Size,`
			`typename T,`
			`typename OutputIterator,`
			`typename BinaryFunction,`
			`typename Context>`
			`struct unordered_reduce_closure`
			`{`
			`InputIterator input;`
			`Size n;`
			`T init;`
			`OutputIterator output;`
			`BinaryFunction binary_op;`
			`unsigned int shared_array_size;`

			`typedef Context context_type;`
			`context_type context;`

			`unordered_reduce_closure(InputIterator input, Size n, T init, OutputIterator output, BinaryFunction binary_op, unsigned int shared_array_size, Context context = Context())`
			`: input(input), n(n), init(init), output(output), binary_op(binary_op), shared_array_size(shared_array_size), context(context) {}`

			`__device__ __thrust_forceinline__`
			`void operator()(void)`
			`{`
			`typedef typename thrust::iterator_value<OutputIterator>::type OutputType;`
			`extern_shared_ptr<OutputType> shared_array;`

			`Size grid_size = context.block_dimension() * context.grid_dimension();`

			`Size i = context.linear_index();`

			`input += i;`

			`// compute reduction with all blockDim.x threads`
			`OutputType sum = thrust::raw_reference_cast(*input);`

			`i += grid_size;`
			`input += grid_size;`

			`while (i < n)`
			`{`
			`OutputType val = thrust::raw_reference_cast(*input);`

			`sum = binary_op(sum, val);`

			`i += grid_size;`
			`input += grid_size;`
			`}`

			`// write first shared_array_size values into shared memory`
			`if (context.thread_index() < shared_array_size)`
			`shared_array[context.thread_index()] = sum;`

			`// accumulate remaining values (if any) to shared memory in stages`
			`if (context.block_dimension() > shared_array_size)`
			`{`
			`unsigned int lb = shared_array_size;`
			`unsigned int ub = shared_array_size + lb;`

			`while (lb < context.block_dimension())`
			`{`
			`context.barrier();`

			`if (lb <= context.thread_index() && context.thread_index() < ub)`
			`{`
			`OutputType tmp = shared_array[context.thread_index() - lb];`
			`shared_array[context.thread_index() - lb] = binary_op(tmp, sum);`
			`}`

			`lb += shared_array_size;`
			`ub += shared_array_size;`
			`}`
			`}`

			`context.barrier();`

			`block::reduce_n(context, shared_array, thrust::min<unsigned int>(context.block_dimension(), shared_array_size), binary_op);`

			`if (context.thread_index() == 0)`
			`{`
			`OutputType tmp = shared_array[0];`

			`if (context.grid_dimension() == 1)`
			`tmp = binary_op(init, tmp);`

			`output += context.block_index();`
			`*output = tmp;`
			`}`
			`}`
			`};`


			`__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN`

			`template<typename DerivedPolicy,`
			`typename InputIterator,`
			`typename OutputType,`
			`typename BinaryFunction>`
			`OutputType reduce(execution_policy<DerivedPolicy> &exec,`
			`InputIterator first,`
			`InputIterator last,`
			`OutputType init,`
			`BinaryFunction binary_op)`
			`{`
			`// we're attempting to launch a kernel, assert we're compiling with nvcc`
			`// ========================================================================`
			`// X Note to the user: If you've found this line due to a compiler error, X`
			`// X you need to compile your code using nvcc, rather than g++ or cl.exe X`
			`// ========================================================================`
			`THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );`

			`typedef typename thrust::iterator_difference<InputIterator>::type difference_type;`

			`difference_type n = thrust::distance(first,last);`

			`if (n == 0)`
			`return init;`

			`typedef thrust::detail::temporary_array<OutputType, DerivedPolicy> OutputArray;`
			`typedef typename OutputArray::iterator OutputIterator;`

			`typedef detail::blocked_thread_array Context;`
			`typedef unordered_reduce_closure<InputIterator,difference_type,OutputType,OutputIterator,BinaryFunction,Context> Closure;`

			`function_attributes_t attributes = detail::closure_attributes<Closure>();`

			`// TODO chose this in a more principled manner`
			`size_t threshold = thrust::max<size_t>(2 * attributes.maxThreadsPerBlock, 1024);`

			`device_properties_t properties = device_properties();`

			`// launch configuration`
			`size_t num_blocks;`
			`size_t block_size;`
			`size_t array_size;`
			`size_t smem_bytes;`

			`// first level reduction`
			`if (static_cast<size_t>(n) < threshold)`
			`{`
			`num_blocks = 1;`
			`block_size = thrust::min(static_cast<size_t>(n), static_cast<size_t>(attributes.maxThreadsPerBlock));`
			`array_size = thrust::min(block_size, (properties.sharedMemPerBlock - attributes.sharedSizeBytes) / sizeof(OutputType));`
			`smem_bytes = sizeof(OutputType) * array_size;`
			`}`
			`else`
			`{`
			`detail::launch_calculator<Closure> calculator;`

			`thrust::tuple<size_t,size_t,size_t> config = calculator.with_variable_block_size_available_smem();`

			`num_blocks = thrust::min(thrust::get<0>(config), static_cast<size_t>(n) / thrust::get<1>(config));`
			`block_size = thrust::get<1>(config);`
			`array_size = thrust::min(block_size, thrust::get<2>(config) / sizeof(OutputType));`
			`smem_bytes = sizeof(OutputType) * array_size;`
			`}`

			`// TODO assert(n <= num_blocks * block_size);`
			`// TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"`

			`OutputArray output(exec, num_blocks);`

			`Closure closure(first, n, init, output.begin(), binary_op, array_size);`

			`//std::cout << "Launching " << num_blocks << " blocks of kernel with " << block_size << " threads and " << smem_bytes << " shared memory per block " << std::endl;`

			`detail::launch_closure(closure, num_blocks, block_size, smem_bytes);`

			`// second level reduction`
			`if (num_blocks > 1)`
			`{`
			`typedef detail::blocked_thread_array Context;`
			`typedef unordered_reduce_closure<OutputIterator,difference_type,OutputType,OutputIterator,BinaryFunction,Context> Closure;`

			`function_attributes_t attributes = detail::closure_attributes<Closure>();`

			`num_blocks = 1;`
			`block_size = thrust::min(output.size(), static_cast<size_t>(attributes.maxThreadsPerBlock));`
			`array_size = thrust::min(block_size, (properties.sharedMemPerBlock - attributes.sharedSizeBytes) / sizeof(OutputType));`
			`smem_bytes = sizeof(OutputType) * array_size;`

			`// TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"`

			`Closure closure(output.begin(), output.size(), init, output.begin(), binary_op, array_size);`

			`//std::cout << "Launching " << num_blocks << " blocks of kernel with " << block_size << " threads and " << smem_bytes << " shared memory per block " << std::endl;`

			`detail::launch_closure(closure, num_blocks, block_size, smem_bytes);`
			`}`

			`return output[0];`
			`} // end reduce`

			`} // end reduce_detail`

			`__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END`

			`template<typename DerivedPolicy,`
			`typename InputIterator,`
			`typename OutputType,`
			`typename BinaryFunction>`
			`OutputType reduce(execution_policy<DerivedPolicy> &exec,`
			`InputIterator first,`
			`InputIterator last,`
			`OutputType init,`
			`BinaryFunction binary_op)`
			`{`
			`return reduce_detail::reduce(exec, first, last, init, binary_op);`
			`} // end reduce()`

			`} // end namespace detail`
			`} // end namespace cuda`
			`} // end namespace system`
			`} // end namespace thrust`