ccminer-gostd-lite/compat/thrust/system/cuda/detail/detail/set_operation.inl

/*
 *  Copyright 2008-2012 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

#include <thrust/detail/config.h>
#include <thrust/system/cuda/detail/detail/set_operation.h>
#include <thrust/system/cuda/detail/detail/balanced_path.h>
#include <thrust/system/cuda/detail/block/inclusive_scan.h>
#include <thrust/system/cuda/detail/block/exclusive_scan.h>
#include <thrust/system/cuda/detail/block/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/scan.h>
#include <thrust/pair.h>
#include <thrust/detail/util/blocking.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/cstdint.h>
#include <thrust/detail/minmax.h>


namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace detail
{
namespace set_operation_detail
{


using thrust::system::cuda::detail::detail::statically_blocked_thread_array;
using thrust::detail::uint16_t;
using thrust::detail::uint32_t;


// empirically determined on sm_20
// value_types larger than this will fail to launch if placed in smem
template<typename T>
  struct stage_through_smem
{
  static const bool value = sizeof(T) <= 6 * sizeof(uint32_t);
};


// max_input_size <= 32
template<typename Size, typename InputIterator, typename OutputIterator>
inline __device__
  OutputIterator serial_bounded_copy_if(Size max_input_size,
                                        InputIterator first,
                                        uint32_t mask,
                                        OutputIterator result)
{
  for(Size i = 0; i < max_input_size; ++i, ++first)
  {
    if((1<<i) & mask)
    {
      *result = *first;
      ++result;
    }
  }

  return result;
}


template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
  struct find_partition_offsets_functor
{
  Size partition_size;
  InputIterator1 first1;
  InputIterator2 first2;
  Size n1, n2;
  Compare comp;

  find_partition_offsets_functor(Size partition_size,
                                 InputIterator1 first1, InputIterator1 last1,
                                 InputIterator2 first2, InputIterator2 last2,
                                 Compare comp)
    : partition_size(partition_size),
      first1(first1), first2(first2),
      n1(last1 - first1), n2(last2 - first2),
      comp(comp)
  {}

  inline __host__ __device__
  thrust::pair<Size,Size> operator()(Size i) const
  {
    Size diag = thrust::min(n1 + n2, i * partition_size);

    // XXX the correctness of balanced_path depends critically on the ll suffix below
    //     why???
    return balanced_path(first1, n1, first2, n2, diag, 4ll, comp);
  }
};


template<typename Size, typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
  OutputIterator find_partition_offsets(thrust::cuda::execution_policy<DerivedPolicy> &exec,
                                        Size num_partitions,
                                        Size partition_size,
                                        InputIterator1 first1, InputIterator1 last1,
                                        InputIterator2 first2, InputIterator2 last2,
                                        OutputIterator result,
                                        Compare comp)
{
  find_partition_offsets_functor<Size,InputIterator1,InputIterator2,Compare> f(partition_size, first1, last1, first2, last2, comp);

  return thrust::transform(exec,
                           thrust::counting_iterator<Size>(0),
                           thrust::counting_iterator<Size>(num_partitions),
                           result,
                           f);
}


namespace block
{


template<unsigned int block_size, typename T>
inline __device__
T right_neighbor(statically_blocked_thread_array<block_size> &ctx, const T &x, const T &boundary)
{
  // stage this shift to conserve smem
  const unsigned int storage_size = block_size / 2;
  __shared__ uninitialized_array<T,storage_size> shared;

  T result = x;

  unsigned int tid = ctx.thread_index();

  if(0 < tid && tid <= storage_size)
  {
    shared[tid - 1] = x;
  }

  ctx.barrier();

  if(tid < storage_size)
  {
    result = shared[tid];
  }

  ctx.barrier();

  tid -= storage_size;
  if(0 < tid && tid <= storage_size)
  {
    shared[tid - 1] = x;
  }
  else if(tid == 0)
  {
    shared[storage_size-1] = boundary;
  }

  ctx.barrier();

  if(tid < storage_size)
  {
    result = shared[tid];
  }

  ctx.barrier();

  return result;
}


template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
inline __device__
  unsigned int bounded_count_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
                                             InputIterator1 first1, uint16_t n1,
                                             InputIterator2 first2, uint16_t n2,
                                             Compare comp,
                                             SetOperation set_op)
{
  unsigned int thread_idx = ctx.thread_index();

  // find partition offsets
  uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
  thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
  thrust::pair<uint16_t,uint16_t> thread_input_end   = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));

  __shared__ uint16_t s_thread_output_size[block_size];

  // work_per_thread + 1 to accomodate a "starred" partition returned from balanced_path above
  s_thread_output_size[thread_idx] =
    set_op.count(work_per_thread + 1,
                 first1 + thread_input_begin.first,  first1 + thread_input_end.first,
                 first2 + thread_input_begin.second, first2 + thread_input_end.second,
                 comp);

  ctx.barrier();

  // reduce per-thread counts
  thrust::system::cuda::detail::block::inplace_inclusive_scan(ctx, s_thread_output_size);
  return s_thread_output_size[ctx.block_dimension() - 1];
}


inline __device__ int pop_count(unsigned int x)
{
// guard use of __popc from other compilers
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
  return __popc(x);
#else
  return x;
#endif
}


template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
inline __device__
  OutputIterator bounded_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
                                         InputIterator1 first1, uint16_t n1,
                                         InputIterator2 first2, uint16_t n2,
                                         OutputIterator result,
                                         Compare comp,
                                         SetOperation set_op)
{
  unsigned int thread_idx = ctx.thread_index();

  // find partition offsets
  uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
  thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
  thrust::pair<uint16_t,uint16_t> thread_input_end   = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));

  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
  // +1 to accomodate a "starred" partition returned from balanced_path above
  uninitialized_array<value_type, work_per_thread + 1> sparse_result;
  uint32_t active_mask =
    set_op(work_per_thread + 1,
           first1 + thread_input_begin.first,  first1 + thread_input_end.first,
           first2 + thread_input_begin.second, first2 + thread_input_end.second,
           sparse_result.begin(),
           comp);

  __shared__ uint16_t s_thread_output_size[block_size];
  s_thread_output_size[thread_idx] = pop_count(active_mask);

  ctx.barrier();

  // scan to turn per-thread counts into output indices
  uint16_t block_output_size = thrust::system::cuda::detail::block::inplace_exclusive_scan(ctx, s_thread_output_size, 0u);

  serial_bounded_copy_if(work_per_thread + 1, sparse_result.begin(), active_mask, result + s_thread_output_size[thread_idx]);

  ctx.barrier();

  return result + block_output_size;
}


template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
inline __device__
  typename thrust::iterator_difference<InputIterator1>::type
    count_set_operation(statically_blocked_thread_array<block_size> &ctx,
                        InputIterator1 first1, InputIterator1 last1,
                        InputIterator2 first2, InputIterator2 last2,
                        Compare comp,
                        SetOperation set_op)
{
  typedef typename thrust::iterator_difference<InputIterator1>::type difference;

  difference result = 0;

  thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);

  // iterate until the input is consumed
  while(remaining_input_size.first + remaining_input_size.second > 0)
  {
    // find the end of this subpartition's input
    // -1 to accomodate "starred" partitions
    uint16_t max_subpartition_size = block_size * work_per_thread - 1;
    difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
    thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);

    typedef typename thrust::iterator_value<InputIterator2>::type value_type;
    if(stage_through_smem<value_type>::value)
    {
      // load the input into __shared__ storage
      __shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;

      value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first,  s_input.begin());
      value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);

      result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
                                                                                 s_input.begin(), subpartition_size.first,
                                                                                 s_input_end1,    subpartition_size.second,
                                                                                 comp,
                                                                                 set_op);
    }
    else
    {
      result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
                                                                                 first1, subpartition_size.first,
                                                                                 first2, subpartition_size.second,
                                                                                 comp,
                                                                                 set_op);
    }

    // advance input
    first1 += subpartition_size.first;
    first2 += subpartition_size.second;

    // decrement remaining size
    remaining_input_size.first  -= subpartition_size.first;
    remaining_input_size.second -= subpartition_size.second;
  }

  return result;
}


template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
inline __device__
OutputIterator set_operation(statically_blocked_thread_array<block_size> &ctx,
                             InputIterator1 first1, InputIterator1 last1,
                             InputIterator2 first2, InputIterator2 last2,
                             OutputIterator result,
                             Compare comp,
                             SetOperation set_op)
{
  typedef typename thrust::iterator_difference<InputIterator1>::type difference;

  thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);

  // iterate until the input is consumed
  while(remaining_input_size.first + remaining_input_size.second > 0)
  {
    // find the end of this subpartition's input
    // -1 to accomodate "starred" partitions
    uint16_t max_subpartition_size = block_size * work_per_thread - 1;
    difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
    thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);

    typedef typename thrust::iterator_value<InputIterator2>::type value_type;
    if(stage_through_smem<value_type>::value)
    {
      // load the input into __shared__ storage
      __shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;

      value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first,  s_input.begin());
      value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);

      result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
                                                                          s_input.begin(), subpartition_size.first,
                                                                          s_input_end1,    subpartition_size.second,
                                                                          result,
                                                                          comp,
                                                                          set_op);
    }
    else
    {
      result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
                                                                          first1, subpartition_size.first,
                                                                          first2, subpartition_size.second,
                                                                          result,
                                                                          comp,
                                                                          set_op);
    }

    // advance input
    first1 += subpartition_size.first;
    first2 += subpartition_size.second;

    // decrement remaining size
    remaining_input_size.first  -= subpartition_size.first;
    remaining_input_size.second -= subpartition_size.second;
  }

  return result;
}


} // end namespace block


template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
  inline __device__ void count_set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
                                             InputIterator1                                      input_partition_offsets,
                                             Size                                                num_partitions,
                                             InputIterator2                                      first1,
                                             InputIterator3                                      first2,
                                             OutputIterator                                      result,
                                             Compare                                             comp,
                                             SetOperation                                        set_op)
{
  // consume partitions
  for(Size partition_idx = ctx.block_index();
      partition_idx < num_partitions;
      partition_idx += ctx.grid_dimension())
  {
    typedef typename thrust::iterator_difference<InputIterator2>::type difference;

    // find the partition
    thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
    thrust::pair<difference,difference> block_input_end   = input_partition_offsets[partition_idx + 1];

    // count the size of the set operation
    difference count = block::count_set_operation<threads_per_block,work_per_thread>(ctx,
                                                                                     first1 + block_input_begin.first,  first1 + block_input_end.first,
                                                                                     first2 + block_input_begin.second, first2 + block_input_end.second,
                                                                                     comp,
                                                                                     set_op);

    if(ctx.thread_index() == 0)
    {
      result[partition_idx] = count;
    }
  }
}


template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
  struct count_set_operation_closure
{
  typedef statically_blocked_thread_array<threads_per_block> context_type;

  InputIterator1 input_partition_offsets;
  Size           num_partitions;
  InputIterator2 first1;
  InputIterator3 first2;
  OutputIterator result;
  Compare        comp;
  SetOperation   set_op;

  count_set_operation_closure(InputIterator1 input_partition_offsets,
                              Size           num_partitions,
                              InputIterator2 first1,
                              InputIterator3 first2,
                              OutputIterator result,
                              Compare        comp,
                              SetOperation   set_op)
    : input_partition_offsets(input_partition_offsets),
      num_partitions(num_partitions),
      first1(first1),
      first2(first2),
      result(result),
      comp(comp),
      set_op(set_op)
  {}

  inline __device__ void operator()() const
  {
    context_type ctx;
    count_set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, result, comp, set_op);
  }
};


template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
  count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation>
    make_count_set_operation_closure(InputIterator1 input_partition_offsets,
                                     Size           num_partitions,
                                     InputIterator2 first1,
                                     InputIterator3 first2,
                                     OutputIterator result,
                                     Compare        comp,
                                     SetOperation   set_op)
{
  typedef count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation> result_type;
  return result_type(input_partition_offsets,num_partitions,first1,first2,result,comp,set_op);
}


template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
inline __device__
  void set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
                     InputIterator1                                      input_partition_offsets,
                     Size                                                num_partitions,
                     InputIterator2                                      first1,
                     InputIterator3                                      first2,
                     InputIterator4                                      output_partition_offsets,
                     OutputIterator                                      result,
                     Compare                                             comp,
                     SetOperation                                        set_op)
{
  // consume partitions
  for(Size partition_idx = ctx.block_index();
      partition_idx < num_partitions;
      partition_idx += ctx.grid_dimension())
  {
    typedef typename thrust::iterator_difference<InputIterator2>::type difference;

    // find the partition
    thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
    thrust::pair<difference,difference> block_input_end   = input_partition_offsets[partition_idx + 1];

    // do the set operation across the partition
    block::set_operation<threads_per_block,work_per_thread>(ctx,
                                                            first1 + block_input_begin.first,  first1 + block_input_end.first,
                                                            first2 + block_input_begin.second, first2 + block_input_end.second,
                                                            result + output_partition_offsets[partition_idx],
                                                            comp,
                                                            set_op);
  }
}


template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
  struct set_operation_closure
{
  typedef statically_blocked_thread_array<threads_per_block> context_type;

  InputIterator1 input_partition_offsets;
  Size           num_partitions;
  InputIterator2 first1;
  InputIterator3 first2;
  InputIterator4 output_partition_offsets;
  OutputIterator result;
  Compare        comp;
  SetOperation   set_op;

  set_operation_closure(InputIterator1 input_partition_offsets,
                        Size           num_partitions,
                        InputIterator2 first1,
                        InputIterator3 first2,
                        InputIterator4 output_partition_offsets,
                        OutputIterator result,
                        Compare        comp,
                        SetOperation   set_op)
    : input_partition_offsets(input_partition_offsets),
      num_partitions(num_partitions),
      first1(first1),
      first2(first2),
      output_partition_offsets(output_partition_offsets),
      result(result),
      comp(comp),
      set_op(set_op)
  {}

  inline __device__ void operator()() const
  {
    context_type ctx;
    set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, output_partition_offsets, result, comp, set_op);
  }
};


template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
  set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation>
    make_set_operation_closure(InputIterator1 input_partition_offsets,
                               Size           num_partitions,
                               InputIterator2 first1,
                               InputIterator3 first2,
                               InputIterator4 output_partition_offsets,
                               OutputIterator result,
                               Compare        comp,
                               SetOperation   set_op)
{
  typedef set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation> result_type;
  return result_type(input_partition_offsets,num_partitions,first1,first2,output_partition_offsets,result,comp,set_op);
}


} // end namespace set_operation_detail


template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
  OutputIterator set_operation(thrust::cuda::execution_policy<DerivedPolicy> &exec,
                               InputIterator1 first1, InputIterator1 last1,
                               InputIterator2 first2, InputIterator2 last2,
                               OutputIterator result,
                               Compare comp,
                               SetOperation set_op)
{
  using thrust::system::cuda::detail::device_properties;
  using thrust::system::cuda::detail::detail::launch_closure;
  namespace d = thrust::system::cuda::detail::detail::set_operation_detail;

  typedef typename thrust::iterator_difference<InputIterator1>::type difference;

  const difference n1 = last1 - first1;
  const difference n2 = last2 - first2;

  // handle empty input
  if(n1 == 0 && n2 == 0)
  {
    return result;
  }

  const thrust::detail::uint16_t work_per_thread   = 15;
  const thrust::detail::uint16_t threads_per_block = 128;
  const thrust::detail::uint16_t work_per_block    = threads_per_block * work_per_thread;

  // -1 because balanced_path adds a single element to the end of a "starred" partition, increasing its size by one
  const thrust::detail::uint16_t maximum_partition_size = work_per_block - 1;
  const difference num_partitions = thrust::detail::util::divide_ri(n1 + n2, maximum_partition_size);

  // find input partition offsets
  // +1 to handle the end of the input elegantly
  thrust::detail::temporary_array<thrust::pair<difference,difference>, DerivedPolicy> input_partition_offsets(0, exec, num_partitions + 1);
  d::find_partition_offsets<difference>(exec, input_partition_offsets.size(), maximum_partition_size, first1, last1, first2, last2, input_partition_offsets.begin(), comp);

  const difference num_blocks = thrust::min<difference>(device_properties().maxGridSize[0], num_partitions);

  // find output partition offsets
  // +1 to store the total size of the total
  thrust::detail::temporary_array<difference, DerivedPolicy> output_partition_offsets(0, exec, num_partitions + 1);
  launch_closure(d::make_count_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), comp, set_op),
                 num_blocks,
                 threads_per_block);

  // turn the output partition counts into offsets to output partitions
  thrust::exclusive_scan(exec, output_partition_offsets.begin(), output_partition_offsets.end(), output_partition_offsets.begin());

  // run the set op kernel
  launch_closure(d::make_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), result, comp, set_op),
                 num_blocks,
                 threads_per_block);

  return result + output_partition_offsets[num_partitions];
}


} // end namespace detail
} // end namespace detail
} // end namespace cuda
} // end namespace system
} // end namespace thrust