ccminer-gostd-lite/compat/thrust/system/cuda/detail/block/merging_sort.h

/*
 *  Copyright 2008-2012 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */


/*! \file merging_sort.h
 *  \brief Block version of merge sort
 */

#pragma once

#include <thrust/iterator/iterator_traits.h>
#include <thrust/detail/type_traits.h>
#include <thrust/system/detail/generic/scalar/binary_search.h>

namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace block
{


template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
__device__ void conditional_swap(RandomAccessIterator1 keys_first,
                                 RandomAccessIterator2 values_first,
                                 const unsigned int i,
                                 const unsigned int end,
                                 bool pred,
                                 Compare comp)
{
  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
  typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;

  if(pred && i+1<end)
  {
    KeyType xi = keys_first[i];
    KeyType xj = keys_first[i+1];

    // swap if xj sorts before xi
    if(comp(xj, xi))
    {
      // XXX this implementation should really dispatch swap via ADL
      ValueType yi;
      yi = values_first[i];
      ValueType yj;
      yj = values_first[i+1];

      keys_first[i]     = xj;
      keys_first[i+1]   = xi;
      values_first[i]   = yj;
      values_first[i+1] = yi;
    }
  }
}


template<typename Context,
         typename RandomAccessIterator1,
         typename RandomAccessIterator2,
         typename Compare>
__device__ void transposition_sort(Context context,
                                   RandomAccessIterator1 keys_first,
                                   RandomAccessIterator2 values_first,
                                   const unsigned int i,
                                   const unsigned int end,
                                   const unsigned int size,
                                   Compare comp)
{
  const bool is_odd = i&0x1;
  
  for(unsigned int round=size/2; round>0; --round)
  {
    // ODDS
    conditional_swap(keys_first, values_first, i, end, is_odd, comp);
    context.barrier();
  
    // EVENS
    conditional_swap(keys_first, values_first, i, end, !is_odd, comp);
    context.barrier();
  }
}

template<typename Context,
         typename RandomAccessIterator1,
         typename RandomAccessIterator2,
         typename StrictWeakOrdering>
__device__ void merge(Context context,
                      RandomAccessIterator1 keys_first, 
                      RandomAccessIterator2 values_first,
                      const unsigned int i,
                      const unsigned int n,
                      unsigned int begin,
                      unsigned int end,
                      unsigned int h,
                      StrictWeakOrdering cmp)
{
  // INVARIANT: Every element i resides within a sequence [begin,end)
  //            of length h which is already sorted
  while( h<n )
  {
    h *= 2;

    unsigned int new_begin = i&(~(h-1));
    unsigned int new_end   = min(n,new_begin+h);

    typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
    typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;

    KeyType key;
    ValueType value;

    unsigned int rank = i - begin;

    // prevent out-of-bounds access
    if(i < new_end)
    {
      key = keys_first[i];

      if(begin==new_begin)  // in the left side of merging pair
      {
        RandomAccessIterator1 result = thrust::system::detail::generic::scalar::lower_bound_n(keys_first+end, new_end-end, key, cmp);
        rank += (result - (keys_first+end));
      }
      else                  // in the right side of merging pair
      {
        RandomAccessIterator1 result = thrust::system::detail::generic::scalar::upper_bound_n(keys_first+new_begin, begin-new_begin, key, cmp);
        rank += (result - (keys_first+new_begin));
      }

      value = values_first[i];
    }

    context.barrier();

    if(i < new_end)
    {
      keys_first[new_begin+rank] = key;
      values_first[new_begin+rank] = value;
    }
    
    context.barrier();

    begin = new_begin;
    end   = new_end;
  }
}


/*! Block-wise implementation of merge sort.
 *  It provides the same external interface as odd_even_sort.
 */
template<typename Context,
         typename RandomAccessIterator1,
         typename RandomAccessIterator2,
         typename StrictWeakOrdering>
__device__ void merging_sort(Context context,
                             RandomAccessIterator1 keys_first,
                             RandomAccessIterator2 values_first,
                             const unsigned int n,
                             StrictWeakOrdering comp)
{
  // Phase 1: Sort subsequences of length 32 using odd-even
  //          transposition sort.  The code below assumes that h is a
  //          power of 2.  Empirically, 32 delivers best results,
  //          which is not surprising since that's the warp width.
  unsigned int i = context.thread_index();
  unsigned int h = 32;
  unsigned int begin=i&(~(h-1)),  end=min(n,begin+h);
  
  transposition_sort(context, keys_first, values_first, i, end, h, comp);
  
  // Phase 2: Apply merge tree to produce final sorted results
  merge(context, keys_first, values_first, i, n, begin, end, h, comp);
} // end merging_sort()


} // end namespace block
} // end namespace detail
} // end namespace cuda
} // end namespace system
} // end namespace thrust
commit initial version 0.1 11 years ago			`/*`
			`* Copyright 2008-2012 NVIDIA Corporation`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`


			`/*! \file merging_sort.h`
			`* \brief Block version of merge sort`
			`*/`

			`#pragma once`

			`#include <thrust/iterator/iterator_traits.h>`
			`#include <thrust/detail/type_traits.h>`
			`#include <thrust/system/detail/generic/scalar/binary_search.h>`

			`namespace thrust`
			`{`
			`namespace system`
			`{`
			`namespace cuda`
			`{`
			`namespace detail`
			`{`
			`namespace block`
			`{`


			`template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>`
			`__device__ void conditional_swap(RandomAccessIterator1 keys_first,`
			`RandomAccessIterator2 values_first,`
			`const unsigned int i,`
			`const unsigned int end,`
			`bool pred,`
			`Compare comp)`
			`{`
			`typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;`
			`typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;`

			`if(pred && i+1<end)`
			`{`
			`KeyType xi = keys_first[i];`
			`KeyType xj = keys_first[i+1];`

			`// swap if xj sorts before xi`
			`if(comp(xj, xi))`
			`{`
			`// XXX this implementation should really dispatch swap via ADL`
			`ValueType yi;`
			`yi = values_first[i];`
			`ValueType yj;`
			`yj = values_first[i+1];`

			`keys_first[i] = xj;`
			`keys_first[i+1] = xi;`
			`values_first[i] = yj;`
			`values_first[i+1] = yi;`
			`}`
			`}`
			`}`


			`template<typename Context,`
			`typename RandomAccessIterator1,`
			`typename RandomAccessIterator2,`
			`typename Compare>`
			`__device__ void transposition_sort(Context context,`
			`RandomAccessIterator1 keys_first,`
			`RandomAccessIterator2 values_first,`
			`const unsigned int i,`
			`const unsigned int end,`
			`const unsigned int size,`
			`Compare comp)`
			`{`
			`const bool is_odd = i&0x1;`

			`for(unsigned int round=size/2; round>0; --round)`
			`{`
			`// ODDS`
			`conditional_swap(keys_first, values_first, i, end, is_odd, comp);`
			`context.barrier();`

			`// EVENS`
			`conditional_swap(keys_first, values_first, i, end, !is_odd, comp);`
			`context.barrier();`
			`}`
			`}`

			`template<typename Context,`
			`typename RandomAccessIterator1,`
			`typename RandomAccessIterator2,`
			`typename StrictWeakOrdering>`
			`__device__ void merge(Context context,`
			`RandomAccessIterator1 keys_first,`
			`RandomAccessIterator2 values_first,`
			`const unsigned int i,`
			`const unsigned int n,`
			`unsigned int begin,`
			`unsigned int end,`
			`unsigned int h,`
			`StrictWeakOrdering cmp)`
			`{`
			`// INVARIANT: Every element i resides within a sequence [begin,end)`
			`// of length h which is already sorted`
			`while( h<n )`
			`{`
			`h *= 2;`

			`unsigned int new_begin = i&(~(h-1));`
			`unsigned int new_end = min(n,new_begin+h);`

			`typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;`
			`typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;`

			`KeyType key;`
			`ValueType value;`

			`unsigned int rank = i - begin;`

			`// prevent out-of-bounds access`
			`if(i < new_end)`
			`{`
			`key = keys_first[i];`

			`if(begin==new_begin) // in the left side of merging pair`
			`{`
			`RandomAccessIterator1 result = thrust::system::detail::generic::scalar::lower_bound_n(keys_first+end, new_end-end, key, cmp);`
			`rank += (result - (keys_first+end));`
			`}`
			`else // in the right side of merging pair`
			`{`
			`RandomAccessIterator1 result = thrust::system::detail::generic::scalar::upper_bound_n(keys_first+new_begin, begin-new_begin, key, cmp);`
			`rank += (result - (keys_first+new_begin));`
			`}`

			`value = values_first[i];`
			`}`

			`context.barrier();`

			`if(i < new_end)`
			`{`
			`keys_first[new_begin+rank] = key;`
			`values_first[new_begin+rank] = value;`
			`}`

			`context.barrier();`

			`begin = new_begin;`
			`end = new_end;`
			`}`
			`}`


			`/*! Block-wise implementation of merge sort.`
			`* It provides the same external interface as odd_even_sort.`
			`*/`
			`template<typename Context,`
			`typename RandomAccessIterator1,`
			`typename RandomAccessIterator2,`
			`typename StrictWeakOrdering>`
			`__device__ void merging_sort(Context context,`
			`RandomAccessIterator1 keys_first,`
			`RandomAccessIterator2 values_first,`
			`const unsigned int n,`
			`StrictWeakOrdering comp)`
			`{`
			`// Phase 1: Sort subsequences of length 32 using odd-even`
			`// transposition sort. The code below assumes that h is a`
			`// power of 2. Empirically, 32 delivers best results,`
			`// which is not surprising since that's the warp width.`
			`unsigned int i = context.thread_index();`
			`unsigned int h = 32;`
			`unsigned int begin=i&(~(h-1)), end=min(n,begin+h);`

			`transposition_sort(context, keys_first, values_first, i, end, h, comp);`

			`// Phase 2: Apply merge tree to produce final sorted results`
			`merge(context, keys_first, values_first, i, n, begin, end, h, comp);`
			`} // end merging_sort()`


			`} // end namespace block`
			`} // end namespace detail`
			`} // end namespace cuda`
			`} // end namespace system`
			`} // end namespace thrust`