/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /*! \file merging_sort.h * \brief Block version of merge sort */ #pragma once #include #include #include namespace thrust { namespace system { namespace cuda { namespace detail { namespace block { template __device__ void conditional_swap(RandomAccessIterator1 keys_first, RandomAccessIterator2 values_first, const unsigned int i, const unsigned int end, bool pred, Compare comp) { typedef typename thrust::iterator_traits::value_type KeyType; typedef typename thrust::iterator_traits::value_type ValueType; if(pred && i+1 __device__ void transposition_sort(Context context, RandomAccessIterator1 keys_first, RandomAccessIterator2 values_first, const unsigned int i, const unsigned int end, const unsigned int size, Compare comp) { const bool is_odd = i&0x1; for(unsigned int round=size/2; round>0; --round) { // ODDS conditional_swap(keys_first, values_first, i, end, is_odd, comp); context.barrier(); // EVENS conditional_swap(keys_first, values_first, i, end, !is_odd, comp); context.barrier(); } } template __device__ void merge(Context context, RandomAccessIterator1 keys_first, RandomAccessIterator2 values_first, const unsigned int i, const unsigned int n, unsigned int begin, unsigned int end, unsigned int h, StrictWeakOrdering cmp) { // INVARIANT: Every element i resides within a sequence [begin,end) // of length h which is already sorted while( h::value_type KeyType; typedef typename thrust::iterator_traits::value_type ValueType; KeyType key; ValueType value; unsigned int rank = i - begin; // prevent out-of-bounds access if(i < new_end) { key = keys_first[i]; if(begin==new_begin) // in the left side of merging pair { RandomAccessIterator1 result = thrust::system::detail::generic::scalar::lower_bound_n(keys_first+end, new_end-end, key, cmp); rank += (result - (keys_first+end)); } else // in the right side of merging pair { RandomAccessIterator1 result = thrust::system::detail::generic::scalar::upper_bound_n(keys_first+new_begin, begin-new_begin, key, cmp); rank += (result - (keys_first+new_begin)); } value = values_first[i]; } context.barrier(); if(i < new_end) { keys_first[new_begin+rank] = key; values_first[new_begin+rank] = value; } context.barrier(); begin = new_begin; end = new_end; } } /*! Block-wise implementation of merge sort. * It provides the same external interface as odd_even_sort. */ template __device__ void merging_sort(Context context, RandomAccessIterator1 keys_first, RandomAccessIterator2 values_first, const unsigned int n, StrictWeakOrdering comp) { // Phase 1: Sort subsequences of length 32 using odd-even // transposition sort. The code below assumes that h is a // power of 2. Empirically, 32 delivers best results, // which is not surprising since that's the warp width. unsigned int i = context.thread_index(); unsigned int h = 32; unsigned int begin=i&(~(h-1)), end=min(n,begin+h); transposition_sort(context, keys_first, values_first, i, end, h, comp); // Phase 2: Apply merge tree to produce final sorted results merge(context, keys_first, values_first, i, n, begin, end, h, comp); } // end merging_sort() } // end namespace block } // end namespace detail } // end namespace cuda } // end namespace system } // end namespace thrust