/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include namespace thrust { namespace system { namespace detail { namespace internal { namespace scalar { namespace detail { template struct RadixEncoder : public thrust::identity {}; template <> struct RadixEncoder : public thrust::unary_function { unsigned char operator()(char x) const { if(std::numeric_limits::is_signed) return x ^ static_cast(1) << (8 * sizeof(unsigned char) - 1); else return x; } }; template <> struct RadixEncoder : public thrust::unary_function { unsigned char operator()(signed char x) const { return x ^ static_cast(1) << (8 * sizeof(unsigned char) - 1); } }; template <> struct RadixEncoder : public thrust::unary_function { unsigned short operator()(short x) const { return x ^ static_cast(1) << (8 * sizeof(unsigned short) - 1); } }; template <> struct RadixEncoder : public thrust::unary_function { unsigned long operator()(long x) const { return x ^ static_cast(1) << (8 * sizeof(unsigned int) - 1); } }; template <> struct RadixEncoder : public thrust::unary_function { unsigned long operator()(long x) const { return x ^ static_cast(1) << (8 * sizeof(unsigned long) - 1); } }; template <> struct RadixEncoder : public thrust::unary_function { unsigned long long operator()(long long x) const { return x ^ static_cast(1) << (8 * sizeof(unsigned long long) - 1); } }; // ideally we'd use uint32 here and uint64 below template <> struct RadixEncoder : public thrust::unary_function { thrust::detail::uint32_t operator()(float x) const { union { float f; thrust::detail::uint32_t i; } u; u.f = x; thrust::detail::uint32_t mask = -static_cast(u.i >> 31) | (static_cast(1) << 31); return u.i ^ mask; } }; template <> struct RadixEncoder : public thrust::unary_function { thrust::detail::uint64_t operator()(double x) const { union { double f; thrust::detail::uint64_t i; } u; u.f = x; thrust::detail::uint64_t mask = -static_cast(u.i >> 63) | (static_cast(1) << 63); return u.i ^ mask; } }; template void radix_sort(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N) { typedef typename thrust::iterator_value::type KeyType; typedef RadixEncoder Encoder; typedef typename Encoder::result_type EncodedType; static const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits; static const unsigned int HistogramSize = 1 << RadixBits; static const EncodedType BitMask = static_cast((1 << RadixBits) - 1); Encoder encode; // storage for histograms size_t histograms[NumHistograms][HistogramSize] = {{0}}; // see which passes can be eliminated bool skip_shuffle[NumHistograms] = {false}; // false if most recent data is stored in (keys1,vals1) bool flip = false; // compute histograms for (size_t i = 0; i < N; i++) { const EncodedType x = encode(keys1[i]); for (unsigned int j = 0; j < NumHistograms; j++) { const EncodedType BitShift = RadixBits * j; histograms[j][(x >> BitShift) & BitMask]++; } } // scan histograms for (unsigned int i = 0; i < NumHistograms; i++) { size_t sum = 0; for (unsigned int j = 0; j < HistogramSize; j++) { size_t bin = histograms[i][j]; if (bin == N) skip_shuffle[i] = true; histograms[i][j] = sum; sum = sum + bin; } } // shuffle keys and (optionally) values for (unsigned int i = 0; i < NumHistograms; i++) { const EncodedType BitShift = static_cast(RadixBits * i); if (!skip_shuffle[i]) { if (flip) { for (size_t j = 0; j < N; j++) { const EncodedType x = encode(keys2[j]); size_t position = histograms[i][(x >> BitShift) & BitMask]++; RandomAccessIterator1 temp_keys1 = keys1; temp_keys1 += position; RandomAccessIterator2 temp_keys2 = keys2; temp_keys2 += j; // keys1[position] = keys2[j] *temp_keys1 = *temp_keys2; if (HasValues) { RandomAccessIterator3 temp_vals1 = vals1; temp_vals1 += position; RandomAccessIterator4 temp_vals2 = vals2; temp_vals2 += j; // vals1[position] = vals2[j] *temp_vals1 = *temp_vals2; } } } else { for (size_t j = 0; j < N; j++) { const EncodedType x = encode(keys1[j]); size_t position = histograms[i][(x >> BitShift) & BitMask]++; RandomAccessIterator1 temp_keys1 = keys1; temp_keys1 += j; RandomAccessIterator2 temp_keys2 = keys2; temp_keys2 += position; // keys2[position] = keys1[j]; *temp_keys2 = *temp_keys1; if (HasValues) { RandomAccessIterator3 temp_vals1 = vals1; temp_vals1 += j; RandomAccessIterator4 temp_vals2 = vals2; temp_vals2 += position; // vals2[position] = vals1[j] *temp_vals2 = *temp_vals1; } } } flip = (flip) ? false : true; } } // ensure final values are in (keys1,vals1) if (flip) { thrust::copy(keys2, keys2 + N, keys1); if (HasValues) thrust::copy(vals2, vals2 + N, vals1); } } // Select best radix sort parameters based on sizeof(T) and input size // These particular values were determined through empirical testing on a Core i7 950 CPU template struct radix_sort_dispatcher { }; template <> struct radix_sort_dispatcher<1> { template void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N) { detail::radix_sort<8,false>(keys1, keys2, static_cast(0), static_cast(0), N); } template void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N) { detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N); } }; template <> struct radix_sort_dispatcher<2> { template void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N) { if (N < (1 << 16)) detail::radix_sort<8,false>(keys1, keys2, static_cast(0), static_cast(0), N); else detail::radix_sort<16,false>(keys1, keys2, static_cast(0), static_cast(0), N); } template void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N) { if (N < (1 << 15)) detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N); else detail::radix_sort<16,true>(keys1, keys2, vals1, vals2, N); } }; template <> struct radix_sort_dispatcher<4> { template void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N) { if (N < (1 << 22)) detail::radix_sort<8,false>(keys1, keys2, static_cast(0), static_cast(0), N); else detail::radix_sort<4,false>(keys1, keys2, static_cast(0), static_cast(0), N); } template void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N) { if (N < (1 << 22)) detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N); else detail::radix_sort<3,true>(keys1, keys2, vals1, vals2, N); } }; template <> struct radix_sort_dispatcher<8> { template void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N) { if (N < (1 << 21)) detail::radix_sort<8,false>(keys1, keys2, static_cast(0), static_cast(0), N); else detail::radix_sort<4,false>(keys1, keys2, static_cast(0), static_cast(0), N); } template void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N) { if (N < (1 << 21)) detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N); else detail::radix_sort<3,true>(keys1, keys2, vals1, vals2, N); } }; template void radix_sort(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N) { typedef typename thrust::iterator_value::type KeyType; radix_sort_dispatcher()(keys1, keys2, N); } template void radix_sort(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N) { typedef typename thrust::iterator_value::type KeyType; radix_sort_dispatcher()(keys1, keys2, vals1, vals2, N); } } // namespace detail ////////////// // Key Sort // ////////////// template void stable_radix_sort(RandomAccessIterator first, RandomAccessIterator last) { typedef typename thrust::iterator_system::type ExecutionPolicy; typedef typename thrust::iterator_value::type KeyType; size_t N = last - first; // XXX assumes ExecutionPolicy is default constructible // XXX consider how to get stateful systems into this function ExecutionPolicy exec; thrust::detail::temporary_array temp(exec, N); detail::radix_sort(first, temp.begin(), N); } //////////////////// // Key-Value Sort // //////////////////// template void stable_radix_sort_by_key(RandomAccessIterator1 first1, RandomAccessIterator1 last1, RandomAccessIterator2 first2) { // XXX the type of exec should be // typedef decltype(select_system(first1,last1,first2)) system; typedef typename thrust::iterator_system::type ExecutionPolicy; typedef typename thrust::iterator_value::type KeyType; typedef typename thrust::iterator_value::type ValueType; size_t N = last1 - first1; // XXX assumes ExecutionPolicy is default constructible // XXX consider how to get stateful systems into this function ExecutionPolicy exec; thrust::detail::temporary_array temp1(exec, N); thrust::detail::temporary_array temp2(exec, N); detail::radix_sort(first1, temp1.begin(), first2, temp2.begin(), N); } } // end namespace scalar } // end namespace internal } // end namespace detail } // end namespace system } // end namespace thrust