/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include namespace thrust { namespace system { namespace cuda { namespace detail { namespace merge_detail { template __device__ __thrust_forceinline__ thrust::pair partition_search(RandomAccessIterator1 first1, RandomAccessIterator2 first2, Size diag, Size lower_bound1, Size upper_bound1, Size lower_bound2, Size upper_bound2, Compare comp) { Size begin = thrust::max(lower_bound1, diag - upper_bound2); Size end = thrust::min(diag - lower_bound2, upper_bound1); while(begin < end) { Size mid = (begin + end) / 2; Size index1 = mid; Size index2 = diag - mid - 1; if(comp(first2[index2], first1[index1])) { end = mid; } else { begin = mid + 1; } } return thrust::make_pair(begin, diag - begin); } template __device__ __thrust_forceinline__ void merge_n(Context &ctx, RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2, Size n2, RandomAccessIterator3 result, Compare comp_, unsigned int work_per_thread) { const unsigned int block_size = ctx.block_dimension(); thrust::detail::device_function comp(comp_); typedef typename thrust::iterator_value::type value_type1; typedef typename thrust::iterator_value::type value_type2; Size result_size = n1 + n2; // this is just oversubscription_rate * block_size * work_per_thread // but it makes no sense to send oversubscription_rate as an extra parameter Size work_per_block = thrust::detail::util::divide_ri(result_size, ctx.grid_dimension()); using thrust::system::cuda::detail::detail::uninitialized; __shared__ uninitialized > s_block_input_begin; Size result_block_offset = ctx.block_index() * work_per_block; // find where this block's input begins in both input sequences if(ctx.thread_index() == 0) { s_block_input_begin = (ctx.block_index() == 0) ? thrust::pair(0,0) : partition_search(first1, first2, result_block_offset, Size(0), n1, Size(0), n2, comp); } ctx.barrier(); // iterate to consume this block's input Size work_per_iteration = block_size * work_per_thread; thrust::pair block_input_end = s_block_input_begin; block_input_end.first += work_per_iteration; block_input_end.second += work_per_iteration; Size result_block_offset_last = result_block_offset + thrust::min(work_per_block, result_size - result_block_offset); for(; result_block_offset < result_block_offset_last; result_block_offset += work_per_iteration, block_input_end.first += work_per_iteration, block_input_end.second += work_per_iteration ) { // find where this thread's input begins in both input sequences for this iteration thrust::pair thread_input_begin = partition_search(first1, first2, Size(result_block_offset + ctx.thread_index() * work_per_thread), s_block_input_begin.get().first, thrust::min(block_input_end.first , n1), s_block_input_begin.get().second, thrust::min(block_input_end.second, n2), comp); ctx.barrier(); // XXX the performance impact of not keeping x1 & x2 // in registers is about 10% for int32 uninitialized x1; uninitialized x2; // XXX this is just a serial merge -- try to simplify or abstract this loop Size i = result_block_offset + ctx.thread_index() * work_per_thread; Size last_i = i + thrust::min(work_per_thread, result_size - thread_input_begin.first - thread_input_begin.second); for(; i < last_i; ++i) { // optionally load x1 & x2 bool output_x2 = true; if(thread_input_begin.second < n2) { x2 = first2[thread_input_begin.second]; } else { output_x2 = false; } if(thread_input_begin.first < n1) { x1 = first1[thread_input_begin.first]; if(output_x2) { output_x2 = comp(x2.get(), x1.get()); } } result[i] = output_x2 ? x2.get() : x1.get(); if(output_x2) { ++thread_input_begin.second; } else { ++thread_input_begin.first; } } // end for // the block's last thread has conveniently located the // beginning of the next iteration's input if(ctx.thread_index() == block_size-1) { s_block_input_begin = thread_input_begin; } ctx.barrier(); } // end for } // end merge_n template struct merge_n_closure { typedef thrust::system::cuda::detail::detail::blocked_thread_array context_type; RandomAccessIterator1 first1; Size n1; RandomAccessIterator2 first2; Size n2; RandomAccessIterator3 result; Compare comp; Size work_per_thread; merge_n_closure(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2, Size n2, RandomAccessIterator3 result, Compare comp, Size work_per_thread) : first1(first1), n1(n1), first2(first2), n2(n2), result(result), comp(comp), work_per_thread(work_per_thread) {} __device__ __forceinline__ void operator()() { context_type ctx; merge_n(ctx, first1, n1, first2, n2, result, comp, work_per_thread); } }; // returns (work_per_thread, threads_per_block, oversubscription_factor) template thrust::tuple tunables(RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator2, RandomAccessIterator3, Compare comp) { // determined by empirical testing on GTX 480 // ~4500 Mkeys/s on GTX 480 const unsigned int work_per_thread = 5; const unsigned int threads_per_block = 128; const unsigned int oversubscription_factor = 30; return thrust::make_tuple(work_per_thread, threads_per_block, oversubscription_factor); } } // end merge_detail template RandomAccessIterator3 merge(execution_policy &exec, RandomAccessIterator1 first1, RandomAccessIterator1 last1, RandomAccessIterator2 first2, RandomAccessIterator2 last2, RandomAccessIterator3 result, Compare comp) { typedef typename thrust::iterator_difference::type Size; Size n1 = last1 - first1; Size n2 = last2 - first2; typename thrust::iterator_difference::type n = n1 + n2; // empty result if(n <= 0) return result; unsigned int work_per_thread = 0, threads_per_block = 0, oversubscription_factor = 0; thrust::tie(work_per_thread,threads_per_block,oversubscription_factor) = merge_detail::tunables(first1, last1, first2, last2, result, comp); const unsigned int work_per_block = work_per_thread * threads_per_block; const unsigned int num_processors = device_properties().multiProcessorCount; const unsigned int num_blocks = thrust::min(oversubscription_factor * num_processors, thrust::detail::util::divide_ri(n, work_per_block)); typedef merge_detail::merge_n_closure closure_type; closure_type closure(first1, n1, first2, n2, result, comp, work_per_thread); detail::launch_closure(closure, num_blocks, threads_per_block); return result + n1 + n2; } // end merge() } // end namespace detail } // end namespace cuda } // end namespace system } // end namespace thrust