286 lines
9.4 KiB
C++
286 lines
9.4 KiB
C++
/*
|
|
* Copyright 2008-2012 NVIDIA Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <thrust/detail/config.h>
|
|
#include <thrust/system/cuda/detail/merge.h>
|
|
#include <thrust/pair.h>
|
|
#include <thrust/tuple.h>
|
|
#include <thrust/detail/minmax.h>
|
|
#include <thrust/detail/function.h>
|
|
#include <thrust/system/cuda/detail/detail/uninitialized.h>
|
|
#include <thrust/system/cuda/detail/detail/launch_closure.h>
|
|
#include <thrust/detail/util/blocking.h>
|
|
|
|
namespace thrust
|
|
{
|
|
namespace system
|
|
{
|
|
namespace cuda
|
|
{
|
|
namespace detail
|
|
{
|
|
namespace merge_detail
|
|
{
|
|
|
|
|
|
template<typename RandomAccessIterator1,
|
|
typename RandomAccessIterator2,
|
|
typename Size,
|
|
typename Compare>
|
|
__device__ __thrust_forceinline__
|
|
thrust::pair<Size,Size>
|
|
partition_search(RandomAccessIterator1 first1,
|
|
RandomAccessIterator2 first2,
|
|
Size diag,
|
|
Size lower_bound1,
|
|
Size upper_bound1,
|
|
Size lower_bound2,
|
|
Size upper_bound2,
|
|
Compare comp)
|
|
{
|
|
Size begin = thrust::max<Size>(lower_bound1, diag - upper_bound2);
|
|
Size end = thrust::min<Size>(diag - lower_bound2, upper_bound1);
|
|
|
|
while(begin < end)
|
|
{
|
|
Size mid = (begin + end) / 2;
|
|
Size index1 = mid;
|
|
Size index2 = diag - mid - 1;
|
|
|
|
if(comp(first2[index2], first1[index1]))
|
|
{
|
|
end = mid;
|
|
}
|
|
else
|
|
{
|
|
begin = mid + 1;
|
|
}
|
|
}
|
|
|
|
return thrust::make_pair(begin, diag - begin);
|
|
}
|
|
|
|
|
|
template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
|
|
__device__ __thrust_forceinline__
|
|
void merge_n(Context &ctx,
|
|
RandomAccessIterator1 first1,
|
|
Size n1,
|
|
RandomAccessIterator2 first2,
|
|
Size n2,
|
|
RandomAccessIterator3 result,
|
|
Compare comp_,
|
|
unsigned int work_per_thread)
|
|
{
|
|
const unsigned int block_size = ctx.block_dimension();
|
|
thrust::detail::device_function<Compare,bool> comp(comp_);
|
|
typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
|
|
typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
|
|
|
|
Size result_size = n1 + n2;
|
|
|
|
// this is just oversubscription_rate * block_size * work_per_thread
|
|
// but it makes no sense to send oversubscription_rate as an extra parameter
|
|
Size work_per_block = thrust::detail::util::divide_ri(result_size, ctx.grid_dimension());
|
|
|
|
using thrust::system::cuda::detail::detail::uninitialized;
|
|
__shared__ uninitialized<thrust::pair<Size,Size> > s_block_input_begin;
|
|
|
|
Size result_block_offset = ctx.block_index() * work_per_block;
|
|
|
|
// find where this block's input begins in both input sequences
|
|
if(ctx.thread_index() == 0)
|
|
{
|
|
s_block_input_begin = (ctx.block_index() == 0) ?
|
|
thrust::pair<Size,Size>(0,0) :
|
|
partition_search(first1, first2,
|
|
result_block_offset,
|
|
Size(0), n1,
|
|
Size(0), n2,
|
|
comp);
|
|
}
|
|
|
|
ctx.barrier();
|
|
|
|
// iterate to consume this block's input
|
|
Size work_per_iteration = block_size * work_per_thread;
|
|
thrust::pair<Size,Size> block_input_end = s_block_input_begin;
|
|
block_input_end.first += work_per_iteration;
|
|
block_input_end.second += work_per_iteration;
|
|
Size result_block_offset_last = result_block_offset + thrust::min<Size>(work_per_block, result_size - result_block_offset);
|
|
|
|
for(;
|
|
result_block_offset < result_block_offset_last;
|
|
result_block_offset += work_per_iteration,
|
|
block_input_end.first += work_per_iteration,
|
|
block_input_end.second += work_per_iteration
|
|
)
|
|
{
|
|
// find where this thread's input begins in both input sequences for this iteration
|
|
thrust::pair<Size,Size> thread_input_begin =
|
|
partition_search(first1, first2,
|
|
Size(result_block_offset + ctx.thread_index() * work_per_thread),
|
|
s_block_input_begin.get().first, thrust::min<Size>(block_input_end.first , n1),
|
|
s_block_input_begin.get().second, thrust::min<Size>(block_input_end.second, n2),
|
|
comp);
|
|
|
|
ctx.barrier();
|
|
|
|
// XXX the performance impact of not keeping x1 & x2
|
|
// in registers is about 10% for int32
|
|
uninitialized<value_type1> x1;
|
|
uninitialized<value_type2> x2;
|
|
|
|
// XXX this is just a serial merge -- try to simplify or abstract this loop
|
|
Size i = result_block_offset + ctx.thread_index() * work_per_thread;
|
|
Size last_i = i + thrust::min<Size>(work_per_thread, result_size - thread_input_begin.first - thread_input_begin.second);
|
|
for(;
|
|
i < last_i;
|
|
++i)
|
|
{
|
|
// optionally load x1 & x2
|
|
bool output_x2 = true;
|
|
if(thread_input_begin.second < n2)
|
|
{
|
|
x2 = first2[thread_input_begin.second];
|
|
}
|
|
else
|
|
{
|
|
output_x2 = false;
|
|
}
|
|
|
|
if(thread_input_begin.first < n1)
|
|
{
|
|
x1 = first1[thread_input_begin.first];
|
|
|
|
if(output_x2)
|
|
{
|
|
output_x2 = comp(x2.get(), x1.get());
|
|
}
|
|
}
|
|
|
|
result[i] = output_x2 ? x2.get() : x1.get();
|
|
|
|
if(output_x2)
|
|
{
|
|
++thread_input_begin.second;
|
|
}
|
|
else
|
|
{
|
|
++thread_input_begin.first;
|
|
}
|
|
} // end for
|
|
|
|
// the block's last thread has conveniently located the
|
|
// beginning of the next iteration's input
|
|
if(ctx.thread_index() == block_size-1)
|
|
{
|
|
s_block_input_begin = thread_input_begin;
|
|
}
|
|
ctx.barrier();
|
|
} // end for
|
|
} // end merge_n
|
|
|
|
|
|
template<typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
|
|
struct merge_n_closure
|
|
{
|
|
typedef thrust::system::cuda::detail::detail::blocked_thread_array context_type;
|
|
|
|
RandomAccessIterator1 first1;
|
|
Size n1;
|
|
RandomAccessIterator2 first2;
|
|
Size n2;
|
|
RandomAccessIterator3 result;
|
|
Compare comp;
|
|
Size work_per_thread;
|
|
|
|
merge_n_closure(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2, Size n2, RandomAccessIterator3 result, Compare comp, Size work_per_thread)
|
|
: first1(first1), n1(n1), first2(first2), n2(n2), result(result), comp(comp), work_per_thread(work_per_thread)
|
|
{}
|
|
|
|
__device__ __forceinline__
|
|
void operator()()
|
|
{
|
|
context_type ctx;
|
|
merge_n(ctx, first1, n1, first2, n2, result, comp, work_per_thread);
|
|
}
|
|
};
|
|
|
|
|
|
// returns (work_per_thread, threads_per_block, oversubscription_factor)
|
|
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
|
|
thrust::tuple<unsigned int,unsigned int,unsigned int>
|
|
tunables(RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator2, RandomAccessIterator3, Compare comp)
|
|
{
|
|
// determined by empirical testing on GTX 480
|
|
// ~4500 Mkeys/s on GTX 480
|
|
const unsigned int work_per_thread = 5;
|
|
const unsigned int threads_per_block = 128;
|
|
const unsigned int oversubscription_factor = 30;
|
|
|
|
return thrust::make_tuple(work_per_thread, threads_per_block, oversubscription_factor);
|
|
}
|
|
|
|
|
|
} // end merge_detail
|
|
|
|
|
|
template<typename DerivedPolicy,
|
|
typename RandomAccessIterator1,
|
|
typename RandomAccessIterator2,
|
|
typename RandomAccessIterator3,
|
|
typename Compare>
|
|
RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
|
|
RandomAccessIterator1 first1,
|
|
RandomAccessIterator1 last1,
|
|
RandomAccessIterator2 first2,
|
|
RandomAccessIterator2 last2,
|
|
RandomAccessIterator3 result,
|
|
Compare comp)
|
|
{
|
|
typedef typename thrust::iterator_difference<RandomAccessIterator1>::type Size;
|
|
Size n1 = last1 - first1;
|
|
Size n2 = last2 - first2;
|
|
typename thrust::iterator_difference<RandomAccessIterator1>::type n = n1 + n2;
|
|
|
|
// empty result
|
|
if(n <= 0) return result;
|
|
|
|
unsigned int work_per_thread = 0, threads_per_block = 0, oversubscription_factor = 0;
|
|
thrust::tie(work_per_thread,threads_per_block,oversubscription_factor)
|
|
= merge_detail::tunables(first1, last1, first2, last2, result, comp);
|
|
|
|
const unsigned int work_per_block = work_per_thread * threads_per_block;
|
|
|
|
const unsigned int num_processors = device_properties().multiProcessorCount;
|
|
const unsigned int num_blocks = thrust::min<int>(oversubscription_factor * num_processors, thrust::detail::util::divide_ri(n, work_per_block));
|
|
|
|
typedef merge_detail::merge_n_closure<RandomAccessIterator1,Size,RandomAccessIterator2,RandomAccessIterator3,Compare> closure_type;
|
|
closure_type closure(first1, n1, first2, n2, result, comp, work_per_thread);
|
|
|
|
detail::launch_closure(closure, num_blocks, threads_per_block);
|
|
|
|
return result + n1 + n2;
|
|
} // end merge()
|
|
|
|
|
|
} // end namespace detail
|
|
} // end namespace cuda
|
|
} // end namespace system
|
|
} // end namespace thrust
|
|
|