286 lines
9.4 KiB
Plaintext
Raw Normal View History

2014-03-18 22:17:40 +01:00
/*
* Copyright 2008-2012 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thrust/detail/config.h>
#include <thrust/system/cuda/detail/merge.h>
#include <thrust/pair.h>
#include <thrust/tuple.h>
#include <thrust/detail/minmax.h>
#include <thrust/detail/function.h>
#include <thrust/system/cuda/detail/detail/uninitialized.h>
#include <thrust/system/cuda/detail/detail/launch_closure.h>
#include <thrust/detail/util/blocking.h>
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace merge_detail
{
template<typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Size,
typename Compare>
__device__ __thrust_forceinline__
thrust::pair<Size,Size>
partition_search(RandomAccessIterator1 first1,
RandomAccessIterator2 first2,
Size diag,
Size lower_bound1,
Size upper_bound1,
Size lower_bound2,
Size upper_bound2,
Compare comp)
{
Size begin = thrust::max<Size>(lower_bound1, diag - upper_bound2);
Size end = thrust::min<Size>(diag - lower_bound2, upper_bound1);
while(begin < end)
{
Size mid = (begin + end) / 2;
Size index1 = mid;
Size index2 = diag - mid - 1;
if(comp(first2[index2], first1[index1]))
{
end = mid;
}
else
{
begin = mid + 1;
}
}
return thrust::make_pair(begin, diag - begin);
}
template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
__device__ __thrust_forceinline__
void merge_n(Context &ctx,
RandomAccessIterator1 first1,
Size n1,
RandomAccessIterator2 first2,
Size n2,
RandomAccessIterator3 result,
Compare comp_,
unsigned int work_per_thread)
{
const unsigned int block_size = ctx.block_dimension();
thrust::detail::device_function<Compare,bool> comp(comp_);
typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type1;
typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type2;
Size result_size = n1 + n2;
// this is just oversubscription_rate * block_size * work_per_thread
// but it makes no sense to send oversubscription_rate as an extra parameter
Size work_per_block = thrust::detail::util::divide_ri(result_size, ctx.grid_dimension());
using thrust::system::cuda::detail::detail::uninitialized;
__shared__ uninitialized<thrust::pair<Size,Size> > s_block_input_begin;
Size result_block_offset = ctx.block_index() * work_per_block;
// find where this block's input begins in both input sequences
if(ctx.thread_index() == 0)
{
s_block_input_begin = (ctx.block_index() == 0) ?
thrust::pair<Size,Size>(0,0) :
partition_search(first1, first2,
result_block_offset,
Size(0), n1,
Size(0), n2,
comp);
}
ctx.barrier();
// iterate to consume this block's input
Size work_per_iteration = block_size * work_per_thread;
thrust::pair<Size,Size> block_input_end = s_block_input_begin;
block_input_end.first += work_per_iteration;
block_input_end.second += work_per_iteration;
Size result_block_offset_last = result_block_offset + thrust::min<Size>(work_per_block, result_size - result_block_offset);
for(;
result_block_offset < result_block_offset_last;
result_block_offset += work_per_iteration,
block_input_end.first += work_per_iteration,
block_input_end.second += work_per_iteration
)
{
// find where this thread's input begins in both input sequences for this iteration
thrust::pair<Size,Size> thread_input_begin =
partition_search(first1, first2,
Size(result_block_offset + ctx.thread_index() * work_per_thread),
s_block_input_begin.get().first, thrust::min<Size>(block_input_end.first , n1),
s_block_input_begin.get().second, thrust::min<Size>(block_input_end.second, n2),
comp);
ctx.barrier();
// XXX the performance impact of not keeping x1 & x2
// in registers is about 10% for int32
uninitialized<value_type1> x1;
uninitialized<value_type2> x2;
// XXX this is just a serial merge -- try to simplify or abstract this loop
Size i = result_block_offset + ctx.thread_index() * work_per_thread;
Size last_i = i + thrust::min<Size>(work_per_thread, result_size - thread_input_begin.first - thread_input_begin.second);
for(;
i < last_i;
++i)
{
// optionally load x1 & x2
bool output_x2 = true;
if(thread_input_begin.second < n2)
{
x2 = first2[thread_input_begin.second];
}
else
{
output_x2 = false;
}
if(thread_input_begin.first < n1)
{
x1 = first1[thread_input_begin.first];
if(output_x2)
{
output_x2 = comp(x2.get(), x1.get());
}
}
result[i] = output_x2 ? x2.get() : x1.get();
if(output_x2)
{
++thread_input_begin.second;
}
else
{
++thread_input_begin.first;
}
} // end for
// the block's last thread has conveniently located the
// beginning of the next iteration's input
if(ctx.thread_index() == block_size-1)
{
s_block_input_begin = thread_input_begin;
}
ctx.barrier();
} // end for
} // end merge_n
template<typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
struct merge_n_closure
{
typedef thrust::system::cuda::detail::detail::blocked_thread_array context_type;
RandomAccessIterator1 first1;
Size n1;
RandomAccessIterator2 first2;
Size n2;
RandomAccessIterator3 result;
Compare comp;
Size work_per_thread;
merge_n_closure(RandomAccessIterator1 first1, Size n1, RandomAccessIterator2 first2, Size n2, RandomAccessIterator3 result, Compare comp, Size work_per_thread)
: first1(first1), n1(n1), first2(first2), n2(n2), result(result), comp(comp), work_per_thread(work_per_thread)
{}
__device__ __forceinline__
void operator()()
{
context_type ctx;
merge_n(ctx, first1, n1, first2, n2, result, comp, work_per_thread);
}
};
// returns (work_per_thread, threads_per_block, oversubscription_factor)
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
thrust::tuple<unsigned int,unsigned int,unsigned int>
tunables(RandomAccessIterator1, RandomAccessIterator1, RandomAccessIterator2, RandomAccessIterator2, RandomAccessIterator3, Compare comp)
{
// determined by empirical testing on GTX 480
// ~4500 Mkeys/s on GTX 480
const unsigned int work_per_thread = 5;
const unsigned int threads_per_block = 128;
const unsigned int oversubscription_factor = 30;
return thrust::make_tuple(work_per_thread, threads_per_block, oversubscription_factor);
}
} // end merge_detail
template<typename DerivedPolicy,
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename Compare>
RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
RandomAccessIterator1 first1,
RandomAccessIterator1 last1,
RandomAccessIterator2 first2,
RandomAccessIterator2 last2,
RandomAccessIterator3 result,
Compare comp)
{
typedef typename thrust::iterator_difference<RandomAccessIterator1>::type Size;
Size n1 = last1 - first1;
Size n2 = last2 - first2;
typename thrust::iterator_difference<RandomAccessIterator1>::type n = n1 + n2;
// empty result
if(n <= 0) return result;
unsigned int work_per_thread = 0, threads_per_block = 0, oversubscription_factor = 0;
thrust::tie(work_per_thread,threads_per_block,oversubscription_factor)
= merge_detail::tunables(first1, last1, first2, last2, result, comp);
const unsigned int work_per_block = work_per_thread * threads_per_block;
const unsigned int num_processors = device_properties().multiProcessorCount;
const unsigned int num_blocks = thrust::min<int>(oversubscription_factor * num_processors, thrust::detail::util::divide_ri(n, work_per_block));
typedef merge_detail::merge_n_closure<RandomAccessIterator1,Size,RandomAccessIterator2,RandomAccessIterator3,Compare> closure_type;
closure_type closure(first1, n1, first2, n2, result, comp, work_per_thread);
detail::launch_closure(closure, num_blocks, threads_per_block);
return result + n1 + n2;
} // end merge()
} // end namespace detail
} // end namespace cuda
} // end namespace system
} // end namespace thrust