2014-03-18 22:17:40 +01:00

640 lines
27 KiB
C++

/*
* Copyright 2008-2012 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thrust/detail/config.h>
#include <thrust/system/cuda/detail/detail/set_operation.h>
#include <thrust/system/cuda/detail/detail/balanced_path.h>
#include <thrust/system/cuda/detail/block/inclusive_scan.h>
#include <thrust/system/cuda/detail/block/exclusive_scan.h>
#include <thrust/system/cuda/detail/block/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/scan.h>
#include <thrust/pair.h>
#include <thrust/detail/util/blocking.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/cstdint.h>
#include <thrust/detail/minmax.h>
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace detail
{
namespace set_operation_detail
{
using thrust::system::cuda::detail::detail::statically_blocked_thread_array;
using thrust::detail::uint16_t;
using thrust::detail::uint32_t;
// empirically determined on sm_20
// value_types larger than this will fail to launch if placed in smem
template<typename T>
struct stage_through_smem
{
static const bool value = sizeof(T) <= 6 * sizeof(uint32_t);
};
// max_input_size <= 32
template<typename Size, typename InputIterator, typename OutputIterator>
inline __device__
OutputIterator serial_bounded_copy_if(Size max_input_size,
InputIterator first,
uint32_t mask,
OutputIterator result)
{
for(Size i = 0; i < max_input_size; ++i, ++first)
{
if((1<<i) & mask)
{
*result = *first;
++result;
}
}
return result;
}
template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
struct find_partition_offsets_functor
{
Size partition_size;
InputIterator1 first1;
InputIterator2 first2;
Size n1, n2;
Compare comp;
find_partition_offsets_functor(Size partition_size,
InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2,
Compare comp)
: partition_size(partition_size),
first1(first1), first2(first2),
n1(last1 - first1), n2(last2 - first2),
comp(comp)
{}
inline __host__ __device__
thrust::pair<Size,Size> operator()(Size i) const
{
Size diag = thrust::min(n1 + n2, i * partition_size);
// XXX the correctness of balanced_path depends critically on the ll suffix below
// why???
return balanced_path(first1, n1, first2, n2, diag, 4ll, comp);
}
};
template<typename Size, typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
OutputIterator find_partition_offsets(thrust::cuda::execution_policy<DerivedPolicy> &exec,
Size num_partitions,
Size partition_size,
InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2,
OutputIterator result,
Compare comp)
{
find_partition_offsets_functor<Size,InputIterator1,InputIterator2,Compare> f(partition_size, first1, last1, first2, last2, comp);
return thrust::transform(exec,
thrust::counting_iterator<Size>(0),
thrust::counting_iterator<Size>(num_partitions),
result,
f);
}
namespace block
{
template<unsigned int block_size, typename T>
inline __device__
T right_neighbor(statically_blocked_thread_array<block_size> &ctx, const T &x, const T &boundary)
{
// stage this shift to conserve smem
const unsigned int storage_size = block_size / 2;
__shared__ uninitialized_array<T,storage_size> shared;
T result = x;
unsigned int tid = ctx.thread_index();
if(0 < tid && tid <= storage_size)
{
shared[tid - 1] = x;
}
ctx.barrier();
if(tid < storage_size)
{
result = shared[tid];
}
ctx.barrier();
tid -= storage_size;
if(0 < tid && tid <= storage_size)
{
shared[tid - 1] = x;
}
else if(tid == 0)
{
shared[storage_size-1] = boundary;
}
ctx.barrier();
if(tid < storage_size)
{
result = shared[tid];
}
ctx.barrier();
return result;
}
template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
inline __device__
unsigned int bounded_count_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
InputIterator1 first1, uint16_t n1,
InputIterator2 first2, uint16_t n2,
Compare comp,
SetOperation set_op)
{
unsigned int thread_idx = ctx.thread_index();
// find partition offsets
uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
thrust::pair<uint16_t,uint16_t> thread_input_end = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));
__shared__ uint16_t s_thread_output_size[block_size];
// work_per_thread + 1 to accomodate a "starred" partition returned from balanced_path above
s_thread_output_size[thread_idx] =
set_op.count(work_per_thread + 1,
first1 + thread_input_begin.first, first1 + thread_input_end.first,
first2 + thread_input_begin.second, first2 + thread_input_end.second,
comp);
ctx.barrier();
// reduce per-thread counts
thrust::system::cuda::detail::block::inplace_inclusive_scan(ctx, s_thread_output_size);
return s_thread_output_size[ctx.block_dimension() - 1];
}
inline __device__ int pop_count(unsigned int x)
{
// guard use of __popc from other compilers
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
return __popc(x);
#else
return x;
#endif
}
template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
inline __device__
OutputIterator bounded_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
InputIterator1 first1, uint16_t n1,
InputIterator2 first2, uint16_t n2,
OutputIterator result,
Compare comp,
SetOperation set_op)
{
unsigned int thread_idx = ctx.thread_index();
// find partition offsets
uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
thrust::pair<uint16_t,uint16_t> thread_input_end = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));
typedef typename thrust::iterator_value<InputIterator1>::type value_type;
// +1 to accomodate a "starred" partition returned from balanced_path above
uninitialized_array<value_type, work_per_thread + 1> sparse_result;
uint32_t active_mask =
set_op(work_per_thread + 1,
first1 + thread_input_begin.first, first1 + thread_input_end.first,
first2 + thread_input_begin.second, first2 + thread_input_end.second,
sparse_result.begin(),
comp);
__shared__ uint16_t s_thread_output_size[block_size];
s_thread_output_size[thread_idx] = pop_count(active_mask);
ctx.barrier();
// scan to turn per-thread counts into output indices
uint16_t block_output_size = thrust::system::cuda::detail::block::inplace_exclusive_scan(ctx, s_thread_output_size, 0u);
serial_bounded_copy_if(work_per_thread + 1, sparse_result.begin(), active_mask, result + s_thread_output_size[thread_idx]);
ctx.barrier();
return result + block_output_size;
}
template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
inline __device__
typename thrust::iterator_difference<InputIterator1>::type
count_set_operation(statically_blocked_thread_array<block_size> &ctx,
InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2,
Compare comp,
SetOperation set_op)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference;
difference result = 0;
thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);
// iterate until the input is consumed
while(remaining_input_size.first + remaining_input_size.second > 0)
{
// find the end of this subpartition's input
// -1 to accomodate "starred" partitions
uint16_t max_subpartition_size = block_size * work_per_thread - 1;
difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);
typedef typename thrust::iterator_value<InputIterator2>::type value_type;
if(stage_through_smem<value_type>::value)
{
// load the input into __shared__ storage
__shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;
value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first, s_input.begin());
value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);
result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
s_input.begin(), subpartition_size.first,
s_input_end1, subpartition_size.second,
comp,
set_op);
}
else
{
result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
first1, subpartition_size.first,
first2, subpartition_size.second,
comp,
set_op);
}
// advance input
first1 += subpartition_size.first;
first2 += subpartition_size.second;
// decrement remaining size
remaining_input_size.first -= subpartition_size.first;
remaining_input_size.second -= subpartition_size.second;
}
return result;
}
template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
inline __device__
OutputIterator set_operation(statically_blocked_thread_array<block_size> &ctx,
InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2,
OutputIterator result,
Compare comp,
SetOperation set_op)
{
typedef typename thrust::iterator_difference<InputIterator1>::type difference;
thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);
// iterate until the input is consumed
while(remaining_input_size.first + remaining_input_size.second > 0)
{
// find the end of this subpartition's input
// -1 to accomodate "starred" partitions
uint16_t max_subpartition_size = block_size * work_per_thread - 1;
difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);
typedef typename thrust::iterator_value<InputIterator2>::type value_type;
if(stage_through_smem<value_type>::value)
{
// load the input into __shared__ storage
__shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;
value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first, s_input.begin());
value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);
result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
s_input.begin(), subpartition_size.first,
s_input_end1, subpartition_size.second,
result,
comp,
set_op);
}
else
{
result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
first1, subpartition_size.first,
first2, subpartition_size.second,
result,
comp,
set_op);
}
// advance input
first1 += subpartition_size.first;
first2 += subpartition_size.second;
// decrement remaining size
remaining_input_size.first -= subpartition_size.first;
remaining_input_size.second -= subpartition_size.second;
}
return result;
}
} // end namespace block
template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
inline __device__ void count_set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
InputIterator1 input_partition_offsets,
Size num_partitions,
InputIterator2 first1,
InputIterator3 first2,
OutputIterator result,
Compare comp,
SetOperation set_op)
{
// consume partitions
for(Size partition_idx = ctx.block_index();
partition_idx < num_partitions;
partition_idx += ctx.grid_dimension())
{
typedef typename thrust::iterator_difference<InputIterator2>::type difference;
// find the partition
thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
thrust::pair<difference,difference> block_input_end = input_partition_offsets[partition_idx + 1];
// count the size of the set operation
difference count = block::count_set_operation<threads_per_block,work_per_thread>(ctx,
first1 + block_input_begin.first, first1 + block_input_end.first,
first2 + block_input_begin.second, first2 + block_input_end.second,
comp,
set_op);
if(ctx.thread_index() == 0)
{
result[partition_idx] = count;
}
}
}
template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
struct count_set_operation_closure
{
typedef statically_blocked_thread_array<threads_per_block> context_type;
InputIterator1 input_partition_offsets;
Size num_partitions;
InputIterator2 first1;
InputIterator3 first2;
OutputIterator result;
Compare comp;
SetOperation set_op;
count_set_operation_closure(InputIterator1 input_partition_offsets,
Size num_partitions,
InputIterator2 first1,
InputIterator3 first2,
OutputIterator result,
Compare comp,
SetOperation set_op)
: input_partition_offsets(input_partition_offsets),
num_partitions(num_partitions),
first1(first1),
first2(first2),
result(result),
comp(comp),
set_op(set_op)
{}
inline __device__ void operator()() const
{
context_type ctx;
count_set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, result, comp, set_op);
}
};
template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation>
make_count_set_operation_closure(InputIterator1 input_partition_offsets,
Size num_partitions,
InputIterator2 first1,
InputIterator3 first2,
OutputIterator result,
Compare comp,
SetOperation set_op)
{
typedef count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation> result_type;
return result_type(input_partition_offsets,num_partitions,first1,first2,result,comp,set_op);
}
template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
inline __device__
void set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
InputIterator1 input_partition_offsets,
Size num_partitions,
InputIterator2 first1,
InputIterator3 first2,
InputIterator4 output_partition_offsets,
OutputIterator result,
Compare comp,
SetOperation set_op)
{
// consume partitions
for(Size partition_idx = ctx.block_index();
partition_idx < num_partitions;
partition_idx += ctx.grid_dimension())
{
typedef typename thrust::iterator_difference<InputIterator2>::type difference;
// find the partition
thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
thrust::pair<difference,difference> block_input_end = input_partition_offsets[partition_idx + 1];
// do the set operation across the partition
block::set_operation<threads_per_block,work_per_thread>(ctx,
first1 + block_input_begin.first, first1 + block_input_end.first,
first2 + block_input_begin.second, first2 + block_input_end.second,
result + output_partition_offsets[partition_idx],
comp,
set_op);
}
}
template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
struct set_operation_closure
{
typedef statically_blocked_thread_array<threads_per_block> context_type;
InputIterator1 input_partition_offsets;
Size num_partitions;
InputIterator2 first1;
InputIterator3 first2;
InputIterator4 output_partition_offsets;
OutputIterator result;
Compare comp;
SetOperation set_op;
set_operation_closure(InputIterator1 input_partition_offsets,
Size num_partitions,
InputIterator2 first1,
InputIterator3 first2,
InputIterator4 output_partition_offsets,
OutputIterator result,
Compare comp,
SetOperation set_op)
: input_partition_offsets(input_partition_offsets),
num_partitions(num_partitions),
first1(first1),
first2(first2),
output_partition_offsets(output_partition_offsets),
result(result),
comp(comp),
set_op(set_op)
{}
inline __device__ void operator()() const
{
context_type ctx;
set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, output_partition_offsets, result, comp, set_op);
}
};
template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation>
make_set_operation_closure(InputIterator1 input_partition_offsets,
Size num_partitions,
InputIterator2 first1,
InputIterator3 first2,
InputIterator4 output_partition_offsets,
OutputIterator result,
Compare comp,
SetOperation set_op)
{
typedef set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation> result_type;
return result_type(input_partition_offsets,num_partitions,first1,first2,output_partition_offsets,result,comp,set_op);
}
} // end namespace set_operation_detail
template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
OutputIterator set_operation(thrust::cuda::execution_policy<DerivedPolicy> &exec,
InputIterator1 first1, InputIterator1 last1,
InputIterator2 first2, InputIterator2 last2,
OutputIterator result,
Compare comp,
SetOperation set_op)
{
using thrust::system::cuda::detail::device_properties;
using thrust::system::cuda::detail::detail::launch_closure;
namespace d = thrust::system::cuda::detail::detail::set_operation_detail;
typedef typename thrust::iterator_difference<InputIterator1>::type difference;
const difference n1 = last1 - first1;
const difference n2 = last2 - first2;
// handle empty input
if(n1 == 0 && n2 == 0)
{
return result;
}
const thrust::detail::uint16_t work_per_thread = 15;
const thrust::detail::uint16_t threads_per_block = 128;
const thrust::detail::uint16_t work_per_block = threads_per_block * work_per_thread;
// -1 because balanced_path adds a single element to the end of a "starred" partition, increasing its size by one
const thrust::detail::uint16_t maximum_partition_size = work_per_block - 1;
const difference num_partitions = thrust::detail::util::divide_ri(n1 + n2, maximum_partition_size);
// find input partition offsets
// +1 to handle the end of the input elegantly
thrust::detail::temporary_array<thrust::pair<difference,difference>, DerivedPolicy> input_partition_offsets(0, exec, num_partitions + 1);
d::find_partition_offsets<difference>(exec, input_partition_offsets.size(), maximum_partition_size, first1, last1, first2, last2, input_partition_offsets.begin(), comp);
const difference num_blocks = thrust::min<difference>(device_properties().maxGridSize[0], num_partitions);
// find output partition offsets
// +1 to store the total size of the total
thrust::detail::temporary_array<difference, DerivedPolicy> output_partition_offsets(0, exec, num_partitions + 1);
launch_closure(d::make_count_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), comp, set_op),
num_blocks,
threads_per_block);
// turn the output partition counts into offsets to output partitions
thrust::exclusive_scan(exec, output_partition_offsets.begin(), output_partition_offsets.end(), output_partition_offsets.begin());
// run the set op kernel
launch_closure(d::make_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), result, comp, set_op),
num_blocks,
threads_per_block);
return result + output_partition_offsets[num_partitions];
}
} // end namespace detail
} // end namespace detail
} // end namespace cuda
} // end namespace system
} // end namespace thrust