/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace thrust { namespace system { namespace cuda { namespace detail { namespace detail { namespace set_operation_detail { using thrust::system::cuda::detail::detail::statically_blocked_thread_array; using thrust::detail::uint16_t; using thrust::detail::uint32_t; // empirically determined on sm_20 // value_types larger than this will fail to launch if placed in smem template struct stage_through_smem { static const bool value = sizeof(T) <= 6 * sizeof(uint32_t); }; // max_input_size <= 32 template inline __device__ OutputIterator serial_bounded_copy_if(Size max_input_size, InputIterator first, uint32_t mask, OutputIterator result) { for(Size i = 0; i < max_input_size; ++i, ++first) { if((1< struct find_partition_offsets_functor { Size partition_size; InputIterator1 first1; InputIterator2 first2; Size n1, n2; Compare comp; find_partition_offsets_functor(Size partition_size, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Compare comp) : partition_size(partition_size), first1(first1), first2(first2), n1(last1 - first1), n2(last2 - first2), comp(comp) {} inline __host__ __device__ thrust::pair operator()(Size i) const { Size diag = thrust::min(n1 + n2, i * partition_size); // XXX the correctness of balanced_path depends critically on the ll suffix below // why??? return balanced_path(first1, n1, first2, n2, diag, 4ll, comp); } }; template OutputIterator find_partition_offsets(thrust::cuda::execution_policy &exec, Size num_partitions, Size partition_size, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp) { find_partition_offsets_functor f(partition_size, first1, last1, first2, last2, comp); return thrust::transform(exec, thrust::counting_iterator(0), thrust::counting_iterator(num_partitions), result, f); } namespace block { template inline __device__ T right_neighbor(statically_blocked_thread_array &ctx, const T &x, const T &boundary) { // stage this shift to conserve smem const unsigned int storage_size = block_size / 2; __shared__ uninitialized_array shared; T result = x; unsigned int tid = ctx.thread_index(); if(0 < tid && tid <= storage_size) { shared[tid - 1] = x; } ctx.barrier(); if(tid < storage_size) { result = shared[tid]; } ctx.barrier(); tid -= storage_size; if(0 < tid && tid <= storage_size) { shared[tid - 1] = x; } else if(tid == 0) { shared[storage_size-1] = boundary; } ctx.barrier(); if(tid < storage_size) { result = shared[tid]; } ctx.barrier(); return result; } template inline __device__ unsigned int bounded_count_set_operation_n(statically_blocked_thread_array &ctx, InputIterator1 first1, uint16_t n1, InputIterator2 first2, uint16_t n2, Compare comp, SetOperation set_op) { unsigned int thread_idx = ctx.thread_index(); // find partition offsets uint16_t diag = thrust::min(n1 + n2, thread_idx * work_per_thread); thrust::pair thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp); thrust::pair thread_input_end = block::right_neighbor(ctx, thread_input_begin, thrust::make_pair(n1,n2)); __shared__ uint16_t s_thread_output_size[block_size]; // work_per_thread + 1 to accomodate a "starred" partition returned from balanced_path above s_thread_output_size[thread_idx] = set_op.count(work_per_thread + 1, first1 + thread_input_begin.first, first1 + thread_input_end.first, first2 + thread_input_begin.second, first2 + thread_input_end.second, comp); ctx.barrier(); // reduce per-thread counts thrust::system::cuda::detail::block::inplace_inclusive_scan(ctx, s_thread_output_size); return s_thread_output_size[ctx.block_dimension() - 1]; } inline __device__ int pop_count(unsigned int x) { // guard use of __popc from other compilers #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC return __popc(x); #else return x; #endif } template inline __device__ OutputIterator bounded_set_operation_n(statically_blocked_thread_array &ctx, InputIterator1 first1, uint16_t n1, InputIterator2 first2, uint16_t n2, OutputIterator result, Compare comp, SetOperation set_op) { unsigned int thread_idx = ctx.thread_index(); // find partition offsets uint16_t diag = thrust::min(n1 + n2, thread_idx * work_per_thread); thrust::pair thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp); thrust::pair thread_input_end = block::right_neighbor(ctx, thread_input_begin, thrust::make_pair(n1,n2)); typedef typename thrust::iterator_value::type value_type; // +1 to accomodate a "starred" partition returned from balanced_path above uninitialized_array sparse_result; uint32_t active_mask = set_op(work_per_thread + 1, first1 + thread_input_begin.first, first1 + thread_input_end.first, first2 + thread_input_begin.second, first2 + thread_input_end.second, sparse_result.begin(), comp); __shared__ uint16_t s_thread_output_size[block_size]; s_thread_output_size[thread_idx] = pop_count(active_mask); ctx.barrier(); // scan to turn per-thread counts into output indices uint16_t block_output_size = thrust::system::cuda::detail::block::inplace_exclusive_scan(ctx, s_thread_output_size, 0u); serial_bounded_copy_if(work_per_thread + 1, sparse_result.begin(), active_mask, result + s_thread_output_size[thread_idx]); ctx.barrier(); return result + block_output_size; } template inline __device__ typename thrust::iterator_difference::type count_set_operation(statically_blocked_thread_array &ctx, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Compare comp, SetOperation set_op) { typedef typename thrust::iterator_difference::type difference; difference result = 0; thrust::pair remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2); // iterate until the input is consumed while(remaining_input_size.first + remaining_input_size.second > 0) { // find the end of this subpartition's input // -1 to accomodate "starred" partitions uint16_t max_subpartition_size = block_size * work_per_thread - 1; difference diag = thrust::min(remaining_input_size.first + remaining_input_size.second, max_subpartition_size); thrust::pair subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp); typedef typename thrust::iterator_value::type value_type; if(stage_through_smem::value) { // load the input into __shared__ storage __shared__ uninitialized_array s_input; value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first, s_input.begin()); value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1); result += block::bounded_count_set_operation_n(ctx, s_input.begin(), subpartition_size.first, s_input_end1, subpartition_size.second, comp, set_op); } else { result += block::bounded_count_set_operation_n(ctx, first1, subpartition_size.first, first2, subpartition_size.second, comp, set_op); } // advance input first1 += subpartition_size.first; first2 += subpartition_size.second; // decrement remaining size remaining_input_size.first -= subpartition_size.first; remaining_input_size.second -= subpartition_size.second; } return result; } template inline __device__ OutputIterator set_operation(statically_blocked_thread_array &ctx, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp, SetOperation set_op) { typedef typename thrust::iterator_difference::type difference; thrust::pair remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2); // iterate until the input is consumed while(remaining_input_size.first + remaining_input_size.second > 0) { // find the end of this subpartition's input // -1 to accomodate "starred" partitions uint16_t max_subpartition_size = block_size * work_per_thread - 1; difference diag = thrust::min(remaining_input_size.first + remaining_input_size.second, max_subpartition_size); thrust::pair subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp); typedef typename thrust::iterator_value::type value_type; if(stage_through_smem::value) { // load the input into __shared__ storage __shared__ uninitialized_array s_input; value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first, s_input.begin()); value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1); result = block::bounded_set_operation_n(ctx, s_input.begin(), subpartition_size.first, s_input_end1, subpartition_size.second, result, comp, set_op); } else { result = block::bounded_set_operation_n(ctx, first1, subpartition_size.first, first2, subpartition_size.second, result, comp, set_op); } // advance input first1 += subpartition_size.first; first2 += subpartition_size.second; // decrement remaining size remaining_input_size.first -= subpartition_size.first; remaining_input_size.second -= subpartition_size.second; } return result; } } // end namespace block template inline __device__ void count_set_operation(statically_blocked_thread_array &ctx, InputIterator1 input_partition_offsets, Size num_partitions, InputIterator2 first1, InputIterator3 first2, OutputIterator result, Compare comp, SetOperation set_op) { // consume partitions for(Size partition_idx = ctx.block_index(); partition_idx < num_partitions; partition_idx += ctx.grid_dimension()) { typedef typename thrust::iterator_difference::type difference; // find the partition thrust::pair block_input_begin = input_partition_offsets[partition_idx]; thrust::pair block_input_end = input_partition_offsets[partition_idx + 1]; // count the size of the set operation difference count = block::count_set_operation(ctx, first1 + block_input_begin.first, first1 + block_input_end.first, first2 + block_input_begin.second, first2 + block_input_end.second, comp, set_op); if(ctx.thread_index() == 0) { result[partition_idx] = count; } } } template struct count_set_operation_closure { typedef statically_blocked_thread_array context_type; InputIterator1 input_partition_offsets; Size num_partitions; InputIterator2 first1; InputIterator3 first2; OutputIterator result; Compare comp; SetOperation set_op; count_set_operation_closure(InputIterator1 input_partition_offsets, Size num_partitions, InputIterator2 first1, InputIterator3 first2, OutputIterator result, Compare comp, SetOperation set_op) : input_partition_offsets(input_partition_offsets), num_partitions(num_partitions), first1(first1), first2(first2), result(result), comp(comp), set_op(set_op) {} inline __device__ void operator()() const { context_type ctx; count_set_operation(ctx, input_partition_offsets, num_partitions, first1, first2, result, comp, set_op); } }; template count_set_operation_closure make_count_set_operation_closure(InputIterator1 input_partition_offsets, Size num_partitions, InputIterator2 first1, InputIterator3 first2, OutputIterator result, Compare comp, SetOperation set_op) { typedef count_set_operation_closure result_type; return result_type(input_partition_offsets,num_partitions,first1,first2,result,comp,set_op); } template inline __device__ void set_operation(statically_blocked_thread_array &ctx, InputIterator1 input_partition_offsets, Size num_partitions, InputIterator2 first1, InputIterator3 first2, InputIterator4 output_partition_offsets, OutputIterator result, Compare comp, SetOperation set_op) { // consume partitions for(Size partition_idx = ctx.block_index(); partition_idx < num_partitions; partition_idx += ctx.grid_dimension()) { typedef typename thrust::iterator_difference::type difference; // find the partition thrust::pair block_input_begin = input_partition_offsets[partition_idx]; thrust::pair block_input_end = input_partition_offsets[partition_idx + 1]; // do the set operation across the partition block::set_operation(ctx, first1 + block_input_begin.first, first1 + block_input_end.first, first2 + block_input_begin.second, first2 + block_input_end.second, result + output_partition_offsets[partition_idx], comp, set_op); } } template struct set_operation_closure { typedef statically_blocked_thread_array context_type; InputIterator1 input_partition_offsets; Size num_partitions; InputIterator2 first1; InputIterator3 first2; InputIterator4 output_partition_offsets; OutputIterator result; Compare comp; SetOperation set_op; set_operation_closure(InputIterator1 input_partition_offsets, Size num_partitions, InputIterator2 first1, InputIterator3 first2, InputIterator4 output_partition_offsets, OutputIterator result, Compare comp, SetOperation set_op) : input_partition_offsets(input_partition_offsets), num_partitions(num_partitions), first1(first1), first2(first2), output_partition_offsets(output_partition_offsets), result(result), comp(comp), set_op(set_op) {} inline __device__ void operator()() const { context_type ctx; set_operation(ctx, input_partition_offsets, num_partitions, first1, first2, output_partition_offsets, result, comp, set_op); } }; template set_operation_closure make_set_operation_closure(InputIterator1 input_partition_offsets, Size num_partitions, InputIterator2 first1, InputIterator3 first2, InputIterator4 output_partition_offsets, OutputIterator result, Compare comp, SetOperation set_op) { typedef set_operation_closure result_type; return result_type(input_partition_offsets,num_partitions,first1,first2,output_partition_offsets,result,comp,set_op); } } // end namespace set_operation_detail template OutputIterator set_operation(thrust::cuda::execution_policy &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp, SetOperation set_op) { using thrust::system::cuda::detail::device_properties; using thrust::system::cuda::detail::detail::launch_closure; namespace d = thrust::system::cuda::detail::detail::set_operation_detail; typedef typename thrust::iterator_difference::type difference; const difference n1 = last1 - first1; const difference n2 = last2 - first2; // handle empty input if(n1 == 0 && n2 == 0) { return result; } const thrust::detail::uint16_t work_per_thread = 15; const thrust::detail::uint16_t threads_per_block = 128; const thrust::detail::uint16_t work_per_block = threads_per_block * work_per_thread; // -1 because balanced_path adds a single element to the end of a "starred" partition, increasing its size by one const thrust::detail::uint16_t maximum_partition_size = work_per_block - 1; const difference num_partitions = thrust::detail::util::divide_ri(n1 + n2, maximum_partition_size); // find input partition offsets // +1 to handle the end of the input elegantly thrust::detail::temporary_array, DerivedPolicy> input_partition_offsets(0, exec, num_partitions + 1); d::find_partition_offsets(exec, input_partition_offsets.size(), maximum_partition_size, first1, last1, first2, last2, input_partition_offsets.begin(), comp); const difference num_blocks = thrust::min(device_properties().maxGridSize[0], num_partitions); // find output partition offsets // +1 to store the total size of the total thrust::detail::temporary_array output_partition_offsets(0, exec, num_partitions + 1); launch_closure(d::make_count_set_operation_closure(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), comp, set_op), num_blocks, threads_per_block); // turn the output partition counts into offsets to output partitions thrust::exclusive_scan(exec, output_partition_offsets.begin(), output_partition_offsets.end(), output_partition_offsets.begin()); // run the set op kernel launch_closure(d::make_set_operation_closure(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), result, comp, set_op), num_blocks, threads_per_block); return result + output_partition_offsets[num_partitions]; } } // end namespace detail } // end namespace detail } // end namespace cuda } // end namespace system } // end namespace thrust