/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include namespace thrust { namespace system { namespace tbb { namespace detail { namespace reduce_by_key_detail { template inline L divide_ri(const L x, const R y) { return (x + (y - 1)) / y; } template struct partial_sum_type : thrust::detail::eval_if< thrust::detail::has_result_type::value, thrust::detail::result_type, thrust::detail::eval_if< thrust::detail::is_output_iterator::value, thrust::iterator_value, thrust::iterator_value > > {}; template struct partial_sum_type : thrust::detail::eval_if< thrust::detail::has_result_type::value, thrust::detail::result_type, thrust::iterator_value > {}; template thrust::pair< InputIterator1, thrust::pair< typename InputIterator1::value_type, typename partial_sum_type::type > > reduce_last_segment_backward(InputIterator1 keys_first, InputIterator1 keys_last, InputIterator2 values_first, BinaryPredicate binary_pred, BinaryFunction binary_op) { typename thrust::iterator_difference::type n = keys_last - keys_first; // reverse the ranges and consume from the end thrust::reverse_iterator keys_first_r(keys_last); thrust::reverse_iterator keys_last_r(keys_first); thrust::reverse_iterator values_first_r(values_first + n); typename InputIterator1::value_type result_key = *keys_first_r; typename partial_sum_type::type result_value = *values_first_r; // consume the entirety of the first key's sequence for(++keys_first_r, ++values_first_r; (keys_first_r != keys_last_r) && binary_pred(*keys_first_r, result_key); ++keys_first_r, ++values_first_r) { result_value = binary_op(result_value, *values_first_r); } return thrust::make_pair(keys_first_r.base(), thrust::make_pair(result_key, result_value)); } template thrust::tuple< OutputIterator1, OutputIterator2, typename InputIterator1::value_type, typename partial_sum_type::type > reduce_by_key_with_carry(InputIterator1 keys_first, InputIterator1 keys_last, InputIterator2 values_first, OutputIterator1 keys_output, OutputIterator2 values_output, BinaryPredicate binary_pred, BinaryFunction binary_op) { // first, consume the last sequence to produce the carry // XXX is there an elegant way to pose this such that we don't need to default construct carry? thrust::pair< typename InputIterator1::value_type, typename partial_sum_type::type > carry; thrust::tie(keys_last, carry) = reduce_last_segment_backward(keys_first, keys_last, values_first, binary_pred, binary_op); // finish with sequential reduce_by_key thrust::cpp::tag seq; thrust::tie(keys_output, values_output) = thrust::reduce_by_key(seq, keys_first, keys_last, values_first, keys_output, values_output, binary_pred, binary_op); return thrust::make_tuple(keys_output, values_output, carry.first, carry.second); } template bool interval_has_carry(size_t interval_idx, size_t interval_size, size_t num_intervals, Iterator tail_flags) { // to discover whether the interval has a carry, look at the tail_flag corresponding to its last element // the final interval never has a carry by definition return (interval_idx + 1 < num_intervals) ? !tail_flags[(interval_idx + 1) * interval_size - 1] : false; } template struct serial_reduce_by_key_body { typedef typename thrust::iterator_difference::type size_type; Iterator1 keys_first; Iterator2 values_first; Iterator3 result_offset; Iterator4 keys_result; Iterator5 values_result; Iterator6 carry_result; size_type n; size_type interval_size; size_type num_intervals; BinaryPredicate binary_pred; BinaryFunction binary_op; serial_reduce_by_key_body(Iterator1 keys_first, Iterator2 values_first, Iterator3 result_offset, Iterator4 keys_result, Iterator5 values_result, Iterator6 carry_result, size_type n, size_type interval_size, size_type num_intervals, BinaryPredicate binary_pred, BinaryFunction binary_op) : keys_first(keys_first), values_first(values_first), result_offset(result_offset), keys_result(keys_result), values_result(values_result), carry_result(carry_result), n(n), interval_size(interval_size), num_intervals(num_intervals), binary_pred(binary_pred), binary_op(binary_op) {} void operator()(const ::tbb::blocked_range &r) const { assert(r.size() == 1); const size_type interval_idx = r.begin(); const size_type offset_to_first = interval_size * interval_idx; const size_type offset_to_last = thrust::min(n, offset_to_first + interval_size); Iterator1 my_keys_first = keys_first + offset_to_first; Iterator1 my_keys_last = keys_first + offset_to_last; Iterator2 my_values_first = values_first + offset_to_first; Iterator3 my_result_offset = result_offset + interval_idx; Iterator4 my_keys_result = keys_result + *my_result_offset; Iterator5 my_values_result = values_result + *my_result_offset; Iterator6 my_carry_result = carry_result + interval_idx; // consume the rest of the interval with reduce_by_key typedef typename thrust::iterator_value::type key_type; typedef typename partial_sum_type::type value_type; // XXX is there a way to pose this so that we don't require default construction of carry? thrust::pair carry; thrust::tie(my_keys_result, my_values_result, carry.first, carry.second) = reduce_by_key_with_carry(my_keys_first, my_keys_last, my_values_first, my_keys_result, my_values_result, binary_pred, binary_op); // store to carry only when we actually have a carry // store to my_keys_result & my_values_result otherwise // create tail_flags so we can check for a carry thrust::detail::tail_flags flags = thrust::detail::make_tail_flags(keys_first, keys_first + n, binary_pred); if(interval_has_carry(interval_idx, interval_size, num_intervals, flags.begin())) { // we can ignore the carry's key // XXX because the carry result is uninitialized, we should copy construct *my_carry_result = carry.second; } else { *my_keys_result = carry.first; *my_values_result = carry.second; } } }; template serial_reduce_by_key_body make_serial_reduce_by_key_body(Iterator1 keys_first, Iterator2 values_first, Iterator3 result_offset, Iterator4 keys_result, Iterator5 values_result, Iterator6 carry_result, typename thrust::iterator_difference::type n, size_t interval_size, size_t num_intervals, BinaryPredicate binary_pred, BinaryFunction binary_op) { return serial_reduce_by_key_body(keys_first, values_first, result_offset, keys_result, values_result, carry_result, n, interval_size, num_intervals, binary_pred, binary_op); } } // end reduce_by_key_detail template thrust::pair reduce_by_key(thrust::tbb::execution_policy &exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 keys_result, Iterator4 values_result, BinaryPredicate binary_pred, BinaryFunction binary_op) { typedef typename thrust::iterator_difference::type difference_type; difference_type n = keys_last - keys_first; if(n == 0) return thrust::make_pair(keys_result, values_result); // XXX this value is a tuning opportunity const difference_type parallelism_threshold = 10000; if(n < parallelism_threshold) { // don't bother parallelizing for small n thrust::cpp::tag seq; return thrust::reduce_by_key(seq, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op); } // count the number of processors const unsigned int p = thrust::max(1u, ::tbb::tbb_thread::hardware_concurrency()); // generate O(P) intervals of sequential work // XXX oversubscribing is a tuning opportunity const unsigned int subscription_rate = 1; difference_type interval_size = thrust::min(parallelism_threshold, thrust::max(n, n / (subscription_rate * p))); difference_type num_intervals = reduce_by_key_detail::divide_ri(n, interval_size); // decompose the input into intervals of size N / num_intervals // add one extra element to this vector to store the size of the entire result thrust::detail::temporary_array interval_output_offsets(0, exec, num_intervals + 1); // first count the number of tail flags in each interval thrust::detail::tail_flags tail_flags = thrust::detail::make_tail_flags(keys_first, keys_last, binary_pred); thrust::system::tbb::detail::reduce_intervals(exec, tail_flags.begin(), tail_flags.end(), interval_size, interval_output_offsets.begin() + 1, thrust::plus()); interval_output_offsets[0] = 0; // scan the counts to get each body's output offset thrust::cpp::tag seq; thrust::inclusive_scan(seq, interval_output_offsets.begin() + 1, interval_output_offsets.end(), interval_output_offsets.begin() + 1); // do a reduce_by_key serially in each thread // the final interval never has a carry by definition, so don't reserve space for it typedef typename reduce_by_key_detail::partial_sum_type::type carry_type; thrust::detail::temporary_array carries(0, exec, num_intervals - 1); // force grainsize == 1 with simple_partioner() ::tbb::parallel_for(::tbb::blocked_range(0, num_intervals, 1), reduce_by_key_detail::make_serial_reduce_by_key_body(keys_first, values_first, interval_output_offsets.begin(), keys_result, values_result, carries.begin(), n, interval_size, num_intervals, binary_pred, binary_op), ::tbb::simple_partitioner()); difference_type size_of_result = interval_output_offsets[num_intervals]; // sequentially accumulate the carries // note that the last interval does not have a carry // XXX find a way to express this loop via a sequential algorithm, perhaps reduce_by_key for(typename thrust::detail::temporary_array::size_type i = 0; i < carries.size(); ++i) { // if our interval has a carry, then we need to sum the carry to the next interval's output offset // if it does not have a carry, then we need to ignore carry_value[i] if(reduce_by_key_detail::interval_has_carry(i, interval_size, num_intervals, tail_flags.begin())) { difference_type output_idx = interval_output_offsets[i+1]; values_result[output_idx] = binary_op(values_result[output_idx], carries[i]); } } return thrust::make_pair(keys_result + size_of_result, values_result + size_of_result); } } // end detail } // end tbb } // end system } // end thrust