/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include namespace thrust { namespace system { namespace detail { namespace generic { namespace detail { template struct segmented_scan_functor { AssociativeOperator binary_op; typedef typename thrust::tuple result_type; __host__ __device__ segmented_scan_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {} __host__ __device__ result_type operator()(result_type a, result_type b) { return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)), thrust::get<1>(a) | thrust::get<1>(b)); } }; } // end namespace detail template OutputIterator inclusive_scan_by_key(thrust::execution_policy &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator result) { typedef typename thrust::iterator_traits::value_type InputType1; return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to()); } template OutputIterator inclusive_scan_by_key(thrust::execution_policy &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator result, BinaryPredicate binary_pred) { typedef typename thrust::iterator_traits::value_type OutputType; return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus()); } template OutputIterator inclusive_scan_by_key(thrust::execution_policy &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator result, BinaryPredicate binary_pred, AssociativeOperator binary_op) { typedef typename thrust::iterator_traits::value_type OutputType; typedef unsigned int HeadFlagType; const size_t n = last1 - first1; if(n != 0) { // compute head flags thrust::detail::temporary_array flags(exec, n); flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred)); // scan key-flag tuples, // For additional details refer to Section 2 of the following paper // S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs" // NVIDIA Technical Report NVR-2008-003, December 2008 // http://mgarland.org/files/papers/nvr-2008-003.pdf thrust::inclusive_scan (exec, thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())), thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())) + n, thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())), detail::segmented_scan_functor(binary_op)); } return result + n; } template OutputIterator exclusive_scan_by_key(thrust::execution_policy &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator result) { typedef typename thrust::iterator_traits::value_type OutputType; return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, OutputType(0)); } template OutputIterator exclusive_scan_by_key(thrust::execution_policy &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator result, T init) { typedef typename thrust::iterator_traits::value_type InputType1; return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to()); } template OutputIterator exclusive_scan_by_key(thrust::execution_policy &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator result, T init, BinaryPredicate binary_pred) { typedef typename thrust::iterator_traits::value_type OutputType; return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus()); } template OutputIterator exclusive_scan_by_key(thrust::execution_policy &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator result, T init, BinaryPredicate binary_pred, AssociativeOperator binary_op) { typedef typename thrust::iterator_traits::value_type OutputType; typedef unsigned int HeadFlagType; const size_t n = last1 - first1; if(n != 0) { InputIterator2 last2 = first2 + n; // compute head flags thrust::detail::temporary_array flags(exec, n); flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred)); // shift input one to the right and initialize segments with init thrust::detail::temporary_array temp(exec, n); thrust::replace_copy_if(exec, first2, last2 - 1, flags.begin() + 1, temp.begin() + 1, thrust::negate(), init); temp[0] = init; // scan key-flag tuples, // For additional details refer to Section 2 of the following paper // S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs" // NVIDIA Technical Report NVR-2008-003, December 2008 // http://mgarland.org/files/papers/nvr-2008-003.pdf thrust::inclusive_scan(exec, thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())), thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())) + n, thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())), detail::segmented_scan_functor(binary_op)); } return result + n; } } // end namespace generic } // end namespace detail } // end namespace system } // end namespace thrust