240 lines
9.4 KiB
C++
240 lines
9.4 KiB
C++
/*
|
|
* Copyright 2008-2012 NVIDIA Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
|
|
#include <thrust/detail/config.h>
|
|
#include <thrust/system/detail/generic/scan_by_key.h>
|
|
#include <thrust/functional.h>
|
|
#include <thrust/transform.h>
|
|
#include <thrust/replace.h>
|
|
#include <thrust/iterator/zip_iterator.h>
|
|
#include <thrust/iterator/iterator_traits.h>
|
|
#include <thrust/detail/temporary_array.h>
|
|
#include <thrust/detail/internal_functional.h>
|
|
#include <thrust/scan.h>
|
|
|
|
namespace thrust
|
|
{
|
|
namespace system
|
|
{
|
|
namespace detail
|
|
{
|
|
namespace generic
|
|
{
|
|
namespace detail
|
|
{
|
|
|
|
template <typename OutputType, typename HeadFlagType, typename AssociativeOperator>
|
|
struct segmented_scan_functor
|
|
{
|
|
AssociativeOperator binary_op;
|
|
|
|
typedef typename thrust::tuple<OutputType, HeadFlagType> result_type;
|
|
|
|
__host__ __device__
|
|
segmented_scan_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
|
|
|
|
__host__ __device__
|
|
result_type operator()(result_type a, result_type b)
|
|
{
|
|
return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)),
|
|
thrust::get<1>(a) | thrust::get<1>(b));
|
|
}
|
|
};
|
|
|
|
} // end namespace detail
|
|
|
|
|
|
template<typename DerivedPolicy,
|
|
typename InputIterator1,
|
|
typename InputIterator2,
|
|
typename OutputIterator>
|
|
OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
|
|
InputIterator1 first1,
|
|
InputIterator1 last1,
|
|
InputIterator2 first2,
|
|
OutputIterator result)
|
|
{
|
|
typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
|
|
return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<InputType1>());
|
|
}
|
|
|
|
|
|
template<typename DerivedPolicy,
|
|
typename InputIterator1,
|
|
typename InputIterator2,
|
|
typename OutputIterator,
|
|
typename BinaryPredicate>
|
|
OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
|
|
InputIterator1 first1,
|
|
InputIterator1 last1,
|
|
InputIterator2 first2,
|
|
OutputIterator result,
|
|
BinaryPredicate binary_pred)
|
|
{
|
|
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
|
|
return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<OutputType>());
|
|
}
|
|
|
|
|
|
template<typename DerivedPolicy,
|
|
typename InputIterator1,
|
|
typename InputIterator2,
|
|
typename OutputIterator,
|
|
typename BinaryPredicate,
|
|
typename AssociativeOperator>
|
|
OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
|
|
InputIterator1 first1,
|
|
InputIterator1 last1,
|
|
InputIterator2 first2,
|
|
OutputIterator result,
|
|
BinaryPredicate binary_pred,
|
|
AssociativeOperator binary_op)
|
|
{
|
|
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
|
|
typedef unsigned int HeadFlagType;
|
|
|
|
const size_t n = last1 - first1;
|
|
|
|
if(n != 0)
|
|
{
|
|
// compute head flags
|
|
thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
|
|
flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
|
|
|
|
// scan key-flag tuples,
|
|
// For additional details refer to Section 2 of the following paper
|
|
// S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
|
|
// NVIDIA Technical Report NVR-2008-003, December 2008
|
|
// http://mgarland.org/files/papers/nvr-2008-003.pdf
|
|
thrust::inclusive_scan
|
|
(exec,
|
|
thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())),
|
|
thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())) + n,
|
|
thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())),
|
|
detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
|
|
}
|
|
|
|
return result + n;
|
|
}
|
|
|
|
|
|
template<typename DerivedPolicy,
|
|
typename InputIterator1,
|
|
typename InputIterator2,
|
|
typename OutputIterator>
|
|
OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
|
|
InputIterator1 first1,
|
|
InputIterator1 last1,
|
|
InputIterator2 first2,
|
|
OutputIterator result)
|
|
{
|
|
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
|
|
return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, OutputType(0));
|
|
}
|
|
|
|
|
|
template<typename DerivedPolicy,
|
|
typename InputIterator1,
|
|
typename InputIterator2,
|
|
typename OutputIterator,
|
|
typename T>
|
|
OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
|
|
InputIterator1 first1,
|
|
InputIterator1 last1,
|
|
InputIterator2 first2,
|
|
OutputIterator result,
|
|
T init)
|
|
{
|
|
typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
|
|
return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<InputType1>());
|
|
}
|
|
|
|
|
|
template<typename DerivedPolicy,
|
|
typename InputIterator1,
|
|
typename InputIterator2,
|
|
typename OutputIterator,
|
|
typename T,
|
|
typename BinaryPredicate>
|
|
OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
|
|
InputIterator1 first1,
|
|
InputIterator1 last1,
|
|
InputIterator2 first2,
|
|
OutputIterator result,
|
|
T init,
|
|
BinaryPredicate binary_pred)
|
|
{
|
|
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
|
|
return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<OutputType>());
|
|
}
|
|
|
|
|
|
template<typename DerivedPolicy,
|
|
typename InputIterator1,
|
|
typename InputIterator2,
|
|
typename OutputIterator,
|
|
typename T,
|
|
typename BinaryPredicate,
|
|
typename AssociativeOperator>
|
|
OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
|
|
InputIterator1 first1,
|
|
InputIterator1 last1,
|
|
InputIterator2 first2,
|
|
OutputIterator result,
|
|
T init,
|
|
BinaryPredicate binary_pred,
|
|
AssociativeOperator binary_op)
|
|
{
|
|
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
|
|
typedef unsigned int HeadFlagType;
|
|
|
|
const size_t n = last1 - first1;
|
|
|
|
if(n != 0)
|
|
{
|
|
InputIterator2 last2 = first2 + n;
|
|
|
|
// compute head flags
|
|
thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
|
|
flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
|
|
|
|
// shift input one to the right and initialize segments with init
|
|
thrust::detail::temporary_array<OutputType,DerivedPolicy> temp(exec, n);
|
|
thrust::replace_copy_if(exec, first2, last2 - 1, flags.begin() + 1, temp.begin() + 1, thrust::negate<HeadFlagType>(), init);
|
|
temp[0] = init;
|
|
|
|
// scan key-flag tuples,
|
|
// For additional details refer to Section 2 of the following paper
|
|
// S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
|
|
// NVIDIA Technical Report NVR-2008-003, December 2008
|
|
// http://mgarland.org/files/papers/nvr-2008-003.pdf
|
|
thrust::inclusive_scan(exec,
|
|
thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())),
|
|
thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())) + n,
|
|
thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())),
|
|
detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
|
|
}
|
|
|
|
return result + n;
|
|
}
|
|
|
|
} // end namespace generic
|
|
} // end namespace detail
|
|
} // end namespace system
|
|
} // end namespace thrust
|
|
|