GOSTcoin support for ccminer CUDA miner project, compatible with most nvidia cards
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

240 lines
9.4 KiB

/*
* Copyright 2008-2012 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <thrust/detail/config.h>
#include <thrust/system/detail/generic/scan_by_key.h>
#include <thrust/functional.h>
#include <thrust/transform.h>
#include <thrust/replace.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/iterator_traits.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/internal_functional.h>
#include <thrust/scan.h>
namespace thrust
{
namespace system
{
namespace detail
{
namespace generic
{
namespace detail
{
template <typename OutputType, typename HeadFlagType, typename AssociativeOperator>
struct segmented_scan_functor
{
AssociativeOperator binary_op;
typedef typename thrust::tuple<OutputType, HeadFlagType> result_type;
__host__ __device__
segmented_scan_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
__host__ __device__
result_type operator()(result_type a, result_type b)
{
return result_type(thrust::get<1>(b) ? thrust::get<0>(b) : binary_op(thrust::get<0>(a), thrust::get<0>(b)),
thrust::get<1>(a) | thrust::get<1>(b));
}
};
} // end namespace detail
template<typename DerivedPolicy,
typename InputIterator1,
typename InputIterator2,
typename OutputIterator>
OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator result)
{
typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<InputType1>());
}
template<typename DerivedPolicy,
typename InputIterator1,
typename InputIterator2,
typename OutputIterator,
typename BinaryPredicate>
OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator result,
BinaryPredicate binary_pred)
{
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<OutputType>());
}
template<typename DerivedPolicy,
typename InputIterator1,
typename InputIterator2,
typename OutputIterator,
typename BinaryPredicate,
typename AssociativeOperator>
OutputIterator inclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator result,
BinaryPredicate binary_pred,
AssociativeOperator binary_op)
{
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
typedef unsigned int HeadFlagType;
const size_t n = last1 - first1;
if(n != 0)
{
// compute head flags
thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
// scan key-flag tuples,
// For additional details refer to Section 2 of the following paper
// S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
// NVIDIA Technical Report NVR-2008-003, December 2008
// http://mgarland.org/files/papers/nvr-2008-003.pdf
thrust::inclusive_scan
(exec,
thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())),
thrust::make_zip_iterator(thrust::make_tuple(first2, flags.begin())) + n,
thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())),
detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
}
return result + n;
}
template<typename DerivedPolicy,
typename InputIterator1,
typename InputIterator2,
typename OutputIterator>
OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator result)
{
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, OutputType(0));
}
template<typename DerivedPolicy,
typename InputIterator1,
typename InputIterator2,
typename OutputIterator,
typename T>
OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator result,
T init)
{
typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<InputType1>());
}
template<typename DerivedPolicy,
typename InputIterator1,
typename InputIterator2,
typename OutputIterator,
typename T,
typename BinaryPredicate>
OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator result,
T init,
BinaryPredicate binary_pred)
{
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<OutputType>());
}
template<typename DerivedPolicy,
typename InputIterator1,
typename InputIterator2,
typename OutputIterator,
typename T,
typename BinaryPredicate,
typename AssociativeOperator>
OutputIterator exclusive_scan_by_key(thrust::execution_policy<DerivedPolicy> &exec,
InputIterator1 first1,
InputIterator1 last1,
InputIterator2 first2,
OutputIterator result,
T init,
BinaryPredicate binary_pred,
AssociativeOperator binary_op)
{
typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
typedef unsigned int HeadFlagType;
const size_t n = last1 - first1;
if(n != 0)
{
InputIterator2 last2 = first2 + n;
// compute head flags
thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
// shift input one to the right and initialize segments with init
thrust::detail::temporary_array<OutputType,DerivedPolicy> temp(exec, n);
thrust::replace_copy_if(exec, first2, last2 - 1, flags.begin() + 1, temp.begin() + 1, thrust::negate<HeadFlagType>(), init);
temp[0] = init;
// scan key-flag tuples,
// For additional details refer to Section 2 of the following paper
// S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
// NVIDIA Technical Report NVR-2008-003, December 2008
// http://mgarland.org/files/papers/nvr-2008-003.pdf
thrust::inclusive_scan(exec,
thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())),
thrust::make_zip_iterator(thrust::make_tuple(temp.begin(), flags.begin())) + n,
thrust::make_zip_iterator(thrust::make_tuple(result, flags.begin())),
detail::segmented_scan_functor<OutputType, HeadFlagType, AssociativeOperator>(binary_op));
}
return result + n;
}
} // end namespace generic
} // end namespace detail
} // end namespace system
} // end namespace thrust