754 lines
28 KiB
C++
754 lines
28 KiB
C++
/*
|
|
* Copyright 2008-2012 NVIDIA Corporation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
|
|
#include <thrust/detail/config.h>
|
|
|
|
#include <thrust/iterator/iterator_traits.h>
|
|
#include <thrust/system/detail/generic/select_system.h>
|
|
#include <thrust/detail/temporary_array.h>
|
|
|
|
#include <thrust/detail/type_traits.h>
|
|
#include <thrust/detail/type_traits/function_traits.h>
|
|
#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
|
|
|
|
#include <thrust/system/cuda/detail/reduce_intervals.h>
|
|
#include <thrust/system/cuda/detail/synchronize.h>
|
|
#include <thrust/system/cuda/detail/default_decomposition.h>
|
|
#include <thrust/system/cuda/detail/detail/launch_closure.h>
|
|
#include <thrust/system/cuda/detail/detail/uninitialized.h>
|
|
#include <thrust/detail/raw_pointer_cast.h>
|
|
|
|
__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
|
|
|
|
|
|
namespace thrust
|
|
{
|
|
namespace detail
|
|
{
|
|
|
|
// forward declaration of temporary_array
|
|
template<typename,typename> class temporary_array;
|
|
|
|
} // end detail
|
|
|
|
namespace system
|
|
{
|
|
namespace cuda
|
|
{
|
|
namespace detail
|
|
{
|
|
namespace detail
|
|
{
|
|
namespace fast_scan
|
|
{
|
|
namespace fast_scan_detail
|
|
{
|
|
|
|
|
|
// TODO tune this
|
|
template <typename ValueType>
|
|
struct inclusive_scan_block_size
|
|
{
|
|
private:
|
|
static const unsigned int max_memory = 16384 - 256 - 2 * sizeof(ValueType);
|
|
static const unsigned int max_block_size = max_memory / sizeof(ValueType);
|
|
static const unsigned int default_block_size = 7 * 32;
|
|
static const unsigned int block_size = (max_block_size < default_block_size) ? max_block_size : default_block_size;
|
|
|
|
public:
|
|
static const unsigned int pass1 = block_size;
|
|
static const unsigned int pass2 = block_size;
|
|
static const unsigned int pass3 = block_size;
|
|
};
|
|
|
|
// TODO tune this
|
|
template <typename ValueType>
|
|
struct exclusive_scan_block_size
|
|
{
|
|
private:
|
|
static const unsigned int max_memory = 16384 - 256 - 2 * sizeof(ValueType);
|
|
static const unsigned int max_block_size = max_memory / sizeof(ValueType);
|
|
static const unsigned int default_block_size = 5 * 32;
|
|
static const unsigned int block_size = (max_block_size < default_block_size) ? max_block_size : default_block_size;
|
|
|
|
public:
|
|
static const unsigned int pass1 = block_size;
|
|
static const unsigned int pass2 = block_size;
|
|
static const unsigned int pass3 = block_size;
|
|
};
|
|
|
|
|
|
template <unsigned int CTA_SIZE,
|
|
typename Context,
|
|
typename SharedArray,
|
|
typename BinaryFunction>
|
|
__device__ __thrust_forceinline__
|
|
void scan_block(Context context, SharedArray array, BinaryFunction binary_op)
|
|
{
|
|
typedef typename thrust::iterator_value<SharedArray>::type T;
|
|
|
|
T val = array[context.thread_index()];
|
|
|
|
if (CTA_SIZE > 1) { if(context.thread_index() >= 1) { T tmp = array[context.thread_index() - 1]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 2) { if(context.thread_index() >= 2) { T tmp = array[context.thread_index() - 2]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 4) { if(context.thread_index() >= 4) { T tmp = array[context.thread_index() - 4]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 8) { if(context.thread_index() >= 8) { T tmp = array[context.thread_index() - 8]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 16) { if(context.thread_index() >= 16) { T tmp = array[context.thread_index() - 16]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 32) { if(context.thread_index() >= 32) { T tmp = array[context.thread_index() - 32]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 64) { if(context.thread_index() >= 64) { T tmp = array[context.thread_index() - 64]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 128) { if(context.thread_index() >= 128) { T tmp = array[context.thread_index() - 128]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 256) { if(context.thread_index() >= 256) { T tmp = array[context.thread_index() - 256]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 512) { if(context.thread_index() >= 512) { T tmp = array[context.thread_index() - 512]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 1024) { if(context.thread_index() >= 1024) { T tmp = array[context.thread_index() - 1024]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
}
|
|
|
|
template <unsigned int CTA_SIZE,
|
|
typename Context,
|
|
typename SharedArray,
|
|
typename BinaryFunction>
|
|
__device__ __thrust_forceinline__
|
|
void scan_block_n(Context context, SharedArray array, const unsigned int n, BinaryFunction binary_op)
|
|
{
|
|
typedef typename thrust::iterator_value<SharedArray>::type T;
|
|
|
|
T val = array[context.thread_index()];
|
|
|
|
if (CTA_SIZE > 1) { if(context.thread_index() < n && context.thread_index() >= 1) { T tmp = array[context.thread_index() - 1]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 2) { if(context.thread_index() < n && context.thread_index() >= 2) { T tmp = array[context.thread_index() - 2]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 4) { if(context.thread_index() < n && context.thread_index() >= 4) { T tmp = array[context.thread_index() - 4]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 8) { if(context.thread_index() < n && context.thread_index() >= 8) { T tmp = array[context.thread_index() - 8]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 16) { if(context.thread_index() < n && context.thread_index() >= 16) { T tmp = array[context.thread_index() - 16]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 32) { if(context.thread_index() < n && context.thread_index() >= 32) { T tmp = array[context.thread_index() - 32]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 64) { if(context.thread_index() < n && context.thread_index() >= 64) { T tmp = array[context.thread_index() - 64]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 128) { if(context.thread_index() < n && context.thread_index() >= 128) { T tmp = array[context.thread_index() - 128]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 256) { if(context.thread_index() < n && context.thread_index() >= 256) { T tmp = array[context.thread_index() - 256]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 512) { if(context.thread_index() < n && context.thread_index() >= 512) { T tmp = array[context.thread_index() - 512]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
if (CTA_SIZE > 1024) { if(context.thread_index() < n && context.thread_index() >= 1024) { T tmp = array[context.thread_index() - 1024]; val = binary_op(tmp, val); } context.barrier(); array[context.thread_index()] = val; context.barrier(); }
|
|
}
|
|
|
|
template <unsigned int CTA_SIZE,
|
|
unsigned int K,
|
|
bool FullBlock,
|
|
typename Context,
|
|
typename InputIterator,
|
|
typename ValueType>
|
|
__device__ __thrust_forceinline__
|
|
void load_block(Context context,
|
|
const unsigned int n,
|
|
InputIterator input,
|
|
ValueType (&sdata)[K][CTA_SIZE + 1])
|
|
{
|
|
for(unsigned int k = 0; k < K; k++)
|
|
{
|
|
const unsigned int offset = k*CTA_SIZE + context.thread_index();
|
|
|
|
if (FullBlock || offset < n)
|
|
{
|
|
InputIterator temp = input + offset;
|
|
sdata[offset % K][offset / K] = *temp;
|
|
}
|
|
}
|
|
|
|
context.barrier();
|
|
}
|
|
|
|
template <unsigned int CTA_SIZE,
|
|
unsigned int K,
|
|
bool Inclusive,
|
|
bool FullBlock,
|
|
typename Context,
|
|
typename OutputIterator,
|
|
typename ValueType>
|
|
__device__ __thrust_forceinline__
|
|
void store_block(Context context,
|
|
const unsigned int n,
|
|
OutputIterator output,
|
|
ValueType (&sdata)[K][CTA_SIZE + 1],
|
|
ValueType& carry)
|
|
{
|
|
if (Inclusive)
|
|
{
|
|
for(unsigned int k = 0; k < K; k++)
|
|
{
|
|
const unsigned int offset = k*CTA_SIZE + context.thread_index();
|
|
|
|
if (FullBlock || offset < n)
|
|
{
|
|
OutputIterator temp = output + offset;
|
|
*temp = sdata[offset % K][offset / K];
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for(unsigned int k = 0; k < K; k++)
|
|
{
|
|
const unsigned int offset = k*CTA_SIZE + context.thread_index();
|
|
|
|
if (FullBlock || offset < n)
|
|
{
|
|
OutputIterator temp = output + offset;
|
|
*temp = (offset == 0) ? carry : sdata[(offset - 1) % K][(offset - 1) / K];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <unsigned int CTA_SIZE,
|
|
unsigned int K,
|
|
bool FullBlock,
|
|
typename Context,
|
|
typename InputIterator,
|
|
typename BinaryFunction,
|
|
typename ValueType>
|
|
__device__ __thrust_forceinline__
|
|
void upsweep_body(Context context,
|
|
const unsigned int n,
|
|
const bool carry_in,
|
|
InputIterator input,
|
|
BinaryFunction binary_op,
|
|
ValueType (&sdata)[K][CTA_SIZE + 1],
|
|
ValueType& carry)
|
|
{
|
|
// read data
|
|
load_block<CTA_SIZE,K,FullBlock>(context, n, input, sdata);
|
|
|
|
// copy into local array
|
|
ValueType ldata[K];
|
|
for (unsigned int k = 0; k < K; k++)
|
|
ldata[k] = sdata[k][context.thread_index()];
|
|
|
|
// carry in
|
|
if (context.thread_index() == 0 && carry_in)
|
|
{
|
|
// XXX WAR sm_10 issue
|
|
ValueType tmp = carry;
|
|
ldata[0] = binary_op(tmp, ldata[0]);
|
|
}
|
|
|
|
// scan local values
|
|
for(unsigned int k = 1; k < K; k++)
|
|
{
|
|
const unsigned int offset = K * context.thread_index() + k;
|
|
|
|
if (FullBlock || offset < n)
|
|
ldata[k] = binary_op(ldata[k-1],ldata[k]);
|
|
}
|
|
|
|
sdata[K - 1][context.thread_index()] = ldata[K - 1];
|
|
|
|
context.barrier();
|
|
|
|
// second level scan
|
|
if (FullBlock && sizeof(ValueType) > 1) // TODO investigate why this WAR is necessary
|
|
scan_block<CTA_SIZE>(context, sdata[K - 1], binary_op);
|
|
else
|
|
scan_block_n<CTA_SIZE>(context, sdata[K - 1], n / K, binary_op);
|
|
|
|
// store carry out
|
|
if (FullBlock)
|
|
{
|
|
if (context.thread_index() == CTA_SIZE - 1)
|
|
carry = sdata[K - 1][context.thread_index()];
|
|
}
|
|
else
|
|
{
|
|
if (context.thread_index() == (n - 1) / K)
|
|
{
|
|
ValueType sum;
|
|
|
|
for (unsigned int k = 0; k < K; k++)
|
|
if ((n - 1) % K == k)
|
|
sum = ldata[k];
|
|
|
|
if (context.thread_index() > 0)
|
|
{
|
|
// WAR sm_10 issue
|
|
ValueType tmp = sdata[K - 1][context.thread_index() - 1];
|
|
sum = binary_op(tmp, sum);
|
|
}
|
|
|
|
carry = sum;
|
|
}
|
|
}
|
|
|
|
context.barrier();
|
|
}
|
|
|
|
template <unsigned int CTA_SIZE,
|
|
unsigned int K,
|
|
bool Inclusive,
|
|
bool FullBlock,
|
|
typename Context,
|
|
typename InputIterator,
|
|
typename OutputIterator,
|
|
typename BinaryFunction,
|
|
typename ValueType>
|
|
__device__ __thrust_forceinline__
|
|
void scan_body(Context context,
|
|
const unsigned int n,
|
|
const bool carry_in,
|
|
InputIterator input,
|
|
OutputIterator output,
|
|
BinaryFunction binary_op,
|
|
ValueType (&sdata)[K][CTA_SIZE + 1],
|
|
ValueType& carry)
|
|
{
|
|
// read data
|
|
load_block<CTA_SIZE,K,FullBlock>(context, n, input, sdata);
|
|
|
|
// copy into local array
|
|
ValueType ldata[K];
|
|
for (unsigned int k = 0; k < K; k++)
|
|
ldata[k] = sdata[k][context.thread_index()];
|
|
|
|
// carry in
|
|
if (context.thread_index() == 0 && carry_in)
|
|
{
|
|
// XXX WAR sm_10 issue
|
|
ValueType tmp = carry;
|
|
ldata[0] = binary_op(tmp, ldata[0]);
|
|
}
|
|
|
|
// scan local values
|
|
for(unsigned int k = 1; k < K; k++)
|
|
{
|
|
const unsigned int offset = K * context.thread_index() + k;
|
|
|
|
if (FullBlock || offset < n)
|
|
ldata[k] = binary_op(ldata[k-1],ldata[k]);
|
|
}
|
|
|
|
sdata[K - 1][context.thread_index()] = ldata[K - 1];
|
|
|
|
context.barrier();
|
|
|
|
// second level scan
|
|
if (FullBlock)
|
|
scan_block<CTA_SIZE>(context, sdata[K - 1], binary_op);
|
|
else
|
|
scan_block_n<CTA_SIZE>(context, sdata[K - 1], n / K, binary_op);
|
|
|
|
// update local values
|
|
if (context.thread_index() > 0)
|
|
{
|
|
ValueType left = sdata[K - 1][context.thread_index() - 1];
|
|
|
|
for(unsigned int k = 0; k < K; k++)
|
|
{
|
|
const unsigned int offset = K * context.thread_index() + k;
|
|
|
|
if (FullBlock || offset < n)
|
|
ldata[k] = binary_op(left, ldata[k]);
|
|
}
|
|
}
|
|
|
|
for (unsigned int k = 0; k < K; k++)
|
|
sdata[k][context.thread_index()] = ldata[k];
|
|
|
|
context.barrier();
|
|
|
|
// write data
|
|
store_block<CTA_SIZE, K, Inclusive, FullBlock>(context, n, output, sdata, carry);
|
|
|
|
// store carry out
|
|
if (context.thread_index() == 0)
|
|
{
|
|
if (FullBlock)
|
|
carry = sdata[K - 1][CTA_SIZE - 1];
|
|
else
|
|
carry = sdata[(n - 1) % K][(n - 1) / K]; // note: this must come after the local update
|
|
}
|
|
|
|
context.barrier();
|
|
}
|
|
|
|
template <typename InputIterator,
|
|
typename ValueType,
|
|
typename BinaryFunction,
|
|
typename Decomposition,
|
|
typename Context>
|
|
struct upsweep_intervals_closure
|
|
{
|
|
InputIterator input;
|
|
ValueType * block_results; // TODO change this to ValueIterator
|
|
BinaryFunction binary_op;
|
|
Decomposition decomp;
|
|
Context context;
|
|
|
|
typedef Context context_type;
|
|
|
|
upsweep_intervals_closure(InputIterator input,
|
|
ValueType * block_results,
|
|
BinaryFunction binary_op,
|
|
Decomposition decomp,
|
|
Context context = Context())
|
|
: input(input), block_results(block_results), binary_op(binary_op), decomp(decomp), context(context) {}
|
|
|
|
__device__ __thrust_forceinline__
|
|
void operator()(void)
|
|
{
|
|
typedef typename Decomposition::index_type IndexType;
|
|
|
|
const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
|
|
|
|
#if __CUDA_ARCH__ >= 200
|
|
const unsigned int SMEM = (48 * 1024);
|
|
#else
|
|
const unsigned int SMEM = (16 * 1024) - 256;
|
|
#endif
|
|
const unsigned int MAX_K = ((SMEM - 1 * sizeof(ValueType)) / (sizeof(ValueType) * (CTA_SIZE + 1)));
|
|
const unsigned int K = (MAX_K < 6) ? MAX_K : 6;
|
|
|
|
__shared__ uninitialized<ValueType[K][CTA_SIZE + 1]> sdata; // padded to avoid bank conflicts
|
|
|
|
__shared__ uninitialized<ValueType> carry; // storage for carry out
|
|
if(context.thread_index() == 0) carry.construct();
|
|
|
|
context.barrier();
|
|
|
|
thrust::system::detail::internal::index_range<IndexType> interval = decomp[context.block_index()];
|
|
|
|
IndexType base = interval.begin();
|
|
|
|
input += base;
|
|
|
|
const unsigned int unit_size = K * CTA_SIZE;
|
|
|
|
bool carry_in = false;
|
|
|
|
// process full units
|
|
while (base + unit_size <= interval.end())
|
|
{
|
|
const unsigned int n = unit_size;
|
|
upsweep_body<CTA_SIZE,K,true>(context, n, carry_in, input, binary_op, sdata.get(), carry.get());
|
|
base += unit_size;
|
|
input += unit_size;
|
|
carry_in = true;
|
|
}
|
|
|
|
// process partially full unit at end of input (if necessary)
|
|
if (base < interval.end())
|
|
{
|
|
const unsigned int n = interval.end() - base;
|
|
upsweep_body<CTA_SIZE,K,false>(context, n, carry_in, input, binary_op, sdata.get(), carry.get());
|
|
}
|
|
|
|
// write interval sum
|
|
if (context.thread_index() == 0)
|
|
block_results[context.block_index()] = carry;
|
|
}
|
|
};
|
|
|
|
|
|
template <bool Inclusive,
|
|
typename InputIterator,
|
|
typename OutputIterator,
|
|
typename ValueType,
|
|
typename BinaryFunction,
|
|
typename Decomposition,
|
|
typename Context>
|
|
struct downsweep_intervals_closure
|
|
{
|
|
InputIterator input;
|
|
OutputIterator output;
|
|
ValueType * block_results;
|
|
BinaryFunction binary_op;
|
|
Decomposition decomp;
|
|
Context context;
|
|
|
|
typedef Context context_type;
|
|
|
|
downsweep_intervals_closure(InputIterator input,
|
|
OutputIterator output,
|
|
ValueType * block_results,
|
|
BinaryFunction binary_op,
|
|
Decomposition decomp,
|
|
Context context = Context())
|
|
: input(input), output(output), block_results(block_results), binary_op(binary_op), decomp(decomp), context(context) {}
|
|
|
|
__device__ __thrust_forceinline__
|
|
void operator()(void)
|
|
{
|
|
typedef typename Decomposition::index_type IndexType;
|
|
|
|
const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
|
|
|
|
#if __CUDA_ARCH__ >= 200
|
|
const unsigned int SMEM = (48 * 1024);
|
|
#else
|
|
const unsigned int SMEM = (16 * 1024) - 256;
|
|
#endif
|
|
const unsigned int MAX_K = ((SMEM - 1 * sizeof(ValueType))/ (sizeof(ValueType) * (CTA_SIZE + 1)));
|
|
const unsigned int K = (MAX_K < 6) ? MAX_K : 6;
|
|
|
|
__shared__ uninitialized<ValueType[K][CTA_SIZE + 1]> sdata; // padded to avoid bank conflicts
|
|
|
|
__shared__ uninitialized<ValueType> carry; // storage for carry in and carry out
|
|
if(context.thread_index() == 0) carry.construct();
|
|
|
|
context.barrier();
|
|
|
|
thrust::system::detail::internal::index_range<IndexType> interval = decomp[context.block_index()];
|
|
|
|
IndexType base = interval.begin();
|
|
|
|
input += base;
|
|
output += base;
|
|
|
|
const unsigned int unit_size = K * CTA_SIZE;
|
|
|
|
bool carry_in = (Inclusive && context.block_index() == 0) ? false : true;
|
|
|
|
if (carry_in)
|
|
{
|
|
if (context.thread_index() == 0)
|
|
carry = block_results[context.block_index()];
|
|
context.barrier();
|
|
}
|
|
|
|
// process full units
|
|
while (base + unit_size <= interval.end())
|
|
{
|
|
const unsigned int n = unit_size;
|
|
scan_body<CTA_SIZE,K,Inclusive,true>(context, n, carry_in, input, output, binary_op, sdata.get(), carry.get());
|
|
base += K * CTA_SIZE;
|
|
input += K * CTA_SIZE;
|
|
output += K * CTA_SIZE;
|
|
carry_in = true;
|
|
}
|
|
|
|
// process partially full unit at end of input (if necessary)
|
|
if (base < interval.end())
|
|
{
|
|
const unsigned int n = interval.end() - base;
|
|
scan_body<CTA_SIZE,K,Inclusive,false>(context, n, carry_in, input, output, binary_op, sdata.get(), carry.get());
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
} // end namespace fast_scan_detail
|
|
|
|
|
|
template <typename DerivedPolicy,
|
|
typename InputIterator,
|
|
typename OutputIterator,
|
|
typename BinaryFunction>
|
|
OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
|
|
InputIterator first,
|
|
InputIterator last,
|
|
OutputIterator output,
|
|
BinaryFunction binary_op)
|
|
{
|
|
using namespace fast_scan_detail;
|
|
|
|
// the pseudocode for deducing the type of the temporary used below:
|
|
//
|
|
// if BinaryFunction is AdaptableBinaryFunction
|
|
// TemporaryType = AdaptableBinaryFunction::result_type
|
|
// else if OutputIterator is a "pure" output iterator
|
|
// TemporaryType = InputIterator::value_type
|
|
// else
|
|
// TemporaryType = OutputIterator::value_type
|
|
//
|
|
// XXX upon c++0x, TemporaryType needs to be:
|
|
// result_of<BinaryFunction>::type
|
|
|
|
typedef typename thrust::detail::eval_if<
|
|
thrust::detail::has_result_type<BinaryFunction>::value,
|
|
thrust::detail::result_type<BinaryFunction>,
|
|
thrust::detail::eval_if<
|
|
thrust::detail::is_output_iterator<OutputIterator>::value,
|
|
thrust::iterator_value<InputIterator>,
|
|
thrust::iterator_value<OutputIterator>
|
|
>
|
|
>::type ValueType;
|
|
|
|
typedef unsigned int IndexType;
|
|
typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
|
|
typedef thrust::detail::temporary_array<ValueType,DerivedPolicy> ValueArray;
|
|
|
|
if (first == last)
|
|
return output;
|
|
|
|
Decomposition decomp = thrust::system::cuda::detail::default_decomposition<IndexType>(last - first);
|
|
|
|
ValueArray block_results(exec, decomp.size());
|
|
|
|
// compute sum over each interval
|
|
if (thrust::detail::is_commutative<BinaryFunction>::value)
|
|
{
|
|
// use reduce_intervals for commutative operators
|
|
thrust::system::cuda::detail::reduce_intervals(exec, first, block_results.begin(), binary_op, decomp);
|
|
}
|
|
else
|
|
{
|
|
const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass1;
|
|
typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
|
|
|
|
typedef upsweep_intervals_closure<InputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
|
|
Closure closure(first,
|
|
thrust::raw_pointer_cast(&block_results[0]),
|
|
binary_op,
|
|
decomp);
|
|
detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
|
|
}
|
|
|
|
// second level inclusive scan of per-block results
|
|
{
|
|
const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass2;
|
|
typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
|
|
|
|
typedef downsweep_intervals_closure<true,ValueType*,ValueType*,ValueType,BinaryFunction,Decomposition,Context> Closure;
|
|
Closure closure(thrust::raw_pointer_cast(&block_results[0]),
|
|
thrust::raw_pointer_cast(&block_results[0]),
|
|
thrust::raw_pointer_cast(&block_results[0]), // not used
|
|
binary_op,
|
|
Decomposition(decomp.size(), 1, 1));
|
|
detail::launch_closure(closure, 1, ThreadsPerBlock);
|
|
}
|
|
|
|
// update intervals with result of second level scan
|
|
{
|
|
const static unsigned int ThreadsPerBlock = inclusive_scan_block_size<ValueType>::pass3;
|
|
typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
|
|
|
|
typedef downsweep_intervals_closure<true,InputIterator,OutputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
|
|
Closure closure(first,
|
|
output,
|
|
thrust::raw_pointer_cast(&block_results[0]) - 1, // shift block results
|
|
binary_op,
|
|
decomp);
|
|
detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
|
|
}
|
|
|
|
return output + (last - first);
|
|
}
|
|
|
|
|
|
template <typename DerivedPolicy,
|
|
typename InputIterator,
|
|
typename OutputIterator,
|
|
typename T,
|
|
typename BinaryFunction>
|
|
OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
|
|
InputIterator first,
|
|
InputIterator last,
|
|
OutputIterator output,
|
|
const T init,
|
|
BinaryFunction binary_op)
|
|
{
|
|
using namespace fast_scan_detail;
|
|
|
|
// the pseudocode for deducing the type of the temporary used below:
|
|
//
|
|
// if BinaryFunction is AdaptableBinaryFunction
|
|
// TemporaryType = AdaptableBinaryFunction::result_type
|
|
// else if OutputIterator is a "pure" output iterator
|
|
// TemporaryType = InputIterator::value_type
|
|
// else
|
|
// TemporaryType = OutputIterator::value_type
|
|
//
|
|
// XXX upon c++0x, TemporaryType needs to be:
|
|
// result_of<BinaryFunction>::type
|
|
|
|
typedef typename thrust::detail::eval_if<
|
|
thrust::detail::has_result_type<BinaryFunction>::value,
|
|
thrust::detail::result_type<BinaryFunction>,
|
|
thrust::detail::eval_if<
|
|
thrust::detail::is_output_iterator<OutputIterator>::value,
|
|
thrust::iterator_value<InputIterator>,
|
|
thrust::iterator_value<OutputIterator>
|
|
>
|
|
>::type ValueType;
|
|
|
|
typedef unsigned int IndexType;
|
|
typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
|
|
typedef thrust::detail::temporary_array<ValueType,DerivedPolicy> ValueArray;
|
|
|
|
if (first == last)
|
|
return output;
|
|
|
|
Decomposition decomp = thrust::system::cuda::detail::default_decomposition<IndexType>(last - first);
|
|
|
|
ValueArray block_results(exec, decomp.size() + 1);
|
|
|
|
// compute sum over each interval
|
|
if (thrust::detail::is_commutative<BinaryFunction>::value)
|
|
{
|
|
// use reduce_intervals for commutative operators
|
|
thrust::system::cuda::detail::reduce_intervals(exec, first, block_results.begin() + 1, binary_op, decomp);
|
|
}
|
|
else
|
|
{
|
|
const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass1;
|
|
typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
|
|
|
|
typedef upsweep_intervals_closure<InputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
|
|
Closure closure(first,
|
|
thrust::raw_pointer_cast(&block_results[0]) + 1,
|
|
binary_op,
|
|
decomp);
|
|
detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
|
|
}
|
|
|
|
// place init before per-block results
|
|
block_results[0] = init;
|
|
|
|
// second level inclusive scan of per-block results
|
|
{
|
|
const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass2;
|
|
typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
|
|
|
|
typedef downsweep_intervals_closure<true,ValueType*,ValueType*,ValueType,BinaryFunction,Decomposition,Context> Closure;
|
|
Closure closure(thrust::raw_pointer_cast(&block_results[0]),
|
|
thrust::raw_pointer_cast(&block_results[0]),
|
|
thrust::raw_pointer_cast(&block_results[0]), // not used
|
|
binary_op,
|
|
Decomposition(decomp.size() + 1, 1, 1));
|
|
detail::launch_closure(closure, 1, ThreadsPerBlock);
|
|
}
|
|
|
|
// update intervals with result of second level scan
|
|
{
|
|
const static unsigned int ThreadsPerBlock = exclusive_scan_block_size<ValueType>::pass3;
|
|
typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
|
|
|
|
typedef downsweep_intervals_closure<false,InputIterator,OutputIterator,ValueType,BinaryFunction,Decomposition,Context> Closure;
|
|
Closure closure(first,
|
|
output,
|
|
thrust::raw_pointer_cast(&block_results[0]), // shift block results
|
|
binary_op,
|
|
decomp);
|
|
detail::launch_closure(closure, decomp.size(), ThreadsPerBlock);
|
|
}
|
|
|
|
return output + (last - first);
|
|
}
|
|
|
|
|
|
} // end namespace fast_scan
|
|
} // end namespace detail
|
|
} // end namespace detail
|
|
} // end namespace cuda
|
|
} // end namespace system
|
|
} // end namespace thrust
|
|
|
|
__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
|
|
|