You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
223 lines
6.8 KiB
223 lines
6.8 KiB
/* |
|
* Copyright 2008-2012 NVIDIA Corporation |
|
* |
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
|
* you may not use this file except in compliance with the License. |
|
* You may obtain a copy of the License at |
|
* |
|
* http://www.apache.org/licenses/LICENSE-2.0 |
|
* |
|
* Unless required by applicable law or agreed to in writing, software |
|
* distributed under the License is distributed on an "AS IS" BASIS, |
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
* See the License for the specific language governing permissions and |
|
* limitations under the License. |
|
*/ |
|
|
|
/*! \file copy.h |
|
* \brief CUDA implementation of device-to-device copy, |
|
* based on Gregory Diamos' memcpy code. |
|
*/ |
|
|
|
#pragma once |
|
|
|
#include <thrust/detail/config.h> |
|
|
|
#include <thrust/pair.h> |
|
|
|
#include <thrust/detail/type_traits.h> |
|
#include <thrust/detail/dispatch/is_trivial_copy.h> |
|
#include <thrust/detail/raw_reference_cast.h> |
|
|
|
namespace thrust |
|
{ |
|
namespace system |
|
{ |
|
namespace cuda |
|
{ |
|
namespace detail |
|
{ |
|
namespace block |
|
{ |
|
|
|
namespace trivial_copy_detail |
|
{ |
|
|
|
|
|
template<typename Size> |
|
inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d) |
|
{ |
|
Size quotient = n / d; |
|
Size remainder = n - d * quotient; |
|
return thrust::make_pair(quotient,remainder); |
|
} // end quotient_and_remainder() |
|
|
|
|
|
// assumes the addresses dst & src are aligned to T boundaries |
|
template<typename Context, |
|
typename T> |
|
__device__ __thrust_forceinline__ |
|
void aligned_copy(Context context, T *dst, const T *src, unsigned int num_elements) |
|
{ |
|
for(unsigned int i = context.thread_index(); |
|
i < num_elements; |
|
i += context.block_dimension()) |
|
{ |
|
dst[i] = src[i]; |
|
} |
|
} // end aligned_copy() |
|
|
|
|
|
} // end namespace trivial_copy_detail |
|
|
|
|
|
template <typename Context> |
|
__device__ __thrust_forceinline__ |
|
void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes) |
|
{ |
|
// reinterpret at bytes |
|
char* destination = reinterpret_cast<char*>(destination_); |
|
const char* source = reinterpret_cast<const char*>(source_); |
|
|
|
// TODO replace this with uint64 |
|
#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC |
|
typedef long long int2; |
|
typedef long long uint2; |
|
#endif // THRUST_DEVICE_COMPILER_NVCC |
|
|
|
// check alignment |
|
// XXX can we do this in three steps? |
|
// 1. copy until alignment is met |
|
// 2. go hog wild |
|
// 3. get the remainder |
|
if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 || reinterpret_cast<size_t>(source) % sizeof(uint2) != 0) |
|
{ |
|
for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension()) |
|
{ |
|
destination[i] = source[i]; |
|
} |
|
} |
|
else |
|
{ |
|
// it's aligned; do a wide copy |
|
|
|
// this pair stores the number of int2s in the aligned portion of the arrays |
|
// and the number of bytes in the remainder |
|
const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2)); |
|
|
|
// copy int2 elements |
|
trivial_copy_detail::aligned_copy(context, |
|
reinterpret_cast<int2*>(destination), |
|
reinterpret_cast<const int2*>(source), |
|
num_wide_elements_and_remainder_bytes.first); |
|
|
|
// XXX we could copy int elements here |
|
|
|
// copy remainder byte by byte |
|
|
|
// to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion |
|
// this is sizeof(int2) times the number of int2s comprising the aligned portion |
|
const char *remainder_first = reinterpret_cast<const char*>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first); |
|
char *remainder_result = reinterpret_cast<char*>(destination + sizeof(int2) * num_wide_elements_and_remainder_bytes.first); |
|
|
|
trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second); |
|
} |
|
} // end trivial_copy() |
|
|
|
|
|
namespace detail |
|
{ |
|
namespace dispatch |
|
{ |
|
|
|
template<typename Context, |
|
typename RandomAccessIterator1, |
|
typename RandomAccessIterator2> |
|
__thrust_forceinline__ __device__ |
|
RandomAccessIterator2 copy(Context context, |
|
RandomAccessIterator1 first, |
|
RandomAccessIterator1 last, |
|
RandomAccessIterator2 result, |
|
thrust::detail::true_type is_trivial_copy) |
|
{ |
|
typedef typename thrust::iterator_value<RandomAccessIterator1>::type T; |
|
|
|
const T *src = &thrust::raw_reference_cast(*first); |
|
T *dst = &thrust::raw_reference_cast(*result); |
|
|
|
size_t n = (last - first); |
|
thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T)); |
|
return result + n; |
|
} // end copy() |
|
|
|
template<typename Context, |
|
typename RandomAccessIterator1, |
|
typename RandomAccessIterator2> |
|
__thrust_forceinline__ __device__ |
|
RandomAccessIterator2 copy(Context context, |
|
RandomAccessIterator1 first, |
|
RandomAccessIterator1 last, |
|
RandomAccessIterator2 result, |
|
thrust::detail::false_type is_trivial_copy) |
|
{ |
|
RandomAccessIterator2 end_of_output = result + (last - first); |
|
|
|
// advance iterators |
|
first += context.thread_index(); |
|
result += context.thread_index(); |
|
|
|
for(; |
|
first < last; |
|
first += context.block_dimension(), |
|
result += context.block_dimension()) |
|
{ |
|
*result = *first; |
|
} // end for |
|
|
|
return end_of_output; |
|
} // end copy() |
|
|
|
} // end namespace dispatch |
|
} // end namespace detail |
|
|
|
template<typename Context, |
|
typename RandomAccessIterator1, |
|
typename RandomAccessIterator2> |
|
__thrust_forceinline__ __device__ |
|
RandomAccessIterator2 copy(Context context, |
|
RandomAccessIterator1 first, |
|
RandomAccessIterator1 last, |
|
RandomAccessIterator2 result) |
|
{ |
|
return detail::dispatch::copy(context, first, last, result, |
|
#if __CUDA_ARCH__ < 200 |
|
// does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues |
|
thrust::detail::false_type() |
|
#else |
|
typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type() |
|
#endif |
|
); |
|
} // end copy() |
|
|
|
|
|
template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2> |
|
inline __device__ |
|
RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result) |
|
{ |
|
for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension()) |
|
{ |
|
result[i] = first[i]; |
|
} |
|
|
|
ctx.barrier(); |
|
|
|
return result + n; |
|
} |
|
|
|
|
|
} // end namespace block |
|
} // end namespace detail |
|
} // end namespace cuda |
|
} // end namespace system |
|
} // end namespace thrust |
|
|
|
|