GOSTCoin CUDA miner project, compatible with most nvidia cards, containing only gostd algo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

224 lines
6.8 KiB

/*
* Copyright 2008-2012 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*! \file copy.h
* \brief CUDA implementation of device-to-device copy,
* based on Gregory Diamos' memcpy code.
*/
#pragma once
#include <thrust/detail/config.h>
#include <thrust/pair.h>
#include <thrust/detail/type_traits.h>
#include <thrust/detail/dispatch/is_trivial_copy.h>
#include <thrust/detail/raw_reference_cast.h>
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace block
{
namespace trivial_copy_detail
{
template<typename Size>
inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d)
{
Size quotient = n / d;
Size remainder = n - d * quotient;
return thrust::make_pair(quotient,remainder);
} // end quotient_and_remainder()
// assumes the addresses dst & src are aligned to T boundaries
template<typename Context,
typename T>
__device__ __thrust_forceinline__
void aligned_copy(Context context, T *dst, const T *src, unsigned int num_elements)
{
for(unsigned int i = context.thread_index();
i < num_elements;
i += context.block_dimension())
{
dst[i] = src[i];
}
} // end aligned_copy()
} // end namespace trivial_copy_detail
template <typename Context>
__device__ __thrust_forceinline__
void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes)
{
// reinterpret at bytes
char* destination = reinterpret_cast<char*>(destination_);
const char* source = reinterpret_cast<const char*>(source_);
// TODO replace this with uint64
#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
typedef long long int2;
typedef long long uint2;
#endif // THRUST_DEVICE_COMPILER_NVCC
// check alignment
// XXX can we do this in three steps?
// 1. copy until alignment is met
// 2. go hog wild
// 3. get the remainder
if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 || reinterpret_cast<size_t>(source) % sizeof(uint2) != 0)
{
for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension())
{
destination[i] = source[i];
}
}
else
{
// it's aligned; do a wide copy
// this pair stores the number of int2s in the aligned portion of the arrays
// and the number of bytes in the remainder
const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2));
// copy int2 elements
trivial_copy_detail::aligned_copy(context,
reinterpret_cast<int2*>(destination),
reinterpret_cast<const int2*>(source),
num_wide_elements_and_remainder_bytes.first);
// XXX we could copy int elements here
// copy remainder byte by byte
// to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion
// this is sizeof(int2) times the number of int2s comprising the aligned portion
const char *remainder_first = reinterpret_cast<const char*>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
char *remainder_result = reinterpret_cast<char*>(destination + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second);
}
} // end trivial_copy()
namespace detail
{
namespace dispatch
{
template<typename Context,
typename RandomAccessIterator1,
typename RandomAccessIterator2>
__thrust_forceinline__ __device__
RandomAccessIterator2 copy(Context context,
RandomAccessIterator1 first,
RandomAccessIterator1 last,
RandomAccessIterator2 result,
thrust::detail::true_type is_trivial_copy)
{
typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
const T *src = &thrust::raw_reference_cast(*first);
T *dst = &thrust::raw_reference_cast(*result);
size_t n = (last - first);
thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T));
return result + n;
} // end copy()
template<typename Context,
typename RandomAccessIterator1,
typename RandomAccessIterator2>
__thrust_forceinline__ __device__
RandomAccessIterator2 copy(Context context,
RandomAccessIterator1 first,
RandomAccessIterator1 last,
RandomAccessIterator2 result,
thrust::detail::false_type is_trivial_copy)
{
RandomAccessIterator2 end_of_output = result + (last - first);
// advance iterators
first += context.thread_index();
result += context.thread_index();
for(;
first < last;
first += context.block_dimension(),
result += context.block_dimension())
{
*result = *first;
} // end for
return end_of_output;
} // end copy()
} // end namespace dispatch
} // end namespace detail
template<typename Context,
typename RandomAccessIterator1,
typename RandomAccessIterator2>
__thrust_forceinline__ __device__
RandomAccessIterator2 copy(Context context,
RandomAccessIterator1 first,
RandomAccessIterator1 last,
RandomAccessIterator2 result)
{
return detail::dispatch::copy(context, first, last, result,
#if __CUDA_ARCH__ < 200
// does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues
thrust::detail::false_type()
#else
typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type()
#endif
);
} // end copy()
template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
inline __device__
RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
{
for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension())
{
result[i] = first[i];
}
ctx.barrier();
return result + n;
}
} // end namespace block
} // end namespace detail
} // end namespace cuda
} // end namespace system
} // end namespace thrust