ccminer-gostd-lite/compat/thrust/system/cuda/detail/block/copy.h

/*
 *  Copyright 2008-2012 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

/*! \file copy.h
 *  \brief CUDA implementation of device-to-device copy,
 *         based on Gregory Diamos' memcpy code.
 */

#pragma once

#include <thrust/detail/config.h>

#include <thrust/pair.h>

#include <thrust/detail/type_traits.h>
#include <thrust/detail/dispatch/is_trivial_copy.h>
#include <thrust/detail/raw_reference_cast.h>

namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace block
{

namespace trivial_copy_detail
{


template<typename Size>
  inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d)
{
  Size quotient  = n / d;
  Size remainder = n - d * quotient; 
  return thrust::make_pair(quotient,remainder);
} // end quotient_and_remainder()


// assumes the addresses dst & src are aligned to T boundaries
template<typename Context,
         typename T>
__device__ __thrust_forceinline__
void aligned_copy(Context context, T *dst, const T *src, unsigned int num_elements)
{
  for(unsigned int i = context.thread_index();
      i < num_elements;
      i += context.block_dimension())
  {
    dst[i] = src[i];
  }
} // end aligned_copy()


} // end namespace trivial_copy_detail


template <typename Context>
__device__ __thrust_forceinline__
void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes)
{
  // reinterpret at bytes
  char* destination  = reinterpret_cast<char*>(destination_);
  const char* source = reinterpret_cast<const char*>(source_);
 
  // TODO replace this with uint64
#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
  typedef long long  int2;
  typedef long long uint2;
#endif // THRUST_DEVICE_COMPILER_NVCC

  // check alignment
  // XXX can we do this in three steps?
  //     1. copy until alignment is met
  //     2. go hog wild
  //     3. get the remainder
  if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 || reinterpret_cast<size_t>(source) % sizeof(uint2) != 0)
  {
    for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension())
    {
      destination[i] = source[i];
    }
  }
  else
  {
    // it's aligned; do a wide copy

    // this pair stores the number of int2s in the aligned portion of the arrays
    // and the number of bytes in the remainder
    const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2));

    // copy int2 elements
    trivial_copy_detail::aligned_copy(context,
                                      reinterpret_cast<int2*>(destination),
                                      reinterpret_cast<const int2*>(source),
                                      num_wide_elements_and_remainder_bytes.first);

    // XXX we could copy int elements here

    // copy remainder byte by byte

    // to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion
    // this is sizeof(int2) times the number of int2s comprising the aligned portion
    const char *remainder_first  = reinterpret_cast<const char*>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
          char *remainder_result = reinterpret_cast<char*>(destination  + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);

    trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second);
  }
} // end trivial_copy()


namespace detail
{
namespace dispatch
{

template<typename Context,
         typename RandomAccessIterator1,
         typename RandomAccessIterator2>
  __thrust_forceinline__ __device__
  RandomAccessIterator2 copy(Context context,
                             RandomAccessIterator1 first,
                             RandomAccessIterator1 last,
                             RandomAccessIterator2 result,
                             thrust::detail::true_type is_trivial_copy)
{
  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;

  const T *src = &thrust::raw_reference_cast(*first);
        T *dst = &thrust::raw_reference_cast(*result);

  size_t n = (last - first);
  thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T));
  return result + n;
} // end copy()

template<typename Context,
         typename RandomAccessIterator1,
         typename RandomAccessIterator2>
  __thrust_forceinline__ __device__
  RandomAccessIterator2 copy(Context context, 
                             RandomAccessIterator1 first,
                             RandomAccessIterator1 last,
                             RandomAccessIterator2 result,
                             thrust::detail::false_type is_trivial_copy)
{
  RandomAccessIterator2 end_of_output = result + (last - first);
  
  // advance iterators
  first  += context.thread_index();
  result += context.thread_index();

  for(;
      first < last;
      first  += context.block_dimension(),
      result += context.block_dimension())
  {
    *result = *first;
  } // end for

  return end_of_output;
} // end copy()

} // end namespace dispatch
} // end namespace detail

template<typename Context, 
         typename RandomAccessIterator1,
         typename RandomAccessIterator2>
  __thrust_forceinline__ __device__
  RandomAccessIterator2 copy(Context context,
                             RandomAccessIterator1 first,
                             RandomAccessIterator1 last,
                             RandomAccessIterator2 result)
{
  return detail::dispatch::copy(context, first, last, result,
#if __CUDA_ARCH__ < 200
      // does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues
      thrust::detail::false_type()
#else
      typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type()
#endif
      );
} // end copy()


template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
inline __device__
RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
{
  for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension())
  {
    result[i] = first[i];
  }

  ctx.barrier();

  return result + n;
}


} // end namespace block
} // end namespace detail
} // end namespace cuda
} // end namespace system
} // end namespace thrust
commit initial version 0.1 11 years ago			`/*`
			`* Copyright 2008-2012 NVIDIA Corporation`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`/*! \file copy.h`
			`* \brief CUDA implementation of device-to-device copy,`
			`* based on Gregory Diamos' memcpy code.`
			`*/`

			`#pragma once`

			`#include <thrust/detail/config.h>`

			`#include <thrust/pair.h>`

			`#include <thrust/detail/type_traits.h>`
			`#include <thrust/detail/dispatch/is_trivial_copy.h>`
			`#include <thrust/detail/raw_reference_cast.h>`

			`namespace thrust`
			`{`
			`namespace system`
			`{`
			`namespace cuda`
			`{`
			`namespace detail`
			`{`
			`namespace block`
			`{`

			`namespace trivial_copy_detail`
			`{`


			`template<typename Size>`
			`inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d)`
			`{`
			`Size quotient = n / d;`
			`Size remainder = n - d * quotient;`
			`return thrust::make_pair(quotient,remainder);`
			`} // end quotient_and_remainder()`


			`// assumes the addresses dst & src are aligned to T boundaries`
			`template<typename Context,`
			`typename T>`
			`__device__ __thrust_forceinline__`
			`void aligned_copy(Context context, T dst, const T src, unsigned int num_elements)`
			`{`
			`for(unsigned int i = context.thread_index();`
			`i < num_elements;`
			`i += context.block_dimension())`
			`{`
			`dst[i] = src[i];`
			`}`
			`} // end aligned_copy()`


			`} // end namespace trivial_copy_detail`


			`template <typename Context>`
			`__device__ __thrust_forceinline__`
			`void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes)`
			`{`
			`// reinterpret at bytes`
			`char* destination = reinterpret_cast<char*>(destination_);`
			`const char* source = reinterpret_cast<const char*>(source_);`

			`// TODO replace this with uint64`
			`#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC`
			`typedef long long int2;`
			`typedef long long uint2;`
			`#endif // THRUST_DEVICE_COMPILER_NVCC`

			`// check alignment`
			`// XXX can we do this in three steps?`
			`// 1. copy until alignment is met`
			`// 2. go hog wild`
			`// 3. get the remainder`
			`if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 \|\| reinterpret_cast<size_t>(source) % sizeof(uint2) != 0)`
			`{`
			`for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension())`
			`{`
			`destination[i] = source[i];`
			`}`
			`}`
			`else`
			`{`
			`// it's aligned; do a wide copy`

			`// this pair stores the number of int2s in the aligned portion of the arrays`
			`// and the number of bytes in the remainder`
			`const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2));`

			`// copy int2 elements`
			`trivial_copy_detail::aligned_copy(context,`
			`reinterpret_cast<int2*>(destination),`
			`reinterpret_cast<const int2*>(source),`
			`num_wide_elements_and_remainder_bytes.first);`

			`// XXX we could copy int elements here`

			`// copy remainder byte by byte`

			`// to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion`
			`// this is sizeof(int2) times the number of int2s comprising the aligned portion`
			`const char remainder_first = reinterpret_cast<const char>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);`
			`char remainder_result = reinterpret_cast<char>(destination + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);`

			`trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second);`
			`}`
			`} // end trivial_copy()`


			`namespace detail`
			`{`
			`namespace dispatch`
			`{`

			`template<typename Context,`
			`typename RandomAccessIterator1,`
			`typename RandomAccessIterator2>`
			`__thrust_forceinline__ __device__`
			`RandomAccessIterator2 copy(Context context,`
			`RandomAccessIterator1 first,`
			`RandomAccessIterator1 last,`
			`RandomAccessIterator2 result,`
			`thrust::detail::true_type is_trivial_copy)`
			`{`
			`typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;`

			`const T src = &thrust::raw_reference_cast(first);`
			`T dst = &thrust::raw_reference_cast(result);`

			`size_t n = (last - first);`
			`thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T));`
			`return result + n;`
			`} // end copy()`

			`template<typename Context,`
			`typename RandomAccessIterator1,`
			`typename RandomAccessIterator2>`
			`__thrust_forceinline__ __device__`
			`RandomAccessIterator2 copy(Context context,`
			`RandomAccessIterator1 first,`
			`RandomAccessIterator1 last,`
			`RandomAccessIterator2 result,`
			`thrust::detail::false_type is_trivial_copy)`
			`{`
			`RandomAccessIterator2 end_of_output = result + (last - first);`

			`// advance iterators`
			`first += context.thread_index();`
			`result += context.thread_index();`

			`for(;`
			`first < last;`
			`first += context.block_dimension(),`
			`result += context.block_dimension())`
			`{`
			`result = first;`
			`} // end for`

			`return end_of_output;`
			`} // end copy()`

			`} // end namespace dispatch`
			`} // end namespace detail`

			`template<typename Context,`
			`typename RandomAccessIterator1,`
			`typename RandomAccessIterator2>`
			`__thrust_forceinline__ __device__`
			`RandomAccessIterator2 copy(Context context,`
			`RandomAccessIterator1 first,`
			`RandomAccessIterator1 last,`
			`RandomAccessIterator2 result)`
			`{`
			`return detail::dispatch::copy(context, first, last, result,`
			`#if __CUDA_ARCH__ < 200`
			`// does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues`
			`thrust::detail::false_type()`
			`#else`
			`typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type()`
			`#endif`
			`);`
			`} // end copy()`


			`template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>`
			`inline __device__`
			`RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)`
			`{`
			`for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension())`
			`{`
			`result[i] = first[i];`
			`}`

			`ctx.barrier();`

			`return result + n;`
			`}`


			`} // end namespace block`
			`} // end namespace detail`
			`} // end namespace cuda`
			`} // end namespace system`
			`} // end namespace thrust`