/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /*! \file fill.inl * \brief Inline file for fill.h. */ #include #include #include #include #include #include #include #include #include #include namespace thrust { namespace system { namespace cuda { namespace detail { namespace detail { template WidePtr widen_raw_ptr(T *ptr) { typedef thrust::detail::pointer_traits WideTraits; typedef typename WideTraits::element_type WideT; // carefully widen the pointer to avoid warnings about conversions between differently aligned types on ARM WideT *wide_raw_ptr = static_cast(static_cast(ptr)); return WideTraits::pointer_to(*wide_raw_ptr); } template Pointer wide_fill_n(execution_policy &exec, Pointer first, Size n, const T &value) { typedef typename thrust::iterator_value::type OutputType; size_t ALIGNMENT_BOUNDARY = 128; // begin copying blocks at this byte boundary WideType wide_exemplar; OutputType narrow_exemplars[sizeof(WideType) / sizeof(OutputType)]; for (size_t i = 0; i < sizeof(WideType) / sizeof(OutputType); i++) narrow_exemplars[i] = static_cast(value); // cast through char * to avoid type punning warnings for (size_t i = 0; i < sizeof(WideType); i++) reinterpret_cast(&wide_exemplar)[i] = reinterpret_cast(narrow_exemplars)[i]; OutputType *first_raw = thrust::raw_pointer_cast(first); OutputType *last_raw = first_raw + n; OutputType *block_first_raw = (thrust::min)(first_raw + n, thrust::detail::util::align_up(first_raw, ALIGNMENT_BOUNDARY)); OutputType *block_last_raw = (thrust::max)(block_first_raw, thrust::detail::util::align_down(last_raw, sizeof(WideType))); // rebind Pointer to WideType typedef typename thrust::detail::rebind_pointer::type WidePtr; // point to the widened range // XXX since we've got an execution policy, we probably don't even need to deal with rebinding pointers WidePtr block_first_wide = widen_raw_ptr(block_first_raw); WidePtr block_last_wide = widen_raw_ptr(block_last_raw); thrust::generate(exec, first, Pointer(block_first_raw), thrust::detail::fill_functor(value)); thrust::generate(exec, block_first_wide, block_last_wide, thrust::detail::fill_functor(wide_exemplar)); thrust::generate(exec, Pointer(block_last_raw), first + n, thrust::detail::fill_functor(value)); return first + n; } template OutputIterator fill_n(execution_policy &exec, OutputIterator first, Size n, const T &value, thrust::detail::false_type) { thrust::detail::fill_functor func(value); return thrust::generate_n(exec, first, n, func); } template OutputIterator fill_n(execution_policy &exec, OutputIterator first, Size n, const T &value, thrust::detail::true_type) { typedef typename thrust::iterator_traits::value_type OutputType; if ( thrust::detail::util::is_aligned(thrust::raw_pointer_cast(&*first)) ) { if (compute_capability() < 20) { // 32-bit writes are faster on G80 and GT200 typedef unsigned int WideType; wide_fill_n(exec, &*first, n, value); } else { // 64-bit writes are faster on Fermi typedef unsigned long long WideType; wide_fill_n(exec, &*first, n, value); } return first + n; } else { return fill_n(exec, first, n, value, thrust::detail::false_type()); } } } // end detail template OutputIterator fill_n(execution_policy &exec, OutputIterator first, Size n, const T &value) { typedef typename thrust::iterator_traits::value_type OutputType; // we're compiling with nvcc, launch a kernel const bool use_wide_fill = thrust::detail::is_trivial_iterator::value && thrust::detail::has_trivial_assign::value && (sizeof(OutputType) == 1 || sizeof(OutputType) == 2 || sizeof(OutputType) == 4); // XXX WAR usused variable warning (void)use_wide_fill; return detail::fill_n(exec, first, n, value, thrust::detail::integral_constant()); } template void fill(execution_policy &exec, ForwardIterator first, ForwardIterator last, const T &value) { thrust::system::cuda::detail::fill_n(exec, first, thrust::distance(first,last), value); } // end fill() } // end namespace detail } // end namespace cuda } // end namespace system } // end namespace thrust