/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /*! \file for_each.inl * \brief Inline file for for_each.h. */ #include #include #include #include #include #include #include #include #include #include #include namespace thrust { namespace system { namespace cuda { namespace detail { namespace for_each_n_detail { template struct for_each_n_closure { typedef void result_type; typedef Context context_type; RandomAccessIterator first; Size n; thrust::detail::device_function f; Context context; for_each_n_closure(RandomAccessIterator first, Size n, UnaryFunction f, Context context = Context()) : first(first), n(n), f(f), context(context) {} __device__ __thrust_forceinline__ result_type operator()(void) { const Size grid_size = context.block_dimension() * context.grid_dimension(); Size i = context.linear_index(); // advance iterator first += i; while(i < n) { f(*first); i += grid_size; first += grid_size; } } }; // end for_each_n_closure template thrust::tuple configure_launch(Size n) { // calculate launch configuration detail::launch_calculator calculator; thrust::tuple config = calculator.with_variable_block_size(); size_t max_blocks = thrust::get<0>(config); size_t block_size = thrust::get<1>(config); size_t num_blocks = thrust::min(max_blocks, thrust::detail::util::divide_ri(n, block_size)); return thrust::make_tuple(num_blocks, block_size); } template bool use_big_closure(Size n, unsigned int little_grid_size) { // use the big closure when n will not fit within an unsigned int // or if incrementing an unsigned int by little_grid_size would overflow // the counter Size threshold = std::numeric_limits::max(); bool result = (sizeof(Size) > sizeof(unsigned int)) && (n > threshold); if(!result) { // check if we'd overflow the little closure's counter unsigned int little_n = static_cast(n); if((little_n - 1u) + little_grid_size < little_n) { result = true; } } return result; } } // end for_each_n_detail template RandomAccessIterator for_each_n(execution_policy &, RandomAccessIterator first, Size n, UnaryFunction f) { // we're attempting to launch a kernel, assert we're compiling with nvcc // ======================================================================== // X Note to the user: If you've found this line due to a compiler error, X // X you need to compile your code using nvcc, rather than g++ or cl.exe X // ======================================================================== THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation::value) ); if(n <= 0) return first; // empty range // create two candidate closures to implement the for_each // choose between them based on the whether we can fit n into a smaller integer // and whether or not we'll overflow the closure's counter typedef detail::blocked_thread_array Context; typedef for_each_n_detail::for_each_n_closure BigClosure; typedef for_each_n_detail::for_each_n_closure LittleClosure; BigClosure big_closure(first, n, f); LittleClosure little_closure(first, static_cast(n), f); thrust::tuple little_config = for_each_n_detail::configure_launch(n); unsigned int little_grid_size = thrust::get<0>(little_config) * thrust::get<1>(little_config); if(for_each_n_detail::use_big_closure(n, little_grid_size)) { // launch the big closure thrust::tuple big_config = for_each_n_detail::configure_launch(n); detail::launch_closure(big_closure, thrust::get<0>(big_config), thrust::get<1>(big_config)); } else { // launch the little closure detail::launch_closure(little_closure, thrust::get<0>(little_config), thrust::get<1>(little_config)); } return first + n; } template InputIterator for_each(execution_policy &exec, InputIterator first, InputIterator last, UnaryFunction f) { return cuda::detail::for_each_n(exec, first, thrust::distance(first,last), f); } // end for_each() } // end namespace detail } // end namespace cuda } // end namespace system } // end namespace thrust