/* * Copyright 2008-2012 NVIDIA Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include namespace thrust { namespace detail { // XXX WAR circular inclusion problems with this forward declaration template class temporary_array; } // end detail namespace system { namespace cuda { namespace detail { namespace detail { #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC template __global__ __launch_bounds__(Closure::context_type::ThreadsPerBlock::value, Closure::context_type::BlocksPerMultiprocessor::value) void launch_closure_by_value(Closure f) { f(); } template __global__ __launch_bounds__(Closure::context_type::ThreadsPerBlock::value, Closure::context_type::BlocksPerMultiprocessor::value) void launch_closure_by_pointer(const Closure *f) { // copy to registers Closure f_reg = *f; f_reg(); } #else template void launch_closure_by_value(Closure) {} template void launch_closure_by_pointer(const Closure *) {} #endif // THRUST_DEVICE_COMPILER_NVCC template struct closure_launcher_base { typedef void (*launch_function_t)(Closure); static launch_function_t get_launch_function(void) { return launch_closure_by_value; } template static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size) { #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC if(num_blocks > 0) { launch_closure_by_value<<<(unsigned int) num_blocks, (unsigned int) block_size, (unsigned int) smem_size>>>(f); synchronize_if_enabled("launch_closure_by_value"); } #endif // THRUST_DEVICE_COMPILER_NVCC } }; // end closure_launcher_base template struct closure_launcher_base { typedef void (*launch_function_t)(const Closure *); static launch_function_t get_launch_function(void) { return launch_closure_by_pointer; } template static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size) { #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC if(num_blocks > 0) { // use temporary storage for the closure // XXX use of cuda::tag is too specific here thrust::cuda::tag cuda_tag; thrust::host_system_tag host_tag; thrust::detail::temporary_array closure_storage(cuda_tag, host_tag, &f, &f + 1); // launch detail::launch_closure_by_pointer<<<(unsigned int) num_blocks, (unsigned int) block_size, (unsigned int) smem_size>>>((&closure_storage[0]).get()); synchronize_if_enabled("launch_closure_by_pointer"); } #endif // THRUST_DEVICE_COMPILER_NVCC } }; template struct closure_launcher : public closure_launcher_base { typedef closure_launcher_base super_t; static inline const device_properties_t& device_properties(void) { return device_properties(); } static inline function_attributes_t function_attributes(void) { return thrust::system::cuda::detail::function_attributes(super_t::get_launch_function()); } template static void launch(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size) { super_t::launch(f,num_blocks,block_size,smem_size); } }; template void launch_closure(Closure f, Size num_blocks) { launch_calculator calculator; launch_closure(f, num_blocks, thrust::get<1>(calculator.with_variable_block_size())); } // end launch_closure() template void launch_closure(Closure f, Size1 num_blocks, Size2 block_size) { launch_closure(f, num_blocks, block_size, 0u); } // end launch_closure() template void launch_closure(Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size) { closure_launcher::launch(f, num_blocks, block_size, smem_size); } // end launch_closure() template function_attributes_t closure_attributes(void) { typedef closure_launcher Launcher; // cache the result of function_attributes(), because it is slow // only cache the first few devices static const int max_num_devices = 16; static bool attributes_exist[max_num_devices] = {0}; static function_attributes_t function_attributes[max_num_devices] = {}; // XXX device_id ought to be an argument to this function int device_id = current_device(); if(device_id >= max_num_devices) { return thrust::system::cuda::detail::function_attributes(Launcher::get_launch_function()); } if(!attributes_exist[device_id]) { function_attributes[device_id] = thrust::system::cuda::detail::function_attributes(Launcher::get_launch_function()); // disallow the compiler to move the write to attributes_exist[device_id] // before the initialization of function_attributes[device_id] __thrust_compiler_fence(); attributes_exist[device_id] = true; } return function_attributes[device_id]; } } // end namespace detail } // end namespace detail } // end namespace cuda } // end namespace system } // end namespace thrust