GOSTCoin CUDA miner project, compatible with most nvidia cards, containing only gostd algo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

199 lines
6.0 KiB

/*
* Copyright 2008-2012 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*! \file merging_sort.h
* \brief Block version of merge sort
*/
#pragma once
#include <thrust/iterator/iterator_traits.h>
#include <thrust/detail/type_traits.h>
#include <thrust/system/detail/generic/scalar/binary_search.h>
namespace thrust
{
namespace system
{
namespace cuda
{
namespace detail
{
namespace block
{
template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
__device__ void conditional_swap(RandomAccessIterator1 keys_first,
RandomAccessIterator2 values_first,
const unsigned int i,
const unsigned int end,
bool pred,
Compare comp)
{
typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
if(pred && i+1<end)
{
KeyType xi = keys_first[i];
KeyType xj = keys_first[i+1];
// swap if xj sorts before xi
if(comp(xj, xi))
{
// XXX this implementation should really dispatch swap via ADL
ValueType yi;
yi = values_first[i];
ValueType yj;
yj = values_first[i+1];
keys_first[i] = xj;
keys_first[i+1] = xi;
values_first[i] = yj;
values_first[i+1] = yi;
}
}
}
template<typename Context,
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename Compare>
__device__ void transposition_sort(Context context,
RandomAccessIterator1 keys_first,
RandomAccessIterator2 values_first,
const unsigned int i,
const unsigned int end,
const unsigned int size,
Compare comp)
{
const bool is_odd = i&0x1;
for(unsigned int round=size/2; round>0; --round)
{
// ODDS
conditional_swap(keys_first, values_first, i, end, is_odd, comp);
context.barrier();
// EVENS
conditional_swap(keys_first, values_first, i, end, !is_odd, comp);
context.barrier();
}
}
template<typename Context,
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename StrictWeakOrdering>
__device__ void merge(Context context,
RandomAccessIterator1 keys_first,
RandomAccessIterator2 values_first,
const unsigned int i,
const unsigned int n,
unsigned int begin,
unsigned int end,
unsigned int h,
StrictWeakOrdering cmp)
{
// INVARIANT: Every element i resides within a sequence [begin,end)
// of length h which is already sorted
while( h<n )
{
h *= 2;
unsigned int new_begin = i&(~(h-1));
unsigned int new_end = min(n,new_begin+h);
typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
KeyType key;
ValueType value;
unsigned int rank = i - begin;
// prevent out-of-bounds access
if(i < new_end)
{
key = keys_first[i];
if(begin==new_begin) // in the left side of merging pair
{
RandomAccessIterator1 result = thrust::system::detail::generic::scalar::lower_bound_n(keys_first+end, new_end-end, key, cmp);
rank += (result - (keys_first+end));
}
else // in the right side of merging pair
{
RandomAccessIterator1 result = thrust::system::detail::generic::scalar::upper_bound_n(keys_first+new_begin, begin-new_begin, key, cmp);
rank += (result - (keys_first+new_begin));
}
value = values_first[i];
}
context.barrier();
if(i < new_end)
{
keys_first[new_begin+rank] = key;
values_first[new_begin+rank] = value;
}
context.barrier();
begin = new_begin;
end = new_end;
}
}
/*! Block-wise implementation of merge sort.
* It provides the same external interface as odd_even_sort.
*/
template<typename Context,
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename StrictWeakOrdering>
__device__ void merging_sort(Context context,
RandomAccessIterator1 keys_first,
RandomAccessIterator2 values_first,
const unsigned int n,
StrictWeakOrdering comp)
{
// Phase 1: Sort subsequences of length 32 using odd-even
// transposition sort. The code below assumes that h is a
// power of 2. Empirically, 32 delivers best results,
// which is not surprising since that's the warp width.
unsigned int i = context.thread_index();
unsigned int h = 32;
unsigned int begin=i&(~(h-1)), end=min(n,begin+h);
transposition_sort(context, keys_first, values_first, i, end, h, comp);
// Phase 2: Apply merge tree to produce final sorted results
merge(context, keys_first, values_first, i, n, begin, end, h, comp);
} // end merging_sort()
} // end namespace block
} // end namespace detail
} // end namespace cuda
} // end namespace system
} // end namespace thrust