GOSTCoin CUDA miner project, compatible with most nvidia cards, containing only gostd algo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

434 lines
13 KiB

/*
* Copyright 2008-2012 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <limits>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/iterator/iterator_traits.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/cstdint.h>
namespace thrust
{
namespace system
{
namespace detail
{
namespace internal
{
namespace scalar
{
namespace detail
{
template <typename T>
struct RadixEncoder : public thrust::identity<T>
{};
template <>
struct RadixEncoder<char> : public thrust::unary_function<char, unsigned char>
{
unsigned char operator()(char x) const
{
if(std::numeric_limits<char>::is_signed)
return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
else
return x;
}
};
template <>
struct RadixEncoder<signed char> : public thrust::unary_function<signed char, unsigned char>
{
unsigned char operator()(signed char x) const
{
return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
}
};
template <>
struct RadixEncoder<short> : public thrust::unary_function<short, unsigned short>
{
unsigned short operator()(short x) const
{
return x ^ static_cast<unsigned short>(1) << (8 * sizeof(unsigned short) - 1);
}
};
template <>
struct RadixEncoder<int> : public thrust::unary_function<int, unsigned int>
{
unsigned long operator()(long x) const
{
return x ^ static_cast<unsigned int>(1) << (8 * sizeof(unsigned int) - 1);
}
};
template <>
struct RadixEncoder<long> : public thrust::unary_function<long, unsigned long>
{
unsigned long operator()(long x) const
{
return x ^ static_cast<unsigned long>(1) << (8 * sizeof(unsigned long) - 1);
}
};
template <>
struct RadixEncoder<long long> : public thrust::unary_function<long long, unsigned long long>
{
unsigned long long operator()(long long x) const
{
return x ^ static_cast<unsigned long long>(1) << (8 * sizeof(unsigned long long) - 1);
}
};
// ideally we'd use uint32 here and uint64 below
template <>
struct RadixEncoder<float> : public thrust::unary_function<float, thrust::detail::uint32_t>
{
thrust::detail::uint32_t operator()(float x) const
{
union { float f; thrust::detail::uint32_t i; } u;
u.f = x;
thrust::detail::uint32_t mask = -static_cast<thrust::detail::int32_t>(u.i >> 31) | (static_cast<thrust::detail::uint32_t>(1) << 31);
return u.i ^ mask;
}
};
template <>
struct RadixEncoder<double> : public thrust::unary_function<double, thrust::detail::uint64_t>
{
thrust::detail::uint64_t operator()(double x) const
{
union { double f; thrust::detail::uint64_t i; } u;
u.f = x;
thrust::detail::uint64_t mask = -static_cast<thrust::detail::int64_t>(u.i >> 63) | (static_cast<thrust::detail::uint64_t>(1) << 63);
return u.i ^ mask;
}
};
template <unsigned int RadixBits,
bool HasValues,
typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename RandomAccessIterator4>
void radix_sort(RandomAccessIterator1 keys1,
RandomAccessIterator2 keys2,
RandomAccessIterator3 vals1,
RandomAccessIterator4 vals2,
const size_t N)
{
typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
typedef RadixEncoder<KeyType> Encoder;
typedef typename Encoder::result_type EncodedType;
static const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits;
static const unsigned int HistogramSize = 1 << RadixBits;
static const EncodedType BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
Encoder encode;
// storage for histograms
size_t histograms[NumHistograms][HistogramSize] = {{0}};
// see which passes can be eliminated
bool skip_shuffle[NumHistograms] = {false};
// false if most recent data is stored in (keys1,vals1)
bool flip = false;
// compute histograms
for (size_t i = 0; i < N; i++)
{
const EncodedType x = encode(keys1[i]);
for (unsigned int j = 0; j < NumHistograms; j++)
{
const EncodedType BitShift = RadixBits * j;
histograms[j][(x >> BitShift) & BitMask]++;
}
}
// scan histograms
for (unsigned int i = 0; i < NumHistograms; i++)
{
size_t sum = 0;
for (unsigned int j = 0; j < HistogramSize; j++)
{
size_t bin = histograms[i][j];
if (bin == N)
skip_shuffle[i] = true;
histograms[i][j] = sum;
sum = sum + bin;
}
}
// shuffle keys and (optionally) values
for (unsigned int i = 0; i < NumHistograms; i++)
{
const EncodedType BitShift = static_cast<EncodedType>(RadixBits * i);
if (!skip_shuffle[i])
{
if (flip)
{
for (size_t j = 0; j < N; j++)
{
const EncodedType x = encode(keys2[j]);
size_t position = histograms[i][(x >> BitShift) & BitMask]++;
RandomAccessIterator1 temp_keys1 = keys1;
temp_keys1 += position;
RandomAccessIterator2 temp_keys2 = keys2;
temp_keys2 += j;
// keys1[position] = keys2[j]
*temp_keys1 = *temp_keys2;
if (HasValues)
{
RandomAccessIterator3 temp_vals1 = vals1;
temp_vals1 += position;
RandomAccessIterator4 temp_vals2 = vals2;
temp_vals2 += j;
// vals1[position] = vals2[j]
*temp_vals1 = *temp_vals2;
}
}
}
else
{
for (size_t j = 0; j < N; j++)
{
const EncodedType x = encode(keys1[j]);
size_t position = histograms[i][(x >> BitShift) & BitMask]++;
RandomAccessIterator1 temp_keys1 = keys1;
temp_keys1 += j;
RandomAccessIterator2 temp_keys2 = keys2;
temp_keys2 += position;
// keys2[position] = keys1[j];
*temp_keys2 = *temp_keys1;
if (HasValues)
{
RandomAccessIterator3 temp_vals1 = vals1;
temp_vals1 += j;
RandomAccessIterator4 temp_vals2 = vals2;
temp_vals2 += position;
// vals2[position] = vals1[j]
*temp_vals2 = *temp_vals1;
}
}
}
flip = (flip) ? false : true;
}
}
// ensure final values are in (keys1,vals1)
if (flip)
{
thrust::copy(keys2, keys2 + N, keys1);
if (HasValues)
thrust::copy(vals2, vals2 + N, vals1);
}
}
// Select best radix sort parameters based on sizeof(T) and input size
// These particular values were determined through empirical testing on a Core i7 950 CPU
template <size_t KeySize>
struct radix_sort_dispatcher
{
};
template <>
struct radix_sort_dispatcher<1>
{
template <typename RandomAccessIterator1, typename RandomAccessIterator2>
void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
{
detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
}
template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
{
detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
}
};
template <>
struct radix_sort_dispatcher<2>
{
template <typename RandomAccessIterator1, typename RandomAccessIterator2>
void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
{
if (N < (1 << 16))
detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
else
detail::radix_sort<16,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
}
template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
{
if (N < (1 << 15))
detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
else
detail::radix_sort<16,true>(keys1, keys2, vals1, vals2, N);
}
};
template <>
struct radix_sort_dispatcher<4>
{
template <typename RandomAccessIterator1, typename RandomAccessIterator2>
void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
{
if (N < (1 << 22))
detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
else
detail::radix_sort<4,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
}
template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
{
if (N < (1 << 22))
detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
else
detail::radix_sort<3,true>(keys1, keys2, vals1, vals2, N);
}
};
template <>
struct radix_sort_dispatcher<8>
{
template <typename RandomAccessIterator1, typename RandomAccessIterator2>
void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, const size_t N)
{
if (N < (1 << 21))
detail::radix_sort<8,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
else
detail::radix_sort<4,false>(keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
}
template <typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4>
void operator()(RandomAccessIterator1 keys1, RandomAccessIterator2 keys2, RandomAccessIterator3 vals1, RandomAccessIterator4 vals2, const size_t N)
{
if (N < (1 << 21))
detail::radix_sort<8,true>(keys1, keys2, vals1, vals2, N);
else
detail::radix_sort<3,true>(keys1, keys2, vals1, vals2, N);
}
};
template <typename RandomAccessIterator1,
typename RandomAccessIterator2>
void radix_sort(RandomAccessIterator1 keys1,
RandomAccessIterator2 keys2,
const size_t N)
{
typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
radix_sort_dispatcher<sizeof(KeyType)>()(keys1, keys2, N);
}
template <typename RandomAccessIterator1,
typename RandomAccessIterator2,
typename RandomAccessIterator3,
typename RandomAccessIterator4>
void radix_sort(RandomAccessIterator1 keys1,
RandomAccessIterator2 keys2,
RandomAccessIterator3 vals1,
RandomAccessIterator4 vals2,
const size_t N)
{
typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
radix_sort_dispatcher<sizeof(KeyType)>()(keys1, keys2, vals1, vals2, N);
}
} // namespace detail
//////////////
// Key Sort //
//////////////
template <typename RandomAccessIterator>
void stable_radix_sort(RandomAccessIterator first,
RandomAccessIterator last)
{
typedef typename thrust::iterator_system<RandomAccessIterator>::type ExecutionPolicy;
typedef typename thrust::iterator_value<RandomAccessIterator>::type KeyType;
size_t N = last - first;
// XXX assumes ExecutionPolicy is default constructible
// XXX consider how to get stateful systems into this function
ExecutionPolicy exec;
thrust::detail::temporary_array<KeyType, ExecutionPolicy> temp(exec, N);
detail::radix_sort(first, temp.begin(), N);
}
////////////////////
// Key-Value Sort //
////////////////////
template <typename RandomAccessIterator1,
typename RandomAccessIterator2>
void stable_radix_sort_by_key(RandomAccessIterator1 first1,
RandomAccessIterator1 last1,
RandomAccessIterator2 first2)
{
// XXX the type of exec should be
// typedef decltype(select_system(first1,last1,first2)) system;
typedef typename thrust::iterator_system<RandomAccessIterator1>::type ExecutionPolicy;
typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
size_t N = last1 - first1;
// XXX assumes ExecutionPolicy is default constructible
// XXX consider how to get stateful systems into this function
ExecutionPolicy exec;
thrust::detail::temporary_array<KeyType, ExecutionPolicy> temp1(exec, N);
thrust::detail::temporary_array<ValueType, ExecutionPolicy> temp2(exec, N);
detail::radix_sort(first1, temp1.begin(), first2, temp2.begin(), N);
}
} // end namespace scalar
} // end namespace internal
} // end namespace detail
} // end namespace system
} // end namespace thrust