1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
|
#pragma once
#include <algorithm>
#include <intrin.h>
#include <vector>
#include <numeric>
namespace SIMD
{
namespace detail
{
enum AlignTo : size_t
{
SSE = 16,
AVX = 32
};
template<typename T>
struct MemoryTypeToSIMDType {};
template<>
struct MemoryTypeToSIMDType<__m256>
{
static constexpr auto value = AlignTo::AVX;
};
template<>
struct MemoryTypeToSIMDType<__m128>
{
static constexpr auto value = AlignTo::SSE;
};
template<typename T, AlignTo alignTo>
constexpr size_t numberInSIMD = alignTo / sizeof(T);
template<typename T>
constexpr size_t numberInAVX = numberInSIMD<T, AlignTo::AVX>;
template<typename T, typename TOut = T, typename MemoryType>
static auto HorizontalSum(const MemoryType& data)
{
// these are usually done at the end of tight loops, so performance isn't really important. Something generic is better.
constexpr auto alignType = MemoryTypeToSIMDType<MemoryType>::value;
auto& values = reinterpret_cast<const float(&)[numberInSIMD<T, alignType>]>(data);
return std::accumulate(std::begin(values), std::end(values), TOut());
}
float sumFloatsAVX(const float* arr, size_t arrSize);
template<AlignTo alignTo, typename T, typename F>
static auto ProcessScalar(T* arr, size_t arrSize, const F& func)
{
typedef decltype(func(arr, arrSize)) ReturnValue;
struct ProcessScalarResult
{
ReturnValue mBeforeArrayStart;
ReturnValue mAfterArrayEnd;
T* const mContinueFrom;
size_t mRemainingElements;
};
// assuming array is sizeof(T) aligned
constexpr auto alignToMask = alignTo - 1;
const auto addressStart = reinterpret_cast<size_t>(arr);
const auto elementsNeededToAlign = (alignTo - (addressStart & alignToMask)) / sizeof(T);
const auto addressEnd = addressStart + arrSize * sizeof(T);
const auto elementsAtEnd = (addressEnd & alignToMask) / sizeof(T);
return ProcessScalarResult
{
func(arr, elementsNeededToAlign),
func(arr + arrSize - elementsAtEnd, elementsAtEnd),
arr + elementsNeededToAlign,
arrSize - elementsNeededToAlign - elementsAtEnd
};
}
}
float average(const float* arr, size_t arrSize);
template<size_t N>
inline float average(const float(&arr)[N])
{
return average(arr, N);
}
inline float average(std::vector<float> data)
{
return average(&data[0], data.size());
}
}
|