Comments (6)
I don't fully understand your situation, but autovectorizer for g++ does not have support for SLEEF.
SLEEF is not designed to be a drop-in replacement of SVML.
You need to manually rewrite function names exposed by SVML to the corresponding name in SLEEF in your source code.
from sleef.
Most of what I'm doing is trying to wrap Sleef in a type-generic way to dispatch the fastest versions of functions I can.
So, in order to make Sleef work, I'd need to implement the SVML methods myself or link against SVML?
Here's the wrapper I have:
#ifndef _VEC_H__
#define _VEC_H__
#define NOSVML
#include "sleef/include/sleefdft.h"
#include "sleef.h"
#include "x86intrin.h"
#include <cmath>
#include <iterator>
namespace vec {
template<typename ValueType>
struct SIMDTypes;
#define OP(op, suf, sz) _mm##sz##_##op##_##suf
#define decop(op, suf, sz) static constexpr decltype(&OP(op, suf, sz)) op = &OP(op, suf, sz);
/* Use or separately because it's a keyword.*/
#define declare_all(suf, sz) \
decop(loadu, suf, sz); \
decop(storeu, suf, sz); \
decop(load, suf, sz); \
decop(store, suf, sz); \
static constexpr decltype(&OP(or, suf, sz)) or_fn = &OP(or, suf, sz);\
static constexpr decltype(&OP(and, suf, sz)) and_fn = &OP(and, suf, sz);\
decop(add, suf, sz); \
decop(sub, suf, sz); \
decop(mul, suf, sz); \
decop(set1, suf, sz); \
decop(setr, suf, sz); \
decop(set, suf, sz); \
decop(mask_and, suf, sz); \
decop(maskz_and, suf, sz); \
decop(maskz_andnot, suf, sz); \
decop(mask_andnot, suf, sz); \
decop(andnot, suf, sz); \
decop(blendv, suf, sz); \
decop(cmp, suf, sz); \
#define SLEEF_OP(op, suf, prec, set) Sleef_##op##suf##_##prec##set
#define dec_sleefop_prec(op, suf, prec, instructset) \
static constexpr decltype(&SLEEF_OP(op, suf, prec, instructset)) op##_##prec = \
&SLEEF_OP(op, suf, prec, instructset); \
struct apply_##op##_##prec {\
template<typename... T>\
auto operator()(T &&...args) const {return op##_##prec(std::forward<T...>(args)...);} \
template<typename OT>\
OT scalar(OT val) const {return std::op(val);} \
};
#define dec_all_precs(op, suf, instructset) \
dec_sleefop_prec(op, suf, u35, instructset) \
dec_sleefop_prec(op, suf, u10, instructset)
#define dec_all_trig(suf, set) \
dec_all_precs(sin, suf, set) \
dec_all_precs(cos, suf, set) \
dec_all_precs(asin, suf, set) \
dec_all_precs(acos, suf, set) \
dec_all_precs(atan, suf, set) \
dec_all_precs(atan2, suf, set) \
dec_all_precs(cbrt, suf, set) \
dec_sleefop_prec(log, suf, u10, set) \
dec_sleefop_prec(log1p, suf, u10, set) \
dec_sleefop_prec(expm1, suf, u10, set) \
dec_sleefop_prec(exp, suf, u10, set) \
dec_sleefop_prec(exp2, suf, u10, set) \
/*dec_sleefop_prec(exp10, suf, u10, set) */ \
dec_sleefop_prec(lgamma, suf, u10, set) \
dec_sleefop_prec(tgamma, suf, u10, set) \
dec_sleefop_prec(sinh, suf, u10, set) \
dec_sleefop_prec(cosh, suf, u10, set) \
dec_sleefop_prec(asinh, suf, u10, set) \
dec_sleefop_prec(acosh, suf, u10, set) \
dec_sleefop_prec(tanh, suf, u10, set) \
dec_sleefop_prec(atanh, suf, u10, set)
template<>
struct SIMDTypes<float>{
#if _FEATURE_AVX512F
using Type = __m512;
declare_all(ps, 512)
static const size_t ALN = 64;
dec_all_trig(f16, avx512f);
#elif __AVX2__
using Type = __m256;
declare_all(ps, 256)
static const size_t ALN = 32;
dec_all_trig(f8, avx2);
#elif __SSE2__
using Type = __m128;
declare_all(ps, )
static const size_t ALN = 16;
dec_all_trig(f4, sse2);
#else
#error("Need at least sse2")
#endif
static const size_t MASK = ALN - 1;
};
template<>
struct SIMDTypes<double>{
#if _FEATURE_AVX512F
using Type = __m512d;
declare_all(pd, 512)
static const size_t ALN = 64;
dec_all_trig(d8, avx512f);
#elif __AVX2__
using Type = __m256d;
declare_all(pd, 256)
static const size_t ALN = 32;
dec_all_trig(d4, avx2);
#elif __SSE2__
using Type = __m128d;
declare_all(pd, )
static const size_t ALN = 16;
dec_all_trig(d2, sse2);
#else
#error("Need at least sse2")
#endif
static const size_t MASK = ALN - 1;
};
template<typename FloatType>
void blockmul(FloatType *pos, size_t nelem, FloatType div) {
#if __AVX2__ || _FEATURE_AVX512F || __SSE2__
#pragma message("Using vectorized scalar multiplication.")
using SIMDType = typename vec::SIMDTypes<FloatType>::Type;
SIMDType factor(vec::SIMDTypes<FloatType>::set1(div));
SIMDType *ptr((SIMDType *)pos);
FloatType *end(pos + nelem);
if((uint64_t)ptr & vec::SIMDTypes<FloatType>::MASK) {
while((FloatType *)ptr < end - sizeof(SIMDType) / sizeof(FloatType)) {
vec::SIMDTypes<FloatType>::storeu((FloatType *)ptr,
vec::SIMDTypes<FloatType>::mul(factor, vec::SIMDTypes<FloatType>::loadu((FloatType *)ptr)));
++ptr;
}
} else {
while((FloatType *)ptr < end - sizeof(SIMDType) / sizeof(FloatType)) {
vec::SIMDTypes<FloatType>::store((FloatType *)ptr,
vec::SIMDTypes<FloatType>::mul(factor, vec::SIMDTypes<FloatType>::load((FloatType *)ptr)));
++ptr;
}
}
pos = (FloatType *)ptr;
while(pos < end) *pos++ *= div;
#else
for(size_t i(0); i < (static_cast<size_t>(1) << nelem); ++i) pos[i] *= div; // Could be vectorized.
#endif
}
template<typename FloatType>
void blockadd(FloatType *pos, size_t nelem, FloatType val) {
#if __AVX2__ || _FEATURE_AVX512F || __SSE2__
#pragma message("Using vectorized scalar vector addition.")
using SIMDType = typename vec::SIMDTypes<FloatType>::Type;
SIMDType inc(vec::SIMDTypes<FloatType>::set1(val));
SIMDType *ptr((SIMDType *)pos);
FloatType *end(pos + nelem);
if((uint64_t)ptr & vec::SIMDTypes<FloatType>::MASK) {
while((FloatType *)ptr < end - sizeof(SIMDType) / sizeof(FloatType)) {
vec::SIMDTypes<FloatType>::storeu((FloatType *)ptr,
vec::SIMDTypes<FloatType>::add(inc, vec::SIMDTypes<FloatType>::loadu((FloatType *)ptr)));
++ptr;
}
} else {
while((FloatType *)ptr < end - sizeof(SIMDType) / sizeof(FloatType)) {
vec::SIMDTypes<FloatType>::store((FloatType *)ptr,
vec::SIMDTypes<FloatType>::add(inc, vec::SIMDTypes<FloatType>::load((FloatType *)ptr)));
++ptr;
}
}
pos = (FloatType *)ptr;
while(pos < end) *pos++ += div;
#else
#pragma message("Enjoy your serial version.")
for(size_t i(0); i < (static_cast<size_t>(1) << nelem); ++i) pos[i] += div; // Could be vectorized.
#endif
}
template<typename FloatType>
void vecmul(FloatType *to, const FloatType *from, size_t nelem) {
#if __AVX2__ || _FEATURE_AVX512F || __SSE2__
#pragma message("Using vectorized multiplication.")
using SIMDType = typename vec::SIMDTypes<FloatType>::Type;
SIMDType *ptr((SIMDType *)to), *fromptr((SIMDType *)from);
FloatType *end(to + nelem);
if((uint64_t)ptr & vec::SIMDTypes<FloatType>::MASK || (uint64_t)fromptr & (vec::SIMDTypes<FloatType>::MASK)) {
while((FloatType *)ptr < end - sizeof(SIMDType) / sizeof(FloatType)) {
vec::SIMDTypes<FloatType>::storeu((FloatType *)ptr,
vec::SIMDTypes<FloatType>::mul(vec::SIMDTypes<FloatType>::loadu((FloatType *)fromptr), vec::SIMDTypes<FloatType>::loadu((FloatType *)ptr)));
++ptr; ++fromptr;
}
} else {
while((FloatType *)ptr < end - sizeof(SIMDType) / sizeof(FloatType)) {
vec::SIMDTypes<FloatType>::store((FloatType *)ptr,
vec::SIMDTypes<FloatType>::mul(vec::SIMDTypes<FloatType>::load((FloatType *)fromptr), vec::SIMDTypes<FloatType>::load((FloatType *)ptr)));
++ptr; ++fromptr;
}
}
to = (FloatType *)ptr, from = (FloatType *)fromptr;
while(to < end) *to++ *= *from++;
#else
#pragma message("Enjoy your serial version.")
for(size_t i(0); i < (static_cast<size_t>(1) << nelem); ++i) to[i] *= from[i]; // Could be vectorized.
#endif
}
template<typename FloatType, typename Functor>
void block_apply(FloatType *pos, size_t nelem, const Functor &func=Functor{}) {
#if __AVX2__ || _FEATURE_AVX512F || __SSE2__
#pragma message("Using vectorized multiplication.")
using Space = typename vec::SIMDTypes<FloatType>;
using SIMDType = typename Space::Type;
SIMDType *ptr((SIMDType *)pos);
FloatType *end(pos + nelem);
if((uint64_t)ptr & Space::MASK) {
while((FloatType *)ptr < end - sizeof(SIMDType) / sizeof(FloatType)) {
Space::storeu((FloatType *)ptr,
func(Space::loadu((FloatType *)ptr)));
++ptr;
}
} else {
while((FloatType *)ptr < end - sizeof(SIMDType) / sizeof(FloatType)) {
Space::store((FloatType *)ptr,
func(Space::load((FloatType *)ptr)));
++ptr;
}
}
pos = (FloatType *)ptr;
while(pos < end) *pos = func.scalar(*pos), ++pos;
#else
#pragma message("Enjoy your serial version.")
for(size_t i(0); i < (static_cast<size_t>(1) << nelem); ++i) to[i] *= func.scalar(to[i]); // Could be vectorized.
#endif
}
template<typename Container, typename Functor>
void block_apply(Container &con, const Functor &func=Functor{}) {
if(&con[1] - &con[0] == 1) {
const size_t nelem(con.size());
block_apply(&(*std::begin(con)), nelem, func);
}
else {
Functor func;
for(auto &el: con) el = func.scalar(el);
}
}
} // namespace vec
#undef OP
#undef decop
#undef SLEEF_OP
#undef dec_sleefop_prec
#undef dec_all_precs
#undef dec_all_trig
#endif // #ifndef _VEC_H__
This works just fine for the basic intrinsics. Using block_apply is what I have not been able to do.
My test program is
#include "blaze/Math.h"
#include <random>
#include "vec.h"
#ifndef FLOAT_TYPE
# define FLOAT_TYPE float
#endif
int main() {
blaze::DynamicVector<FLOAT_TYPE> vec(1 << 16);
std::mt19937_64 gen(13);
std::chi_squared_distribution<FLOAT_TYPE> dist(16);
for(auto &el: vec) el = dist(gen);
vec::block_apply(vec, vec::SIMDTypes<FLOAT_TYPE>::apply_cos_u10{});
}
On my machine with AVX2, that means that vec::SIMDTypes<FLOAT_TYPE>::apply_cos_u10{}
knows it needs to call Sleef_cos8f_u10avx2
. This is in the header file and in the dynamically loaded library file.
from sleef.
Please tell me the compiler options you used to compile your code.
from sleef.
I used
g++-7 -O3 -funroll-loops -pipe -fno-strict-aliasing -mavx2 -march=native -mveclibabi=svml -ftree-vectorize -fno-rtti -std=c++17 -Wall -Wextra -DFLOAT_TYPE=double -Iblaze -I sleef/build/include/ -L. -Lsleef/build/lib src/vtest.cpp -o vtest -lsleef -lsvml
Actually, the above worked, after I added -lsvml
and -L$PATH_TO_SVML
.
I just didn't realize that Sleef depended on SVML. I'd prefer it if it could be, given that SVML requires a license outside of academic/open source use, even though I'm an academic and have a free license.
from sleef.
Since you specified -mveclibabi=svml, the automatic vectorizer in g++ generates references to svml.
It is not SLEEF that is dependent on SVML.
from sleef.
You're entirely right. Thank you -- I had that left over from when I was testing SVML before.
I'm sorry for the trouble, and thank you for your contribution!
(At some point, I might have this wrapper more complete, in case you're interested in an easier interface down the road.)
from sleef.
Related Issues (20)
- Tests fail to build with inline headers on for armhf
- Restart/Fix CI tests HOT 4
- Failures on various archs for gcc 12 HOT 2
- Failures for s390x VXE2 with qemu HOT 2
- Some tests fail on AArch64 with LLVM 17 HOT 1
- Deprecated usage of MD5 API
- Fix RVV inline header generation HOT 3
- Test more OS-es in Github Actions HOT 7
- Add GNU make as potential generator in CI
- Please, create tag release HOT 3
- dft test gets stuck during initialisation when hardware vector length is very long HOT 2
- Failure on ppc64 when cross compiled with gcc-11 toolchain
- Inaccuracy for f32 erf on non-FMA instruction set
- New test failures in 3.6 HOT 7
- F64 exp returns infinity slightly too soon
- RISC-V architecture missing on https://sleef.org/ HOT 1
- Clang on Windows and GNU ABI
- RISC-V: exploit `-mrvv-vector-bits=zvl` when used HOT 1
- Missing unversioned symbolic link for GNUABI version in 3.6 HOT 3
- A few API functions were (accidentally?) removed in 3.6 HOT 11
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from sleef.