Also noticing many of the oft-used routines like matrix transpose only have SIMD paths for SSE. Do the Neon paths on these go to scalar or simd ops?
static simd_float2x2 SIMD_CFUNC simd_transpose(simd_float2x2 __x) {
#if defined __SSE__ - why no Neon path?
simd_float4 __x0, __x1;
__x0.xy = __x.columns[0];
__x1.xy = __x.columns[1];
simd_float4 __r01 = _mm_unpacklo_ps(__x0, __x1);
return simd_matrix(__r01.lo, __r01.hi);
#else
return simd_matrix((simd_float2){__x.columns[0][0], __x.columns[1][0]},
(simd_float2){__x.columns[0][1], __x.columns[1][1]});
#endif
}
Also there are two abs ops that are a part of AVX512 that are used under __AVX2__ flag.
static inline SIMD_CFUNC simd_long2 simd_abs(simd_long2 x) {
#if defined __arm64__
return vabsq_s64(x);
#elif defined __SSE4_1__ - should be __AVX512F__
return (simd_long2) _mm_abs_epi64((__m128i)x);
#else
simd_long2 mask = x 63; return (x ^ mask) - mask;
#endif
}
static inline SIMD_CFUNC simd_long4 simd_abs(simd_long4 x) {
#if defined __AVX2__ - should be __AVX512F__
return _mm256_abs_epi64(x);
#else
return simd_make_long4(simd_abs(x.lo), simd_abs(x.hi));
#endif
}
Topic:
App & System Services
SubTopic:
Core OS
Tags: